Skip to content

Commit 35d7265

Browse files
committed
Update on "Improved perfs for vectorized bilinear interpolate cpu uint8 RGB-case (channels last)"
## Description - Based on #96651 - Improved perfs for vectorized bilinear interpolate uint8 RGB-case, channels last - unified RGB and RGBA processing code such that RGB input is not copied into RGBA - Performances are more close to Pillow-SIMD (`Pillow (9.0.0.post1)`) - RGBA case perfs are the same after refactoring (see Source link below) - Fixed mem pointer alignment, added more comments (reviews from #96651) ## Results ``` [------------------------------------------------------------------------------------------ Resize -----------------------------------------------------------------------------------------] | Pillow (9.0.0.post1) | torch (2.1.0a0+git0968a5d) PR | torch (2.1.0a0+git5309c44) nightly | Speed-up: PR vs nightly 1 threads: ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 3 torch.uint8 channels_last bilinear 256 -> 32 aa=True | 39.0 | 56.6 | 133.2 | 2.4 3 torch.uint8 channels_last bilinear 256 -> 32 aa=False | | 36.9 | 112.8 | 3.1 3 torch.uint8 channels_last bilinear 256 -> 224 aa=True | 128.1 | 152.5 | 305.4 | 2.0 3 torch.uint8 channels_last bilinear 256 -> 224 aa=False | | 141.1 | 288.7 | 2.0 3 torch.uint8 channels_last bilinear 256 -> 320 aa=True | 179.6 | 208.8 | 442.5 | 2.1 3 torch.uint8 channels_last bilinear 256 -> 320 aa=False | | 206.4 | 436.9 | 2.1 3 torch.uint8 channels_last bilinear 520 -> 32 aa=True | 113.3 | 132.1 | 464.8 | 3.5 3 torch.uint8 channels_last bilinear 520 -> 32 aa=False | | 57.2 | 365.5 | 6.4 3 torch.uint8 channels_last bilinear 520 -> 224 aa=True | 281.7 | 327.4 | 722.4 | 2.2 3 torch.uint8 channels_last bilinear 520 -> 224 aa=False | | 230.2 | 593.5 | 2.6 3 torch.uint8 channels_last bilinear 712 -> 32 aa=True | 186.9 | 210.5 | 833.8 | 4.0 3 torch.uint8 channels_last bilinear 712 -> 32 aa=False | | 75.6 | 651.4 | 8.6 3 torch.uint8 channels_last bilinear 712 -> 224 aa=True | 410.3 | 450.9 | 1128.4 | 2.5 3 torch.uint8 channels_last bilinear 712 -> 224 aa=False | | 298.7 | 917.6 | 3.1 ``` Note: for other cases (see Source below) speed-up is roughly around 1.0 +/- 0.1 which may be attributed to noisy measurements ... [Source](https://gist.github.com/vfdev-5/1c0778904a07ce40401306548b9525e8#file-20230315-162238-pr_vs_nightly_speedup-md) ## Context - #90771 cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 [ghstack-poisoned]
2 parents 3534a6c + 8ce7530 commit 35d7265

File tree

1 file changed

+101
-69
lines changed

1 file changed

+101
-69
lines changed

aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h

Lines changed: 101 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,16 @@ Like PIL, Pillow is licensed under the open source HPND License
3535

3636
namespace {
3737

38-
static __m128i inline mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr) {
39-
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)ptr));
38+
static __m128i inline mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
39+
int32_t v;
40+
if (i32_aligned) {
41+
v = *(const int32_t*)ptr;
42+
} else {
43+
uint8_t aligned_ptr[4];
44+
std::memcpy(aligned_ptr, ptr, 4);
45+
v = *(int32_t*)aligned_ptr;
46+
}
47+
return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(v));
4048
}
4149

4250
// TODO: We may want to hard-code an unrolled version for the case where
@@ -418,7 +426,8 @@ void ImagingResampleHorizontalConvolution8u4x(
418426
bool is_last_line) {
419427

420428
// Interpolation horizontal pass processing together 4 vertical lines.
421-
// - Input data format is RGBA with R,G,B,A being uint8, we can encode 4 values as a single uint32 value.
429+
// - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
430+
// we can encode 4 values as a single uint32 value.
422431
// - We split the size of weight vector for a given output index as a sum: K = n * 4 + m * 2 + k.
423432
// - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values
424433
// in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1").
@@ -459,33 +468,38 @@ void ImagingResampleHorizontalConvolution8u4x(
459468

460469
TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
461470

462-
// Precompute xmax limits for block 4 and block 2
463-
// lineIn0 + stride * (x + xmin) + 16 <= lineIn0 + stride * (xmax + xmin)
464-
// --> x <= xmax - 16.0 / stride
471+
// out_xsize = output width, out_x = output x index
472+
// ids_min is the input offset index corresponding to out_x
473+
// ids_size is the interpolation size for out_x
474+
475+
// Let's precompute ids_size limits for block 4 and block 2.
476+
//
477+
// In block 4 (4 means we process 4 weight values together), we read input data
478+
// with _mm_loadu_si128, i.e. 16 bytes, per one line:
479+
// lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
480+
// --> i <= ids_size - 16.0 / stride
465481
// Strict boundary:
466-
// --> x < xmax + 1 - int(ceil(16.0 / stride)) = xmax - b4_delta
482+
// --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
467483
// Soft boundary for reading inside the buffer except its boundaries:
468-
// --> x < xmax + 1 - int(16.0 / stride) = xmax - b4_delta_soft
484+
// --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
469485
// RGBA: b4_delta = b4_delta_soft = 3
470486
// RGB : b4_delta = 5
471487
// RGB : b4_delta_soft = 4
472488
const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
473489

474-
// lineIn0 + stride * (x + xmin) + 8 <= lineIn0 + stride * (xmax + xmin)
475-
// --> x <= xmax - 8.0 / stride
490+
// In block 2 (2 means we process 2 weights values together), we read input data
491+
// with _mm_loadl_epi64, i.e. 8 bytes, per one line:
492+
// lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
493+
// --> i <= ids_size - 8.0 / stride
476494
// Strict boundary:
477-
// --> x < xmax + 1 - int(ceil(8.0 / stride)) = xmax - b2_delta
495+
// --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
478496
// Soft boundary for reading inside the buffer except its boundaries:
479-
// --> x < xmax + 1 - int(8.0 / stride) = xmax - b2_delta_soft
497+
// --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
480498
// RGBA: b2_delta = b2_delta_soft = 1
481499
// RGB : b2_delta = 2
482500
// RGB : b2_delta_soft = 1
483501
const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
484502

485-
// out_xsize = output width, out_x = output x index
486-
// xmax = interpolation size, x = interpolation index (horizontal <-> x dimension)
487-
// xmin = input x start index corresponding to output x index (out_x)
488-
489503
const auto max_out_x_strided = out_xsize * stride;
490504
const auto max_in_x_strided = in_xsize * stride;
491505

@@ -563,11 +577,11 @@ void ImagingResampleHorizontalConvolution8u4x(
563577
const auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]);
564578

565579
// Load 4 pixels (2 per line) from input lines 0 and 1:
566-
// RGBA: source = [
580+
// RGBA: source1 = [
567581
// r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0
568582
// R0 G0 B0 A0 R1 G1 B1 A1 0 0 0 0 0 0 0 0
569583
// ]
570-
// RGB: source = [
584+
// RGB: source1 = [
571585
// r0 g0 b0 r1 g1 b1 r2 0 0 0 0 0 0 0 0
572586
// R0 G0 B0 R1 G1 B1 R2 0 0 0 0 0 0 0 0
573587
// ]
@@ -592,30 +606,31 @@ void ImagingResampleHorizontalConvolution8u4x(
592606
}
593607

594608
// block 1
609+
const auto i32_aligned = num_channels == 4;
595610
for (; i < ids_size - 1; i++) {
596611
// Load 1 value from weight vector
597612
// mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...]
598613
const auto mmk = _mm256_set1_epi32(k[i]);
599614

600615
// Load 2 pixels (one per line) from input lines 0 and 1:
601-
// RGBA: source = [
602-
// r0 g0 b0 a0 0 0 0 0 0 0 0 0 0 0 0 0
603-
// R0 G0 B0 A0 0 0 0 0 0 0 0 0 0 0 0 0
616+
// RGBA: pix1 = [
617+
// r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0
618+
// R0 0 0 0 G0 0 0 0 B0 0 0 0 A0 0 0 0
604619
// ]
605-
// RGB: source = [
606-
// r0 g0 b0 r1 0 0 0 0 0 0 0 0 0 0 0 0
607-
// R0 G0 B0 R1 0 0 0 0 0 0 0 0 0 0 0 0
620+
// RGB: pix1 = [
621+
// r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0
622+
// R0 0 0 0 G0 0 0 0 B0 0 0 0 R1 0 0 0
608623
// ]
609624
auto pix1 = _mm256_inserti128_si256(_mm256_castsi128_si256(
610-
mm_cvtepu8_epi32(lineIn0_min + stride * i)),
611-
mm_cvtepu8_epi32(lineIn1_min + stride * i), 1);
625+
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
626+
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
612627
// Compute output value as C += w0 * C0 for each channel in 32-bit precision
613628
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
614629

615630
// Same as above for lines 2 and 3
616631
auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(
617-
mm_cvtepu8_epi32(lineIn2_min + stride * i)),
618-
mm_cvtepu8_epi32(lineIn3_min + stride * i), 1);
632+
mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned)),
633+
mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned), 1);
619634
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
620635
}
621636

@@ -625,18 +640,18 @@ void ImagingResampleHorizontalConvolution8u4x(
625640
// For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
626641
// lines 0, 1 and 2 wont go out of allocated memory bounds
627642
auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
628-
mm_cvtepu8_epi32(lineIn0_min + stride * i)),
629-
mm_cvtepu8_epi32(lineIn1_min + stride * i), 1);
643+
mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
644+
mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
630645
sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
631646

632-
auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i);
647+
auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned);
633648
__m128i p1;
634649
if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
635650
uint8_t output1[4];
636651
std::memcpy(output1, lineIn3_min + stride * i, 3);
637-
p1 = mm_cvtepu8_epi32(output1);
652+
p1 = mm_cvtepu8_epi32(output1, true);
638653
} else {
639-
p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i);
654+
p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned);
640655
}
641656
auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(p0), p1, 1);
642657
sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
@@ -664,14 +679,18 @@ void ImagingResampleHorizontalConvolution8u4x(
664679
const auto out_x_strided = stride * out_x;
665680

666681
if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) {
667-
// This is a boundary case when we want to write 4 bytes to the output buffer but
668-
// the 4th bytes is already computed. It means that we can not overwrite it.
682+
// This is a boundary case when we want to write 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
683+
// The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
684+
// value which was preveiously computed by another line. In other words, it means that we can not overwrite
685+
// it by simply writing 4 bytes from the register to the output. We'll do the following:
669686
// v----------|
670-
// Output = [... X1 X2 X3 | A B C D ...]
671-
// First, we store store next 4 bytes (A B C D)
672-
// Second, we write 4 bytes to (X1 X2 X3 | A) -> (U V W | Z)
673-
// [... U V W | Z B C D ...]
674-
// Third, we overwrite next 4 bytes (Z B C D) with stored values (A B C D)
687+
// Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
688+
// First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
689+
// Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
690+
// Output = [... R G B | X G1 B1 R2 ...]
691+
// Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
692+
// Output = [... R G B | R1 G1 B1 R2 ...]
693+
675694
char next0[4];
676695
std::memcpy(next0, lineOut0 + out_x_strided + stride, 4);
677696
std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4);
@@ -728,7 +747,8 @@ void ImagingResampleHorizontalConvolution8u(
728747
bool is_last_line) {
729748

730749
// Interpolation horizontal pass processing only one vertical line.
731-
// - Input data format is RGBA with R,G,B,A being uint8, we can encode 4 values as a single uint32 value.
750+
// - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
751+
// we can encode 4 values as a single uint32 value.
732752
// - We split the size of weight vector for a given output index as a sum: K = n * 8 + m * 4 + k * 2 + l.
733753
// - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in
734754
// in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1").
@@ -790,43 +810,50 @@ void ImagingResampleHorizontalConvolution8u(
790810
const auto mask_low128 = (num_channels == 3) ? masks_low128_c3_c4[0] : masks_low128_c3_c4[1];
791811

792812
// out_xsize = output width, out_x = output x index
793-
// ids_size = interpolation size
794-
// ids_min = input x start index corresponding to output x index (out_x)
813+
// ids_min is the input offset index corresponding to out_x
814+
// ids_size is the interpolation size for out_x
795815

796816
const auto stride = num_channels * 1; // num channels * sizeof(uint8)
797817
const auto zero = _mm_setzero_si128();
798818

799819
TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
800820

801-
// Precompute xmax limits for block 8, block 4 and block 2
802-
// lineIn + stride * (x + xmin) + 32 <= lineIn + stride * (xmax + xmin)
803-
// --> x <= xmax - 32.0 / stride
821+
// Let's precompute ids_size limits for block 8, block 4 and block 2
822+
//
823+
// In block 8 (8 means we process 8 weight values together), we read at
824+
// most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB)
825+
// lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min)
826+
// --> i <= ids_size - 32.0 / stride
804827
// Strict boundary:
805-
// --> x < xmax + 1 - int(ceil(32.0 / stride)) = xmax - b8_delta
828+
// --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta
806829
// Soft boundary for reading inside the buffer except its boundaries:
807-
// --> x < xmax + 1 - int(32.0 / stride) = xmax - b8_delta_soft
830+
// --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft
808831
// RGBA: b8_delta = b8_delta_soft = 7
809832
// RGB : b8_delta = 10
810833
// RGB : b8_delta_soft = 9
811834
const auto b8_delta = (stride == 4) ? 7 : ((is_last_line) ? 10 : 9);
812835

813-
// lineIn + stride * (x + xmin) + 16 <= lineIn0 + stride * (xmax + xmin)
814-
// --> x <= xmax - 16.0 / stride
836+
// In block 4 (4 means we process 4 weight values together), we read
837+
// 16 bytes of input data.
838+
// lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
839+
// --> i <= ids_size - 16.0 / stride
815840
// Strict boundary:
816-
// --> x < xmax + 1 - int(ceil(16.0 / stride)) = xmax - b4_delta
841+
// --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
817842
// Soft boundary for reading inside the buffer except its boundaries:
818-
// --> x < xmax + 1 - int(16.0 / stride) = xmax - b4_delta_soft
843+
// --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
819844
// RGBA: b4_delta = b4_delta_soft = 3
820845
// RGB : b4_delta = 5
821846
// RGB : b4_delta_soft = 4
822847
const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
823848

824-
// lineIn0 + stride * (x + xmin) + 8 <= lineIn0 + stride * (xmax + xmin)
825-
// --> x <= xmax - 8.0 / stride
849+
// In block 2 (2 means we process 2 weight values together), we read
850+
// 8 bytes of input data.
851+
// lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
852+
// --> i <= ids_size - 8.0 / stride
826853
// Strict boundary:
827-
// --> x < xmax + 1 - int(ceil(8.0 / stride)) = xmax - b2_delta
854+
// --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
828855
// Soft boundary for reading inside the buffer except its boundaries:
829-
// --> x < xmax + 1 - int(8.0 / stride) = xmax - b2_delta_soft
856+
// --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
830857
// RGBA: b2_delta = b2_delta_soft = 1
831858
// RGB : b2_delta = 2
832859
// RGB : b2_delta_soft = 1
@@ -972,6 +999,7 @@ void ImagingResampleHorizontalConvolution8u(
972999
}
9731000

9741001
// block 1
1002+
const auto i32_aligned = num_channels == 4;
9751003
for (; i < ids_size - 1; i++) {
9761004
// Load 1 value from weight vector
9771005
// mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...]
@@ -983,7 +1011,7 @@ void ImagingResampleHorizontalConvolution8u(
9831011
// RGB: pix = [
9841012
// r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0
9851013
// ]
986-
auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i);
1014+
auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i, i32_aligned);
9871015
// Compute output value as C += w0 * C0 for each channel in 32-bit precision
9881016
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
9891017
}
@@ -996,9 +1024,9 @@ void ImagingResampleHorizontalConvolution8u(
9961024
if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
9971025
uint8_t output[4];
9981026
std::memcpy(output, p, 3);
999-
pix = mm_cvtepu8_epi32(output);
1027+
pix = mm_cvtepu8_epi32(output, true);
10001028
} else {
1001-
pix = mm_cvtepu8_epi32(p);
1029+
pix = mm_cvtepu8_epi32(p, true);
10021030
}
10031031
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
10041032
}
@@ -1021,14 +1049,17 @@ void ImagingResampleHorizontalConvolution8u(
10211049
// as they are out of memory bounds.
10221050
std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 3);
10231051
} else {
1024-
// This is a boundary case when we want to write 4 bytes to the output buffer but
1025-
// the 4th bytes is already computed. It means that we can not overwrite it.
1052+
// This is a boundary case when we want to write 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
1053+
// The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
1054+
// value which was preveiously computed by another line. In other words, it means that we can not overwrite
1055+
// it by simply writing 4 bytes from the register to the output. We'll do the following:
10261056
// v----------|
1027-
// Output = [... X1 X2 X3 | A B C D ...]
1028-
// First, we store store next 4 bytes (A B C D)
1029-
// Second, we write 4 bytes to (X1 X2 X3 | A) -> (U V W | Z)
1030-
// [... U V W | Z B C D ...]
1031-
// Third, we overwrite next 4 bytes (Z B C D) with stored values (A B C D)
1057+
// Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
1058+
// First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
1059+
// Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
1060+
// Output = [... R G B | X G1 B1 R2 ...]
1061+
// Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
1062+
// Output = [... R G B | R1 G1 B1 R2 ...]
10321063

10331064
char next[4];
10341065
std::memcpy(next, lineOut + out_x_strided + stride, 4);
@@ -1250,6 +1281,7 @@ void ImagingResampleVerticalConvolution8u(
12501281

12511282
// block 1
12521283
const auto b1_usable_vec_stride = (4 / data_stride) * data_stride;
1284+
const auto i32_aligned = num_channels == 4;
12531285
for (; j < data_size - 4; j += b1_usable_vec_stride) {
12541286
auto sss = initial;
12551287
int64_t i = 0;
@@ -1285,7 +1317,7 @@ void ImagingResampleVerticalConvolution8u(
12851317

12861318
for (; i < ids_size; i++) {
12871319
auto mmk = _mm_set1_epi32(k[i]);
1288-
auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size);
1320+
auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size, i32_aligned);
12891321
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
12901322
}
12911323
sss = _mm_srai_epi32(sss, coefs_precision);
@@ -1331,9 +1363,9 @@ void ImagingResampleVerticalConvolution8u(
13311363
if (num_channels == 3) {
13321364
uint8_t input[4];
13331365
std::memcpy(input, p, 3);
1334-
pix = mm_cvtepu8_epi32(input);
1366+
pix = mm_cvtepu8_epi32(input, true);
13351367
} else {
1336-
pix = mm_cvtepu8_epi32(p);
1368+
pix = mm_cvtepu8_epi32(p, true);
13371369
}
13381370
sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
13391371
}

0 commit comments

Comments
 (0)