@@ -35,8 +35,16 @@ Like PIL, Pillow is licensed under the open source HPND License
3535
3636namespace {
3737
38- static __m128i inline mm_cvtepu8_epi32 (const uint8_t * C10_RESTRICT ptr) {
39- return _mm_cvtepu8_epi32 (_mm_cvtsi32_si128 (*(int32_t *)ptr));
38+ static __m128i inline mm_cvtepu8_epi32 (const uint8_t * C10_RESTRICT ptr, bool i32_aligned) {
39+ int32_t v;
40+ if (i32_aligned) {
41+ v = *(const int32_t *)ptr;
42+ } else {
43+ uint8_t aligned_ptr[4 ];
44+ std::memcpy (aligned_ptr, ptr, 4 );
45+ v = *(int32_t *)aligned_ptr;
46+ }
47+ return _mm_cvtepu8_epi32 (_mm_cvtsi32_si128 (v));
4048}
4149
4250// TODO: We may want to hard-code an unrolled version for the case where
@@ -418,7 +426,8 @@ void ImagingResampleHorizontalConvolution8u4x(
418426 bool is_last_line) {
419427
420428 // Interpolation horizontal pass processing together 4 vertical lines.
421- // - Input data format is RGBA with R,G,B,A being uint8, we can encode 4 values as a single uint32 value.
429+ // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
430+ // we can encode 4 values as a single uint32 value.
422431 // - We split the size of weight vector for a given output index as a sum: K = n * 4 + m * 2 + k.
423432 // - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values
424433 // in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1").
@@ -459,33 +468,38 @@ void ImagingResampleHorizontalConvolution8u4x(
459468
460469 TORCH_INTERNAL_ASSERT (stride == 3 || stride == 4 );
461470
462- // Precompute xmax limits for block 4 and block 2
463- // lineIn0 + stride * (x + xmin) + 16 <= lineIn0 + stride * (xmax + xmin)
464- // --> x <= xmax - 16.0 / stride
471+ // out_xsize = output width, out_x = output x index
472+ // ids_min is the input offset index corresponding to out_x
473+ // ids_size is the interpolation size for out_x
474+
475+ // Let's precompute ids_size limits for block 4 and block 2.
476+ //
477+ // In block 4 (4 means we process 4 weight values together), we read input data
478+ // with _mm_loadu_si128, i.e. 16 bytes, per one line:
479+ // lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
480+ // --> i <= ids_size - 16.0 / stride
465481 // Strict boundary:
466- // --> x < xmax + 1 - int(ceil(16.0 / stride)) = xmax - b4_delta
482+ // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
467483 // Soft boundary for reading inside the buffer except its boundaries:
468- // --> x < xmax + 1 - int(16.0 / stride) = xmax - b4_delta_soft
484+ // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
469485 // RGBA: b4_delta = b4_delta_soft = 3
470486 // RGB : b4_delta = 5
471487 // RGB : b4_delta_soft = 4
472488 const auto b4_delta = (stride == 4 ) ? 3 : ((is_last_line) ? 5 : 4 );
473489
474- // lineIn0 + stride * (x + xmin) + 8 <= lineIn0 + stride * (xmax + xmin)
475- // --> x <= xmax - 8.0 / stride
490+ // In block 2 (2 means we process 2 weights values together), we read input data
491+ // with _mm_loadl_epi64, i.e. 8 bytes, per one line:
492+ // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
493+ // --> i <= ids_size - 8.0 / stride
476494 // Strict boundary:
477- // --> x < xmax + 1 - int(ceil(8.0 / stride)) = xmax - b2_delta
495+ // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
478496 // Soft boundary for reading inside the buffer except its boundaries:
479- // --> x < xmax + 1 - int(8.0 / stride) = xmax - b2_delta_soft
497+ // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
480498 // RGBA: b2_delta = b2_delta_soft = 1
481499 // RGB : b2_delta = 2
482500 // RGB : b2_delta_soft = 1
483501 const auto b2_delta = (stride == 4 ) ? 1 : ((is_last_line) ? 2 : 1 );
484502
485- // out_xsize = output width, out_x = output x index
486- // xmax = interpolation size, x = interpolation index (horizontal <-> x dimension)
487- // xmin = input x start index corresponding to output x index (out_x)
488-
489503 const auto max_out_x_strided = out_xsize * stride;
490504 const auto max_in_x_strided = in_xsize * stride;
491505
@@ -563,11 +577,11 @@ void ImagingResampleHorizontalConvolution8u4x(
563577 const auto mmk = _mm256_set1_epi32 (*(int32_t *)&k[i]);
564578
565579 // Load 4 pixels (2 per line) from input lines 0 and 1:
566- // RGBA: source = [
580+ // RGBA: source1 = [
567581 // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0
568582 // R0 G0 B0 A0 R1 G1 B1 A1 0 0 0 0 0 0 0 0
569583 // ]
570- // RGB: source = [
584+ // RGB: source1 = [
571585 // r0 g0 b0 r1 g1 b1 r2 0 0 0 0 0 0 0 0
572586 // R0 G0 B0 R1 G1 B1 R2 0 0 0 0 0 0 0 0
573587 // ]
@@ -592,30 +606,31 @@ void ImagingResampleHorizontalConvolution8u4x(
592606 }
593607
594608 // block 1
609+ const auto i32_aligned = num_channels == 4 ;
595610 for (; i < ids_size - 1 ; i++) {
596611 // Load 1 value from weight vector
597612 // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...]
598613 const auto mmk = _mm256_set1_epi32 (k[i]);
599614
600615 // Load 2 pixels (one per line) from input lines 0 and 1:
601- // RGBA: source = [
602- // r0 g0 b0 a0 0 0 0 0 0 0 0 0 0 0 0 0
603- // R0 G0 B0 A0 0 0 0 0 0 0 0 0 0 0 0 0
616+ // RGBA: pix1 = [
617+ // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0
618+ // R0 0 0 0 G0 0 0 0 B0 0 0 0 A0 0 0 0
604619 // ]
605- // RGB: source = [
606- // r0 g0 b0 r1 0 0 0 0 0 0 0 0 0 0 0 0
607- // R0 G0 B0 R1 0 0 0 0 0 0 0 0 0 0 0 0
620+ // RGB: pix1 = [
621+ // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0
622+ // R0 0 0 0 G0 0 0 0 B0 0 0 0 R1 0 0 0
608623 // ]
609624 auto pix1 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
610- mm_cvtepu8_epi32 (lineIn0_min + stride * i)),
611- mm_cvtepu8_epi32 (lineIn1_min + stride * i), 1 );
625+ mm_cvtepu8_epi32 (lineIn0_min + stride * i, i32_aligned )),
626+ mm_cvtepu8_epi32 (lineIn1_min + stride * i, i32_aligned ), 1 );
612627 // Compute output value as C += w0 * C0 for each channel in 32-bit precision
613628 sss0 = _mm256_add_epi32 (sss0, _mm256_madd_epi16 (pix1, mmk));
614629
615630 // Same as above for lines 2 and 3
616631 auto pix2 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
617- mm_cvtepu8_epi32 (lineIn2_min + stride * i)),
618- mm_cvtepu8_epi32 (lineIn3_min + stride * i), 1 );
632+ mm_cvtepu8_epi32 (lineIn2_min + stride * i, i32_aligned )),
633+ mm_cvtepu8_epi32 (lineIn3_min + stride * i, i32_aligned ), 1 );
619634 sss1 = _mm256_add_epi32 (sss1, _mm256_madd_epi16 (pix2, mmk));
620635 }
621636
@@ -625,18 +640,18 @@ void ImagingResampleHorizontalConvolution8u4x(
625640 // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
626641 // lines 0, 1 and 2 wont go out of allocated memory bounds
627642 auto pix = _mm256_inserti128_si256 (_mm256_castsi128_si256 (
628- mm_cvtepu8_epi32 (lineIn0_min + stride * i)),
629- mm_cvtepu8_epi32 (lineIn1_min + stride * i), 1 );
643+ mm_cvtepu8_epi32 (lineIn0_min + stride * i, i32_aligned )),
644+ mm_cvtepu8_epi32 (lineIn1_min + stride * i, i32_aligned ), 1 );
630645 sss0 = _mm256_add_epi32 (sss0, _mm256_madd_epi16 (pix, mmk));
631646
632- auto p0 = mm_cvtepu8_epi32 (lineIn2_min + stride * i);
647+ auto p0 = mm_cvtepu8_epi32 (lineIn2_min + stride * i, i32_aligned );
633648 __m128i p1;
634649 if (num_channels == 3 && C10_UNLIKELY (is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
635650 uint8_t output1[4 ];
636651 std::memcpy (output1, lineIn3_min + stride * i, 3 );
637- p1 = mm_cvtepu8_epi32 (output1);
652+ p1 = mm_cvtepu8_epi32 (output1, true );
638653 } else {
639- p1 = mm_cvtepu8_epi32 (lineIn3_min + stride * i);
654+ p1 = mm_cvtepu8_epi32 (lineIn3_min + stride * i, i32_aligned );
640655 }
641656 auto pix2 = _mm256_inserti128_si256 (_mm256_castsi128_si256 (p0), p1, 1 );
642657 sss1 = _mm256_add_epi32 (sss1, _mm256_madd_epi16 (pix2, mmk));
@@ -664,14 +679,18 @@ void ImagingResampleHorizontalConvolution8u4x(
664679 const auto out_x_strided = stride * out_x;
665680
666681 if (num_channels == 3 && C10_UNLIKELY (out_x_strided + 4 >= max_out_x_strided)) {
667- // This is a boundary case when we want to write 4 bytes to the output buffer but
668- // the 4th bytes is already computed. It means that we can not overwrite it.
682+ // This is a boundary case when we want to write 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
683+ // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
684+ // value which was preveiously computed by another line. In other words, it means that we can not overwrite
685+ // it by simply writing 4 bytes from the register to the output. We'll do the following:
669686 // v----------|
670- // Output = [... X1 X2 X3 | A B C D ...]
671- // First, we store store next 4 bytes (A B C D)
672- // Second, we write 4 bytes to (X1 X2 X3 | A) -> (U V W | Z)
673- // [... U V W | Z B C D ...]
674- // Third, we overwrite next 4 bytes (Z B C D) with stored values (A B C D)
687+ // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
688+ // First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
689+ // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
690+ // Output = [... R G B | X G1 B1 R2 ...]
691+ // Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
692+ // Output = [... R G B | R1 G1 B1 R2 ...]
693+
675694 char next0[4 ];
676695 std::memcpy (next0, lineOut0 + out_x_strided + stride, 4 );
677696 std::memcpy (lineOut0 + out_x_strided, (uint8_t *) &o0, 4 );
@@ -728,7 +747,8 @@ void ImagingResampleHorizontalConvolution8u(
728747 bool is_last_line) {
729748
730749 // Interpolation horizontal pass processing only one vertical line.
731- // - Input data format is RGBA with R,G,B,A being uint8, we can encode 4 values as a single uint32 value.
750+ // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
751+ // we can encode 4 values as a single uint32 value.
732752 // - We split the size of weight vector for a given output index as a sum: K = n * 8 + m * 4 + k * 2 + l.
733753 // - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in
734754 // in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1").
@@ -790,43 +810,50 @@ void ImagingResampleHorizontalConvolution8u(
790810 const auto mask_low128 = (num_channels == 3 ) ? masks_low128_c3_c4[0 ] : masks_low128_c3_c4[1 ];
791811
792812 // out_xsize = output width, out_x = output x index
793- // ids_size = interpolation size
794- // ids_min = input x start index corresponding to output x index ( out_x)
813+ // ids_min is the input offset index corresponding to out_x
814+ // ids_size is the interpolation size for out_x
795815
796816 const auto stride = num_channels * 1 ; // num channels * sizeof(uint8)
797817 const auto zero = _mm_setzero_si128 ();
798818
799819 TORCH_INTERNAL_ASSERT (stride == 3 || stride == 4 );
800820
801- // Precompute xmax limits for block 8, block 4 and block 2
802- // lineIn + stride * (x + xmin) + 32 <= lineIn + stride * (xmax + xmin)
803- // --> x <= xmax - 32.0 / stride
821+ // Let's precompute ids_size limits for block 8, block 4 and block 2
822+ //
823+ // In block 8 (8 means we process 8 weight values together), we read at
824+ // most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB)
825+ // lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min)
826+ // --> i <= ids_size - 32.0 / stride
804827 // Strict boundary:
805- // --> x < xmax + 1 - int(ceil(32.0 / stride)) = xmax - b8_delta
828+ // --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta
806829 // Soft boundary for reading inside the buffer except its boundaries:
807- // --> x < xmax + 1 - int(32.0 / stride) = xmax - b8_delta_soft
830+ // --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft
808831 // RGBA: b8_delta = b8_delta_soft = 7
809832 // RGB : b8_delta = 10
810833 // RGB : b8_delta_soft = 9
811834 const auto b8_delta = (stride == 4 ) ? 7 : ((is_last_line) ? 10 : 9 );
812835
813- // lineIn + stride * (x + xmin) + 16 <= lineIn0 + stride * (xmax + xmin)
814- // --> x <= xmax - 16.0 / stride
836+ // In block 4 (4 means we process 4 weight values together), we read
837+ // 16 bytes of input data.
838+ // lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
839+ // --> i <= ids_size - 16.0 / stride
815840 // Strict boundary:
816- // --> x < xmax + 1 - int(ceil(16.0 / stride)) = xmax - b4_delta
841+ // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
817842 // Soft boundary for reading inside the buffer except its boundaries:
818- // --> x < xmax + 1 - int(16.0 / stride) = xmax - b4_delta_soft
843+ // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
819844 // RGBA: b4_delta = b4_delta_soft = 3
820845 // RGB : b4_delta = 5
821846 // RGB : b4_delta_soft = 4
822847 const auto b4_delta = (stride == 4 ) ? 3 : ((is_last_line) ? 5 : 4 );
823848
824- // lineIn0 + stride * (x + xmin) + 8 <= lineIn0 + stride * (xmax + xmin)
825- // --> x <= xmax - 8.0 / stride
849+ // In block 2 (2 means we process 2 weight values together), we read
850+ // 8 bytes of input data.
851+ // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
852+ // --> i <= ids_size - 8.0 / stride
826853 // Strict boundary:
827- // --> x < xmax + 1 - int(ceil(8.0 / stride)) = xmax - b2_delta
854+ // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
828855 // Soft boundary for reading inside the buffer except its boundaries:
829- // --> x < xmax + 1 - int(8.0 / stride) = xmax - b2_delta_soft
856+ // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
830857 // RGBA: b2_delta = b2_delta_soft = 1
831858 // RGB : b2_delta = 2
832859 // RGB : b2_delta_soft = 1
@@ -972,6 +999,7 @@ void ImagingResampleHorizontalConvolution8u(
972999 }
9731000
9741001 // block 1
1002+ const auto i32_aligned = num_channels == 4 ;
9751003 for (; i < ids_size - 1 ; i++) {
9761004 // Load 1 value from weight vector
9771005 // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...]
@@ -983,7 +1011,7 @@ void ImagingResampleHorizontalConvolution8u(
9831011 // RGB: pix = [
9841012 // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0
9851013 // ]
986- auto pix = mm_cvtepu8_epi32 (lineIn_min + stride * i);
1014+ auto pix = mm_cvtepu8_epi32 (lineIn_min + stride * i, i32_aligned );
9871015 // Compute output value as C += w0 * C0 for each channel in 32-bit precision
9881016 sss = _mm_add_epi32 (sss, _mm_madd_epi16 (pix, mmk));
9891017 }
@@ -996,9 +1024,9 @@ void ImagingResampleHorizontalConvolution8u(
9961024 if (num_channels == 3 && C10_UNLIKELY (is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
9971025 uint8_t output[4 ];
9981026 std::memcpy (output, p, 3 );
999- pix = mm_cvtepu8_epi32 (output);
1027+ pix = mm_cvtepu8_epi32 (output, true );
10001028 } else {
1001- pix = mm_cvtepu8_epi32 (p);
1029+ pix = mm_cvtepu8_epi32 (p, true );
10021030 }
10031031 sss = _mm_add_epi32 (sss, _mm_madd_epi16 (pix, mmk));
10041032 }
@@ -1021,14 +1049,17 @@ void ImagingResampleHorizontalConvolution8u(
10211049 // as they are out of memory bounds.
10221050 std::memcpy (lineOut + out_x_strided, (uint8_t *) &o, 3 );
10231051 } else {
1024- // This is a boundary case when we want to write 4 bytes to the output buffer but
1025- // the 4th bytes is already computed. It means that we can not overwrite it.
1052+ // This is a boundary case when we want to write 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
1053+ // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
1054+ // value which was preveiously computed by another line. In other words, it means that we can not overwrite
1055+ // it by simply writing 4 bytes from the register to the output. We'll do the following:
10261056 // v----------|
1027- // Output = [... X1 X2 X3 | A B C D ...]
1028- // First, we store store next 4 bytes (A B C D)
1029- // Second, we write 4 bytes to (X1 X2 X3 | A) -> (U V W | Z)
1030- // [... U V W | Z B C D ...]
1031- // Third, we overwrite next 4 bytes (Z B C D) with stored values (A B C D)
1057+ // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
1058+ // First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
1059+ // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
1060+ // Output = [... R G B | X G1 B1 R2 ...]
1061+ // Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
1062+ // Output = [... R G B | R1 G1 B1 R2 ...]
10321063
10331064 char next[4 ];
10341065 std::memcpy (next, lineOut + out_x_strided + stride, 4 );
@@ -1250,6 +1281,7 @@ void ImagingResampleVerticalConvolution8u(
12501281
12511282 // block 1
12521283 const auto b1_usable_vec_stride = (4 / data_stride) * data_stride;
1284+ const auto i32_aligned = num_channels == 4 ;
12531285 for (; j < data_size - 4 ; j += b1_usable_vec_stride) {
12541286 auto sss = initial;
12551287 int64_t i = 0 ;
@@ -1285,7 +1317,7 @@ void ImagingResampleVerticalConvolution8u(
12851317
12861318 for (; i < ids_size; i++) {
12871319 auto mmk = _mm_set1_epi32 (k[i]);
1288- auto pix = mm_cvtepu8_epi32 (lineIn_min + i * data_size);
1320+ auto pix = mm_cvtepu8_epi32 (lineIn_min + i * data_size, i32_aligned );
12891321 sss = _mm_add_epi32 (sss, _mm_madd_epi16 (pix, mmk));
12901322 }
12911323 sss = _mm_srai_epi32 (sss, coefs_precision);
@@ -1331,9 +1363,9 @@ void ImagingResampleVerticalConvolution8u(
13311363 if (num_channels == 3 ) {
13321364 uint8_t input[4 ];
13331365 std::memcpy (input, p, 3 );
1334- pix = mm_cvtepu8_epi32 (input);
1366+ pix = mm_cvtepu8_epi32 (input, true );
13351367 } else {
1336- pix = mm_cvtepu8_epi32 (p);
1368+ pix = mm_cvtepu8_epi32 (p, true );
13371369 }
13381370 sss = _mm_add_epi32 (sss, _mm_madd_epi16 (pix, mmk));
13391371 }
0 commit comments