Update on "Improved perfs for vectorized bilinear interpolate cpu uint8 RGB-case (channels last)"

vfdev-5 · vfdev-5 · commit 07d7584a06c8 · 2023-03-22T11:45:55.000Z
## Description - Based on #96651 - Improved perfs for vectorized **bilinear** interpolate uint8 RGB-case, **channels last** - unified RGB and RGBA processing code such that RGB input is not copied into RGBA - Performances are more close to Pillow-SIMD (labeled as `Pillow (9.0.0.post1)` in the results) - RGBA case perfs are the same after refactoring (see Source link below) - Fixed mem pointer alignment, added more comments (reviews from #96651) ## Results - `Pillow (9.0.0.post1)` == Pillow-SIMD ``` [-------------------------------------------------------------------------------------------------- Resize -------------------------------------------------------------------------------------------------] | Pillow (9.0.0.post1) | torch (2.1.0a0+git8d955df) PR | torch (2.1.0a0+git5309c44) nightly | Speed-up: PR vs nightly 1 threads: -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 3 torch.uint8 channels_last bilinear (256, 256) -> (32, 32) aa=True | 38.649 (+-0.306) | 55.828 (+-0.370) | 132.147 (+-1.236) | 2.367 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (32, 32) aa=False | | 36.826 (+-0.229) | 111.789 (+-1.175) | 3.036 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (224, 224) aa=True | 128.233 (+-1.313) | 153.827 (+-1.229) | 302.518 (+-2.632) | 1.967 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (224, 224) aa=False | | 143.886 (+-1.409) | 286.663 (+-2.494) | 1.992 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (320, 320) aa=True | 179.504 (+-1.825) | 211.569 (+-1.336) | 439.375 (+-4.014) | 2.077 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (320, 320) aa=False | | 209.888 (+-1.443) | 438.537 (+-4.143) | 2.089 (+-0.000) 3 torch.uint8 channels_last bilinear (520, 520) -> (32, 32) aa=True | 112.891 (+-1.118) | 129.373 (+-1.396) | 446.804 (+-3.283) | 3.454 (+-0.000) 3 torch.uint8 channels_last bilinear (520, 520) -> (32, 32) aa=False | | 56.858 (+-0.227) | 374.244 (+-13.598) | 6.582 (+-0.000) 3 torch.uint8 channels_last bilinear (520, 520) -> (224, 224) aa=True | 282.917 (+-2.992) | 324.378 (+-1.694) | 720.197 (+-3.467) | 2.220 (+-0.000) 3 torch.uint8 channels_last bilinear (520, 520) -> (224, 224) aa=False | | 236.078 (+-1.679) | 592.834 (+-3.903) | 2.511 (+-0.000) 3 torch.uint8 channels_last bilinear (712, 712) -> (32, 32) aa=True | 185.595 (+-1.633) | 202.000 (+-1.920) | 787.868 (+-3.648) | 3.900 (+-0.000) 3 torch.uint8 channels_last bilinear (712, 712) -> (32, 32) aa=False | | 75.421 (+-0.512) | 651.016 (+-3.926) | 8.632 (+-0.000) 3 torch.uint8 channels_last bilinear (712, 712) -> (224, 224) aa=True | 409.691 (+-2.735) | 449.927 (+-2.500) | 1123.923 (+-14.988) | 2.498 (+-0.000) 3 torch.uint8 channels_last bilinear (712, 712) -> (224, 224) aa=False | | 306.691 (+-2.095) | 915.347 (+-4.486) | 2.985 (+-0.000) # More test-cases from #90771 3 torch.uint8 channels_last bilinear (64, 64) -> (224, 224) aa=True | 60.740 (+-0.278) | 78.745 (+-0.286) | 170.465 (+-1.830) | 2.165 (+-0.000) 3 torch.uint8 channels_last bilinear (224, 224) -> (270, 268) aa=True | 133.029 (+-1.619) | 162.393 (+-1.289) | 330.971 (+-3.249) | 2.038 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (1024, 1024) aa=True | 948.849 (+-2.749) | 896.127 (+-3.696) | 2805.510 (+-25.503) | 3.131 (+-0.000) 3 torch.uint8 channels_last bilinear (224, 224) -> (64, 64) aa=True | 52.505 (+-0.319) | 70.617 (+-0.344) | 135.933 (+-1.625) | 1.925 (+-0.000) 3 torch.uint8 channels_last bilinear (270, 268) -> (224, 224) aa=True | 138.671 (+-1.953) | 165.638 (+-1.473) | 321.112 (+-2.904) | 1.939 (+-0.000) 3 torch.uint8 channels_last bilinear (1024, 1024) -> (256, 256) aa=True | 689.492 (+-2.917) | 758.162 (+-3.719) | 2050.880 (+-22.188) | 2.705 (+-0.000) 3 torch.uint8 channels_last bilinear (64, 64) -> (224, 224) aa=False | | 77.300 (+-0.307) | 169.646 (+-1.640) | 2.195 (+-0.000) 3 torch.uint8 channels_last bilinear (224, 224) -> (270, 268) aa=False | | 159.525 (+-1.225) | 329.754 (+-2.590) | 2.067 (+-0.000) 3 torch.uint8 channels_last bilinear (256, 256) -> (1024, 1024) aa=False | | 890.106 (+-3.358) | 2815.870 (+-22.589) | 3.164 (+-0.000) 3 torch.uint8 channels_last bilinear (224, 224) -> (64, 64) aa=False | | 52.399 (+-0.314) | 112.024 (+-1.225) | 2.138 (+-0.000) 3 torch.uint8 channels_last bilinear (270, 268) -> (224, 224) aa=False | | 148.780 (+-1.282) | 299.152 (+-3.353) | 2.011 (+-0.000) 3 torch.uint8 channels_last bilinear (1024, 1024) -> (256, 256) aa=False | | 479.273 (+-3.432) | 1698.601 (+-16.785) | 3.544 (+-0.000) 4 ``` Note: There is no perf regression for other case. There some cases (see Source below) with small speed-ups, for the rest it is roughly around 1.0 +/- 0.1 which may be attributed to noisy measurements ... [Source](https://gist.github.com/vfdev-5/1c0778904a07ce40401306548b9525e8#file-20230321-145513-pr_vs_nightly-speedup-md) ## Context - #90771 cc jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 [ghstack-poisoned]
diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -35,7 +35,7 @@ Like PIL, Pillow is licensed under the open source HPND License
 
 namespace {
 
-static __m128i inline mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
   int32_t v;
   if (i32_aligned) {
     v = *(const int32_t*)ptr;
@@ -45,10 +45,21 @@ static __m128i inline mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32
   return _mm_cvtsi32_si128(v);
 }
 
-static __m128i inline mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
   return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned));
 }
 
+static inline void _write_endline_rgb_as_uint32(
+    uint8_t* C10_RESTRICT output,
+    uint32_t data
+) {
+  // data is (R G B X), output is (X1 X2 X3 | R1 B1 G1 R2 ...)
+  // Here we explicitly set X as R1
+  uint8_t* data_ptr = reinterpret_cast<uint8_t*>(&data);
+  data_ptr[3] = output[3];
+  std::memcpy(output, data_ptr, 4);
+}
+
 // TODO: We may want to hard-code an unrolled version for the case where
 // num_channels=3 to hint the compiler to vectorize this (looks at original
 // PIL-SIMD's code).
@@ -680,36 +691,20 @@ void ImagingResampleHorizontalConvolution8u4x(
       // it by simply writing 4 bytes from the register to the output. We'll do the following:
       //               v----------|
       // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
-      // First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
-      // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
-      // Output = [... R G B | X G1 B1 R2 ...]
-      // Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
+      // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1)
+      // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1)
       // Output = [... R G B | R1 G1 B1 R2 ...]
 
-      char next0[4];
-      std::memcpy(next0, lineOut0 + out_x_strided + stride, 4);
-      std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4);
-      std::memcpy(lineOut0 + out_x_strided + stride, next0, 4);
-
-      char next1[4];
-      std::memcpy(next1, lineOut1 + out_x_strided + stride, 4);
-      std::memcpy(lineOut1 + out_x_strided, (uint8_t *) &o1, 4);
-      std::memcpy(lineOut1 + out_x_strided + stride, next1, 4);
-
-      char next2[4];
-      std::memcpy(next2, lineOut2 + out_x_strided + stride, 4);
-      std::memcpy(lineOut2 + out_x_strided, (uint8_t *) &o2, 4);
-      std::memcpy(lineOut2 + out_x_strided + stride, next2, 4);
+      _write_endline_rgb_as_uint32(lineOut0 + out_x_strided, o0);
+      _write_endline_rgb_as_uint32(lineOut1 + out_x_strided, o1);
+      _write_endline_rgb_as_uint32(lineOut2 + out_x_strided, o2);
 
       if (C10_UNLIKELY(is_last_line)) {
         // When we handle the last line, we can not access the next 4 bytes
         // as they are out of memory bounds.
         std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, num_channels);
       } else {
-        char next3[4];
-        std::memcpy(next3, lineOut3 + out_x_strided + stride, 4);
-        std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, 4);
-        std::memcpy(lineOut3 + out_x_strided + stride, next3, 4);
+        _write_endline_rgb_as_uint32(lineOut3 + out_x_strided, o3);
       }
     } else if (num_channels == 3) {
       // Memcpy 4-bytes is faster than 3-bytes and here
@@ -1036,16 +1031,10 @@ void ImagingResampleHorizontalConvolution8u(
         // it by simply writing 4 bytes from the register to the output. We'll do the following:
         //               v----------|
         // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
-        // First, we store next 4 bytes (R1 G1 B1 R2) in a temporary variable
-        // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | X)
-        // Output = [... R G B | X G1 B1 R2 ...]
-        // Third, we overwrite next 4 bytes of the output (X G1 B1 R2) with stored values (R1 G1 B1 R2)
+        // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1)
+        // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1)
         // Output = [... R G B | R1 G1 B1 R2 ...]
-
-        char next[4];
-        std::memcpy(next, lineOut + out_x_strided + stride, 4);
-        std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 4);
-        std::memcpy(lineOut + out_x_strided + stride, next, 4);
+        _write_endline_rgb_as_uint32(lineOut + out_x_strided, o);
       }
     } else if (num_channels == 3) {
       // Memcpy 4-bytes is faster than 3-bytes and here
@@ -1109,7 +1098,7 @@ void ImagingResampleVerticalConvolution8u(
       //    r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
       //    r4 g4 b4 a4  r5 g5 b5 a5  r6 g6 b6 a6  r7 g7 b7 a7
       // ]
-      // RGB: Load 8 pixels per line (however we can process only 8 pixels):
+      // RGB: Load 10 pixels per line (however we can process only 8 pixels):
       // source1 = [
       //    r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
       //    r4 g4 b4 r5  g5 b5 r6 g6  b6 r7 g7 b7  r8 g8 b8 r9