Simplify copy kernel

zasdfgbnm · zasdfgbnm · commit d3cbbca1316d · 2019-10-20T20:47:05.000-07:00
Using the new type promotion and dynamic casting added to `TensorIterator`, the copy kernels could be greatly simplified. **Script:** ```python import torch import timeit import pandas import itertools from tqdm import tqdm import math print(torch.__version__) print() _10M = 10 * 1024 ** 2 d = {} for from_, to in tqdm(itertools.product(torch.testing.get_all_dtypes(), repeat=2)): if from_ not in d: d[from_] = {} a = torch.zeros(_10M, dtype=from_) min_ = math.inf for i in range(100): start = timeit.default_timer() a.to(to) end = timeit.default_timer() elapsed = end - start if elapsed < min_: min_ = elapsed d[from_][to] = int(elapsed * 1000 * 1000) pandas.DataFrame(d) ``` **Before:** ![image](https://user-images.githubusercontent.com/1032377/67171274-2e93d000-f36b-11e9-8fa0-91edd7dbc8ec.png) **After:** ![image](https://user-images.githubusercontent.com/1032377/67171200-d361dd80-f36a-11e9-9b22-66292e395a09.png) [ghstack-poisoned]
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
@@ -221,11 +221,11 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     const auto& SCALAR_TYPE C10_UNUSED = TYPE;                          \
     switch (TYPE) {                                                     \
       AT_QINT_PRIVATE_CASE_TYPE(                                        \
-          at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)                    \
+          at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)        \
       AT_QINT_PRIVATE_CASE_TYPE(                                        \
-          at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)                 \
+          at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)     \
       AT_QINT_PRIVATE_CASE_TYPE(                                        \
-          at::kQInt32, at::qint32, at::kInt, int, __VA_ARGS__)                      \
+          at::kQInt32, at::qint32, at::kInt, int, __VA_ARGS__)          \
       default:                                                          \
         AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
     }                                                                   \
@@ -351,6 +351,29 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                                                            \
   }()
 
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND_QINTS_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  [&] {                                                                                                          \
+    switch (TYPE) {                                                                                              \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__)                                           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__)                                            \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)                                            \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)                                            \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)                                           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)                       \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)                     \
+      AT_QINT_PRIVATE_CASE_TYPE(at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__)                           \
+      AT_QINT_PRIVATE_CASE_TYPE(at::kQUInt8, at::quint8, at::kByte, uint8_t, __VA_ARGS__)                        \
+      AT_QINT_PRIVATE_CASE_TYPE(at::kQInt32, at::qint32, at::kInt, int, __VA_ARGS__)                             \
+      AT_PRIVATE_CASE_TYPE(SCALARTYPE1, decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE1>::t), __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(SCALARTYPE2, decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE2>::t), __VA_ARGS__)   \
+      AT_PRIVATE_CASE_TYPE(SCALARTYPE3, decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE3>::t), __VA_ARGS__)   \
+      default:                                                                                                   \
+        AT_ERROR(#NAME, " not implemented for '", TYPE, "'");                                                    \
+    }                                                                                                            \
+  }()
+
 // ----------------------------------------------------------------------------
 // DEPRECATED MACROS, DON'T USE THESE
 // ----------------------------------------------------------------------------
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
@@ -124,11 +124,12 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
 
   auto iter = TensorIterator();
   iter.set_check_mem_overlap(true);
+  iter.dont_compute_common_dtype();
   iter.add_output(self);
   iter.add_input(src);
   iter.dont_resize_outputs();
-  iter.dont_compute_common_dtype();
   iter.build();
+  iter.set_common_dtype(iter.dtype());
 
   if (iter.numel() == 0) {
     return self;
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
@@ -318,6 +318,10 @@ struct CAFFE2_API TensorIterator {
     resize_outputs_ = false;
   }
 
+  void set_common_dtype(ScalarType dtype) {
+    common_dtype_ = dtype;
+  }
+
   void build();
 
 protected:
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -4,69 +4,20 @@
 #include <ATen/native/Copy.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
-#include <c10/util/TypeCast.h>
 
 namespace at {
 namespace native {
 namespace {
 
-template <typename self_T>
-void copy_kernel_cast(TensorIterator& iter) {
-    if (isComplexType(iter.dtype(1))) {
-      AT_DISPATCH_COMPLEX_TYPES(iter.dtype(1), "copy_kernel_cast", [&] {
-        cpu_kernel(iter, [=](scalar_t a) -> self_T {
-            return c10::static_cast_with_inter_type<self_T>(std::real(a));
-          });
-        });
-    }
-    else {
-      AT_DISPATCH_ALL_TYPES_AND3(
-        ScalarType::Half,
-        ScalarType::Bool,
-        ScalarType::BFloat16,
-        iter.dtype(1),
-        "copy_kernel_cast",
-        [&] {
-          cpu_kernel(iter, [=](scalar_t a) -> self_T {
-            return c10::static_cast_with_inter_type<self_T>(a);
-          });
-        });
-    }
-}
-
 static void copy_kernel(TensorIterator& iter, bool non_blocking) {
-  ScalarType dtype = iter.dtype(0);
-  if (dtype == iter.dtype(1)) {
-    if (dtype == ScalarType::Half) {
-      cpu_kernel(iter, [=](at::Half a) -> at::Half { return a; });
-    } else if (dtype == ScalarType::BFloat16) {
-      cpu_kernel(iter, [=](at::BFloat16 a) -> at::BFloat16 { return a; });
-    } else if (isQIntType(dtype)) {
-      AT_DISPATCH_QINT_TYPES(dtype, "copy_kernel", [&] {
-        cpu_kernel(
-            iter,
-            [=](scalar_t a) -> scalar_t {return a; });
-      });
-    } else if (isComplexType(dtype)) {
-      AT_DISPATCH_COMPLEX_TYPES(dtype, "copy_kernel", [&] {
-          cpu_kernel(
-            iter,
-            [=](scalar_t a) -> scalar_t { return a; });
-        });
-    } else {
-      AT_DISPATCH_ALL_TYPES_AND(
-          ScalarType::Bool, dtype, "copy_kernel", [&] {
-            cpu_kernel_vec(
-                iter,
-                [=](scalar_t a) -> scalar_t { return a; },
-                [=](Vec256<scalar_t> a) { return a; });
-          });
-    }
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, dtype, "copy_", [&] {
-      copy_kernel_cast<scalar_t>(iter);
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND_QINTS_AND3(
+    ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.common_dtype(), "copy_",
+    [&] {
+      cpu_kernel_vec(
+        iter,
+        [](scalar_t a) -> scalar_t { return a; },
+        [](Vec256<scalar_t> a) { return a; });
     });
-  }
 }
 
 } // anonymous namespace
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -8,20 +8,12 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <THC/THC.h>
-#include <c10/util/TypeCast.h>
 
 namespace at {
 namespace native {
 
 using namespace at::cuda;
 
-template <typename dst_t, typename src_t>
-void copy_kernel_impl(TensorIterator& iter) {
-  gpu_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t {
-    return c10::static_cast_with_inter_type<dst_t>(x);
-  });
-}
-
 // device-to-device copy, does type conversion
 static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
   int64_t numel = iter.numel();
@@ -67,10 +59,7 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
         copy_stream));
   } else {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
-      using dst_t = scalar_t;
-      AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] {
-        copy_kernel_impl<dst_t, scalar_t>(iter);
-      });
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t x) { return x; });
     });
   }
 

Original file line number	Diff line number	Diff line change
`@@ -318,6 +318,10 @@ struct CAFFE2_API TensorIterator {`
`318`	`318`	`resize_outputs_ = false;`
`319`	`319`	`}`
`320`	`320`
	`321`	`+ void set_common_dtype(ScalarType dtype) {`
	`322`	`+ common_dtype_ = dtype;`
	`323`	`+ }`
	`324`	`+`
`321`	`325`	`void build();`
`322`	`326`
`323`	`327`	`protected:`