Simplify copy kernel

zasdfgbnm · zasdfgbnm · commit f164d7029758 · 2019-10-26T10:11:57.000-07:00
Using the new type promotion and dynamic casting added to `TensorIterator`, the copy kernels could be greatly simplified. **Script:** ```python import torch import timeit import pandas import itertools from tqdm import tqdm import math print(torch.__version__) print() _10M = 10 * 1024 ** 2 d = {} for from_, to in tqdm(itertools.product(torch.testing.get_all_dtypes(), repeat=2)): if from_ not in d: d[from_] = {} a = torch.zeros(_10M, dtype=from_) min_ = math.inf for i in range(100): start = timeit.default_timer() a.to(to) end = timeit.default_timer() elapsed = end - start if elapsed < min_: min_ = elapsed d[from_][to] = int(elapsed * 1000 * 1000) pandas.DataFrame(d) ``` **Before:** ![image](https://user-images.githubusercontent.com/1032377/67171274-2e93d000-f36b-11e9-8fa0-91edd7dbc8ec.png) **After:** ![image](https://user-images.githubusercontent.com/1032377/67171200-d361dd80-f36a-11e9-9b22-66292e395a09.png) ghstack-source-id: 1269ecc Pull Request resolved: #28428
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -8,20 +8,12 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <THC/THC.h>
-#include <c10/util/TypeCast.h>
 
 namespace at {
 namespace native {
 
 using namespace at::cuda;
 
-template <typename dst_t, typename src_t>
-void copy_kernel_impl(TensorIterator& iter) {
-  gpu_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t {
-    return c10::static_cast_with_inter_type<dst_t>(x);
-  });
-}
-
 // device-to-device copy, does type conversion
 static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
   int64_t numel = iter.numel();
@@ -66,11 +58,11 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
         cudaMemcpyDeviceToDevice,
         copy_stream));
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
-      using dst_t = scalar_t;
-      AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] {
-        copy_kernel_impl<dst_t, scalar_t>(iter);
-      });
+    // this is done intentionally done after build because copy has a "promotion"
+    // rule that always "promote" to target dtype.
+    iter.promote_common_dtype();
+    AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(0), "copy_", [&] {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t x) { return x; });
     });
   }