pytorch · zasdfgbnm · Oct 22, 2019 · Oct 22, 2019 · Oct 23, 2019 · Oct 24, 2019
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -8,20 +8,12 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <THC/THC.h>
-#include <c10/util/TypeCast.h>
 
 namespace at {
 namespace native {
 
 using namespace at::cuda;
 
-template <typename dst_t, typename src_t>
-void copy_kernel_impl(TensorIterator& iter) {
-  gpu_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t {
-    return c10::static_cast_with_inter_type<dst_t>(x);
-  });
-}
-
 // device-to-device copy, does type conversion
 static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
   int64_t numel = iter.numel();
@@ -66,11 +58,11 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
         cudaMemcpyDeviceToDevice,
         copy_stream));
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
-      using dst_t = scalar_t;
-      AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] {
-        copy_kernel_impl<dst_t, scalar_t>(iter);
-      });
+    // this is done intentionally done after build because copy has a "promotion"
+    // rule that always "promote" to target dtype.
+    iter.promote_common_dtype();
+    AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(0), "copy_", [&] {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t x) { return x; });
     });
   }