Update on "Simplify copy kernel"

zasdfgbnm · zasdfgbnm · commit 27ecdf9aabe0 · 2019-10-26T10:11:57.000-07:00
Using the new type promotion and dynamic casting added to `TensorIterator`, the copy kernels could be greatly simplified. For benchmark, see #28352 (comment) [ghstack-poisoned]
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
@@ -235,10 +235,8 @@ void TensorIterator::compute_types() {
       }
     }
 
-    if (op.tensor.defined()) {
-      if (op.tensor.scalar_type() != common_dtype_) {
-        has_promotion_ = true;
-      }
+    if (op.tensor.defined() && op.tensor.scalar_type() != common_dtype_) {
+      have_differing_types_ = true;
     }
 
     if (op.tensor.defined() && op.device != op.tensor.device()) {
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
@@ -287,8 +287,8 @@ struct CAFFE2_API TensorIterator {
   /// CUDA reductions.
   bool is_final_output() const { return final_output_; }
 
-  bool has_promotion() const {
-    return has_promotion_;
+  bool needs_dynamic_casting() const {
+    return (common_dtype_strategy_ != CommonDTypeStrategy::NONE) && have_differing_types_;
   }
 
   void set_check_mem_overlap(bool check_mem_overlap) {
@@ -330,10 +330,6 @@ struct CAFFE2_API TensorIterator {
     resize_outputs_ = false;
   }
 
-  void set_common_dtype(ScalarType dtype) {
-    common_dtype_ = dtype;
-  }
-
   void build();
 
 protected:
@@ -370,7 +366,7 @@ struct CAFFE2_API TensorIterator {
   bool promote_gpu_output_dtypes_ = false;
   bool final_output_ = true;
   bool check_mem_overlap_ = false;
-  bool has_promotion_ = false;
+  bool have_differing_types_ = false;
 };
 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -60,9 +60,8 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
   } else {
     // this is done intentionally done after build because copy has a "promotion"
     // rule that always "promote" to target dtype.
-    iter.set_common_dtype(iter.dtype());
     iter.promote_common_dtype();
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
+    AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(0), "copy_", [&] {
       gpu_kernel(iter, []GPU_LAMBDA(scalar_t x) { return x; });
     });
   }
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
@@ -158,7 +158,7 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
       strides[i] = inner_strides[i];
     }
 
-    if (iter.has_promotion()) {
+    if (iter.needs_dynamic_casting()) {
       launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
         void* out = data[0] + strides[0] * idx;
         arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
@@ -172,7 +172,7 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     }
   } else {
     auto offset_calc = make_offset_calculator<traits::arity + 1>(iter);
-    if (iter.has_promotion()) {
+    if (iter.needs_dynamic_casting()) {
       launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
         auto offsets = offset_calc.get(idx);
         void* out = data[0] + offsets[0];

Original file line number	Diff line number	Diff line change
`@@ -235,10 +235,8 @@ void TensorIterator::compute_types() {`
`235`	`235`	`}`
`236`	`236`	`}`
`237`	`237`
`238`		`- if (op.tensor.defined()) {`
`239`		`- if (op.tensor.scalar_type() != common_dtype_) {`
`240`		`- has_promotion_ = true;`
`241`		`- }`
	`238`	`+ if (op.tensor.defined() && op.tensor.scalar_type() != common_dtype_) {`
	`239`	`+ have_differing_types_ = true;`
`242`	`240`	`}`
`243`	`241`
`244`	`242`	`if (op.tensor.defined() && op.device != op.tensor.device()) {`