Make TensorIterator stop promoting types by copying (#28427)

zasdfgbnm · facebook-github-bot · commit b9f099ed936a · 2019-10-28T14:49:02.000-07:00
Summary: Pull Request resolved: #28427 Fixes: #26401 This PR fixes the issue by using the newly added dynamic cast inside `TensorIterator` so that instead of converting the type at the beginning (which generates extra kernel launches), the `TensorIterator` do a load-cast-compute-store for each element while looping. So there is only one read and one write of memory. **nvprof:** ```python import torch _100M = 100 * 1024 ** 2 r = torch.randn(_100M, dtype=torch.float32, device='cuda') d = torch.randn(_100M, dtype=torch.float64, device='cuda') torch.cuda.synchronize() torch.cuda.profiler.start() r.add_(d) torch.cuda.profiler.stop() torch.cuda.synchronize() ``` ``` ==11407== NVPROF is profiling process 11407, command: /home/xgao/anaconda3/bin/python simple.py ==11407== Profiling application: /home/xgao/anaconda3/bin/python simple.py ==11407== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 100.00% 2.0611ms 1 2.0611ms 2.0611ms 2.0611ms _ZN2at6native18elementwise_kernelILi512ELi1EZNS0_15gpu_kernel_implIZZZNS0_15add_kernel_cudaERNS_14TensorIteratorEN3c106ScalarEENKUlvE_clEvENKUlvE1_clEvEUlddE_EEvS4_RKT_EUliE_EEviT1_ API calls: 100.00% 1.05006s 1 1.05006s 1.05006s 1.05006s cudaLaunchKernel 0.00% 2.7740us 2 1.3870us 673ns 2.1010us cudaGetDevice 0.00% 2.3730us 1 2.3730us 2.3730us 2.3730us cudaSetDevice 0.00% 830ns 1 830ns 830ns 830ns cudaGetLastError ``` **benchmark** ```python import torch print(torch.__version__) print(torch.version.git_version) _100M = 100 * 1024 ** 2 r = torch.randn(_100M, dtype=torch.float32, device='cuda') d = torch.randn(_100M, dtype=torch.float64, device='cuda') torch.cuda.synchronize() %timeit r.add_(d); torch.cuda.synchronize() ``` original ``` 1.4.0a0+7d277b0 7d277b0 6.83 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` after ``` 1.4.0a0+f0f2f65 f0f2f65 2.08 ms ± 139 ns per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` For more benchmark, see: #28344 Test Plan: Imported from OSS Differential Revision: D18170997 Pulled By: ezyang fbshipit-source-id: 9c82c1c89583f3e6202c5d790b9b73ad9f960fad
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
@@ -148,7 +148,7 @@ static void validate_dtype(OperandInfo& op, ScalarType common_dtype, CommonDType
   }
 }
 
-static void maybe_promote_common_dtype(OperandInfo& op, ScalarType common_dtype) {
+static void maybe_copy_casting_to_common_dtype(OperandInfo& op, ScalarType common_dtype) {
   if (op.tensor.defined() && op.tensor.scalar_type() != common_dtype)
   {
     op.dtype = common_dtype;
@@ -165,7 +165,7 @@ static void maybe_promote_common_dtype(OperandInfo& op, ScalarType common_dtype)
 void TensorIterator::compute_types() {
   bool missing_dtypes = false;
   bool missing_output_dtypes = false;
-  ScalarType common_dtype = dtype();
+  common_dtype_ = dtype();
   for (auto& op : operands_) {
     if (!op.tensor.defined() && !op.is_type_defined()) {
       missing_dtypes = true;
@@ -183,31 +183,33 @@ void TensorIterator::compute_types() {
   bool compute_common_dtype_only_for_inputs = (common_dtype_strategy_ == CommonDTypeStrategy::PROMOTE_INPUTS);
 
   bool may_have_differing_types = true;
+  bool common_device_is_cuda = false;
 
   if (missing_dtypes || compute_common_dtype) {
     auto operands = compute_common_dtype_only_for_inputs ? at::ArrayRef<OperandInfo>(operands_).slice(noutputs()) : operands_;
     auto common_type = compute_common_type_(operands);
     auto common_device = std::get<0>(common_type);
-    common_dtype = std::get<1>(common_type);
+    common_device_is_cuda = common_device.is_cuda();
+    common_dtype_ = std::get<1>(common_type);
     may_have_differing_types = !std::get<2>(common_type);
     bool has_cpu_scalar = false;
     for (auto& op : operands_) {
       if (!op.is_type_defined()) {
         op.device = common_device;
-        op.dtype = common_dtype;
+        op.dtype = common_dtype_;
       } else if (compute_common_dtype &&
-                 (op.device != common_device || op.dtype != common_dtype)) {
+                 (op.device != common_device || op.dtype != common_dtype_)) {
         if (allow_cpu_scalars_ && op.tensor.defined() && op.tensor.dim() == 0 &&
-            common_device.is_cuda() && op.tensor.device().is_cpu() &&
+            common_device_is_cuda && op.tensor.device().is_cpu() &&
             !has_cpu_scalar) {
           // don't cast CPU scalars in CUDA ops that directly support them.
           op.device = op.tensor.device();
           op.dtype = op.tensor.scalar_type();
           has_cpu_scalar = true;
         } else if (promote_gpu_output_dtypes_ && op.tensor.defined() &&
             !op.is_output &&
-            op.tensor.scalar_type() == kHalf && common_dtype == kFloat &&
-            op.tensor.device().is_cuda() && common_device.is_cuda()) {
+            op.tensor.scalar_type() == kHalf && common_dtype_ == kFloat &&
+            op.tensor.device().is_cuda() && common_device_is_cuda) {
           // allow input tensor type upcasting for fp16 to fp32 in fused kernel
           // on GPU
           op.device = op.tensor.device();
@@ -217,7 +219,7 @@ void TensorIterator::compute_types() {
           if (compute_common_dtype_only_for_inputs && op.is_output) {
             op.dtype = op.tensor.scalar_type();
           } else {
-            op.dtype = common_dtype;
+            op.dtype = common_dtype_;
           }
         }
       }
@@ -226,12 +228,17 @@ void TensorIterator::compute_types() {
 
   for (auto &op : operands_) {
     if (may_have_differing_types) {
-      validate_dtype(op, common_dtype, common_dtype_strategy_);
-      if (compute_common_dtype && (!compute_common_dtype_only_for_inputs || !op.is_output)) {
-        maybe_promote_common_dtype(op, common_dtype);
+      validate_dtype(op, common_dtype_, common_dtype_strategy_);
+      bool cast_by_copy = compute_common_dtype && !common_device_is_cuda && (!compute_common_dtype_only_for_inputs || !op.is_output);
+      if (cast_by_copy) {
+        maybe_copy_casting_to_common_dtype(op, common_dtype_);
       }
     }
 
+    if (op.tensor.defined() && op.tensor.scalar_type() != common_dtype_) {
+      have_differing_types_ = true;
+    }
+
     if (op.tensor.defined() && op.device != op.tensor.device()) {
       if (op.is_output) {
         AT_ERROR("output with device ", op.tensor.device(),
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
@@ -191,6 +191,7 @@ struct CAFFE2_API TensorIterator {
   IntArrayRef strides(int arg) const { return operands_[arg].stride_bytes; }
   void* data_ptr(int arg) const;
   ScalarType dtype(int arg=0) const { return operands_[arg].tensor.scalar_type(); }
+  ScalarType common_dtype() const { return common_dtype_; }
   ScalarType input_dtype(int arg=0) const { return operands_[num_outputs_ + arg].dtype; }
   Device device(int arg=0) const { return operands_[arg].device; }
   DeviceType device_type(int arg=0) const { return device(arg).type(); }
@@ -286,6 +287,10 @@ struct CAFFE2_API TensorIterator {
   /// CUDA reductions.
   bool is_final_output() const { return final_output_; }
 
+  bool needs_dynamic_casting() const {
+    return (common_dtype_strategy_ != CommonDTypeStrategy::NONE) && have_differing_types_;
+  }
+
   void set_check_mem_overlap(bool check_mem_overlap) {
     check_mem_overlap_ = check_mem_overlap;
   }
@@ -352,6 +357,7 @@ struct CAFFE2_API TensorIterator {
   SmallVector<OperandInfo, 4> operands_;
   int num_outputs_ = 0;
   CommonDTypeStrategy common_dtype_strategy_ = CommonDTypeStrategy::CHECK;
+  ScalarType common_dtype_ = ScalarType::Undefined;
   bool has_coalesced_dimensions_ = false;
   bool accumulate_ = false;
   bool resize_outputs_ = true;
@@ -360,6 +366,7 @@ struct CAFFE2_API TensorIterator {
   bool promote_gpu_output_dtypes_ = false;
   bool final_output_ = true;
   bool check_mem_overlap_ = false;
+  bool have_differing_types_ = false;
 };
 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
diff --git a/aten/src/ATen/native/cuda/BinaryOpsKernel.cu b/aten/src/ATen/native/cuda/BinaryOpsKernel.cu
@@ -13,7 +13,7 @@
 namespace at { namespace native {
 
 void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "add_cuda/sub_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
     auto alpha = alpha_scalar.to<scalar_t>();
     gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
       return a + alpha * b;
@@ -26,19 +26,19 @@ static void sub_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
 }
 
 void div_kernel_cuda(TensorIterator& iter) {
-  if (!isIntegralType(iter.dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
+  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
     // optimization for floating-point types: if the second operand is a CPU
     // scalar, compute a * reciprocal(b). Note that this may lose one bit of
     // precision compared to computing the division.
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "div_cuda", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "div_cuda", [&]() {
       auto inv_b = scalar_t(1.0 / iter.scalar_value<scalar_t>(2));
       iter.remove_operand(2);
       gpu_kernel(iter, [inv_b]GPU_LAMBDA(scalar_t a) -> scalar_t {
         return a * inv_b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "div_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "div_cuda", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a / b;
       });
@@ -47,13 +47,13 @@ void div_kernel_cuda(TensorIterator& iter) {
 }
 
 void mul_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(bool a, bool b) -> bool {
       return a && b;
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "mul_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "mul_cuda", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a * b;
       });
@@ -62,22 +62,22 @@ void mul_kernel_cuda(TensorIterator& iter) {
 }
 
 void atan2_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "atan2_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "atan2_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
       return ::atan2(a, b);
     });
   });
 }
 
 void logical_xor_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "logical_xor_cuda", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return bool(a) != bool(b);
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "logical_xor_cuda", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "logical_xor_cuda", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return static_cast<scalar_t>(bool(a) != bool(b));
       });
@@ -86,14 +86,14 @@ void logical_xor_kernel_cuda(TensorIterator& iter) {
 }
 
 void lt_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "lt_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a < b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "lt_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "lt_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a < b;
       });
@@ -102,14 +102,14 @@ void lt_kernel_cuda(TensorIterator& iter) {
 }
 
 void le_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "le_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a <= b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "le_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "le_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a <= b;
       });
@@ -118,14 +118,14 @@ void le_kernel_cuda(TensorIterator& iter) {
 }
 
 void gt_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "gt_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a > b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "gt_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "gt_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a > b;
       });
@@ -134,14 +134,14 @@ void gt_kernel_cuda(TensorIterator& iter) {
 }
 
 void ge_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "ge_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a >= b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "ge_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "ge_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a >= b;
       });
@@ -150,14 +150,14 @@ void ge_kernel_cuda(TensorIterator& iter) {
 }
 
 void eq_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "eq_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a == b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "eq_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "eq_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a == b;
       });
@@ -166,14 +166,14 @@ void eq_kernel_cuda(TensorIterator& iter) {
 }
 
 void ne_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
+  if (iter.common_dtype() == ScalarType::Bool) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.input_dtype(), "ne_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
         return a != b;
       });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "ne_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.common_dtype(), "ne_cpu", [&]() {
       gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
         return a != b;
       });
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
@@ -35,6 +35,7 @@
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/TypeCast.h>
 
 // Marks a lambda as executable on both the host and device. The __host__
 // attribute is important so that we can access static type information from
@@ -116,6 +117,20 @@ invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[]
   return invoke_impl<traits>(f, data, strides, i, Indices{});
 }
 
+template <typename traits, typename func_t, typename index_t, size_t... I>
+C10_HOST_DEVICE typename traits::result_type
+invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i,
+            c10::guts::index_sequence<I...>) {
+  return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(dtypes[I], data[I] + i * strides[I])...);
+}
+
+template <typename func_t, typename index_t, typename traits = function_traits<func_t>>
+C10_HOST_DEVICE typename traits::result_type
+invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) {
+  using Indices = c10::guts::make_index_sequence<traits::arity>;
+  return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
+}
+
 template <typename func_t>
 void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
   using traits = function_traits<func_t>;
@@ -130,6 +145,10 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     data[i] = (char*)iter.data_ptr(i);
   }
 
+  at::detail::Array<ScalarType, ntensors> dtypes;
+  for (int i = 0; i < ntensors; i++) {
+    dtypes[i] = iter.tensor(i).scalar_type();
+  }
 
   int64_t numel = iter.numel();
   if (iter.is_trivial_1d()) {
@@ -138,19 +157,35 @@ void gpu_kernel_impl(TensorIterator& iter, const func_t& f) {
     for (int i = 0; i < ntensors; i++) {
       strides[i] = inner_strides[i];
     }
-   
 
-    launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
-      arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
-      *out = invoke(f, &data.data[1], &strides.data[1], idx);
-    });
+    if (iter.needs_dynamic_casting()) {
+      launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+        void* out = data[0] + strides[0] * idx;
+        arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      });
+    } else {
+      launch_kernel<launch_size_1d, 1>(numel, [=]GPU_LAMBDA(int idx) {
+        arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx);
+        *out = invoke(f, &data.data[1], &strides.data[1], idx);
+      });
+    }
   } else {
     auto offset_calc = make_offset_calculator<traits::arity + 1>(iter);
-    launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
-      auto offsets = offset_calc.get(idx);
-      arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
-      *out = invoke(f, &data.data[1], &offsets.data[1], 1);
-    });
+    if (iter.needs_dynamic_casting()) {
+      launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+        auto offsets = offset_calc.get(idx);
+        void* out = data[0] + offsets[0];
+        arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      });
+    } else {
+      launch_kernel<launch_size_nd, launch_bound2>(numel, [=]GPU_LAMBDA(int idx) {
+        auto offsets = offset_calc.get(idx);
+        arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+        *out = invoke(f, &data.data[1], &offsets.data[1], 1);
+      });
+    }
   }
 }
 
@@ -174,7 +209,6 @@ void gpu_kernel(TensorIterator& iter, const func_t& f) {
   }
 
   gpu_kernel_impl(iter, f);
-  iter.cast_outputs();
 }
 
 template <typename func_t>
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp