Small fixes to improve TensorIterator overhead for the common case of inputs and outputs of the same type (#27457)

Natalia Gimelshein · facebook-github-bot · commit 174e1ba3b8ed · 2019-10-16T13:06:20.000-07:00
Summary: 1) Short-circuits computing common type and type promotion logic for the common case of operands and result of the same type 2) Improves performance of checking memory overlap by returning MemoryOverlap::FULL if tensors are the same, skips the call from TensorIterator when tensors are the same 3) Changes the default size of DimVector from 5 to 6, thus allowing it not to be resized for a common case of binary operation. `strides` DimVector is forced to have at least 2*num_tensors elements, which for an operation with 2 inputs and one output is 6 4) If `offset` is 0 (common non-broadcasting case), don't fill `strides` vector with 0-s, because all the values will be subsequently written to. These changes combined improve the overhead from 1.02 us to .74 us for a simple in-place operation. Pull Request resolved: #27457 Test Plan: should be covered by existing tests Differential Revision: D17784532 Pulled By: ngimel fbshipit-source-id: e6a8ee58be5de14461bdbc2e2b0b6d16a96c309f
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
@@ -39,6 +39,7 @@ MemOverlapStatus get_overlap_status(const Tensor& a, const Tensor& b) {
 }
 
 MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {
+  if (a == b) return MemOverlapStatus::FULL;
   if (!a->is_contiguous() || !b->is_contiguous()) {
     return MemOverlapStatus::TOO_HARD;
   }
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
@@ -12,6 +12,7 @@ using DimMask = TensorIterator::DimMask;
 using PtrVector = TensorIterator::PtrVector;
 using loop_t = TensorIterator::loop_t;
 using loop2d_t = TensorIterator::loop2d_t;
+using StrideVector = TensorIterator::StrideVector;
 
 void TensorIterator::reorder_dimensions() {
   // Sort the dimensions based on strides in ascending order with reduced dims
@@ -86,19 +87,44 @@ Device compute_device(at::ArrayRef<OperandInfo> operands) {
   return kCPU;
 }
 
-static std::tuple<Device, ScalarType> compute_common_type_(at::ArrayRef<OperandInfo> operands) {
+static std::tuple<Device, ScalarType, bool> compute_common_type_(at::ArrayRef<OperandInfo> operands) {
   // See [Result type computation] in TensorIterator.h
   auto device = compute_device(operands);
+  auto common_type = ScalarType::Undefined;
+  bool all_same_type = true;
+  for (const auto& op: operands){
+    if (!op.tensor.defined()) continue;
+    //don't handle scalars
+    if (op.tensor.dim() > 0){
+      ScalarType current = op.tensor.scalar_type();
+      if (current == ScalarType::Undefined){
+        all_same_type = false;
+        break;
+      }
+      if (common_type == ScalarType::Undefined) common_type = current;
+      if (common_type != current) {
+        all_same_type = false;
+        break;
+      }
+    } else {
+      all_same_type = false;
+      break;
+    }
+  }
+  if (all_same_type) {
+    return std::make_tuple(device, common_type, true);
+  }
+  //TODO refactor so that no tensor copies are done
   std::vector<Tensor> tensors;
   std::transform(std::begin(operands), std::end(operands), std::back_inserter(tensors),
                   [](const OperandInfo& op) { return op.tensor; });
   auto dtype = at::native::result_type(tensors);
-  auto result = std::make_tuple(device, dtype);
+  auto result = std::make_tuple(device, dtype, false);
   TORCH_INTERNAL_ASSERT(dtype != ScalarType::Undefined);
   return result;
 }
 
-std::tuple<Device, ScalarType> TensorIterator::compute_common_type() {
+std::tuple<Device, ScalarType, bool> TensorIterator::compute_common_type() {
   return compute_common_type_(operands_);
 }
 
@@ -199,11 +225,13 @@ void TensorIterator::compute_types() {
         }
       }
 
-      if (!compute_common_dtype_only_for_inputs) {
-        validate_dtype(op, common_dtype, ninputs());
-      }
-      if (!compute_common_dtype_only_for_inputs || !op.is_output) {
-        maybe_promote_common_dtype(op, common_dtype);
+      if (!std::get<2>(common_type)) {
+        if (!compute_common_dtype_only_for_inputs) {
+          validate_dtype(op, common_dtype, ninputs());
+        }
+        if (!compute_common_dtype_only_for_inputs || !op.is_output) {
+          maybe_promote_common_dtype(op, common_dtype);
+        }
       }
 
       if (op.tensor.defined() && op.device != op.tensor.device()) {
@@ -221,8 +249,8 @@ void TensorIterator::compute_types() {
   }
 }
 
-DimVector TensorIterator::compatible_stride(int element_size) const {
-  auto stride = DimVector();
+StrideVector TensorIterator::compatible_stride(int element_size) const {
+  auto stride = StrideVector();
   int64_t next_stride = element_size;
   for (int dim = 0; dim < ndim(); dim++) {
     stride.push_back(next_stride);
@@ -369,9 +397,9 @@ int64_t TensorIterator::numel() const {
   return numel;
 }
 
-DimVector TensorIterator::get_dim_strides(int dim) const {
+StrideVector TensorIterator::get_dim_strides(int dim) const {
   auto dims = ndim();
-  auto inner_strides = DimVector();
+  auto inner_strides = StrideVector();
   for (auto& op : operands_) {
     inner_strides.push_back(dims == 0 ? 0 : op.stride_bytes[dim]);
   }
@@ -478,8 +506,8 @@ void TensorIterator::for_each(loop2d_t loop) {
   }
 }
 
-DimVector TensorIterator::get_strides() const {
-  DimVector strides;
+StrideVector TensorIterator::get_strides() const {
+  StrideVector strides;
   for (int dim = 0; dim < ndim(); dim++) {
     for (int arg = 0; arg < ntensors(); arg++) {
       strides.push_back(operands_[arg].stride_bytes[dim]);
@@ -751,9 +779,11 @@ void TensorIterator::compute_strides() {
       auto original_shape = op.tensor.sizes();
       auto original_stride = op.tensor.strides();
       auto element_size_in_bytes = op.tensor.element_size();
-
-      op.stride_bytes.resize(ndim(), 0);
       auto offset = ndim() - original_shape.size();
+      if (offset > 0)
+          op.stride_bytes.resize(ndim(), 0);
+      else
+          op.stride_bytes.resize(ndim());
       for (size_t i = 0; i < original_shape.size(); i++) {
         if (original_shape[i] == 1) {
           op.stride_bytes[offset + i] = 0;
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
@@ -52,7 +52,7 @@
 // (See https://github.com/pytorch/pytorch/issues/9515)
 //
 // Note that TensorIterator currently supports type conversions on 0-dim
-// tensors and arithmetic operators. Other type conversions will raise an 
+// tensors and arithmetic operators. Other type conversions will raise an
 // exception.
 
 namespace at {
@@ -71,6 +71,7 @@ struct DimCounter {
 };
 
 struct CAFFE2_API OperandInfo {
+  using StrideVector = SmallVector<int64_t, 6>;
   OperandInfo() {}
   explicit OperandInfo(const Tensor& t) : tensor(t) {
     if (t.defined()) {
@@ -85,7 +86,7 @@ struct CAFFE2_API OperandInfo {
   }
 
   /// Stride after broadcasting. The stride is in bytes, not number of elements.
-  DimVector stride_bytes;
+  StrideVector stride_bytes;
 
   /// The tensor operand. Note that the strides, data pointer, and
   /// other attributes may differ due to dimension reordering and
@@ -134,6 +135,7 @@ enum class CommonDTypeStrategy : uint8_t {
 struct CAFFE2_API TensorIterator {
   using DimMask = std::bitset<64>;
   using PtrVector = SmallVector<char*, 4>;
+  using StrideVector = SmallVector<int64_t, 6>;
 
   TensorIterator() {}
 
@@ -254,16 +256,16 @@ struct CAFFE2_API TensorIterator {
   /// Create a strides array for a Tensor with shape of this iterator. The
   /// parameter `element_size` specifies the size of Tensor's data type in
   /// bytes (e.g. `4` for `float`)
-  DimVector compatible_stride(int element_size) const;
+  StrideVector compatible_stride(int element_size) const;
 
   /// Inverts the re-ordering done by reorder_dimensions. This can only be
   /// called *before* coalesce_dimensions() is called.
   DimVector invert_perm(IntArrayRef input) const;
 
   /// Helper functions for CPU iteration
-  DimVector get_dim_strides(int dim) const;
-  DimVector get_strides() const;
-  DimVector get_inner_strides() const { return get_dim_strides(0); }
+  StrideVector get_dim_strides(int dim) const;
+  StrideVector get_strides() const;
+  StrideVector get_inner_strides() const { return get_dim_strides(0); }
   PtrVector get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const;
   PtrVector get_base_ptrs() const;
 
@@ -328,7 +330,7 @@ struct CAFFE2_API TensorIterator {
   void reorder_dimensions();
   void permute_dimensions(IntArrayRef perm);
   void compute_types();
-  std::tuple<Device, ScalarType> compute_common_type();
+  std::tuple<Device, ScalarType, bool> compute_common_type();
   void allocate_outputs();
 #ifdef BUILD_NAMEDTENSOR
   void compute_names();

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ MemOverlapStatus get_overlap_status(const Tensor& a, const Tensor& b) {`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {`
	`42`	`+ if (a == b) return MemOverlapStatus::FULL;`
`42`	`43`	`if (!a->is_contiguous() \|\| !b->is_contiguous()) {`
`43`	`44`	`return MemOverlapStatus::TOO_HARD;`
`44`	`45`	`}`