pytorch
diff --git a/‎aten/src/ATen/core/Tensor.h‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/Tensor.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/TensorMethods.h‎
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/core/TensorMethods.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/core/Type.h‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/Type.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/Indexing.cpp‎
Lines changed: 184 additions & 21 deletions b/‎aten/src/ATen/native/Indexing.cpp‎
Lines changed: 184 additions & 21 deletions
diff --git a/‎aten/src/ATen/native/Indexing.h‎
Lines changed: 20 additions & 0 deletions b/‎aten/src/ATen/native/Indexing.h‎
Lines changed: 20 additions & 0 deletions
@@ -352,8 +352,8 @@ class CAFFE2_API Tensor {
   Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) const;
   Tensor index(TensorList indices) const;
   Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source);
-  Tensor index_put(TensorList indices, const Tensor & values) const;
-  Tensor & index_put_(TensorList indices, const Tensor & values);
+  Tensor index_put(TensorList indices, const Tensor & values, bool accumulate=false) const;
+  Tensor & index_put_(TensorList indices, const Tensor & values, bool accumulate=false);
   Tensor inverse() const;
   Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
   bool is_distributed() const;
 
@@ -314,11 +314,11 @@ inline Tensor Tensor::index(TensorList indices) const {
 inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) {
     return type().index_copy_(*this, dim, index, source);
 }
-inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const {
-    return type().index_put(*this, indices, values);
+inline Tensor Tensor::index_put(TensorList indices, const Tensor & values, bool accumulate) const {
+    return type().index_put(*this, indices, values, accumulate);
 }
-inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) {
-    return type().index_put_(*this, indices, values);
+inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values, bool accumulate) {
+    return type().index_put_(*this, indices, values, accumulate);
 }
 inline Tensor Tensor::inverse() const {
     return type().inverse(*this);
 
@@ -257,8 +257,8 @@ struct CAFFE2_API Type {
   virtual Tensor irfft(const Tensor & self, int64_t signal_ndim, bool normalized, bool onesided, IntList signal_sizes) const = 0;
   virtual Tensor index(const Tensor & self, TensorList indices) const = 0;
   virtual Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) const = 0;
-  virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values) const = 0;
-  virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values) const = 0;
+  virtual Tensor index_put(const Tensor & self, TensorList indices, const Tensor & values, bool accumulate) const = 0;
+  virtual Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & values, bool accumulate) const = 0;
   virtual Tensor inverse(const Tensor & self) const = 0;
   virtual Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan) const = 0;
   virtual bool is_distributed(const Tensor & self) const = 0;
 
@@ -21,10 +21,12 @@
 // adjacent (e.g. x[[0, 1], :, [2, 3]]). In this case, self and the index
 // tensors are transposed to the front: x.transpose(1, 2)[[0, 1], [2, 3]]
 
+#include <ATen/native/Indexing.h>
 
-#include "ATen/ATen.h"
-#include "ATen/NativeFunctions.h"
-#include "ATen/ExpandUtils.h"
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/native/TensorIterator.h>
 
 #include <algorithm>
 #include <functional>
@@ -33,6 +35,9 @@
 
 namespace at { namespace native {
 
+DEFINE_DISPATCH(index_stub);
+DEFINE_DISPATCH(index_put_stub);
+
 [[noreturn]]
 static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) {
   std::stringstream ss;
@@ -226,34 +231,192 @@ static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig)
   return std::make_tuple(self, linearIndex);
 }
 
-Tensor index(const Tensor & self, TensorList indices) {
-  AT_CHECK(indices.size() <= (size_t)self.dim(),
-           "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+static bool all_strides_match(TensorList tensors) {
+  AT_ASSERT(tensors.size() >= 1);
+  auto strides = tensors[0].strides();
+  for (auto& tensor : tensors.slice(1)) {
+    if (!strides.equals(tensor.strides())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static std::string shapes_as_str(TensorList tensors) {
+  std::ostringstream os;
+  bool first = true;
+  for (auto& tensor : tensors) {
+    if (tensor.defined()) {
+      if (!first) {
+        os << ", ";
+      }
+      os << tensor.sizes();
+      first = false;
+    }
+  }
+  return os.str();
+}
+
+struct AdvancedIndex {
+  AdvancedIndex(const Tensor& src, TensorList indices);
+
+  Tensor src;
+  std::vector<Tensor> indices;
+  DimVector indexed_sizes;
+  DimVector indexed_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+};
+
+static Tensor restride_src(const Tensor& src, int64_t dims_before, int64_t dims_indexed,
+                           IntList replacement_shape) {
+  auto shape = DimVector(src.sizes());
+  auto strides = DimVector(src.strides());
+  int end = dims_before + dims_indexed;
+  shape.erase(shape.begin() + dims_before, shape.begin() + end);
+  strides.erase(strides.begin() + dims_before, strides.begin() + end);
+  shape.insert(shape.begin() + dims_before, replacement_shape.begin(), replacement_shape.end());
+  strides.insert(strides.begin() + dims_before, replacement_shape.size(), 0);
+  return src.as_strided(shape, strides);
+}
+
+static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t dims_after) {
+  auto orig_shape = index.sizes();
+  auto shape = DimVector();
+  shape.append(dims_before, 1);
+  shape.append(orig_shape.begin(), orig_shape.end());
+  shape.append(dims_after, 1);
+  return index.reshape(shape);
+}
+
+AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
+{
+  int64_t element_size_bytes = src.type().elementSizeInBytes();
+  int dims_before = 0, dims_after = 0, dims_indexed = 0;
+  IntList replacement_shape;
+  for (size_t dim = 0; dim < indices_list.size(); dim++) {
+    if (!indices_list[dim].defined()) {
+      if (dims_indexed == 0) {
+        dims_before++;
+      } else {
+        dims_after++;
+      }
+    } else {
+      dims_indexed++;
+      replacement_shape = indices_list[dim].sizes();
+      indexed_sizes.push_back(src.size(dim));
+      indexed_strides.push_back(src.stride(dim) * element_size_bytes);
+    }
+  }
+
+  this->dims_before = dims_before;
+  this->dims_after = dims_after;
+  this->src = restride_src(src, dims_before, dims_indexed, replacement_shape);
+
+  for (auto& index : indices_list) {
+    if (index.defined()) {
+      indices.push_back(reshape_indexer(index, dims_before, dims_after));
+    }
+  }
 
-  Tensor src, linearIndex;
-  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
-  return src.take(linearIndex);
+  // For CUDA tensors, force all index tensors to have the same striding to
+  // simplify the CUDA kernel.
+  if (indices.size() >= 2 && this->src.type().device_type() == kCUDA) {
+    if (!all_strides_match(indices)) {
+      for (size_t i = 0; i < indices.size(); i++) {
+        indices[i] = indices[i].contiguous();
+      }
+    }
+  }
 }
 
-Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value) {
+static AdvancedIndex make_info(Tensor self, TensorList orig) {
+  checkIndexTensorTypes(orig);
+  // first expand ByteTensor (boolean masks) into 1 or more LongTensors
+  auto indices = expandByteTensors(self, orig);
+  // next broadcast all index tensors together
+  try {
+    indices = expand_outplace(indices);
+  } catch (std::exception& e) {
+    AT_ERROR("shape mismatch: indexing tensors could not be broadcast together"
+             " with shapes ", shapes_as_str(indices));
+  }
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices) = transposeToFront(self, indices);
+  }
+  return AdvancedIndex(self, indices);
+}
+
+static Tensor make_bogus_tensor(const Tensor& self, const AdvancedIndex& info) {
+  auto shape = DimVector(info.src.sizes());
+  auto strides = DimVector(shape.size(), 0);
+  strides[strides.size() - 1] = 1;
+  for (int dim = strides.size() - 2; dim >= 0; dim--) {
+    strides[dim] = strides[dim + 1] * shape[dim + 1];
+  }
+  return info.src.as_strided(shape, strides);
+}
+
+static std::unique_ptr<TensorIterator> make_index_iterator(const AdvancedIndex& info) {
+  auto builder = TensorIterator::Builder();
+  builder.dont_compute_common_dtype();
+  builder.add_output(Tensor(), &info.src.type());
+  builder.add_input(info.src);
+  for (auto& index : info.indices) {
+    builder.add_input(index);
+  }
+  return builder.build();
+}
+
+static std::unique_ptr<TensorIterator> make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
+  if (!is_expandable_to(value.sizes(), info.src.sizes())) {
+    AT_ERROR("shape mismatch: value tensor of shape ", value.sizes(),
+             " cannot be broadcast to indexing result of shape ", info.src.sizes());
+  }
+  auto builder = TensorIterator::Builder();
+  builder.dont_compute_common_dtype();
+  builder.dont_resize_outputs();
+  builder.add_output(info.src);
+  builder.add_input(value, &info.src.type());
+  for (auto& index : info.indices) {
+    builder.add_input(index);
+  }
+  return builder.build();
+}
+
+Tensor index(const Tensor & self, TensorList indices) {
   AT_CHECK(indices.size() <= (size_t)self.dim(),
            "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
-  Tensor src, linearIndex, expandedValue;
-  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
-  std::tie(expandedValue) = expand_inplace(linearIndex, value);
-  Tensor dst = src.clone();
-  return dst.put_(linearIndex, expandedValue);
+  auto info = make_info(self, indices);
+  auto iter = make_index_iterator(info);
+  index_stub(iter->device_type(), *iter, info.indexed_sizes, info.indexed_strides);
+  return iter->output();
 }
 
-Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value) {
+Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value, bool accumulate) {
+  return self.clone().index_put_(indices, value, accumulate);
+}
+
+Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value, bool accumulate) {
   AT_CHECK(indices.size() <= (size_t)self.dim(),
            "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
-
-  Tensor src, linearIndex, expandedValue;
-  std::tie(src, linearIndex) = makeLinearIndex(self, indices);
-  std::tie(expandedValue) = expand_inplace(linearIndex, value);
-  return src.put_(linearIndex, expandedValue);
+  if (accumulate && self.type().device_type() == kCUDA) {
+    Tensor src, linearIndex, expandedValue;
+    std::tie(src, linearIndex) = makeLinearIndex(self, indices);
+    std::tie(expandedValue) = expand_inplace(linearIndex, value);
+    return src.put_(linearIndex, expandedValue, true);
+  }
+  auto info = make_info(self, indices);
+  auto iter = make_index_put_iterator(info, value);
+  index_put_stub(iter->device_type(), *iter, info.indexed_sizes, info.indexed_strides, accumulate);
+  return self;
 }
 
 Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
 
@@ -0,0 +1,20 @@
+#pragma once
+
+// Indexing tensors by by tensors
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+  struct TensorIterator;
+}
+
+namespace at { namespace native {
+
+using index_fn = void(*)(TensorIterator &, IntList indexed_sizes, IntList indexed_strides);
+using index_put_fn = void(*)(TensorIterator &, IntList indexed_sizes, IntList indexed_strides, bool accumulate);
+
+DECLARE_DISPATCH(index_fn, index_stub);
+DECLARE_DISPATCH(index_put_fn, index_put_stub);
+
+}} // namespace at::native
Original file line number	Diff line number	Diff line change
`@@ -314,11 +314,11 @@ inline Tensor Tensor::index(TensorList indices) const {`
`314`	`314`	`inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) {`
`315`	`315`	`return type().index_copy_(*this, dim, index, source);`
`316`	`316`	`}`
`317`		`-inline Tensor Tensor::index_put(TensorList indices, const Tensor & values) const {`
`318`		`- return type().index_put(*this, indices, values);`
	`317`	`+inline Tensor Tensor::index_put(TensorList indices, const Tensor & values, bool accumulate) const {`
	`318`	`+ return type().index_put(*this, indices, values, accumulate);`
`319`	`319`	`}`
`320`		`-inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values) {`
`321`		`- return type().index_put_(*this, indices, values);`
	`320`	`+inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values, bool accumulate) {`
	`321`	`+ return type().index_put_(*this, indices, values, accumulate);`
`322`	`322`	`}`
`323`	`323`	`inline Tensor Tensor::inverse() const {`
`324`	`324`	`return type().inverse(*this);`