pytorch
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.cpp‎
Lines changed: 8 additions & 4 deletions b/‎aten/src/ATen/FunctionalTensorWrapper.cpp‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎aten/src/ATen/MemoryOverlap.cpp‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/MemoryOverlap.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/Copy.cpp‎
Lines changed: 1 addition & 26 deletions b/‎aten/src/ATen/native/Copy.cpp‎
Lines changed: 1 addition & 26 deletions
diff --git a/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 41 additions & 4 deletions b/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/TensorShape.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/TensorShape.h‎
Lines changed: 3 additions & 0 deletions
@@ -146,10 +146,14 @@ functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_st
 void FunctionalTensorWrapper::commit_update() {
   auto storage_impl = functional_storage_impl();
   storage_impl->add_update(value_, view_metas_);
-  // Invariant: commit_update() is called during an inplace operation.
-  // Tensor inputs to the operation are synced before runnig the op,
-  // so the current tensor must be up-to-date with its alias at this point.
-  generation_ = storage_impl->generation();
+  // As an optimization, we used to mark the tensor here as "up-to-date",
+  // That way, code like:
+  //   x = torch.ones(1'000'000)
+  //   x[0].add_(1)
+  // doesn't result in an unnecessary materialization of the base.
+  // This optimization results in the slice temporarily haven't incorrect
+  // stride/storage_offset though, and DCE should handle that optimization anyway.
+  // generation_ = storage_impl->generation();
 }
 
 bool FunctionalTensorWrapper::is_up_to_date() const {
 
@@ -16,8 +16,8 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
     return MemOverlap::No;
   }
 
-  auto strides = t->strides();
-  auto sizes = t->sizes();
+  auto strides = t->sym_strides();
+  auto sizes = t->sym_sizes();
   for (const auto i : c10::irange(strides.size())) {
     if (strides[i] == 0 && sizes[i] > 1) {
       return MemOverlap::Yes;
 
@@ -8,6 +8,7 @@
 #include <ATen/native/quantized/Copy.h>
 #include <ATen/native/mps/Copy.h>
 #include <ATen/native/vulkan/ops/Copy.h>
+#include <ATen/native/TensorShape.h>
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/vulkan/Context.h>
 #include <ATen/metal/Context.h>
@@ -278,32 +279,6 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   return self;
 }
 
-// NB: cribbed from https://github.com/pytorch/pytorch/pull/88198
-at::Tensor clone_preserve_strides(const at::Tensor& self) {
-  TORCH_INTERNAL_ASSERT(self.has_storage());
-  // In cases where the input tensor has internal memory overlap, we cannot actually
-  // preserve the strides/storage_offset of the input tensor, because
-  // *_scatter ops will try to copy_() into the cloned tensor.
-  // However, this should **never** show up in functionalized user code;
-  // most aten ops that try to mutate a tensor with internal memory overlap would error anyway.
-  //
-  // The one place that this does come up is in autograd - if there's a select_scatter
-  // in the forward, then autograd will generate one for the backward.
-  // If the input to the select_scatter is grad_output, then this could be an expanded tensor
-  // with internal overlap.
-  //if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
-  //  return self.clone();
-  //}
-  auto dtype_size = self.dtype().itemsize();
-  auto nbytes = self.storage().sym_nbytes();
-  TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
-  auto numel = nbytes / dtype_size;
-  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
-  auto clone = self_full_size.clone();
-  auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
-  return out;
-}
-
 Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
   // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
   // (1) It isn't exposed to the frontend (no python bindings)
 
@@ -3801,22 +3801,58 @@ std::vector<Tensor> unflatten_dense_tensors(const Tensor& flat, TensorList tenso
   return outputs;
 }
 
+
+// Clones a tensor by cloning the underlying storage that it came from,
+// which allows us to replicate the exact strides/storage_offset in the cloned tensor.
+// Note [*_scatter ops preserve strides]
+// In order for functionalization to preserve stride correctness, the *_scatter
+// operators that it calls must preserve the striding behavior of their inputs.
+// Specifically, the output of *_scatter(base, mutated_view, ...)
+// should have identical size/stride/storage_offset to "base".
+at::Tensor clone_preserve_strides(const at::Tensor& self) {
+  TORCH_INTERNAL_ASSERT(self.has_storage());
+  // In cases where the input tensor has internal memory overlap, we cannot actually
+  // preserve the strides/storage_offset of the input tensor, because
+  // *_scatter ops will try to copy_() into the cloned tensor.
+  // However, this should **never** show up in functionalized user code;
+  // most aten ops that try to mutate a tensor with internal memory overlap would error anyway.
+  //
+  // The one place that this does come up is in autograd - if there's a select_scatter
+  // in the forward, then autograd will generate one for the backward.
+  // If the input to the select_scatter is grad_output, then this could be an expanded tensor
+  // with internal overlap.
+  if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
+    return self.clone();
+  }
+  auto dtype_size = self.dtype().itemsize();
+  auto nbytes = self.storage().sym_nbytes();
+  TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
+  auto numel = nbytes / dtype_size;
+  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
+  auto clone = self_full_size.clone();
+  auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
+  return out;
+}
+
+
 at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.slice(dim, start, end, step);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
 }
 at::Tensor select_scatter_symint(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::SymInt index) {
-    auto output = self.clone();
+    auto output = clone_preserve_strides(self);
     auto slice = output.select_symint(dim, index);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
 }
 at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64_t offset, int64_t dim1, int64_t dim2) {
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.diagonal(offset, dim1, dim2);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
@@ -3825,7 +3861,8 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64
 at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
     // See Note [as_strided_scatter backward support]
     TORCH_INTERNAL_ASSERT(!self.requires_grad() || self.is_contiguous(), "as_strided_scatter is currently only supported for contiguous inputs");
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.as_strided_symint(size, stride, std::move(storage_offset));
     TORCH_CHECK(slice.sym_sizes() == src.sym_sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sym_sizes(), ", slice size = ", slice.sym_sizes());
     slice.copy_(src);
 
@@ -5,6 +5,9 @@
 
 namespace at {
 namespace native {
+
+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
+
 inline bool cat_should_skip_tensor(const Tensor& t) {
   return t.numel() == 0 && t.dim() == 1;
 }
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,9 @@`
`5`	`5`
`6`	`6`	`namespace at {`
`7`	`7`	`namespace native {`
	`8`	`+`
	`9`	`+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);`
	`10`	`+`
`8`	`11`	`inline bool cat_should_skip_tensor(const Tensor& t) {`
`9`	`12`	`return t.numel() == 0 && t.dim() == 1;`
`10`	`13`	`}`