pytorch
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.cpp‎
Lines changed: 8 additions & 4 deletions b/‎aten/src/ATen/FunctionalTensorWrapper.cpp‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎aten/src/ATen/MemoryOverlap.cpp‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/MemoryOverlap.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/functorch/BatchRulesScatterOps.cpp‎
Lines changed: 8 additions & 1 deletion b/‎aten/src/ATen/functorch/BatchRulesScatterOps.cpp‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎aten/src/ATen/functorch/BatchedFallback.cpp‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/functorch/BatchedFallback.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/functorch/BatchedFallback.h‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/functorch/BatchedFallback.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/Copy.cpp‎
Lines changed: 1 addition & 26 deletions b/‎aten/src/ATen/native/Copy.cpp‎
Lines changed: 1 addition & 26 deletions
diff --git a/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 41 additions & 4 deletions b/‎aten/src/ATen/native/TensorShape.cpp‎
Lines changed: 41 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/TensorShape.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/TensorShape.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/functorch/test_aotdispatch.py‎
Lines changed: 40 additions & 30 deletions b/‎test/functorch/test_aotdispatch.py‎
Lines changed: 40 additions & 30 deletions
@@ -146,10 +146,14 @@ functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_st
 void FunctionalTensorWrapper::commit_update() {
   auto storage_impl = functional_storage_impl();
   storage_impl->add_update(value_, view_metas_);
-  // Invariant: commit_update() is called during an inplace operation.
-  // Tensor inputs to the operation are synced before runnig the op,
-  // so the current tensor must be up-to-date with its alias at this point.
-  generation_ = storage_impl->generation();
+  // As an optimization, we used to mark the tensor here as "up-to-date",
+  // That way, code like:
+  //   x = torch.ones(1'000'000)
+  //   x[0].add_(1)
+  // doesn't result in an unnecessary materialization of the base.
+  // This optimization results in the slice temporarily haven't incorrect
+  // stride/storage_offset though, and DCE should handle that optimization anyway.
+  // generation_ = storage_impl->generation();
 }
 
 bool FunctionalTensorWrapper::is_up_to_date() const {
 
@@ -16,8 +16,8 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
     return MemOverlap::No;
   }
 
-  auto strides = t->strides();
-  auto sizes = t->sizes();
+  auto strides = t->sym_strides();
+  auto sizes = t->sym_sizes();
   for (const auto i : c10::irange(strides.size())) {
     if (strides[i] == 0 && sizes[i] > 1) {
       return MemOverlap::Yes;
 
@@ -5,13 +5,14 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <ATen/functorch/BatchRulesHelper.h>
-#include <iostream>
 #include <ATen/Operators.h>
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
+#include <iostream>
+#include <torch/library.h>
 
 
 namespace at { namespace functorch {
@@ -1074,6 +1075,12 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
   VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
   VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
+  // as_strided_scatter does not work with the for-loop fallback today,
+  // because as_strided_scatter will return an output that matches
+  // the strides/storage_offset of its input.
+  // With the for loop fallback, each input tensor is a slice into
+  // the larger batched tensor.
+  m.impl("as_strided_scatter", torch::CppFunction::makeFromBoxedFunction<&vmapErrorFallback>());
 }
 
 }}
@@ -396,5 +396,9 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
   }
 }
 
+void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  TORCH_CHECK(false, "Error: ", op.operator_name(), " requires special handling, and does not yet have a batching rule. Feel free to file a github issue!");
+}
+
 }
 } // namespace at
@@ -32,6 +32,8 @@ namespace functorch {
 // write batching rules for operators whenever possible.
 void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
 
+void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
 // The vmap fallback emits a warning by default, but it may be disabled if
 // the user finds it to be too annoying.
 TORCH_API bool isVmapFallbackWarningEnabled();
 
@@ -8,6 +8,7 @@
 #include <ATen/native/quantized/Copy.h>
 #include <ATen/native/mps/Copy.h>
 #include <ATen/native/vulkan/ops/Copy.h>
+#include <ATen/native/TensorShape.h>
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/vulkan/Context.h>
 #include <ATen/metal/Context.h>
@@ -278,32 +279,6 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   return self;
 }
 
-// NB: cribbed from https://github.com/pytorch/pytorch/pull/88198
-at::Tensor clone_preserve_strides(const at::Tensor& self) {
-  TORCH_INTERNAL_ASSERT(self.has_storage());
-  // In cases where the input tensor has internal memory overlap, we cannot actually
-  // preserve the strides/storage_offset of the input tensor, because
-  // *_scatter ops will try to copy_() into the cloned tensor.
-  // However, this should **never** show up in functionalized user code;
-  // most aten ops that try to mutate a tensor with internal memory overlap would error anyway.
-  //
-  // The one place that this does come up is in autograd - if there's a select_scatter
-  // in the forward, then autograd will generate one for the backward.
-  // If the input to the select_scatter is grad_output, then this could be an expanded tensor
-  // with internal overlap.
-  //if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
-  //  return self.clone();
-  //}
-  auto dtype_size = self.dtype().itemsize();
-  auto nbytes = self.storage().sym_nbytes();
-  TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
-  auto numel = nbytes / dtype_size;
-  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
-  auto clone = self_full_size.clone();
-  auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
-  return out;
-}
-
 Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
   // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
   // (1) It isn't exposed to the frontend (no python bindings)
 
@@ -3801,22 +3801,58 @@ std::vector<Tensor> unflatten_dense_tensors(const Tensor& flat, TensorList tenso
   return outputs;
 }
 
+
+// Clones a tensor by cloning the underlying storage that it came from,
+// which allows us to replicate the exact strides/storage_offset in the cloned tensor.
+// Note [*_scatter ops preserve strides]
+// In order for functionalization to preserve stride correctness, the *_scatter
+// operators that it calls must preserve the striding behavior of their inputs.
+// Specifically, the output of *_scatter(base, mutated_view, ...)
+// should have identical size/stride/storage_offset to "base".
+at::Tensor clone_preserve_strides(const at::Tensor& self) {
+  TORCH_INTERNAL_ASSERT(self.has_storage());
+  // In cases where the input tensor has internal memory overlap, we cannot actually
+  // preserve the strides/storage_offset of the input tensor, because
+  // *_scatter ops will try to copy_() into the cloned tensor.
+  // However, this should **never** show up in functionalized user code;
+  // most aten ops that try to mutate a tensor with internal memory overlap would error anyway.
+  //
+  // The one place that this does come up is in autograd - if there's a select_scatter
+  // in the forward, then autograd will generate one for the backward.
+  // If the input to the select_scatter is grad_output, then this could be an expanded tensor
+  // with internal overlap.
+  if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
+    return self.clone();
+  }
+  auto dtype_size = self.dtype().itemsize();
+  auto nbytes = self.storage().sym_nbytes();
+  TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
+  auto numel = nbytes / dtype_size;
+  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
+  auto clone = self_full_size.clone();
+  auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
+  return out;
+}
+
+
 at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) {
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.slice(dim, start, end, step);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
 }
 at::Tensor select_scatter_symint(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::SymInt index) {
-    auto output = self.clone();
+    auto output = clone_preserve_strides(self);
     auto slice = output.select_symint(dim, index);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
 }
 at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64_t offset, int64_t dim1, int64_t dim2) {
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.diagonal(offset, dim1, dim2);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
@@ -3825,7 +3861,8 @@ at::Tensor diagonal_scatter(const at::Tensor& self, const at::Tensor& src, int64
 at::Tensor as_strided_scatter_symint(const at::Tensor& self, const at::Tensor& src, at::SymIntArrayRef size, at::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
     // See Note [as_strided_scatter backward support]
     TORCH_INTERNAL_ASSERT(!self.requires_grad() || self.is_contiguous(), "as_strided_scatter is currently only supported for contiguous inputs");
-    auto output = self.clone();
+    // See Note [*_scatter ops preserve strides]
+    auto output = clone_preserve_strides(self);
     auto slice = output.as_strided_symint(size, stride, std::move(storage_offset));
     TORCH_CHECK(slice.sym_sizes() == src.sym_sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sym_sizes(), ", slice size = ", slice.sym_sizes());
     slice.copy_(src);
 
@@ -5,6 +5,9 @@
 
 namespace at {
 namespace native {
+
+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
+
 inline bool cat_should_skip_tensor(const Tensor& t) {
   return t.numel() == 0 && t.dim() == 1;
 }
 
@@ -613,8 +613,8 @@ def forward(self, primals_1):
     t_1 = torch.ops.aten.t.default(clone);  clone = None
     select_scatter = torch.ops.aten.select_scatter.default(t_1, mul, 0, 0);  t_1 = mul = None
     t_2 = torch.ops.aten.t.default(select_scatter);  select_scatter = None
-    t_3 = torch.ops.aten.t.default(t_2);  t_2 = None
-    return [t_3, 3, 3, 1, 3, 0]""")
+    t_4 = torch.ops.aten.t.default(t_2);  t_2 = None
+    return [t_4, 3, 3, 1, 3, 0]""")
 
     def test_view_and_inplace_view(self):
         def f(a, b):
@@ -683,11 +683,12 @@ def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
-    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
-    t_1 = torch.ops.aten.t.default(as_strided_5);  as_strided_5 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = mul = None
+    as_strided_3 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    as_strided_6 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    t_1 = torch.ops.aten.t.default(as_strided_6);  as_strided_6 = None
     mul_1 = torch.ops.aten.mul.Tensor(t_1, 3);  t_1 = None
-    return [mul, mul_1, 4, 1, 0]""")
+    return [as_strided_3, mul_1, 4, 1, 0]""")
 
     def test_input_mutation_aliases_other_input(self):
         def f(a, b):
@@ -712,10 +713,11 @@ def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = None
-    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 2);  as_strided_scatter = None
-    add_1 = torch.ops.aten.add.Tensor(add, as_strided_4);  as_strided_4 = None
-    return [add, add_1]""")
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = add = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 0)
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 2);  as_strided_scatter = None
+    add_1 = torch.ops.aten.add.Tensor(as_strided_2, as_strided_5);  as_strided_5 = None
+    return [as_strided_2, add_1]""")
 
     def test_input_mutation_aliases_other_input2(self):
         def f(a, b):
@@ -736,10 +738,11 @@ def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = None
-    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [2, 2], [2, 1], 0);  as_strided_scatter = None
-    add_1 = torch.ops.aten.add.Tensor(add, as_strided_4);  as_strided_4 = None
-    return [add, add_1]""")
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = add = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 0)
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [2, 2], [2, 1], 0);  as_strided_scatter = None
+    add_1 = torch.ops.aten.add.Tensor(as_strided_2, as_strided_5);  as_strided_5 = None
+    return [as_strided_2, add_1]""")
 
     def test_input_mutation_aliases_and_output_alias(self):
         def f(a, b):
@@ -758,9 +761,11 @@ def inp_callable():
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
-    as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0);  clone = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
-    return [add, 4, 1, 0]""")
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = add = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    return [as_strided_2, 4, 1, 0]""")
 
     def test_input_aliased_with_mutation_output_alias(self):
         def f(a, b, c):
@@ -783,10 +788,12 @@ def inp_callable():
         self.assertExpectedInline(fw_graph.code.strip(), """\
 def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
-    as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0);  clone = None
+    as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = mul = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
     add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
-    return [mul, add, 4, 1, 0]""")
+    return [as_strided_2, add, 4, 1, 0]""")
 
     def test_input_metadata_mutation_aliases(self):
         def f(a, b):
@@ -829,11 +836,12 @@ def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     mul = torch.ops.aten.mul.Tensor(as_strided, 2);  as_strided = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
-    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
-    add = torch.ops.aten.add.Tensor(as_strided_2, 1);  as_strided_2 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = mul = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    as_strided_3 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    add = torch.ops.aten.add.Tensor(as_strided_3, 1);  as_strided_3 = None
     add_1 = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
-    return [mul, add, add_1]""")
+    return [as_strided_2, add, add_1]""")
 
     def test_input_mutation_aliases_bases_out_of_order(self):
         # This tests our calling convention: if b and d are aliased, then the outer calling convention
@@ -864,12 +872,13 @@ def forward(self, primals_1, primals_2, primals_3):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = add = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
     add_1 = torch.ops.aten.add.Tensor(primals_2, primals_3);  primals_2 = primals_3 = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = None
-    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
-    t_1 = torch.ops.aten.t.default(as_strided_4);  as_strided_4 = None
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    t_1 = torch.ops.aten.t.default(as_strided_5);  as_strided_5 = None
     add_2 = torch.ops.aten.add.Tensor(add_1, t_1);  add_1 = t_1 = None
-    return [add, add_2, 4, 1, 0, 4, 1, 0]""")
+    return [as_strided_2, add_2, 4, 1, 0, 4, 1, 0]""")
 
     # Mondo test that tests a combination of:
     # input is mutated, that aliases another input (so we make a synthetic base)
@@ -913,10 +922,11 @@ def forward(self, primals_1, primals_2):
     clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
     as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
     mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
-    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
-    add = torch.ops.aten.add.Tensor(as_strided_4, mul);  as_strided_4 = None
-    return [mul, add, 2, 2, 1, 2, 0, 2, 2, 2, 1, 0]""")
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = mul = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0)
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    add = torch.ops.aten.add.Tensor(as_strided_5, as_strided_2);  as_strided_5 = None
+    return [as_strided_2, add, 2, 2, 1, 2, 0, 2, 2, 2, 1, 0]""")
 
     def test_no_grad_input_output(self):
         def f(a, b):
Original file line number	Diff line number	Diff line change
`@@ -396,5 +396,9 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta`
`396`	`396`	`}`
`397`	`397`	`}`
`398`	`398`
	`399`	`+void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {`
	`400`	`+ TORCH_CHECK(false, "Error: ", op.operator_name(), " requires special handling, and does not yet have a batching rule. Feel free to file a github issue!");`
	`401`	`+}`
	`402`	`+`
`399`	`403`	`}`
`400`	`404`	`} // namespace at`
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,9 @@`
`5`	`5`
`6`	`6`	`namespace at {`
`7`	`7`	`namespace native {`
	`8`	`+`
	`9`	`+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);`
	`10`	`+`
`8`	`11`	`inline bool cat_should_skip_tensor(const Tensor& t) {`
`9`	`12`	`return t.numel() == 0 && t.dim() == 1;`
`10`	`13`	`}`