pytorch
diff --git a/‎.github/ci_commit_pins/multipy.txt‎
Lines changed: 1 addition & 0 deletions b/‎.github/ci_commit_pins/multipy.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/ci_commit_pins/xla.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/xla.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.jenkins/pytorch/common_utils.sh‎
Lines changed: 5 additions & 1 deletion b/‎.jenkins/pytorch/common_utils.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/FunctionalStorageImpl.cpp‎
Lines changed: 10 additions & 0 deletions b/‎aten/src/ATen/FunctionalStorageImpl.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/boxing/KernelFunction_impl.h‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/boxing/KernelFunction_impl.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/union_type.cpp‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/union_type.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp‎
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAGeneratorImpl.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGeneratorImpl.h‎
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/cuda/CUDAGeneratorImpl.h‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1 @@
+7dd29931fa8e9bb7c970f05f8c0dc13b69e17494
@@ -1 +1 @@
-d43ed3c4d24ea17df0a57fed7f338a9c7b7d1ef0
+06cc6d4f596e764502e3ce5ab998c8986fd52e5b
@@ -158,10 +158,12 @@ function setup_torchdeploy_deps(){
 
 function checkout_install_torchdeploy() {
   local commit
+  commit=$(get_pinned_commit multipy)
   setup_torchdeploy_deps
   pushd ..
   git clone --recurse-submodules https://github.com/pytorch/multipy.git
   pushd multipy
+  git checkout "${commit}"
   python multipy/runtime/example/generate_examples.py
   pip install -e . --install-option="--cudatests"
   popd
@@ -197,7 +199,9 @@ function checkout_install_torchbench() {
   git clone https://github.com/pytorch/benchmark torchbench
   pushd torchbench
   git checkout no_torchaudio
-  python install.py
+  # Occasionally the installation may fail on one model but it is ok to continue
+  # to install and test other models
+  python install.py --continue_on_fail
   popd
 }
 
 
@@ -783,6 +783,7 @@ include_patterns = [
     'benchmarks/dynamo/**/*.py',
     'torch/_inductor/**/*.py',
     'test/inductor/**/*.py',
+    'test/test_weak.py',
     'torch/onnx/**/*.py',
     'torch/package/**/*.py',
     'torch/_decomp/**/*.py',
 
@@ -90,6 +90,7 @@ To develop PyTorch on your machine, here are some tips:
 * Python >= 3.7 (3.7.6+ recommended)
 
 ### Instructions
+_**Note**: If you get stuck at any step, check out the [tips and debugging](#tips-and-debugging) section below for common solutions_
 
 1. Uninstall all existing PyTorch installs. You may need to run `pip
 uninstall torch` multiple times. You'll know `torch` is fully
 
@@ -65,6 +65,16 @@ const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Ten
 
 
 c10::SymInt get_nbytes(const Tensor& value) {
+  // The functionalization story when wrapping tensors that don't have storage
+  // is a bit wonky, but fortunately for some models (e.g., dlrm) we never
+  // actually perform mutations on these tensors, so you never really get
+  // called out on it.  For now, functionalization still creates "storages"
+  // for these tensors (which is wrong), but we don't give them any space.
+  // A more proper fix would be to have a SparseFunctionalTensorWrapper that
+  // models sparse correctly.
+  if (value.is_sparse()) {
+    return 0;
+  }
   if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
     // Today, the two implementations of SymInt are in Python (proxy tensor),
     // and lazy tensor (LTC/XLA).
 
@@ -64,7 +64,7 @@ inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
 
 template <>
 inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIntArrayRef x) {
-  return C10_AS_INTARRAYREF_SLOW(x);
+  return c10::asIntArrayRefSlow(x);
 }
 
 template <>
@@ -74,7 +74,7 @@ inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10
 
 template <>
 inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(at::OptionalSymIntArrayRef x) {
-  return x.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : c10::nullopt;
+  return x.has_value() ? c10::make_optional(c10::asIntArrayRefSlow(*x)) : c10::nullopt;
 }
 
 template<class Return, class... Args>
 
@@ -228,7 +228,7 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
 }
 
 UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
-  auto union_type = new UnionType(std::move(reference));
+  UnionTypePtr union_type(new UnionType(std::move(reference)));
 
   // Some very special-cased logic for `Optional`. This will be deleted
   // in a later PR
@@ -267,7 +267,7 @@ UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {
     }
   }
 
-  return UnionTypePtr(union_type);
+  return union_type;
 }
 
 c10::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {
 
@@ -101,6 +101,7 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)
   : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),
               DispatchKeySet(c10::DispatchKey::CUDA)} {
   at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");
+  no_reset_rnn_state_.clear();
 }
 
 /**
@@ -113,6 +114,7 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");
   seed_ = seed;
   philox_offset_per_thread_ = 0;
+  no_reset_rnn_state_.clear();
 }
 
 #define CAPTURE_DEFAULT_GENS_MSG \
 
@@ -4,6 +4,7 @@
 #include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
 #include <ATen/Context.h>
 #include <limits>
+#include <atomic>
 
 namespace at {
 /**
@@ -104,6 +105,10 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t capture_epilogue();
   PhiloxCudaState philox_cuda_state(uint64_t increment);
 
+  bool reset_rnn_state() {
+    return !no_reset_rnn_state_.test_and_set();
+  }
+
   // Temporarily accommodates call sites that use philox_engine_inputs.
   // Allows incremental refactor of call sites to use philox_cuda_state.
   std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
@@ -118,6 +123,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   int64_t* offset_extragraph_{};
   uint32_t offset_intragraph_ = 0;
   bool graph_expects_this_gen_ = false;
+  std::atomic_flag no_reset_rnn_state_;
 };
 
 namespace cuda {
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+7dd29931fa8e9bb7c970f05f8c0dc13b69e17494`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-d43ed3c4d24ea17df0a57fed7f338a9c7b7d1ef0`
	`1`	`+06cc6d4f596e764502e3ce5ab998c8986fd52e5b`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {`
`64`	`64`
`65`	`65`	`template <>`
`66`	`66`	`inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIntArrayRef x) {`
`67`		`- return C10_AS_INTARRAYREF_SLOW(x);`
	`67`	`+ return c10::asIntArrayRefSlow(x);`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`template <>`
`@@ -74,7 +74,7 @@ inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10`
`74`	`74`
`75`	`75`	`template <>`
`76`	`76`	`inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(at::OptionalSymIntArrayRef x) {`
`77`		`- return x.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : c10::nullopt;`
	`77`	`+ return x.has_value() ? c10::make_optional(c10::asIntArrayRefSlow(*x)) : c10::nullopt;`
`78`	`78`	`}`
`79`	`79`
`80`	`80`	`template<class Return, class... Args>`
Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType`
`228`	`228`	`}`
`229`	`229`
`230`	`230`	`UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {`
`231`		`- auto union_type = new UnionType(std::move(reference));`
	`231`	`+ UnionTypePtr union_type(new UnionType(std::move(reference)));`
`232`	`232`
`233`	`233`	// Some very special-cased logic for `Optional`. This will be deleted
`234`	`234`	`// in a later PR`
`@@ -267,7 +267,7 @@ UnionTypePtr UnionType::create(std::vector<TypePtr> reference) {`
`267`	`267`	`}`
`268`	`268`	`}`
`269`	`269`
`270`		`- return UnionTypePtr(union_type);`
	`270`	`+ return union_type;`
`271`	`271`	`}`
`272`	`272`
`273`	`273`	`c10::optional<TypePtr> UnionType::subtractTypeSet(std::vector<TypePtr>& to_subtract) const {`
Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index)`
`101`	`101`	`: c10::GeneratorImpl{Device(DeviceType::CUDA, device_index),`
`102`	`102`	`DispatchKeySet(c10::DispatchKey::CUDA)} {`
`103`	`103`	`at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl");`
	`104`	`+ no_reset_rnn_state_.clear();`
`104`	`105`	`}`
`105`	`106`
`106`	`107`	`/**`
`@@ -113,6 +114,7 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {`
`113`	`114`	`at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed");`
`114`	`115`	`seed_ = seed;`
`115`	`116`	`philox_offset_per_thread_ = 0;`
	`117`	`+ no_reset_rnn_state_.clear();`
`116`	`118`	`}`
`117`	`119`
`118`	`120`	`#define CAPTURE_DEFAULT_GENS_MSG \`