pytorch
diff --git a/‎.github/actions/teardown-rocm/action.yml‎
Lines changed: 9 additions & 2 deletions b/‎.github/actions/teardown-rocm/action.yml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 7 additions & 2 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎aten/src/ATen/autocast_mode.cpp‎
Lines changed: 1 addition & 2 deletions b/‎aten/src/ATen/autocast_mode.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎aten/src/ATen/cuda/ScanUtils.cuh‎
Lines changed: 0 additions & 82 deletions b/‎aten/src/ATen/cuda/ScanUtils.cuh‎
Lines changed: 0 additions & 82 deletions
diff --git a/‎aten/src/ATen/native/SpectralOps.cpp‎
Lines changed: 0 additions & 20 deletions b/‎aten/src/ATen/native/SpectralOps.cpp‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎aten/src/ATen/native/TensorCompare.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/TensorCompare.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 6 additions & 7 deletions b/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMath.cpp‎
Lines changed: 134 additions & 2 deletions b/‎aten/src/ATen/native/nested/NestedTensorMath.cpp‎
Lines changed: 134 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMath.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/nested/NestedTensorMath.h‎
Lines changed: 1 addition & 1 deletion
@@ -14,5 +14,12 @@ runs:
         docker stop $(docker ps -q) || true
         # Prune all of the docker containers
         docker container prune -f
-        # Prune all of the docker images older than 1 day
-        docker system prune -af --filter "until=24h"
+        # Prune everything docker if there are more than 10 images (~200GB).
+        # This is easier than using a time filter, e.g., "until=24h".
+        image_count=$(docker images | wc -l)
+        if [[ ${image_count} -gt 10 ]]; then
+            echo "Purging all docker caches"
+            docker system prune -af
+        else
+            echo "Will not purge docker, only ${image_count} images found"
+        fi
@@ -40,6 +40,11 @@ PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}
 
 if [[ $TEST_CONFIG == 'default' ]]; then
   export CUDA_VISIBLE_DEVICES=0
+  export HIP_VISIBLE_DEVICES=0
+fi
+
+if [[ $TEST_CONFIG == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export HIP_VISIBLE_DEVICES=0,1
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
@@ -51,8 +56,8 @@ if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
   export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  # Used so that only cuda specific versions of tests are generated
+if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # Used so that only cuda/rocm specific versions of tests are generated
   # mainly used so that we're not spending extra cycles testing cpu
   # devices on expensive gpu machines
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 
@@ -541,8 +541,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(ADD_NS(quantile), "quantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
   KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool, c10::string_view), fp32)
   KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
-  KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::optional<bool>, c10::optional<bool>), fp32)
-  KERNEL_CPU(ADD_NS(stft), "stft.center", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::string_view, bool, c10::optional<bool>, c10::optional<bool>), fp32)
+  KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::string_view, bool, c10::optional<bool>, c10::optional<bool>), fp32)
   KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
   KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>), fp32)
   KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)
 
@@ -10,88 +10,6 @@
 namespace at {
 namespace cuda {
 
-// Extends the above Inclusive Scan to support segments. It has the same properties
-// but also takes a flag array that indicates the starts of "segments", i.e. individual
-// units to scan. For example, consider the following (+)-scan that is segmented:
-//
-// Input:  [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
-// Flags:  [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
-// Output:  1  4  6  4  5  2  3  5  1  5
-//
-// So we see that each "flag" resets the scan to that index.
-template <typename T, class BinaryOp, int Power2ScanSize>
-__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
-  // Reduce step ("upsweep")
-#pragma unroll
-  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
-    int index = (threadIdx.x + 1) * stride * 2 - 1;
-    if (index < Power2ScanSize) {
-      smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
-      bmem[index] = bmem[index] | bmem[index - stride];
-    }
-    __syncthreads();
-  }
-
-  // Post-reduce step ("downsweep")
-#pragma unroll
-  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
-    int index = (threadIdx.x + 1) * stride * 2 - 1;
-    if ((index + stride) < Power2ScanSize) {
-      smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
-      bmem[index + stride] = bmem[index + stride] | bmem[index];
-    }
-    __syncthreads();
-  }
-}
-
-// Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
-  // FIXME: this is a slow, simple implementation; need up/down sweep,
-  // prevent smem conflicts
-  smem[threadIdx.x] = in;
-
-  __syncthreads();
-
-  for (int offset = 1; offset < blockDim.x; offset *= 2) {
-    T val = 0;
-
-    if (threadIdx.x >= offset) {
-      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
-    }
-
-    __syncthreads();
-    if (threadIdx.x >= offset) {
-      smem[threadIdx.x] = val;
-    }
-
-    __syncthreads();
-  }
-
-  *out = smem[threadIdx.x];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
-// Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency, class BinaryFunction>
-__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
-  // FIXME: crappy implementation
-  // We kill write-after-read dependencies separately below, hence the `false`
-  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
-
-  *out -= in;
-  *carry = smem[blockDim.x - 1];
-
-  // Prevent write-after-read dependencies on smem usage above if necessary
-  if (KillWARDependency) {
-    __syncthreads();
-  }
-}
-
 // Inclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
 template <typename T, bool KillWARDependency, class BinaryFunction>
 
@@ -907,17 +907,6 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   }
 }
 
-Tensor stft(
-    const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-    const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
-    const bool normalized,
-    const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
-  return at::stft(
-      self, n_fft, hop_lengthOpt, win_lengthOpt, window_opt,
-      /*center=*/false, /*mode=*/"constant", normalized, onesidedOpt,
-      return_complexOpt);
-}
-
 // Create complex tensor from the old style of real tensor with size=(..., 2)
 // This is to support istft in the transition to requiring complex input.
 // NOTE: This may return a view of the input tensor, or might clone if necessary
@@ -1111,15 +1100,6 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   #undef REPR
 }
 
-Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-             const optional<int64_t> win_lengthOpt, const Tensor& window,
-             const bool center, const bool normalized, const optional<bool> onesidedOpt,
-             const optional<int64_t> lengthOpt) {
-  return at::native::istft(
-      self, n_fft, hop_lengthOpt, win_lengthOpt, window, center, normalized,
-      onesidedOpt, lengthOpt, /*return_complex=*/false);
-}
-
 void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_) {
   const auto input_sizes = input.sizes();
   const auto input_strides = input.strides();
 
@@ -220,7 +220,7 @@ Tensor isfinite(const Tensor& self) {
 
   // Note: a complex value is finite iff both parts are finite
   if (self.is_complex()) {
-    return at::isfinite(self.abs());
+    return at::isfinite(at::real(self)).__iand__(at::isfinite(at::imag(self)));
   }
 
   return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "isfinite", [&]() {
 
@@ -452,6 +452,7 @@
     SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -461,6 +462,7 @@
     SparseCPU, SparseCUDA: add_sparse_
     SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3219,6 +3221,7 @@
     SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3228,6 +3231,7 @@
     SparseCPU, SparseCUDA: mul_sparse_
     SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4314,12 +4318,7 @@
 
 - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
 
-# Overload without center & pad mode, needed for forward-compatibility
-- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
-  variants: function, method
-  cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
-
-- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
   variants: function, method
 
 - func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
@@ -11670,7 +11669,7 @@
     CompositeExplicitAutograd: alias_copy
   tags: view_copy
 
-- func: to_padded_tensor(Tensor self, float padding) -> Tensor
+- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
   variants: method
   dispatch:
     NestedTensorCPU: NestedTensor_to_padded_tensor_generic
 
@@ -303,7 +303,10 @@ Tensor nested_from_padded_generic(
       std::move(new_buffer), sizes);
 }
 
-Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding) {
+Tensor NestedTensor_to_padded_tensor_generic(
+    const Tensor& t,
+    double padding,
+    OptionalIntArrayRef output_size) {
   // TODO: skipped optimization for case of all 1x1 tensors
   auto& nt = *get_nested_tensor_impl(t);
   auto max_size = NestedTensor_get_max_size(nt);
@@ -356,7 +359,22 @@ Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding) {
     buffers.push_back(pad_tensor_to_shape(to_pad, max_size, padding));
     sizes_ptr += sizes_num_columns;
   }
-  return at::stack(buffers);
+  auto ret_val = at::stack(buffers);
+
+  // Pad output tensor to output_size if provided
+  if (output_size.has_value()) {
+    auto output_size_ = output_size.value();
+    TORCH_CHECK(
+        (int64_t)output_size_.size() == ret_val.dim(),
+        "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
+    for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
+      TORCH_CHECK(
+          output_size_[i] >= ret_val.size(i),
+          "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
+    }
+    return pad_tensor_to_shape(ret_val, output_size_, padding);
+  }
+  return ret_val;
 }
 
 Tensor NestedTensor_embedding(
@@ -385,5 +403,119 @@ Tensor NestedTensor_embedding(
   return at::detail::make_tensor<NestedTensorImpl>(
       result_buffer.reshape({-1}), std::move(new_sizes));
 }
+
+std::pair<NestedTensorImpl*, NestedTensorImpl*>
+get_elementwise_nested_tensor_impl(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name) {
+  if (self.is_nested() && !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a nested self and non-nested other");
+  } else if (!(self.is_nested()) && other.is_nested()) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and nested other");
+  } else if (!(self.is_nested()) || !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and non-nested other");
+  }
+
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto other_ptr = get_nested_tensor_impl(other);
+
+  TORCH_CHECK(
+      self.dim() == other.dim(),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      at::equal(
+          self_ptr->get_nested_size_tensor(),
+          other_ptr->get_nested_size_tensor()),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      nested_tensor_impl_is_contiguous(self_ptr) &&
+          nested_tensor_impl_is_contiguous(other_ptr),
+      op_name,
+      " does not support non-contiguous NestedTensor inputs");
+  return std::make_pair(self_ptr, other_ptr);
+}
+
+template <typename Func>
+Tensor NestedTensor_elementwise_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  const auto& nt_self = *self_impl;
+  const auto& nt_other = *other_impl;
+  const auto& self_sizes = nt_self.get_nested_size_tensor();
+  return wrap_buffer(
+      f(nt_self.get_buffer().reshape({-1}),
+        nt_other.get_buffer().reshape({-1})),
+      self_sizes);
+}
+
+Tensor NestedTensor_add_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) {
+        return at::add(b1, b2, alpha);
+      });
+}
+
+Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "mul", [](const Tensor& b1, const Tensor& b2) {
+        return at::mul(b1, b2);
+      });
+}
+
+template <typename Func>
+Tensor& NestedTensor_elementwise__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  const auto& nt_self = *self_impl;
+  const auto& nt_other = *other_impl;
+  f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1}));
+  return self;
+}
+
+Tensor& NestedTensor_add__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) {
+        return b1.add_(b2, alpha);
+      });
+}
+
+Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "mul_", [](const Tensor& b1, const Tensor& b2) {
+        return b1.mul_(b2);
+      });
+}
+
 } // namespace native
 } // namespace at
@@ -14,7 +14,7 @@ int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
 
 TORCH_API std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt);
 
-TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding);
+TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding, OptionalIntArrayRef output_size);
 
 } // namespace native
 } // namespace at
Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,7 @@ Tensor isfinite(const Tensor& self) {`
`220`	`220`
`221`	`221`	`// Note: a complex value is finite iff both parts are finite`
`222`	`222`	`if (self.is_complex()) {`
`223`		`- return at::isfinite(self.abs());`
	`223`	`+ return at::isfinite(at::real(self)).__iand__(at::isfinite(at::imag(self)));`
`224`	`224`	`}`
`225`	`225`
`226`	`226`	`return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "isfinite", [&]() {`