pytorch
diff --git a/‎.ci/aarch64_linux/aarch64_ci_build.sh‎
Lines changed: 0 additions & 13 deletions b/‎.ci/aarch64_linux/aarch64_ci_build.sh‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎.ci/pytorch/windows/internal/xpu_install.bat‎
Lines changed: 11 additions & 0 deletions b/‎.ci/pytorch/windows/internal/xpu_install.bat‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/TensorFactories.cpp‎
Lines changed: 35 additions & 16 deletions b/‎aten/src/ATen/native/TensorFactories.cpp‎
Lines changed: 35 additions & 16 deletions
diff --git a/‎aten/src/ATen/native/cuda/layer_norm_kernel.cu‎
Lines changed: 38 additions & 1 deletion b/‎aten/src/ATen/native/cuda/layer_norm_kernel.cu‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎benchmarks/dynamo/common.py‎
Lines changed: 1 addition & 2 deletions b/‎benchmarks/dynamo/common.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/dynamo/microbenchmarks/operator_inp_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/dynamo/microbenchmarks/operator_inp_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/dynamo/pr_time_benchmarks/expected_results.csv‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/fastrnns/cells.py‎
Lines changed: 7 additions & 9 deletions b/‎benchmarks/fastrnns/cells.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎benchmarks/fastrnns/custom_lstms.py‎
Lines changed: 21 additions & 21 deletions b/‎benchmarks/fastrnns/custom_lstms.py‎
Lines changed: 21 additions & 21 deletions
@@ -6,19 +6,6 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh
 
-tagged_version() {
-  GIT_DESCRIBE="git --git-dir /pytorch/.git describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if ${GIT_DESCRIBE} --exact >/dev/null; then
-    ${GIT_DESCRIBE}
-  else
-    return 1
-  fi
-}
-
-if tagged_version >/dev/null; then
-  export OVERRIDE_PACKAGE_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
-fi
-
 ###############################################################################
 # Run aarch64 builder python
 ###############################################################################
 
@@ -7,6 +7,9 @@ if not "%CUDA_VERSION%" == "xpu" (
     exit /b 0
 )
 
+set SRC_DIR=%NIGHTLIES_PYTORCH_ROOT%
+if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
+
 set XPU_INSTALL_MODE=%~1
 if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
 if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start
@@ -101,6 +104,14 @@ goto xpu_install_end
 
 :xpu_bundle_install
 
+:: Install Level Zero SDK
+set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
+curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+echo "Installing level zero SDK..."
+7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
+set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
+
+:: Install Bundle
 curl -o xpu_bundle.exe --retry 3 --retry-all-errors -k %XPU_BUNDLE_URL%
 echo "XPU Bundle installing..."
 start /wait "Intel Pytorch Bundle Installer" "xpu_bundle.exe" --action=install --eula=accept --silent --log-dir install_bundle
 
@@ -3037,7 +3037,7 @@ Tensor& linalg_norm_out(const Tensor& X, const std::optional<Scalar>& opt_ord, O
 Tensor linalg_norm(const Tensor& X, std::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, std::optional<ScalarType> opt_dtype) {
   if (opt_dim.has_value()) {
     TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ",
-              "dim is specified, it mut be of length 1 or 2. Got ", *opt_dim);
+              "dim is specified, it must be of length 1 or 2. Got ", *opt_dim);
   } else {
     TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ",
                 "dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D.");
 
@@ -1322,29 +1322,48 @@ Tensor randn_like(
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 namespace {
+
 template <typename scalar_t>
 void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
   scalar_t* r__data = result.data_ptr<scalar_t>();
 
   result.resize_({n});
   int64_t r__stride_0 = result.stride(0);
 
-  at::parallel_for(
-      0,
-      n,
-      internal::GRAIN_SIZE,
-      [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
-        for (const auto i : c10::irange(p_begin, p_end)) {
-          r__data[i * r__stride_0] = static_cast<scalar_t>(i);
-        }
-      });
-
-  for (int64_t i = 0; i < n - 1; i++) {
-    // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
-    int64_t z = generator->random() % (n - i);
-    scalar_t sav = r__data[i * r__stride_0];
-    r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
-    r__data[(z + i) * r__stride_0] = sav;
+  // for small n, preserve old behavior
+  if (n < std::numeric_limits<uint32_t>::max() / 20) {
+    at::parallel_for(
+        0,
+        n,
+        internal::GRAIN_SIZE,
+        [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
+          for (const auto i : c10::irange(p_begin, p_end)) {
+            r__data[i * r__stride_0] = static_cast<scalar_t>(i);
+          }
+        });
+
+    for (int64_t i = 0; i < n - 1; i++) {
+      // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
+      int64_t z = generator->random() % (n - i);
+      scalar_t sav = r__data[i * r__stride_0];
+      r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
+      r__data[(z + i) * r__stride_0] = sav;
+    }
+    return;
+  }
+
+  // we need to pick a number uniformly distributed between 0 and n
+  // when n is of the same order of magnitude as the biggest number returned by
+  // random the % result is not uniformly distributed
+  // so we use random64(), you'd run out of RAM before you
+  // start seeing the skew
+  // use no-initialization Fischer-Yates variant
+  // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm
+  for (int64_t i = 0; i < n; i++) {
+    int64_t z = (int64_t)(generator->random64() % (i + 1));
+    r__data[i * r__stride_0] = i;
+    r__data[i * r__stride_0] = r__data[z * r__stride_0];
+    r__data[z * r__stride_0] = i;
   }
 }
 } // namespace
 
@@ -745,12 +745,49 @@ void launch_vectorized_layer_norm_kernel(
     auto stream = at::cuda::getCurrentCUDAStream().stream();
     const int warp_size = at::cuda::warp_size();
     const dim3 threads(warp_size, num_threads() / warp_size, 1);
-    const dim3 blocks(M);
+    dim3 blocks(M);
+
+#ifdef USE_ROCM
+    uint64_t workgroupSize = static_cast<uint64_t>(blocks.x) * static_cast<uint64_t>(threads.x);
+    // this caused invalid configuration problem
+    if (workgroupSize > std::numeric_limits<uint32_t>::max()) {
+      // Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
+      blocks.x = std::numeric_limits<uint32_t>::max() / threads.x;
+    }
+#endif
+
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
     int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
     vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data,
     gamma_data, beta_data, mean_data, rstd_data, Y_data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#ifdef USE_ROCM
+    // the blocks.x contains the max grid x dimention without invalid configuration error
+    // Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
+    // Ensure all elements are processed. Prepare for next round
+    int64_t remaining = M - blocks.x;
+    const T* X_data2 = X_data;
+    T_ACC* mean_data2 = mean_data;
+    T_ACC* rstd_data2 = rstd_data;
+    T* Y_data2 = Y_data;
+
+    while (remaining > 0) {
+      X_data2 += N * blocks.x;
+      mean_data2 += blocks.x;
+      rstd_data2 += blocks.x;
+      Y_data2 += N * blocks.x;
+
+      blocks.x = (remaining > blocks.x) ? blocks.x : remaining;
+
+      vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data2,
+        gamma_data, beta_data, mean_data2, rstd_data2, Y_data2);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+      remaining -= blocks.x;
+    }
+#endif
+
 }
 
 template <typename T, typename T_ACC>
 
@@ -32,7 +32,6 @@
     NamedTuple,
     Optional,
     Sequence,
-    Tuple,
     Type,
     TYPE_CHECKING,
 )
@@ -746,7 +745,7 @@ def timed(
     return (time_total, result) if return_result else time_total
 
 
-def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:
+def _normalize_bench_inputs(example_inputs) -> tuple[tuple[Any], Mapping[str, Any]]:
     # NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary,
     # and consumed like `model(**example_inputs)`.
     # For other benchmarks, example_inputs are formatted as tuple and consumed
 
@@ -4,7 +4,7 @@
 import os
 from collections import Counter, defaultdict
 from functools import partial
-from typing import Any, Dict, Generator, Iterable, Tuple
+from typing import Any, Dict, Generator, Iterable
 
 import torch
 from torch.testing import make_tensor
@@ -263,7 +263,7 @@ def __init__(self, json_file_path):
 
     def get_inputs_for_operator(
         self, operator, dtype=None, device="cuda"
-    ) -> Generator[Tuple[Iterable[Any], Dict[str, Any]], None, None]:
+    ) -> Generator[tuple[Iterable[Any], Dict[str, Any]], None, None]:
         assert (
             str(operator) in self.operator_db
         ), f"Could not find {operator}, must provide overload"
 
@@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,27530000000,0.015
 
 
 
-basic_modules_ListOfLinears_eager,compile_time_instruction_count,930000000,0.015
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,945667911,0.015
 
 
 
 
@@ -1,5 +1,3 @@
-from typing import Tuple
-
 import torch
 from torch import Tensor
 
@@ -27,12 +25,12 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
 
 def lstm_cell(
     input: Tensor,
-    hidden: Tuple[Tensor, Tensor],
+    hidden: tuple[Tensor, Tensor],
     w_ih: Tensor,
     w_hh: Tensor,
     b_ih: Tensor,
     b_hh: Tensor,
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     hx, cx = hidden
     gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
 
@@ -57,7 +55,7 @@ def flat_lstm_cell(
     w_hh: Tensor,
     b_ih: Tensor,
     b_hh: Tensor,
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
 
     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@@ -75,11 +73,11 @@ def flat_lstm_cell(
 
 def premul_lstm_cell(
     igates: Tensor,
-    hidden: Tuple[Tensor, Tensor],
+    hidden: tuple[Tensor, Tensor],
     w_hh: Tensor,
     b_ih: Tensor,
     b_hh: Tensor,
-) -> Tuple[Tensor, Tensor]:
+) -> tuple[Tensor, Tensor]:
     hx, cx = hidden
     gates = igates + torch.mm(hx, w_hh.t()) + b_ih + b_hh
 
@@ -97,8 +95,8 @@ def premul_lstm_cell(
 
 
 def premul_lstm_cell_no_bias(
-    igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
-) -> Tuple[Tensor, Tensor]:
+    igates: Tensor, hidden: tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
+) -> tuple[Tensor, Tensor]:
     hx, cx = hidden
     gates = igates + torch.mm(hx, w_hh.t()) + b_hh
 
 
@@ -1,7 +1,7 @@
 import numbers
 import warnings
 from collections import namedtuple
-from typing import List, Tuple
+from typing import List
 
 import torch
 import torch.jit as jit
@@ -131,8 +131,8 @@ def __init__(self, input_size, hidden_size):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, state: Tuple[Tensor, Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        self, input: Tensor, state: tuple[Tensor, Tensor]
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         hx, cx = state
         gates = (
             torch.mm(input, self.weight_ih.t())
@@ -199,8 +199,8 @@ def __init__(self, input_size, hidden_size, decompose_layernorm=False):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, state: Tuple[Tensor, Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        self, input: Tensor, state: tuple[Tensor, Tensor]
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         hx, cx = state
         igates = self.layernorm_i(torch.mm(input, self.weight_ih.t()))
         hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t()))
@@ -225,8 +225,8 @@ def __init__(self, cell, *cell_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, state: Tuple[Tensor, Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        self, input: Tensor, state: tuple[Tensor, Tensor]
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         inputs = input.unbind(0)
         outputs = torch.jit.annotate(List[Tensor], [])
         for i in range(len(inputs)):
@@ -242,8 +242,8 @@ def __init__(self, cell, *cell_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, state: Tuple[Tensor, Tensor]
-    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        self, input: Tensor, state: tuple[Tensor, Tensor]
+    ) -> tuple[Tensor, tuple[Tensor, Tensor]]:
         inputs = reverse(input.unbind(0))
         outputs = jit.annotate(List[Tensor], [])
         for i in range(len(inputs)):
@@ -266,11 +266,11 @@ def __init__(self, cell, *cell_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
-    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+        self, input: Tensor, states: List[tuple[Tensor, Tensor]]
+    ) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
         # List[LSTMState]: [forward LSTMState, backward LSTMState]
         outputs = jit.annotate(List[Tensor], [])
-        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
+        output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
         # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
         i = 0
         for direction in self.directions:
@@ -300,10 +300,10 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
-    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+        self, input: Tensor, states: List[tuple[Tensor, Tensor]]
+    ) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
         # List[LSTMState]: One state per layer
-        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
+        output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
         output = input
         # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
         i = 0
@@ -330,11 +330,11 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]
-    ) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
+        self, input: Tensor, states: List[List[tuple[Tensor, Tensor]]]
+    ) -> tuple[Tensor, List[List[tuple[Tensor, Tensor]]]]:
         # List[List[LSTMState]]: The outer list is for layers,
         #                        inner list is for directions.
-        output_states = jit.annotate(List[List[Tuple[Tensor, Tensor]]], [])
+        output_states = jit.annotate(List[List[tuple[Tensor, Tensor]]], [])
         output = input
         # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
         i = 0
@@ -370,10 +370,10 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
 
     @jit.script_method
     def forward(
-        self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
-    ) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
+        self, input: Tensor, states: List[tuple[Tensor, Tensor]]
+    ) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
         # List[LSTMState]: One state per layer
-        output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
+        output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
         output = input
         # XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
         i = 0
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,27530000000,0.015`
`18`	`18`
`19`	`19`
`20`	`20`
`21`		`-basic_modules_ListOfLinears_eager,compile_time_instruction_count,930000000,0.015`
	`21`	`+basic_modules_ListOfLinears_eager,compile_time_instruction_count,945667911,0.015`
`22`	`22`
`23`	`23`
`24`	`24`