Skip to content

Commit 333d087

Browse files
committed
Update on "[dtensor] move all tests to distribute/tensor folder"
as titled, mainly moving files cc H-Huang awgu kwen2501 fegin fduwjj wz337 wconstab d4l3k c-p-i-o ezyang SherlockNoMad EikanWang jgong5 wenzhe-nrv [ghstack-poisoned]
2 parents 2573696 + 406135f commit 333d087

File tree

228 files changed

+2435
-1540
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

228 files changed

+2435
-1540
lines changed

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,6 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
66
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
77
source $SCRIPTPATH/aarch64_ci_setup.sh
88

9-
tagged_version() {
10-
GIT_DESCRIBE="git --git-dir /pytorch/.git describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
11-
if ${GIT_DESCRIBE} --exact >/dev/null; then
12-
${GIT_DESCRIBE}
13-
else
14-
return 1
15-
fi
16-
}
17-
18-
if tagged_version >/dev/null; then
19-
export OVERRIDE_PACKAGE_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
20-
fi
21-
229
###############################################################################
2310
# Run aarch64 builder python
2411
###############################################################################

.ci/pytorch/windows/internal/xpu_install.bat

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ if not "%CUDA_VERSION%" == "xpu" (
77
exit /b 0
88
)
99

10+
set SRC_DIR=%NIGHTLIES_PYTORCH_ROOT%
11+
if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
12+
1013
set XPU_INSTALL_MODE=%~1
1114
if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
1215
if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start
@@ -101,6 +104,14 @@ goto xpu_install_end
101104

102105
:xpu_bundle_install
103106

107+
:: Install Level Zero SDK
108+
set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
109+
curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
110+
echo "Installing level zero SDK..."
111+
7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
112+
set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
113+
114+
:: Install Bundle
104115
curl -o xpu_bundle.exe --retry 3 --retry-all-errors -k %XPU_BUNDLE_URL%
105116
echo "XPU Bundle installing..."
106117
start /wait "Intel Pytorch Bundle Installer" "xpu_bundle.exe" --action=install --eula=accept --silent --log-dir install_bundle

aten/src/ATen/native/LinearAlgebra.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3037,7 +3037,7 @@ Tensor& linalg_norm_out(const Tensor& X, const std::optional<Scalar>& opt_ord, O
30373037
Tensor linalg_norm(const Tensor& X, std::string_view ord, OptionalIntArrayRef opt_dim, bool keepdim, std::optional<ScalarType> opt_dtype) {
30383038
if (opt_dim.has_value()) {
30393039
TORCH_CHECK(opt_dim->size() == 1 || opt_dim ->size() == 2, "linalg.norm: If ",
3040-
"dim is specified, it mut be of length 1 or 2. Got ", *opt_dim);
3040+
"dim is specified, it must be of length 1 or 2. Got ", *opt_dim);
30413041
} else {
30423042
TORCH_CHECK(X.dim() == 1 || X.dim() == 2, "linalg.norm: If ",
30433043
"dim is not specified but ord is, the input must be 1D or 2D. Got ", X.dim(), "D.");

aten/src/ATen/native/TensorFactories.cpp

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,29 +1322,48 @@ Tensor randn_like(
13221322
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13231323

13241324
namespace {
1325+
13251326
template <typename scalar_t>
13261327
void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) {
13271328
scalar_t* r__data = result.data_ptr<scalar_t>();
13281329

13291330
result.resize_({n});
13301331
int64_t r__stride_0 = result.stride(0);
13311332

1332-
at::parallel_for(
1333-
0,
1334-
n,
1335-
internal::GRAIN_SIZE,
1336-
[&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
1337-
for (const auto i : c10::irange(p_begin, p_end)) {
1338-
r__data[i * r__stride_0] = static_cast<scalar_t>(i);
1339-
}
1340-
});
1341-
1342-
for (int64_t i = 0; i < n - 1; i++) {
1343-
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
1344-
int64_t z = generator->random() % (n - i);
1345-
scalar_t sav = r__data[i * r__stride_0];
1346-
r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
1347-
r__data[(z + i) * r__stride_0] = sav;
1333+
// for small n, preserve old behavior
1334+
if (n < std::numeric_limits<uint32_t>::max() / 20) {
1335+
at::parallel_for(
1336+
0,
1337+
n,
1338+
internal::GRAIN_SIZE,
1339+
[&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
1340+
for (const auto i : c10::irange(p_begin, p_end)) {
1341+
r__data[i * r__stride_0] = static_cast<scalar_t>(i);
1342+
}
1343+
});
1344+
1345+
for (int64_t i = 0; i < n - 1; i++) {
1346+
// NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand)
1347+
int64_t z = generator->random() % (n - i);
1348+
scalar_t sav = r__data[i * r__stride_0];
1349+
r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0];
1350+
r__data[(z + i) * r__stride_0] = sav;
1351+
}
1352+
return;
1353+
}
1354+
1355+
// we need to pick a number uniformly distributed between 0 and n
1356+
// when n is of the same order of magnitude as the biggest number returned by
1357+
// random the % result is not uniformly distributed
1358+
// so we use random64(), you'd run out of RAM before you
1359+
// start seeing the skew
1360+
// use no-initialization Fischer-Yates variant
1361+
// https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm
1362+
for (int64_t i = 0; i < n; i++) {
1363+
int64_t z = (int64_t)(generator->random64() % (i + 1));
1364+
r__data[i * r__stride_0] = i;
1365+
r__data[i * r__stride_0] = r__data[z * r__stride_0];
1366+
r__data[z * r__stride_0] = i;
13481367
}
13491368
}
13501369
} // namespace

aten/src/ATen/native/cuda/layer_norm_kernel.cu

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,12 +745,49 @@ void launch_vectorized_layer_norm_kernel(
745745
auto stream = at::cuda::getCurrentCUDAStream().stream();
746746
const int warp_size = at::cuda::warp_size();
747747
const dim3 threads(warp_size, num_threads() / warp_size, 1);
748-
const dim3 blocks(M);
748+
dim3 blocks(M);
749+
750+
#ifdef USE_ROCM
751+
uint64_t workgroupSize = static_cast<uint64_t>(blocks.x) * static_cast<uint64_t>(threads.x);
752+
// this caused invalid configuration problem
753+
if (workgroupSize > std::numeric_limits<uint32_t>::max()) {
754+
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
755+
blocks.x = std::numeric_limits<uint32_t>::max() / threads.x;
756+
}
757+
#endif
758+
749759
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
750760
int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
751761
vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data,
752762
gamma_data, beta_data, mean_data, rstd_data, Y_data);
753763
C10_CUDA_KERNEL_LAUNCH_CHECK();
764+
765+
#ifdef USE_ROCM
766+
// the blocks.x contains the max grid x dimention without invalid configuration error
767+
// Fix invalid configuration https://github.com/pytorch/pytorch/issues/136291
768+
// Ensure all elements are processed. Prepare for next round
769+
int64_t remaining = M - blocks.x;
770+
const T* X_data2 = X_data;
771+
T_ACC* mean_data2 = mean_data;
772+
T_ACC* rstd_data2 = rstd_data;
773+
T* Y_data2 = Y_data;
774+
775+
while (remaining > 0) {
776+
X_data2 += N * blocks.x;
777+
mean_data2 += blocks.x;
778+
rstd_data2 += blocks.x;
779+
Y_data2 += N * blocks.x;
780+
781+
blocks.x = (remaining > blocks.x) ? blocks.x : remaining;
782+
783+
vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data2,
784+
gamma_data, beta_data, mean_data2, rstd_data2, Y_data2);
785+
C10_CUDA_KERNEL_LAUNCH_CHECK();
786+
787+
remaining -= blocks.x;
788+
}
789+
#endif
790+
754791
}
755792

756793
template <typename T, typename T_ACC>

benchmarks/dynamo/common.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
NamedTuple,
3333
Optional,
3434
Sequence,
35-
Tuple,
3635
Type,
3736
TYPE_CHECKING,
3837
)
@@ -746,7 +745,7 @@ def timed(
746745
return (time_total, result) if return_result else time_total
747746

748747

749-
def _normalize_bench_inputs(example_inputs) -> Tuple[Tuple[Any], Mapping[str, Any]]:
748+
def _normalize_bench_inputs(example_inputs) -> tuple[tuple[Any], Mapping[str, Any]]:
750749
# NOTE(bowbao): For huggingface benchmark, example_inputs are formatted as dictionary,
751750
# and consumed like `model(**example_inputs)`.
752751
# For other benchmarks, example_inputs are formatted as tuple and consumed

benchmarks/dynamo/microbenchmarks/operator_inp_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
from collections import Counter, defaultdict
66
from functools import partial
7-
from typing import Any, Dict, Generator, Iterable, Tuple
7+
from typing import Any, Dict, Generator, Iterable
88

99
import torch
1010
from torch.testing import make_tensor
@@ -263,7 +263,7 @@ def __init__(self, json_file_path):
263263

264264
def get_inputs_for_operator(
265265
self, operator, dtype=None, device="cuda"
266-
) -> Generator[Tuple[Iterable[Any], Dict[str, Any]], None, None]:
266+
) -> Generator[tuple[Iterable[Any], Dict[str, Any]], None, None]:
267267
assert (
268268
str(operator) in self.operator_db
269269
), f"Could not find {operator}, must provide overload"

benchmarks/dynamo/pr_time_benchmarks/expected_results.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,27530000000,0.015
1818

1919

2020

21-
basic_modules_ListOfLinears_eager,compile_time_instruction_count,930000000,0.015
21+
basic_modules_ListOfLinears_eager,compile_time_instruction_count,945667911,0.015
2222

2323

2424

benchmarks/fastrnns/cells.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from typing import Tuple
2-
31
import torch
42
from torch import Tensor
53

@@ -27,12 +25,12 @@ def milstm_cell(x, hx, cx, w_ih, w_hh, alpha, beta_i, beta_h, bias):
2725

2826
def lstm_cell(
2927
input: Tensor,
30-
hidden: Tuple[Tensor, Tensor],
28+
hidden: tuple[Tensor, Tensor],
3129
w_ih: Tensor,
3230
w_hh: Tensor,
3331
b_ih: Tensor,
3432
b_hh: Tensor,
35-
) -> Tuple[Tensor, Tensor]:
33+
) -> tuple[Tensor, Tensor]:
3634
hx, cx = hidden
3735
gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
3836

@@ -57,7 +55,7 @@ def flat_lstm_cell(
5755
w_hh: Tensor,
5856
b_ih: Tensor,
5957
b_hh: Tensor,
60-
) -> Tuple[Tensor, Tensor]:
58+
) -> tuple[Tensor, Tensor]:
6159
gates = torch.mm(input, w_ih.t()) + torch.mm(hx, w_hh.t()) + b_ih + b_hh
6260

6361
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
@@ -75,11 +73,11 @@ def flat_lstm_cell(
7573

7674
def premul_lstm_cell(
7775
igates: Tensor,
78-
hidden: Tuple[Tensor, Tensor],
76+
hidden: tuple[Tensor, Tensor],
7977
w_hh: Tensor,
8078
b_ih: Tensor,
8179
b_hh: Tensor,
82-
) -> Tuple[Tensor, Tensor]:
80+
) -> tuple[Tensor, Tensor]:
8381
hx, cx = hidden
8482
gates = igates + torch.mm(hx, w_hh.t()) + b_ih + b_hh
8583

@@ -97,8 +95,8 @@ def premul_lstm_cell(
9795

9896

9997
def premul_lstm_cell_no_bias(
100-
igates: Tensor, hidden: Tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
101-
) -> Tuple[Tensor, Tensor]:
98+
igates: Tensor, hidden: tuple[Tensor, Tensor], w_hh: Tensor, b_hh: Tensor
99+
) -> tuple[Tensor, Tensor]:
102100
hx, cx = hidden
103101
gates = igates + torch.mm(hx, w_hh.t()) + b_hh
104102

benchmarks/fastrnns/custom_lstms.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numbers
22
import warnings
33
from collections import namedtuple
4-
from typing import List, Tuple
4+
from typing import List
55

66
import torch
77
import torch.jit as jit
@@ -131,8 +131,8 @@ def __init__(self, input_size, hidden_size):
131131

132132
@jit.script_method
133133
def forward(
134-
self, input: Tensor, state: Tuple[Tensor, Tensor]
135-
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
134+
self, input: Tensor, state: tuple[Tensor, Tensor]
135+
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
136136
hx, cx = state
137137
gates = (
138138
torch.mm(input, self.weight_ih.t())
@@ -199,8 +199,8 @@ def __init__(self, input_size, hidden_size, decompose_layernorm=False):
199199

200200
@jit.script_method
201201
def forward(
202-
self, input: Tensor, state: Tuple[Tensor, Tensor]
203-
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
202+
self, input: Tensor, state: tuple[Tensor, Tensor]
203+
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
204204
hx, cx = state
205205
igates = self.layernorm_i(torch.mm(input, self.weight_ih.t()))
206206
hgates = self.layernorm_h(torch.mm(hx, self.weight_hh.t()))
@@ -225,8 +225,8 @@ def __init__(self, cell, *cell_args):
225225

226226
@jit.script_method
227227
def forward(
228-
self, input: Tensor, state: Tuple[Tensor, Tensor]
229-
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
228+
self, input: Tensor, state: tuple[Tensor, Tensor]
229+
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
230230
inputs = input.unbind(0)
231231
outputs = torch.jit.annotate(List[Tensor], [])
232232
for i in range(len(inputs)):
@@ -242,8 +242,8 @@ def __init__(self, cell, *cell_args):
242242

243243
@jit.script_method
244244
def forward(
245-
self, input: Tensor, state: Tuple[Tensor, Tensor]
246-
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
245+
self, input: Tensor, state: tuple[Tensor, Tensor]
246+
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
247247
inputs = reverse(input.unbind(0))
248248
outputs = jit.annotate(List[Tensor], [])
249249
for i in range(len(inputs)):
@@ -266,11 +266,11 @@ def __init__(self, cell, *cell_args):
266266

267267
@jit.script_method
268268
def forward(
269-
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
270-
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
269+
self, input: Tensor, states: List[tuple[Tensor, Tensor]]
270+
) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
271271
# List[LSTMState]: [forward LSTMState, backward LSTMState]
272272
outputs = jit.annotate(List[Tensor], [])
273-
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
273+
output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
274274
# XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
275275
i = 0
276276
for direction in self.directions:
@@ -300,10 +300,10 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
300300

301301
@jit.script_method
302302
def forward(
303-
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
304-
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
303+
self, input: Tensor, states: List[tuple[Tensor, Tensor]]
304+
) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
305305
# List[LSTMState]: One state per layer
306-
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
306+
output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
307307
output = input
308308
# XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
309309
i = 0
@@ -330,11 +330,11 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
330330

331331
@jit.script_method
332332
def forward(
333-
self, input: Tensor, states: List[List[Tuple[Tensor, Tensor]]]
334-
) -> Tuple[Tensor, List[List[Tuple[Tensor, Tensor]]]]:
333+
self, input: Tensor, states: List[List[tuple[Tensor, Tensor]]]
334+
) -> tuple[Tensor, List[List[tuple[Tensor, Tensor]]]]:
335335
# List[List[LSTMState]]: The outer list is for layers,
336336
# inner list is for directions.
337-
output_states = jit.annotate(List[List[Tuple[Tensor, Tensor]]], [])
337+
output_states = jit.annotate(List[List[tuple[Tensor, Tensor]]], [])
338338
output = input
339339
# XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
340340
i = 0
@@ -370,10 +370,10 @@ def __init__(self, num_layers, layer, first_layer_args, other_layer_args):
370370

371371
@jit.script_method
372372
def forward(
373-
self, input: Tensor, states: List[Tuple[Tensor, Tensor]]
374-
) -> Tuple[Tensor, List[Tuple[Tensor, Tensor]]]:
373+
self, input: Tensor, states: List[tuple[Tensor, Tensor]]
374+
) -> tuple[Tensor, List[tuple[Tensor, Tensor]]]:
375375
# List[LSTMState]: One state per layer
376-
output_states = jit.annotate(List[Tuple[Tensor, Tensor]], [])
376+
output_states = jit.annotate(List[tuple[Tensor, Tensor]], [])
377377
output = input
378378
# XXX: enumerate https://github.com/pytorch/pytorch/issues/14471
379379
i = 0

0 commit comments

Comments
 (0)