Skip to content

Commit 08b98ca

Browse files
author
Michael Andreas Dagitses
committed
Update on "switch Bazel to the shared generate-code genrule"
We were building it before, but now we use it in downstream rules. This enables us to eliminate the handwritten genrule. Differential Revision: [D35645390](https://our.internmc.facebook.com/intern/diff/D35645390/) [ghstack-poisoned]
2 parents ae27cd6 + 7a5f331 commit 08b98ca

File tree

79 files changed

+2436
-464
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+2436
-464
lines changed

.github/actions/teardown-rocm/action.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,12 @@ runs:
1414
docker stop $(docker ps -q) || true
1515
# Prune all of the docker containers
1616
docker container prune -f
17-
# Prune all of the docker images older than 1 day
18-
docker system prune -af --filter "until=24h"
17+
# Prune everything docker if there are more than 10 images (~200GB).
18+
# This is easier than using a time filter, e.g., "until=24h".
19+
image_count=$(docker images | wc -l)
20+
if [[ ${image_count} -gt 10 ]]; then
21+
echo "Purging all docker caches"
22+
docker system prune -af
23+
else
24+
echo "Will not purge docker, only ${image_count} images found"
25+
fi

.jenkins/pytorch/test.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ PR_NUMBER=${PR_NUMBER:-${CIRCLE_PR_NUMBER:-}}
4040

4141
if [[ $TEST_CONFIG == 'default' ]]; then
4242
export CUDA_VISIBLE_DEVICES=0
43+
export HIP_VISIBLE_DEVICES=0
44+
fi
45+
46+
if [[ $TEST_CONFIG == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
47+
export HIP_VISIBLE_DEVICES=0,1
4348
fi
4449

4550
if [[ "$BUILD_ENVIRONMENT" == *-slow-* || $TEST_CONFIG == 'slow' ]]; then
@@ -51,8 +56,8 @@ if [[ "$BUILD_ENVIRONMENT" == *slow-gradcheck* ]]; then
5156
export PYTORCH_TEST_WITH_SLOW_GRADCHECK=1
5257
fi
5358

54-
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
55-
# Used so that only cuda specific versions of tests are generated
59+
if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
60+
# Used so that only cuda/rocm specific versions of tests are generated
5661
# mainly used so that we're not spending extra cycles testing cpu
5762
# devices on expensive gpu machines
5863
export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"

aten/src/ATen/autocast_mode.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
541541
KERNEL_CPU(ADD_NS(quantile), "quantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
542542
KERNEL_CPU(ADD_NS(nanquantile), "nanquantile", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>, bool, c10::string_view), fp32)
543543
KERNEL_CPU(ADD_NS(nanquantile), "nanquantile.scalar", Tensor(const Tensor &, double, c10::optional<int64_t>, bool, c10::string_view), fp32)
544-
KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::optional<bool>, c10::optional<bool>), fp32)
545-
KERNEL_CPU(ADD_NS(stft), "stft.center", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::string_view, bool, c10::optional<bool>, c10::optional<bool>), fp32)
544+
KERNEL_CPU(ADD_NS(stft), "stft", Tensor(const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const c10::optional<Tensor> &, bool, c10::string_view, bool, c10::optional<bool>, c10::optional<bool>), fp32)
546545
KERNEL_CPU(ADD_NS(cdist), "cdist", Tensor(const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
547546
KERNEL_CPU(ADD_NS(cross), "cross", Tensor(const Tensor &, const Tensor &, c10::optional<int64_t>), fp32)
548547
KERNEL_CPU(ADD_NS(cumprod), "cumprod", Tensor(const Tensor &, int64_t, c10::optional<at::ScalarType>), fp32)

aten/src/ATen/cuda/ScanUtils.cuh

Lines changed: 0 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -10,88 +10,6 @@
1010
namespace at {
1111
namespace cuda {
1212

13-
// Extends the above Inclusive Scan to support segments. It has the same properties
14-
// but also takes a flag array that indicates the starts of "segments", i.e. individual
15-
// units to scan. For example, consider the following (+)-scan that is segmented:
16-
//
17-
// Input: [1, 3, 2, 4, 1, 2, 3, 2, 1, 4]
18-
// Flags: [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]
19-
// Output: 1 4 6 4 5 2 3 5 1 5
20-
//
21-
// So we see that each "flag" resets the scan to that index.
22-
template <typename T, class BinaryOp, int Power2ScanSize>
23-
__device__ void segmentedInclusivePrefixScan(T *smem, bool *bmem, BinaryOp binop) {
24-
// Reduce step ("upsweep")
25-
#pragma unroll
26-
for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
27-
int index = (threadIdx.x + 1) * stride * 2 - 1;
28-
if (index < Power2ScanSize) {
29-
smem[index] = bmem[index] ? smem[index] : binop(smem[index], smem[index - stride]);
30-
bmem[index] = bmem[index] | bmem[index - stride];
31-
}
32-
__syncthreads();
33-
}
34-
35-
// Post-reduce step ("downsweep")
36-
#pragma unroll
37-
for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
38-
int index = (threadIdx.x + 1) * stride * 2 - 1;
39-
if ((index + stride) < Power2ScanSize) {
40-
smem[index + stride] = bmem[index + stride] ? smem[index + stride] : binop(smem[index + stride], smem[index]);
41-
bmem[index + stride] = bmem[index + stride] | bmem[index];
42-
}
43-
__syncthreads();
44-
}
45-
}
46-
47-
// Inclusive prefix sum using shared memory
48-
template <typename T, bool KillWARDependency, class BinaryFunction>
49-
__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
50-
// FIXME: this is a slow, simple implementation; need up/down sweep,
51-
// prevent smem conflicts
52-
smem[threadIdx.x] = in;
53-
54-
__syncthreads();
55-
56-
for (int offset = 1; offset < blockDim.x; offset *= 2) {
57-
T val = 0;
58-
59-
if (threadIdx.x >= offset) {
60-
val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
61-
}
62-
63-
__syncthreads();
64-
if (threadIdx.x >= offset) {
65-
smem[threadIdx.x] = val;
66-
}
67-
68-
__syncthreads();
69-
}
70-
71-
*out = smem[threadIdx.x];
72-
73-
// Prevent write-after-read dependencies on smem usage above if necessary
74-
if (KillWARDependency) {
75-
__syncthreads();
76-
}
77-
}
78-
79-
// Exclusive prefix sum using shared memory
80-
template <typename T, bool KillWARDependency, class BinaryFunction>
81-
__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
82-
// FIXME: crappy implementation
83-
// We kill write-after-read dependencies separately below, hence the `false`
84-
inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
85-
86-
*out -= in;
87-
*carry = smem[blockDim.x - 1];
88-
89-
// Prevent write-after-read dependencies on smem usage above if necessary
90-
if (KillWARDependency) {
91-
__syncthreads();
92-
}
93-
}
94-
9513
// Inclusive prefix sum for binary vars using intra-warp voting +
9614
// shared memory
9715
template <typename T, bool KillWARDependency, class BinaryFunction>

aten/src/ATen/native/SpectralOps.cpp

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -907,17 +907,6 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
907907
}
908908
}
909909

910-
Tensor stft(
911-
const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
912-
const optional<int64_t> win_lengthOpt, const c10::optional<Tensor>& window_opt,
913-
const bool normalized,
914-
const optional<bool> onesidedOpt, const optional<bool> return_complexOpt) {
915-
return at::stft(
916-
self, n_fft, hop_lengthOpt, win_lengthOpt, window_opt,
917-
/*center=*/false, /*mode=*/"constant", normalized, onesidedOpt,
918-
return_complexOpt);
919-
}
920-
921910
// Create complex tensor from the old style of real tensor with size=(..., 2)
922911
// This is to support istft in the transition to requiring complex input.
923912
// NOTE: This may return a view of the input tensor, or might clone if necessary
@@ -1111,15 +1100,6 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
11111100
#undef REPR
11121101
}
11131102

1114-
Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
1115-
const optional<int64_t> win_lengthOpt, const Tensor& window,
1116-
const bool center, const bool normalized, const optional<bool> onesidedOpt,
1117-
const optional<int64_t> lengthOpt) {
1118-
return at::native::istft(
1119-
self, n_fft, hop_lengthOpt, win_lengthOpt, window, center, normalized,
1120-
onesidedOpt, lengthOpt, /*return_complex=*/false);
1121-
}
1122-
11231103
void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_) {
11241104
const auto input_sizes = input.sizes();
11251105
const auto input_strides = input.strides();

aten/src/ATen/native/TensorCompare.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ Tensor isfinite(const Tensor& self) {
220220

221221
// Note: a complex value is finite iff both parts are finite
222222
if (self.is_complex()) {
223-
return at::isfinite(self.abs());
223+
return at::isfinite(at::real(self)).__iand__(at::isfinite(at::imag(self)));
224224
}
225225

226226
return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "isfinite", [&]() {

aten/src/ATen/native/native_functions.yaml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@
452452
SparseCsrCPU, SparseCsrCUDA: add_sparse_csr
453453
MkldnnCPU: mkldnn_add
454454
ZeroTensor: add_zerotensor
455+
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
455456

456457
- func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
457458
device_check: NoCheck # TensorIterator
@@ -461,6 +462,7 @@
461462
SparseCPU, SparseCUDA: add_sparse_
462463
SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
463464
MkldnnCPU: mkldnn_add_
465+
NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
464466

465467
- func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
466468
device_check: NoCheck # TensorIterator
@@ -3219,6 +3221,7 @@
32193221
SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr
32203222
MkldnnCPU: mkldnn_mul
32213223
ZeroTensor: mul_zerotensor
3224+
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
32223225

32233226
- func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
32243227
device_check: NoCheck # TensorIterator
@@ -3228,6 +3231,7 @@
32283231
SparseCPU, SparseCUDA: mul_sparse_
32293232
SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
32303233
MkldnnCPU: mkldnn_mul_
3234+
NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
32313235

32323236
- func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
32333237
device_check: NoCheck # TensorIterator
@@ -4314,12 +4318,7 @@
43144318

43154319
- func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
43164320

4317-
# Overload without center & pad mode, needed for forward-compatibility
4318-
- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4319-
variants: function, method
4320-
cpp_no_default_args: ['hop_length', 'win_length', 'window', 'normalized']
4321-
4322-
- func: stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
4321+
- func: stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
43234322
variants: function, method
43244323

43254324
- func: istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
@@ -11670,7 +11669,7 @@
1167011669
CompositeExplicitAutograd: alias_copy
1167111670
tags: view_copy
1167211671

11673-
- func: to_padded_tensor(Tensor self, float padding) -> Tensor
11672+
- func: to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
1167411673
variants: method
1167511674
dispatch:
1167611675
NestedTensorCPU: NestedTensor_to_padded_tensor_generic

aten/src/ATen/native/nested/NestedTensorMath.cpp

Lines changed: 134 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,10 @@ Tensor nested_from_padded_generic(
303303
std::move(new_buffer), sizes);
304304
}
305305

306-
Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding) {
306+
Tensor NestedTensor_to_padded_tensor_generic(
307+
const Tensor& t,
308+
double padding,
309+
OptionalIntArrayRef output_size) {
307310
// TODO: skipped optimization for case of all 1x1 tensors
308311
auto& nt = *get_nested_tensor_impl(t);
309312
auto max_size = NestedTensor_get_max_size(nt);
@@ -356,7 +359,22 @@ Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding) {
356359
buffers.push_back(pad_tensor_to_shape(to_pad, max_size, padding));
357360
sizes_ptr += sizes_num_columns;
358361
}
359-
return at::stack(buffers);
362+
auto ret_val = at::stack(buffers);
363+
364+
// Pad output tensor to output_size if provided
365+
if (output_size.has_value()) {
366+
auto output_size_ = output_size.value();
367+
TORCH_CHECK(
368+
(int64_t)output_size_.size() == ret_val.dim(),
369+
"Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
370+
for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
371+
TORCH_CHECK(
372+
output_size_[i] >= ret_val.size(i),
373+
"Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
374+
}
375+
return pad_tensor_to_shape(ret_val, output_size_, padding);
376+
}
377+
return ret_val;
360378
}
361379

362380
Tensor NestedTensor_embedding(
@@ -385,5 +403,119 @@ Tensor NestedTensor_embedding(
385403
return at::detail::make_tensor<NestedTensorImpl>(
386404
result_buffer.reshape({-1}), std::move(new_sizes));
387405
}
406+
407+
std::pair<NestedTensorImpl*, NestedTensorImpl*>
408+
get_elementwise_nested_tensor_impl(
409+
const Tensor& self,
410+
const Tensor& other,
411+
const std::string& op_name) {
412+
if (self.is_nested() && !(other.is_nested())) {
413+
TORCH_CHECK(
414+
false,
415+
"Expected both self and other to be nested, but got a nested self and non-nested other");
416+
} else if (!(self.is_nested()) && other.is_nested()) {
417+
TORCH_CHECK(
418+
false,
419+
"Expected both self and other to be nested, but got a non-nested self and nested other");
420+
} else if (!(self.is_nested()) || !(other.is_nested())) {
421+
TORCH_CHECK(
422+
false,
423+
"Expected both self and other to be nested, but got a non-nested self and non-nested other");
424+
}
425+
426+
auto self_ptr = get_nested_tensor_impl(self);
427+
auto other_ptr = get_nested_tensor_impl(other);
428+
429+
TORCH_CHECK(
430+
self.dim() == other.dim(),
431+
op_name,
432+
" does not support broadcasting when given a NestedTensor");
433+
TORCH_CHECK(
434+
at::equal(
435+
self_ptr->get_nested_size_tensor(),
436+
other_ptr->get_nested_size_tensor()),
437+
op_name,
438+
" does not support broadcasting when given a NestedTensor");
439+
TORCH_CHECK(
440+
nested_tensor_impl_is_contiguous(self_ptr) &&
441+
nested_tensor_impl_is_contiguous(other_ptr),
442+
op_name,
443+
" does not support non-contiguous NestedTensor inputs");
444+
return std::make_pair(self_ptr, other_ptr);
445+
}
446+
447+
template <typename Func>
448+
Tensor NestedTensor_elementwise_Tensor(
449+
const Tensor& self,
450+
const Tensor& other,
451+
const std::string& op_name,
452+
Func f) {
453+
NestedTensorImpl* self_impl = nullptr;
454+
NestedTensorImpl* other_impl = nullptr;
455+
std::tie(self_impl, other_impl) =
456+
get_elementwise_nested_tensor_impl(self, other, op_name);
457+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
458+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
459+
const auto& nt_self = *self_impl;
460+
const auto& nt_other = *other_impl;
461+
const auto& self_sizes = nt_self.get_nested_size_tensor();
462+
return wrap_buffer(
463+
f(nt_self.get_buffer().reshape({-1}),
464+
nt_other.get_buffer().reshape({-1})),
465+
self_sizes);
466+
}
467+
468+
Tensor NestedTensor_add_Tensor(
469+
const Tensor& self,
470+
const Tensor& other,
471+
const Scalar& alpha) {
472+
return NestedTensor_elementwise_Tensor(
473+
self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) {
474+
return at::add(b1, b2, alpha);
475+
});
476+
}
477+
478+
Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) {
479+
return NestedTensor_elementwise_Tensor(
480+
self, other, "mul", [](const Tensor& b1, const Tensor& b2) {
481+
return at::mul(b1, b2);
482+
});
483+
}
484+
485+
template <typename Func>
486+
Tensor& NestedTensor_elementwise__Tensor(
487+
Tensor& self,
488+
const Tensor& other,
489+
const std::string& op_name,
490+
Func f) {
491+
NestedTensorImpl* self_impl = nullptr;
492+
NestedTensorImpl* other_impl = nullptr;
493+
std::tie(self_impl, other_impl) =
494+
get_elementwise_nested_tensor_impl(self, other, op_name);
495+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
496+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
497+
const auto& nt_self = *self_impl;
498+
const auto& nt_other = *other_impl;
499+
f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1}));
500+
return self;
501+
}
502+
503+
Tensor& NestedTensor_add__Tensor(
504+
Tensor& self,
505+
const Tensor& other,
506+
const Scalar& alpha) {
507+
return NestedTensor_elementwise__Tensor(
508+
self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) {
509+
return b1.add_(b2, alpha);
510+
});
511+
}
512+
513+
Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) {
514+
return NestedTensor_elementwise__Tensor(
515+
self, other, "mul_", [](const Tensor& b1, const Tensor& b2) {
516+
return b1.mul_(b2);
517+
});
518+
}
519+
388520
} // namespace native
389521
} // namespace at

aten/src/ATen/native/nested/NestedTensorMath.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
1414

1515
TORCH_API std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt);
1616

17-
TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding);
17+
TORCH_API Tensor NestedTensor_to_padded_tensor_generic(const Tensor& t, double padding, OptionalIntArrayRef output_size);
1818

1919
} // namespace native
2020
} // namespace at

0 commit comments

Comments
 (0)