Skip to content

Commit 6bbea20

Browse files
committed
Update on "[17/N] Add _reduce_scatter_base custom op with CPU/CUDA implementation"
Differential Revision: [D41415325](https://our.internmc.facebook.com/intern/diff/D41415325) [ghstack-poisoned]
2 parents f55bb12 + 1c7eddf commit 6bbea20

File tree

103 files changed

+3439
-520
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+3439
-520
lines changed

.bazelrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
build --cxxopt=--std=c++14
1+
build --cxxopt=--std=c++17
22
build --copt=-I.
33
# Bazel does not support including its cc_library targets as system
44
# headers. We work around this for generated code

.github/ci_commit_pins/vision.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
d710f3d1edc06afa244468cb96603ba6dbd4d9d5
1+
5b4f79d9ba8cbeeb8d6f0fbba3ba5757b718888b

.github/workflows/_linux-test.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ jobs:
192192
-w /var/lib/jenkins/workspace \
193193
"${DOCKER_IMAGE}"
194194
)
195+
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
195196
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
196197
197198
- name: Get workflow job id
@@ -216,6 +217,12 @@ jobs:
216217
with:
217218
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
218219

220+
- name: Collect backtraces from coredumps (if any)
221+
if: always()
222+
run: |
223+
# shellcheck disable=SC2156
224+
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
225+
219226
- name: Store Core dumps on S3
220227
uses: seemethere/upload-artifact-s3@v5
221228
if: failure()

.github/workflows/inductor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
with:
2121
build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
2222
docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
23-
cuda-arch-list: 8.6
23+
cuda-arch-list: '8.6'
2424
test-matrix: |
2525
{ include: [
2626
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ jobs:
7878
{ config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
7979
{ config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
8080
{ config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
81-
{ config: "default", shard: 5, num_shards: 5, runner: "linux.2xlarge" },
81+
{ config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
8282
{ config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
8383
]}
8484

aten/src/ATen/cpu/vec/vec256/vec256.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,51 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
222222
_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3. 4 bits apart
223223
}
224224

225+
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
226+
227+
template<>
228+
inline Vectorized<float> flip(const Vectorized<float> & v) {
229+
const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
230+
return _mm256_permutevar8x32_ps(v, mask_float);
231+
}
232+
233+
template<>
234+
inline Vectorized<double> flip(const Vectorized<double> & v) {
235+
return _mm256_permute4x64_pd(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
236+
}
237+
238+
template<>
239+
inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
240+
return _mm256_permute4x64_epi64(v, 27); // 27 == _MM_SHUFFLE(0, 1, 2, 3)
241+
}
242+
243+
template<>
244+
inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
245+
const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
246+
return _mm256_permutevar8x32_epi32(v, mask_int32);
247+
}
248+
249+
template<>
250+
inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
251+
const __m256i mask = _mm256_set_epi8(
252+
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
253+
1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
254+
);
255+
auto reversed = _mm256_shuffle_epi8(v, mask);
256+
return _mm256_permute2x128_si256(reversed, reversed, 1);
257+
}
258+
259+
template<>
260+
inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
261+
const __m256i mask_int8 = _mm256_set_epi8(
262+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
263+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
264+
);
265+
auto reversed = _mm256_shuffle_epi8(v, mask_int8);
266+
return _mm256_permute2x128_si256(reversed, reversed, 1);
267+
}
268+
269+
225270
#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
226271

227272
}}}

aten/src/ATen/cpu/vec/vec512/vec512.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,56 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
190190
_mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
191191
}
192192

193+
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
194+
195+
template<>
196+
inline Vectorized<float> flip(const Vectorized<float> & v) {
197+
const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
198+
8, 9, 10, 11, 12, 13, 14, 15);
199+
return _mm512_permutexvar_ps(mask, v);
200+
}
201+
202+
template<>
203+
inline Vectorized<double> flip(const Vectorized<double> & v) {
204+
const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
205+
return _mm512_permutexvar_pd(mask, v);
206+
}
207+
208+
template<>
209+
inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
210+
const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
211+
return _mm512_permutexvar_epi64(mask, v);
212+
}
213+
214+
template<>
215+
inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
216+
const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
217+
8, 9, 10, 11, 12, 13, 14, 15);
218+
return _mm512_permutexvar_epi32(mask, v);
219+
}
220+
221+
template<>
222+
inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
223+
const __m512i mask = _mm512_set_epi16(
224+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
225+
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
226+
);
227+
return _mm512_permutexvar_epi16(mask, v);
228+
}
229+
230+
template<>
231+
inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
232+
const __m512i mask1 = _mm512_set_epi8(
233+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
234+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
235+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
236+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
237+
);
238+
const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
239+
auto reversed_vec = _mm512_shuffle_epi8(v, mask1);
240+
return _mm512_permutexvar_epi64(mask2, reversed_vec);
241+
}
242+
193243
#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
194244

195245
}}}

aten/src/ATen/cpu/vec/vec_base.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,4 +1001,16 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
10011001
}
10021002
}
10031003

1004+
template <typename T>
1005+
inline Vectorized<T> flip(const Vectorized<T> & data) {
1006+
static constexpr int size = Vectorized<T>::size();
1007+
T output[size];
1008+
T buffer[size];
1009+
data.store(static_cast<void*>(buffer));
1010+
for (const auto i : c10::irange(size)) {
1011+
output[i] = buffer[size - i - 1];
1012+
}
1013+
return Vectorized<T>::loadu(static_cast<void*>(output));
1014+
}
1015+
10041016
}}}

aten/src/ATen/native/TensorFactories.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,6 @@ Tensor empty_like(
325325
// See [Note: hacky wrapper removal for TensorOptions]
326326
TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
327327

328-
329-
TORCH_CHECK(
330-
!(options_.has_memory_format() && optional_memory_format.has_value()),
331-
"Cannot set memory_format both in TensorOptions and explicit argument; please delete "
332-
"the redundant setter.");
333-
334328
TensorOptions options =
335329
self.options()
336330
.merge_in(options_)

aten/src/ATen/native/cpu/IndexKernel.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,75 @@ void masked_select_kernel(TensorIterator& iter, int64_t result_stride) {
457457
});
458458
}
459459

460+
461+
template <typename scalar_t>
462+
void cpu_hflip_vec(at::TensorIterator& iter) {
463+
464+
auto loop2d = [&](char** base, const int64_t *strides, int64_t size0, int64_t size1) {
465+
466+
static constexpr int ntensors = 3;
467+
std::array<char*, ntensors> data_arr;
468+
std::copy_n(base, ntensors, data_arr.data());
469+
const int64_t *outer_strides = &strides[ntensors];
470+
471+
using Vec = Vectorized<scalar_t>;
472+
473+
constexpr auto stride = sizeof(scalar_t);
474+
TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);
475+
476+
for (const auto j C10_UNUSED : c10::irange(size1)) {
477+
478+
// vectorized loop with negative stride for output
479+
char** C10_RESTRICT data_ = data_arr.data();
480+
int64_t n = size0;
481+
482+
char* C10_RESTRICT data[ntensors];
483+
for (const auto arg : c10::irange(ntensors)) {
484+
data[arg] = data_[arg];
485+
}
486+
487+
int64_t i = 0;
488+
489+
// data[0] unaligned pre-pass
490+
int64_t offset = (j * n + (n - i - Vec::size())) % 32;
491+
offset = (offset >= n) ? n : offset;
492+
for (; i < offset; i++) {
493+
scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
494+
*out_ptr = *(scalar_t *)(data[1] + i * stride);
495+
}
496+
// Empirically found that it is faster to process 3 data items together vs 2 or 4
497+
for (; i <= n - 3 * Vec::size(); i += 3 * Vec::size()) {
498+
auto out1 = Vec::loadu(data[1] + i * stride);
499+
auto out2 = Vec::loadu(data[1] + (i + Vec::size()) * stride);
500+
auto out3 = Vec::loadu(data[1] + (i + 2 * Vec::size()) * stride);
501+
// flip the vector: 1234 -> 4321
502+
out1 = flip(out1);
503+
out2 = flip(out2);
504+
out3 = flip(out3);
505+
out1.store(data[0] - (i + Vec::size() - 1) * stride);
506+
out2.store(data[0] - (i + 2 * Vec::size() - 1) * stride);
507+
out3.store(data[0] - (i + 3 * Vec::size() - 1) * stride);
508+
}
509+
if (i < n) {
510+
for (; i < n; i++) {
511+
scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
512+
*out_ptr = *(scalar_t *)(data[1] + i * stride);
513+
}
514+
}
515+
516+
// advance:
517+
for (const auto arg : c10::irange(data_arr.size())) {
518+
data_arr[arg] += outer_strides[arg];
519+
}
520+
}
521+
};
522+
523+
int64_t grain_size = at::internal::GRAIN_SIZE;
524+
iter.for_each(loop2d, grain_size);
525+
iter.cast_outputs();
526+
}
527+
528+
460529
void flip_kernel(TensorIterator& iter, const bool quantized) {
461530
if (quantized) {
462531
AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(iter.dtype(), "flip_quantized_cpu",
@@ -466,6 +535,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
466535
});
467536
});
468537
} else {
538+
// Special case: horizontal flip with vectorization and input is contiguous
539+
// Context: horizontal flip leads to strides[0] < 0 and
540+
// thus is_contiguous condition is not satisfied and non-vectorized code path is taken.
541+
auto output_strides = iter.strides(0);
542+
auto input_strides = iter.strides(1);
543+
if (iter.ndim() > 0 && output_strides[0] < 0 && input_strides[0] == iter.element_size(1)) {
544+
auto iter_dtype = iter.dtype();
545+
if (iter_dtype == kByte) {
546+
return cpu_hflip_vec<uint8_t>(iter);
547+
} else if (iter_dtype == kFloat) {
548+
return cpu_hflip_vec<float>(iter);
549+
} else if (iter_dtype == kInt) {
550+
return cpu_hflip_vec<int32_t>(iter);
551+
} else if (iter_dtype == kShort) {
552+
return cpu_hflip_vec<int16_t>(iter);
553+
} else if (iter_dtype == kLong) {
554+
return cpu_hflip_vec<int64_t>(iter);
555+
} else if (iter_dtype == kDouble) {
556+
return cpu_hflip_vec<double>(iter);
557+
}
558+
// other dtypes are handled below with cpu_kernel_vec
559+
}
560+
469561
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
470562
[&iter] { cpu_kernel_vec(iter,
471563
[](scalar_t a, scalar_t /*dummy input*/) -> scalar_t {

0 commit comments

Comments
 (0)