Skip to content

Commit 10ffcee

Browse files
committed
address comments
1 parent 06bc73b commit 10ffcee

File tree

2 files changed

+108
-11
lines changed

2 files changed

+108
-11
lines changed

aten/src/ATen/WrapDimUtils.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,18 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
4646
}
4747

4848
// wrap negative dims in to_transform_dims
49-
static inline void maybe_wrap_dims(std::vector<int64_t>& to_transform_dims, int64_t tensor_total_dims) {
50-
for (size_t i = 0; i < to_transform_dims.size(); i++) {
51-
if (to_transform_dims[i] < 0) {
52-
to_transform_dims[i] = (tensor_total_dims + (to_transform_dims[i] % tensor_total_dims)) % tensor_total_dims;
53-
}
49+
static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
50+
if (dim_post_expr <= 0) {
51+
dim_post_expr = 1; // this will make range [-1, 0]
52+
}
53+
int64_t min = -dim_post_expr;
54+
int64_t max = dim_post_expr - 1;
55+
for (auto& dim : dims) {
56+
AT_CHECK(
57+
dim >= min && dim <= max,
58+
"Dimension out of range (expected to be in range of [",
59+
min, ", ", max, "], but got ", dim, ")");
60+
if (dim < 0) dim += dim_post_expr;
5461
}
5562
}
5663

aten/src/ATen/native/ReduceOps.cpp

Lines changed: 96 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,84 @@ Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
376376

377377
// MULTI DIM REDUCE ###########################################################
378378

379+
// NB: this applies two optimizations:
380+
// 1. Reducing the dimensions in the order of decreasing size, so that the
381+
// larger dimensions are dealt earlier and we can work with less elements
382+
// overall.
383+
// E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
384+
// If we reduce in the order of [0, 1, 2], the input and output
385+
// shapes of iterations are:
386+
// it 0: [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
387+
// it 1: [10, 200] (2000 elem) => [200] ( 200 elem)
388+
// it 2: [200] ( 200 elem) => [ 1] ( 1 elem)
389+
// Since we need to iterate through all input elements at each
390+
// iteration, total number of elements traversed is 4200.
391+
// If we reduce in the order of [2, 1, 0], i.e., with decreasing
392+
// size, the input and output shapes of iterations are:
393+
// it 0: [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
394+
// it 1: [1, 10] ( 10 elem) => [ 1] ( 1 elem)
395+
// it 2: [1] ( 1 elem) => [ 1] ( 1 elem)
396+
// Total number of elements traversed is 2011, much less than 4200.
397+
// 2. Preallocated buffer.
398+
// Utilizing the `_out` variant, instead of allocating new output tensors
399+
// at each iteration, we can use a preallocated buffer. Since output numel
400+
// in each iteration is decreasing, we can reuse the buffer throughout the
401+
// loop.
402+
// Note that we need two buffers, one containing the input, i.e., output
403+
// from the previous iteration, and one containing the output for this
404+
// iteration.
405+
// The largest output size is the output size of the first iteration. After
406+
// that the largest size we need is the output size of the second
407+
// iteration.
408+
// So we allocate
409+
// 1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
410+
// 2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
411+
// These two regions are allocated together as a contiguous flattened
412+
// buffer tensor, with a variable `offset` indicating the starting position
413+
// of the output region for the current iteration.
414+
// E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
415+
// Say we reduce in the order of [0, 1, 2].
416+
// The first buffer with has size `4 * 3 * 2 / 4 = 6`.
417+
// The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
418+
// So we allocate a tensor of size `6 + 2 = 8`:
419+
// buffer: [ _, _, _, _, _, _, _, _]
420+
// buffer region 1-->^^^^^^^^^^^^^^^^ ^^^^<--buffer region 2
421+
// 1st iteration:
422+
// (before reduction)
423+
// input: self (or input)
424+
// input shape: [ 4, 3, 2]
425+
// output shape: [ 3, 2]
426+
// buffer: [ _, _, _, _, _, _, _, _]
427+
// offset: ^--beginning of 1st buffer region, i.e., the
428+
// starting output location of 1st iteration.
429+
// (after reduction)
430+
// buffer: [ {output of 1st it}, _, _]
431+
//
432+
// 2nd iteration:
433+
// (before reduction)
434+
// input: output of 1st it
435+
// input shape: [ 3, 2]
436+
// output shape: [ 2]
437+
// buffer: [ {output of 1st it}, _, _]
438+
// offset: ^--beginning of 2nd
439+
// buffer region. We can't
440+
// overwrite the 1st buffer
441+
// as it contains input to
442+
// reduction of this it.
443+
// (after reduction)
444+
// buffer: [ {output of 1st it}, {output of 2nd it}]
445+
//
446+
// 3rd iteration:
447+
// (before reduction)
448+
// input: output of 2nd it
449+
// input shape: [ 2]
450+
// output shape: [ 1]
451+
// buffer: [ {output of 1st it}, {output of 2nd it}]
452+
// offset: ^--beginning of 1st buffer region. We can
453+
// safely overwrite now.
454+
// (after reduction)
455+
// buffer: [ {output of 3rd it}, {output of 2nd it}]
456+
// Return {output of 3rd it}.
379457
template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
380458
Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
381459
inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool keepdim) {
@@ -386,28 +464,33 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
386464
return self;
387465
}
388466
int64_t ndims = self.dim();
467+
// `reduced_numel` and `reduced_size` will be updated in the loop.
468+
// Before that, they are just size and numel.
469+
int64_t reduced_numel = self.numel();
389470
auto reduced_size = self.sizes().vec();
390471
auto dims = dims_.vec();
391472
maybe_wrap_dims(dims, ndims);
392-
// Sort the reduced dimensions so that we reduce the largest dimension first.
473+
// Sort the reduced dimensions so that we reduce the larger dimensions first.
393474
std::sort(dims.begin(), dims.end(),
394475
[&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
395-
int64_t reduced_numel = self.numel();
476+
// Calculate 1st buffer region size
396477
int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
397478
int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
398479
// We separate `buffer` into two regions, one starting at 0, and another
399480
// starting at max_reduced_numel. These two regions are used alternatively as
400481
// the output of a `reduce_1` along a particular dimension. `offset` will
401482
// indicate which region we should use next.
402483
// Have keepdim=true when reducing. We will squeeze later.
403-
auto buffer = at::empty({buffer_size}, self.type());
484+
auto buffer = at::empty({buffer_size}, self.options());
404485
int64_t offset = 0;
405486
Tensor t = self;
406487
for (auto& dim : dims) {
407488
reduced_numel /= reduced_size[dim];
408489
reduced_size[dim] = 1;
409490
auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
410491
t = reduce_1_out(res, t, dim, true);
492+
// switch to other buffer region
493+
// this alternatively changes `offset` between 0 and max_reduced_numel
411494
offset = max_reduced_numel - offset;
412495
}
413496
// squeeze if needed
@@ -425,31 +508,36 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
425508
return t;
426509
}
427510

511+
// See comments above reduce_multi_associative for details.
428512
template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
429513
Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
430514
inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
431515
if (dims_.size() == 1) {
432516
return reduce_1_out(result, self, dims_[0], keepdim);
433517
}
434518
if (dims_.size() == 0) {
435-
return result;
519+
// reduce_out should be clone_out with empty dims_
520+
return result.resize_as_(self).copy_(self);
436521
}
437522
int64_t ndims = self.dim();
523+
// `reduced_numel` and `reduced_size` will be updated in the loop.
524+
// Before that, they are just size and numel.
525+
int64_t reduced_numel = self.numel();
438526
auto reduced_size = self.sizes().vec();
439527
auto dims = dims_.vec();
440528
maybe_wrap_dims(dims, ndims);
441529
// Sort the reduced dimensions so that we reduce the largest dimension first.
442530
std::sort(dims.begin(), dims.end(),
443531
[&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
444-
int64_t reduced_numel = self.numel();
532+
// Calculate 1st buffer region size
445533
int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
446534
int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
447535
// We separate `buffer` into two regions, one starting at 0, and another
448536
// starting at max_reduced_numel. These two regions are used alternatively as
449537
// the output of a `reduce_1` along a particular dimension. `offset` will
450538
// indicate which region we should use next.
451539
// Have keepdim=true when reducing. We will squeeze later.
452-
auto buffer = at::empty({buffer_size}, self.type());
540+
auto buffer = at::empty({buffer_size}, self.options());
453541
int64_t offset = 0;
454542
Tensor t = self;
455543
int64_t last_reduction = dims.size() - 1;
@@ -463,6 +551,8 @@ inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self,
463551
} else {
464552
reduce_1_out(result, t, dim, true);
465553
}
554+
// switch to other buffer region
555+
// this alternatively changes `offset` between 0 and max_reduced_numel
466556
offset = max_reduced_numel - offset;
467557
num_reduction++;
468558
}

0 commit comments

Comments
 (0)