Skip to content

Commit d7487bf

Browse files
ssnlfacebook-github-bot
authored andcommitted
Speed-up multidim sum (#8992)
Summary: 1. Instead of using non `_out` variant, we allocate a buffer and use `_out` variant to write the intermediate results into the buffer. 2. Reduce dimensions in order of decreasing sizes. Benchmark: Sum a randn tensor of shape `[200, 1, 30, 40, 20, 1, 50]` along dimensions `[4, 6, 3, 0, 2, 5]`. Averaged across 1000 times: ``` before patch: CPU: 0.0441 s CUDA: 0.0273 s after patch: CPU: 0.0234 s CUDA: 0.0047 s ``` Closes #8992 Differential Revision: D8681069 Pulled By: SsnL fbshipit-source-id: 2c5d5af5c5a284f2e945181f2b24ee8c78becd50
1 parent 9ce1517 commit d7487bf

File tree

2 files changed

+188
-22
lines changed

2 files changed

+188
-22
lines changed

aten/src/ATen/WrapDimUtils.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,22 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
4545
return maybe_wrap_dim(dim, tensor_sizes[0].size());
4646
}
4747

48+
// wrap each of dims basing on dim_post_expr
49+
static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
50+
if (dim_post_expr <= 0) {
51+
dim_post_expr = 1; // this will make range [-1, 0]
52+
}
53+
int64_t min = -dim_post_expr;
54+
int64_t max = dim_post_expr - 1;
55+
for (auto& dim : dims) {
56+
AT_CHECK(
57+
dim >= min && dim <= max,
58+
"Dimension out of range (expected to be in range of [",
59+
min, ", ", max, "], but got ", dim, ")");
60+
if (dim < 0) dim += dim_post_expr;
61+
}
62+
}
63+
4864
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
4965
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
5066
// to be "skipped" (both for wrap dimension behavior and dimension size checking).

aten/src/ATen/native/ReduceOps.cpp

Lines changed: 172 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include <functional>
1111
#include <numeric>
1212
#include <vector>
13-
1413
#include <map>
1514

1615
namespace at {
@@ -377,7 +376,89 @@ Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
377376

378377
// MULTI DIM REDUCE ###########################################################
379378

380-
template <Tensor (reduce_1)(const Tensor &, int64_t, bool)>
379+
// NB: this applies two optimizations:
380+
// 1. Reducing the dimensions in the order of decreasing size, so that the
381+
// larger dimensions are dealt earlier and we can work with less elements
382+
// overall.
383+
// E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
384+
// If we reduce in the order of [0, 1, 2], the input and output
385+
// shapes of iterations are:
386+
// it 0: [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
387+
// it 1: [10, 200] (2000 elem) => [200] ( 200 elem)
388+
// it 2: [200] ( 200 elem) => [ 1] ( 1 elem)
389+
// Since we need to iterate through all input elements at each
390+
// iteration, total number of elements traversed is 4200.
391+
// If we reduce in the order of [2, 1, 0], i.e., with decreasing
392+
// size, the input and output shapes of iterations are:
393+
// it 0: [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
394+
// it 1: [1, 10] ( 10 elem) => [ 1] ( 1 elem)
395+
// it 2: [1] ( 1 elem) => [ 1] ( 1 elem)
396+
// Total number of elements traversed is 2011, much less than 4200.
397+
// 2. Preallocated buffer.
398+
// Utilizing the `_out` variant, instead of allocating new output tensors
399+
// at each iteration, we can use a preallocated buffer. Since output numel
400+
// in each iteration is decreasing, we can reuse the buffer throughout the
401+
// loop.
402+
// Note that we need two buffers, one containing the input, i.e., output
403+
// from the previous iteration, and one containing the output for this
404+
// iteration.
405+
// The largest output size is the output size of the first iteration. After
406+
// that the largest size we need is the output size of the second
407+
// iteration.
408+
// So we allocate
409+
// 1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
410+
// 2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
411+
// These two regions are allocated together as a contiguous flattened
412+
// buffer tensor, with a variable `offset` indicating the starting position
413+
// of the output region for the current iteration.
414+
// E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
415+
// Say we reduce in the order of [0, 1, 2].
416+
// The first buffer with has size `4 * 3 * 2 / 4 = 6`.
417+
// The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
418+
// So we allocate a tensor of size `6 + 2 = 8`:
419+
// buffer: [ _, _, _, _, _, _, _, _]
420+
// buffer region 1-->^^^^^^^^^^^^^^^^ ^^^^<--buffer region 2
421+
// 1st iteration:
422+
// (before reduction)
423+
// input: self (or input)
424+
// input shape: [ 4, 3, 2]
425+
// output shape: [ 3, 2]
426+
// buffer: [ _, _, _, _, _, _, _, _]
427+
// offset: ^--beginning of 1st buffer region, i.e., the
428+
// starting output location of 1st iteration.
429+
// (after reduction)
430+
// buffer: [ {output of 1st it}, _, _]
431+
//
432+
// 2nd iteration:
433+
// (before reduction)
434+
// input: output of 1st it
435+
// input shape: [ 3, 2]
436+
// output shape: [ 2]
437+
// buffer: [ {output of 1st it}, _, _]
438+
// offset: ^--beginning of 2nd
439+
// buffer region. We can't
440+
// overwrite the 1st buffer
441+
// as it contains input to
442+
// reduction of this it.
443+
// (after reduction)
444+
// buffer: [ {output of 1st it}, {output of 2nd it}]
445+
//
446+
// 3rd iteration:
447+
// (before reduction)
448+
// input: output of 2nd it
449+
// input shape: [ 2]
450+
// output shape: [ 1]
451+
// buffer: [ {output of 1st it}, {output of 2nd it}]
452+
// offset: ^--beginning of 1st buffer region. We can
453+
// safely overwrite now.
454+
// (after reduction)
455+
// buffer: [ {output of 3rd it}, {output of 2nd it}]
456+
// Return {output of 3rd it}.
457+
//
458+
// TODO: If two or more reduced dimensions are contiguous, reduce as if they are
459+
// a large dimension.
460+
template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
461+
Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
381462
inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool keepdim) {
382463
if (dims_.size() == 1) {
383464
return reduce_1(self, dims_[0], keepdim);
@@ -386,51 +467,120 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
386467
return self;
387468
}
388469
int64_t ndims = self.dim();
389-
auto reduce_dims = dim_list_to_bitset(dims_, ndims);
390-
Tensor result = self;
391-
for (int64_t dim = ndims-1; dim >= 0; dim--) {
392-
if (reduce_dims[dim])
393-
result = reduce_1(result, dim, keepdim);
470+
// `reduced_numel` and `reduced_size` will be updated in the loop.
471+
// Before that, they are just size and numel.
472+
int64_t reduced_numel = self.numel();
473+
auto reduced_size = self.sizes().vec();
474+
auto dims = dims_.vec();
475+
maybe_wrap_dims(dims, ndims);
476+
// Sort the reduced dimensions so that we reduce the larger dimensions first.
477+
std::sort(dims.begin(), dims.end(),
478+
[&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
479+
// Calculate 1st buffer region size
480+
int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
481+
int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
482+
// We separate `buffer` into two regions, one starting at 0, and another
483+
// starting at max_reduced_numel. These two regions are used alternatively as
484+
// the output of a `reduce_1` along a particular dimension. `offset` will
485+
// indicate which region we should use next.
486+
// Have keepdim=true when reducing. We will squeeze later.
487+
auto buffer = at::empty({buffer_size}, self.options());
488+
int64_t offset = 0;
489+
Tensor t = self;
490+
for (auto& dim : dims) {
491+
reduced_numel /= reduced_size[dim];
492+
reduced_size[dim] = 1;
493+
auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
494+
t = reduce_1_out(res, t, dim, true);
495+
// switch to other buffer region
496+
// this alternatively changes `offset` between 0 and max_reduced_numel
497+
offset = max_reduced_numel - offset;
394498
}
395-
return result;
499+
// squeeze if needed
500+
if (!keepdim) {
501+
std::vector<int64_t> squeezed_shape;
502+
squeezed_shape.reserve(ndims - dims.size());
503+
auto reduce_dims = dim_list_to_bitset(dims_, ndims);
504+
for (int64_t dim = 0; dim < ndims; dim++) {
505+
if (!reduce_dims[dim]) {
506+
squeezed_shape.emplace_back(reduced_size[dim]);
507+
}
508+
}
509+
return t.view(squeezed_shape);
510+
}
511+
return t;
396512
}
397513

514+
// See comments above reduce_multi_associative for details.
398515
template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
399-
Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
516+
Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
400517
inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
401518
if (dims_.size() == 1) {
402519
return reduce_1_out(result, self, dims_[0], keepdim);
403520
}
521+
if (dims_.size() == 0) {
522+
// reduce_out should be clone_out with empty dims_
523+
return result.resize_as_(self).copy_(self);
524+
}
404525
int64_t ndims = self.dim();
405-
auto reduce_dims = dim_list_to_bitset(dims_, ndims);
526+
// `reduced_numel` and `reduced_size` will be updated in the loop.
527+
// Before that, they are just size and numel.
528+
int64_t reduced_numel = self.numel();
529+
auto reduced_size = self.sizes().vec();
530+
auto dims = dims_.vec();
531+
maybe_wrap_dims(dims, ndims);
532+
// Sort the reduced dimensions so that we reduce the largest dimension first.
533+
std::sort(dims.begin(), dims.end(),
534+
[&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
535+
// Calculate 1st buffer region size
536+
int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
537+
int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
538+
// We separate `buffer` into two regions, one starting at 0, and another
539+
// starting at max_reduced_numel. These two regions are used alternatively as
540+
// the output of a `reduce_1` along a particular dimension. `offset` will
541+
// indicate which region we should use next.
542+
// Have keepdim=true when reducing. We will squeeze later.
543+
auto buffer = at::empty({buffer_size}, self.options());
544+
int64_t offset = 0;
406545
Tensor t = self;
407-
int64_t last_reduction = dims_.size()-1;
546+
int64_t last_reduction = dims.size() - 1;
408547
int64_t num_reduction = 0;
409-
for (int64_t dim = ndims-1; dim >= 0; dim--) {
410-
if (reduce_dims[dim]) {
411-
if (num_reduction < last_reduction) {
412-
t = reduce_1(t, dim, keepdim);
413-
} else {
414-
reduce_1_out(result, t, dim, keepdim);
548+
for (auto& dim : dims) {
549+
reduced_numel /= reduced_size[dim];
550+
reduced_size[dim] = 1;
551+
auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
552+
if (num_reduction < last_reduction) {
553+
t = reduce_1_out(res, t, dim, true);
554+
} else {
555+
reduce_1_out(result, t, dim, true);
556+
}
557+
// switch to other buffer region
558+
// this alternatively changes `offset` between 0 and max_reduced_numel
559+
offset = max_reduced_numel - offset;
560+
num_reduction++;
561+
}
562+
// squeeze if needed (use in-place squeeze_)
563+
if (!keepdim) {
564+
auto reduce_dims = dim_list_to_bitset(dims_, ndims);
565+
for (int64_t dim = ndims - 1; dim >= 0; dim--) {
566+
if (reduce_dims[dim]) {
567+
result.squeeze_(dim);
415568
}
416-
num_reduction++;
417569
}
418570
}
419571
return result;
420572
}
421573

422-
423574
Tensor& _sum_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
424575
if (self.is_cuda()) {
425576
return at::_sum_cuda_out(result, self, dim, keepdim);
426-
}
427-
else {
577+
} else {
428578
return _sum_out_cpu(result, self, dim, keepdim);
429579
}
430580
}
431581

432582
Tensor _sum(const Tensor &self, IntList dims, bool keepdim) {
433-
return reduce_multi_associative<_sum>(self, dims, keepdim);
583+
return reduce_multi_associative<_sum, _sum_out>(self, dims, keepdim);
434584
}
435585

436586
Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)

0 commit comments

Comments
 (0)