1010#include < functional>
1111#include < numeric>
1212#include < vector>
13-
1413#include < map>
1514
1615namespace at {
@@ -377,7 +376,89 @@ Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
377376
378377// MULTI DIM REDUCE ###########################################################
379378
380- template <Tensor (reduce_1)(const Tensor &, int64_t , bool )>
379+ // NB: this applies two optimizations:
380+ // 1. Reducing the dimensions in the order of decreasing size, so that the
381+ // larger dimensions are dealt earlier and we can work with less elements
382+ // overall.
383+ // E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
384+ // If we reduce in the order of [0, 1, 2], the input and output
385+ // shapes of iterations are:
386+ // it 0: [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
387+ // it 1: [10, 200] (2000 elem) => [200] ( 200 elem)
388+ // it 2: [200] ( 200 elem) => [ 1] ( 1 elem)
389+ // Since we need to iterate through all input elements at each
390+ // iteration, total number of elements traversed is 4200.
391+ // If we reduce in the order of [2, 1, 0], i.e., with decreasing
392+ // size, the input and output shapes of iterations are:
393+ // it 0: [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
394+ // it 1: [1, 10] ( 10 elem) => [ 1] ( 1 elem)
395+ // it 2: [1] ( 1 elem) => [ 1] ( 1 elem)
396+ // Total number of elements traversed is 2011, much less than 4200.
397+ // 2. Preallocated buffer.
398+ // Utilizing the `_out` variant, instead of allocating new output tensors
399+ // at each iteration, we can use a preallocated buffer. Since output numel
400+ // in each iteration is decreasing, we can reuse the buffer throughout the
401+ // loop.
402+ // Note that we need two buffers, one containing the input, i.e., output
403+ // from the previous iteration, and one containing the output for this
404+ // iteration.
405+ // The largest output size is the output size of the first iteration. After
406+ // that the largest size we need is the output size of the second
407+ // iteration.
408+ // So we allocate
409+ // 1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
410+ // 2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
411+ // These two regions are allocated together as a contiguous flattened
412+ // buffer tensor, with a variable `offset` indicating the starting position
413+ // of the output region for the current iteration.
414+ // E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
415+ // Say we reduce in the order of [0, 1, 2].
416+ // The first buffer with has size `4 * 3 * 2 / 4 = 6`.
417+ // The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
418+ // So we allocate a tensor of size `6 + 2 = 8`:
419+ // buffer: [ _, _, _, _, _, _, _, _]
420+ // buffer region 1-->^^^^^^^^^^^^^^^^ ^^^^<--buffer region 2
421+ // 1st iteration:
422+ // (before reduction)
423+ // input: self (or input)
424+ // input shape: [ 4, 3, 2]
425+ // output shape: [ 3, 2]
426+ // buffer: [ _, _, _, _, _, _, _, _]
427+ // offset: ^--beginning of 1st buffer region, i.e., the
428+ // starting output location of 1st iteration.
429+ // (after reduction)
430+ // buffer: [ {output of 1st it}, _, _]
431+ //
432+ // 2nd iteration:
433+ // (before reduction)
434+ // input: output of 1st it
435+ // input shape: [ 3, 2]
436+ // output shape: [ 2]
437+ // buffer: [ {output of 1st it}, _, _]
438+ // offset: ^--beginning of 2nd
439+ // buffer region. We can't
440+ // overwrite the 1st buffer
441+ // as it contains input to
442+ // reduction of this it.
443+ // (after reduction)
444+ // buffer: [ {output of 1st it}, {output of 2nd it}]
445+ //
446+ // 3rd iteration:
447+ // (before reduction)
448+ // input: output of 2nd it
449+ // input shape: [ 2]
450+ // output shape: [ 1]
451+ // buffer: [ {output of 1st it}, {output of 2nd it}]
452+ // offset: ^--beginning of 1st buffer region. We can
453+ // safely overwrite now.
454+ // (after reduction)
455+ // buffer: [ {output of 3rd it}, {output of 2nd it}]
456+ // Return {output of 3rd it}.
457+ //
458+ // TODO: If two or more reduced dimensions are contiguous, reduce as if they are
459+ // a large dimension.
460+ template <Tensor (reduce_1)(const Tensor &, int64_t , bool ),
461+ Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t , bool )>
381462inline Tensor reduce_multi_associative (const Tensor &self, IntList dims_, bool keepdim) {
382463 if (dims_.size () == 1 ) {
383464 return reduce_1 (self, dims_[0 ], keepdim);
@@ -386,51 +467,120 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
386467 return self;
387468 }
388469 int64_t ndims = self.dim ();
389- auto reduce_dims = dim_list_to_bitset (dims_, ndims);
390- Tensor result = self;
391- for (int64_t dim = ndims-1 ; dim >= 0 ; dim--) {
392- if (reduce_dims[dim])
393- result = reduce_1 (result, dim, keepdim);
470+ // `reduced_numel` and `reduced_size` will be updated in the loop.
471+ // Before that, they are just size and numel.
472+ int64_t reduced_numel = self.numel ();
473+ auto reduced_size = self.sizes ().vec ();
474+ auto dims = dims_.vec ();
475+ maybe_wrap_dims (dims, ndims);
476+ // Sort the reduced dimensions so that we reduce the larger dimensions first.
477+ std::sort (dims.begin (), dims.end (),
478+ [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
479+ // Calculate 1st buffer region size
480+ int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0 ]];
481+ int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1 ]];
482+ // We separate `buffer` into two regions, one starting at 0, and another
483+ // starting at max_reduced_numel. These two regions are used alternatively as
484+ // the output of a `reduce_1` along a particular dimension. `offset` will
485+ // indicate which region we should use next.
486+ // Have keepdim=true when reducing. We will squeeze later.
487+ auto buffer = at::empty ({buffer_size}, self.options ());
488+ int64_t offset = 0 ;
489+ Tensor t = self;
490+ for (auto & dim : dims) {
491+ reduced_numel /= reduced_size[dim];
492+ reduced_size[dim] = 1 ;
493+ auto res = buffer.narrow (0 , offset, reduced_numel).view (reduced_size);
494+ t = reduce_1_out (res, t, dim, true );
495+ // switch to other buffer region
496+ // this alternatively changes `offset` between 0 and max_reduced_numel
497+ offset = max_reduced_numel - offset;
394498 }
395- return result;
499+ // squeeze if needed
500+ if (!keepdim) {
501+ std::vector<int64_t > squeezed_shape;
502+ squeezed_shape.reserve (ndims - dims.size ());
503+ auto reduce_dims = dim_list_to_bitset (dims_, ndims);
504+ for (int64_t dim = 0 ; dim < ndims; dim++) {
505+ if (!reduce_dims[dim]) {
506+ squeezed_shape.emplace_back (reduced_size[dim]);
507+ }
508+ }
509+ return t.view (squeezed_shape);
510+ }
511+ return t;
396512}
397513
514+ // See comments above reduce_multi_associative for details.
398515template <Tensor (reduce_1)(const Tensor &, int64_t , bool ),
399- Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t , bool )>
516+ Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t , bool )>
400517inline Tensor& reduce_multi_associative_out (Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
401518 if (dims_.size () == 1 ) {
402519 return reduce_1_out (result, self, dims_[0 ], keepdim);
403520 }
521+ if (dims_.size () == 0 ) {
522+ // reduce_out should be clone_out with empty dims_
523+ return result.resize_as_ (self).copy_ (self);
524+ }
404525 int64_t ndims = self.dim ();
405- auto reduce_dims = dim_list_to_bitset (dims_, ndims);
526+ // `reduced_numel` and `reduced_size` will be updated in the loop.
527+ // Before that, they are just size and numel.
528+ int64_t reduced_numel = self.numel ();
529+ auto reduced_size = self.sizes ().vec ();
530+ auto dims = dims_.vec ();
531+ maybe_wrap_dims (dims, ndims);
532+ // Sort the reduced dimensions so that we reduce the largest dimension first.
533+ std::sort (dims.begin (), dims.end (),
534+ [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
535+ // Calculate 1st buffer region size
536+ int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0 ]];
537+ int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1 ]];
538+ // We separate `buffer` into two regions, one starting at 0, and another
539+ // starting at max_reduced_numel. These two regions are used alternatively as
540+ // the output of a `reduce_1` along a particular dimension. `offset` will
541+ // indicate which region we should use next.
542+ // Have keepdim=true when reducing. We will squeeze later.
543+ auto buffer = at::empty ({buffer_size}, self.options ());
544+ int64_t offset = 0 ;
406545 Tensor t = self;
407- int64_t last_reduction = dims_ .size ()- 1 ;
546+ int64_t last_reduction = dims .size () - 1 ;
408547 int64_t num_reduction = 0 ;
409- for (int64_t dim = ndims-1 ; dim >= 0 ; dim--) {
410- if (reduce_dims[dim]) {
411- if (num_reduction < last_reduction) {
412- t = reduce_1 (t, dim, keepdim);
413- } else {
414- reduce_1_out (result, t, dim, keepdim);
548+ for (auto & dim : dims) {
549+ reduced_numel /= reduced_size[dim];
550+ reduced_size[dim] = 1 ;
551+ auto res = buffer.narrow (0 , offset, reduced_numel).view (reduced_size);
552+ if (num_reduction < last_reduction) {
553+ t = reduce_1_out (res, t, dim, true );
554+ } else {
555+ reduce_1_out (result, t, dim, true );
556+ }
557+ // switch to other buffer region
558+ // this alternatively changes `offset` between 0 and max_reduced_numel
559+ offset = max_reduced_numel - offset;
560+ num_reduction++;
561+ }
562+ // squeeze if needed (use in-place squeeze_)
563+ if (!keepdim) {
564+ auto reduce_dims = dim_list_to_bitset (dims_, ndims);
565+ for (int64_t dim = ndims - 1 ; dim >= 0 ; dim--) {
566+ if (reduce_dims[dim]) {
567+ result.squeeze_ (dim);
415568 }
416- num_reduction++;
417569 }
418570 }
419571 return result;
420572}
421573
422-
423574Tensor& _sum_out (Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
424575 if (self.is_cuda ()) {
425576 return at::_sum_cuda_out (result, self, dim, keepdim);
426- }
427- else {
577+ } else {
428578 return _sum_out_cpu (result, self, dim, keepdim);
429579 }
430580}
431581
432582Tensor _sum (const Tensor &self, IntList dims, bool keepdim) {
433- return reduce_multi_associative<_sum>(self, dims, keepdim);
583+ return reduce_multi_associative<_sum, _sum_out >(self, dims, keepdim);
434584}
435585
436586Tensor& _sum_out (Tensor &result, const Tensor &self, IntList dims, bool keepdim)
0 commit comments