@@ -376,6 +376,84 @@ Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
376376
377377// MULTI DIM REDUCE ###########################################################
378378
379+ // NB: this applies two optimizations:
380+ // 1. Reducing the dimensions in the order of decreasing size, so that the
381+ // larger dimensions are dealt earlier and we can work with less elements
382+ // overall.
383+ // E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
384+ // If we reduce in the order of [0, 1, 2], the input and output
385+ // shapes of iterations are:
386+ // it 0: [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
387+ // it 1: [10, 200] (2000 elem) => [200] ( 200 elem)
388+ // it 2: [200] ( 200 elem) => [ 1] ( 1 elem)
389+ // Since we need to iterate through all input elements at each
390+ // iteration, total number of elements traversed is 4200.
391+ // If we reduce in the order of [2, 1, 0], i.e., with decreasing
392+ // size, the input and output shapes of iterations are:
393+ // it 0: [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
394+ // it 1: [1, 10] ( 10 elem) => [ 1] ( 1 elem)
395+ // it 2: [1] ( 1 elem) => [ 1] ( 1 elem)
396+ // Total number of elements traversed is 2011, much less than 4200.
397+ // 2. Preallocated buffer.
398+ // Utilizing the `_out` variant, instead of allocating new output tensors
399+ // at each iteration, we can use a preallocated buffer. Since output numel
400+ // in each iteration is decreasing, we can reuse the buffer throughout the
401+ // loop.
402+ // Note that we need two buffers, one containing the input, i.e., output
403+ // from the previous iteration, and one containing the output for this
404+ // iteration.
405+ // The largest output size is the output size of the first iteration. After
406+ // that the largest size we need is the output size of the second
407+ // iteration.
408+ // So we allocate
409+ // 1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
410+ // 2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
411+ // These two regions are allocated together as a contiguous flattened
412+ // buffer tensor, with a variable `offset` indicating the starting position
413+ // of the output region for the current iteration.
414+ // E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
415+ // Say we reduce in the order of [0, 1, 2].
416+ // The first buffer with has size `4 * 3 * 2 / 4 = 6`.
417+ // The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
418+ // So we allocate a tensor of size `6 + 2 = 8`:
419+ // buffer: [ _, _, _, _, _, _, _, _]
420+ // buffer region 1-->^^^^^^^^^^^^^^^^ ^^^^<--buffer region 2
421+ // 1st iteration:
422+ // (before reduction)
423+ // input: self (or input)
424+ // input shape: [ 4, 3, 2]
425+ // output shape: [ 3, 2]
426+ // buffer: [ _, _, _, _, _, _, _, _]
427+ // offset: ^--beginning of 1st buffer region, i.e., the
428+ // starting output location of 1st iteration.
429+ // (after reduction)
430+ // buffer: [ {output of 1st it}, _, _]
431+ //
432+ // 2nd iteration:
433+ // (before reduction)
434+ // input: output of 1st it
435+ // input shape: [ 3, 2]
436+ // output shape: [ 2]
437+ // buffer: [ {output of 1st it}, _, _]
438+ // offset: ^--beginning of 2nd
439+ // buffer region. We can't
440+ // overwrite the 1st buffer
441+ // as it contains input to
442+ // reduction of this it.
443+ // (after reduction)
444+ // buffer: [ {output of 1st it}, {output of 2nd it}]
445+ //
446+ // 3rd iteration:
447+ // (before reduction)
448+ // input: output of 2nd it
449+ // input shape: [ 2]
450+ // output shape: [ 1]
451+ // buffer: [ {output of 1st it}, {output of 2nd it}]
452+ // offset: ^--beginning of 1st buffer region. We can
453+ // safely overwrite now.
454+ // (after reduction)
455+ // buffer: [ {output of 3rd it}, {output of 2nd it}]
456+ // Return {output of 3rd it}.
379457template <Tensor (reduce_1)(const Tensor &, int64_t , bool ),
380458 Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t , bool )>
381459inline Tensor reduce_multi_associative (const Tensor &self, IntList dims_, bool keepdim) {
@@ -386,28 +464,33 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
386464 return self;
387465 }
388466 int64_t ndims = self.dim ();
467+ // `reduced_numel` and `reduced_size` will be updated in the loop.
468+ // Before that, they are just size and numel.
469+ int64_t reduced_numel = self.numel ();
389470 auto reduced_size = self.sizes ().vec ();
390471 auto dims = dims_.vec ();
391472 maybe_wrap_dims (dims, ndims);
392- // Sort the reduced dimensions so that we reduce the largest dimension first.
473+ // Sort the reduced dimensions so that we reduce the larger dimensions first.
393474 std::sort (dims.begin (), dims.end (),
394475 [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
395- int64_t reduced_numel = self. numel ();
476+ // Calculate 1st buffer region size
396477 int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0 ]];
397478 int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1 ]];
398479 // We separate `buffer` into two regions, one starting at 0, and another
399480 // starting at max_reduced_numel. These two regions are used alternatively as
400481 // the output of a `reduce_1` along a particular dimension. `offset` will
401482 // indicate which region we should use next.
402483 // Have keepdim=true when reducing. We will squeeze later.
403- auto buffer = at::empty ({buffer_size}, self.type ());
484+ auto buffer = at::empty ({buffer_size}, self.options ());
404485 int64_t offset = 0 ;
405486 Tensor t = self;
406487 for (auto & dim : dims) {
407488 reduced_numel /= reduced_size[dim];
408489 reduced_size[dim] = 1 ;
409490 auto res = buffer.narrow (0 , offset, reduced_numel).view (reduced_size);
410491 t = reduce_1_out (res, t, dim, true );
492+ // switch to other buffer region
493+ // this alternatively changes `offset` between 0 and max_reduced_numel
411494 offset = max_reduced_numel - offset;
412495 }
413496 // squeeze if needed
@@ -425,31 +508,36 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
425508 return t;
426509}
427510
511+ // See comments above reduce_multi_associative for details.
428512template <Tensor (reduce_1)(const Tensor &, int64_t , bool ),
429513 Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t , bool )>
430514inline Tensor& reduce_multi_associative_out (Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
431515 if (dims_.size () == 1 ) {
432516 return reduce_1_out (result, self, dims_[0 ], keepdim);
433517 }
434518 if (dims_.size () == 0 ) {
435- return result;
519+ // reduce_out should be clone_out with empty dims_
520+ return result.resize_as_ (self).copy_ (self);
436521 }
437522 int64_t ndims = self.dim ();
523+ // `reduced_numel` and `reduced_size` will be updated in the loop.
524+ // Before that, they are just size and numel.
525+ int64_t reduced_numel = self.numel ();
438526 auto reduced_size = self.sizes ().vec ();
439527 auto dims = dims_.vec ();
440528 maybe_wrap_dims (dims, ndims);
441529 // Sort the reduced dimensions so that we reduce the largest dimension first.
442530 std::sort (dims.begin (), dims.end (),
443531 [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
444- int64_t reduced_numel = self. numel ();
532+ // Calculate 1st buffer region size
445533 int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0 ]];
446534 int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1 ]];
447535 // We separate `buffer` into two regions, one starting at 0, and another
448536 // starting at max_reduced_numel. These two regions are used alternatively as
449537 // the output of a `reduce_1` along a particular dimension. `offset` will
450538 // indicate which region we should use next.
451539 // Have keepdim=true when reducing. We will squeeze later.
452- auto buffer = at::empty ({buffer_size}, self.type ());
540+ auto buffer = at::empty ({buffer_size}, self.options ());
453541 int64_t offset = 0 ;
454542 Tensor t = self;
455543 int64_t last_reduction = dims.size () - 1 ;
@@ -463,6 +551,8 @@ inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self,
463551 } else {
464552 reduce_1_out (result, t, dim, true );
465553 }
554+ // switch to other buffer region
555+ // this alternatively changes `offset` between 0 and max_reduced_numel
466556 offset = max_reduced_numel - offset;
467557 num_reduction++;
468558 }
0 commit comments