Speed-up multidim sum (#8992)

ssnl · facebook-github-bot · commit d7487bfe9ebc · 2018-06-29T18:40:39.000-07:00
Summary: 1. Instead of using non `_out` variant, we allocate a buffer and use `_out` variant to write the intermediate results into the buffer. 2. Reduce dimensions in order of decreasing sizes. Benchmark: Sum a randn tensor of shape `[200, 1, 30, 40, 20, 1, 50]` along dimensions `[4, 6, 3, 0, 2, 5]`. Averaged across 1000 times: ``` before patch: CPU: 0.0441 s CUDA: 0.0273 s after patch: CPU: 0.0234 s CUDA: 0.0047 s ``` Closes #8992 Differential Revision: D8681069 Pulled By: SsnL fbshipit-source-id: 2c5d5af5c5a284f2e945181f2b24ee8c78becd50
diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
@@ -45,6 +45,22 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
+// wrap each of dims basing on dim_post_expr
+static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+  if (dim_post_expr <= 0) {
+    dim_post_expr = 1; // this will make range [-1, 0]
+  }
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  for (auto& dim : dims) {
+    AT_CHECK(
+        dim >= min && dim <= max,
+        "Dimension out of range (expected to be in range of [",
+        min, ", ", max, "], but got ", dim, ")");
+    if (dim < 0) dim += dim_post_expr;
+  }
+}
+
 // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
 // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
 // to be "skipped" (both for wrap dimension behavior and dimension size checking).
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
@@ -10,7 +10,6 @@
 #include <functional>
 #include <numeric>
 #include <vector>
-
 #include <map>
 
 namespace at {
@@ -377,7 +376,89 @@ Tensor logsumexp(const Tensor &self, int64_t dim_, bool keepdim) {
 
 // MULTI DIM REDUCE ###########################################################
 
-template <Tensor (reduce_1)(const Tensor &, int64_t, bool)>
+// NB: this applies two optimizations:
+//   1. Reducing the dimensions in the order of decreasing size, so that the
+//      larger dimensions are dealt earlier and we can work with less elements
+//      overall.
+//      E.g., reducing tensor of shape [1, 10, 200] over dimemsions {0, 1, 2}.
+//            If we reduce in the order of [0, 1, 2], the input and output
+//            shapes of iterations are:
+//                it 0:  [1, 10, 200] (2000 elem) => [10, 200] (2000 elem)
+//                it 1:     [10, 200] (2000 elem) =>     [200] ( 200 elem)
+//                it 2:         [200] ( 200 elem) =>     [  1] (   1 elem)
+//              Since we need to iterate through all input elements at each
+//              iteration, total number of elements traversed is 4200.
+//            If we reduce in the order of [2, 1, 0], i.e., with decreasing
+//            size, the input and output shapes of iterations are:
+//                it 0:  [1, 10, 200] (2000 elem) => [1, 10] (10 elem)
+//                it 1:      [1,  10] (  10 elem) =>    [ 1] ( 1 elem)
+//                it 2:           [1] (   1 elem) =>    [ 1] ( 1 elem)
+//              Total number of elements traversed is 2011, much less than 4200.
+//   2. Preallocated buffer.
+//      Utilizing the `_out` variant, instead of allocating new output tensors
+//      at each iteration, we can use a preallocated buffer. Since output numel
+//      in each iteration is decreasing, we can reuse the buffer throughout the
+//      loop.
+//      Note that we need two buffers, one containing the input, i.e., output
+//      from the previous iteration, and one containing the output for this
+//      iteration.
+//      The largest output size is the output size of the first iteration. After
+//      that the largest size we need is the output size  of the second
+//      iteration.
+//      So we allocate
+//        1. a region of size `input.numel() / input.size(reduced_dims[0])`, and
+//        2. a region of size `input.numel() / (input.size(reduced_dims[0]) * input.size(reduced_dims[1]))`.
+//      These two regions are allocated together as a contiguous flattened
+//      buffer tensor, with a variable `offset` indicating the starting position
+//      of the output region for the current iteration.
+//      E.g., reducing tensor of shape [4, 3, 2] over dimemsions {0, 1, 2}.
+//            Say we reduce in the order of [0, 1, 2].
+//            The first buffer with has size `4 * 3 * 2 / 4 = 6`.
+//            The second buffer with has size `4 * 3 * 2 / (4 * 3) = 2`.
+//            So we allocate a tensor of size `6 + 2 = 8`:
+//              buffer: [ _, _, _, _, _, _, _, _]
+//      buffer region 1-->^^^^^^^^^^^^^^^^  ^^^^<--buffer region 2
+//            1st iteration:
+//              (before reduction)
+//                input:         self (or input)
+//                input shape:   [ 4, 3, 2]
+//                output shape:  [ 3, 2]
+//                buffer:        [ _, _, _, _, _, _, _, _]
+//                offset:          ^--beginning of 1st buffer region, i.e., the
+//                                    starting output location of 1st iteration.
+//              (after reduction)
+//                buffer:        [ {output of 1st it}, _, _]
+//
+//            2nd iteration:
+//              (before reduction)
+//                input:         output of 1st it
+//                input shape:   [ 3, 2]
+//                output shape:  [ 2]
+//                buffer:        [ {output of 1st it}, _, _]
+//                offset:                              ^--beginning of 2nd
+//                                                      buffer region. We can't
+//                                                      overwrite the 1st buffer
+//                                                      as it contains input to
+//                                                      reduction of this it.
+//              (after reduction)
+//                buffer:        [ {output of 1st it}, {output of 2nd it}]
+//
+//            3rd iteration:
+//              (before reduction)
+//                input:         output of 2nd it
+//                input shape:   [ 2]
+//                output shape:  [ 1]
+//                buffer:        [ {output of 1st it}, {output of 2nd it}]
+//                offset:          ^--beginning of 1st buffer region. We can
+//                                  safely overwrite now.
+//              (after reduction)
+//                buffer:        [ {output of 3rd it}, {output of 2nd it}]
+//            Return {output of 3rd it}.
+//
+// TODO: If two or more reduced dimensions are contiguous, reduce as if they are
+//       a large dimension.
+template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
+    Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
 inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool keepdim) {
   if (dims_.size() == 1) {
     return reduce_1(self, dims_[0], keepdim);
@@ -386,51 +467,120 @@ inline Tensor reduce_multi_associative(const Tensor &self, IntList dims_, bool k
     return self;
   }
   int64_t ndims = self.dim();
-  auto reduce_dims = dim_list_to_bitset(dims_, ndims);
-  Tensor result = self;
-  for (int64_t dim = ndims-1; dim >= 0; dim--) {
-    if (reduce_dims[dim])
-      result = reduce_1(result, dim, keepdim);
+  // `reduced_numel` and `reduced_size` will be updated in the loop.
+  // Before that, they are just size and numel.
+  int64_t reduced_numel = self.numel();
+  auto reduced_size = self.sizes().vec();
+  auto dims = dims_.vec();
+  maybe_wrap_dims(dims, ndims);
+  // Sort the reduced dimensions so that we reduce the larger dimensions first.
+  std::sort(dims.begin(), dims.end(),
+        [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
+  // Calculate 1st buffer region size
+  int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
+  int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
+  // We separate `buffer` into two regions, one starting at 0, and another
+  // starting at max_reduced_numel. These two regions are used alternatively as
+  // the output of a `reduce_1` along a particular dimension. `offset` will
+  // indicate which region we should use next.
+  // Have keepdim=true when reducing. We will squeeze later.
+  auto buffer = at::empty({buffer_size}, self.options());
+  int64_t offset = 0;
+  Tensor t = self;
+  for (auto& dim : dims) {
+    reduced_numel /= reduced_size[dim];
+    reduced_size[dim] = 1;
+    auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
+    t = reduce_1_out(res, t, dim, true);
+    // switch to other buffer region
+    // this alternatively changes `offset` between 0 and max_reduced_numel
+    offset = max_reduced_numel - offset;
   }
-  return result;
+  // squeeze if needed
+  if (!keepdim) {
+    std::vector<int64_t> squeezed_shape;
+    squeezed_shape.reserve(ndims - dims.size());
+    auto reduce_dims = dim_list_to_bitset(dims_, ndims);
+    for (int64_t dim = 0; dim < ndims; dim++) {
+      if (!reduce_dims[dim]) {
+        squeezed_shape.emplace_back(reduced_size[dim]);
+      }
+    }
+    return t.view(squeezed_shape);
+  }
+  return t;
 }
 
+// See comments above reduce_multi_associative for details.
 template <Tensor (reduce_1)(const Tensor &, int64_t, bool),
-	  Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
+    Tensor& (reduce_1_out)(Tensor& result, const Tensor &, int64_t, bool)>
 inline Tensor& reduce_multi_associative_out(Tensor &result, const Tensor &self, IntList dims_, bool keepdim) {
   if (dims_.size() == 1) {
     return reduce_1_out(result, self, dims_[0], keepdim);
   }
+  if (dims_.size() == 0) {
+    // reduce_out should be clone_out with empty dims_
+    return result.resize_as_(self).copy_(self);
+  }
   int64_t ndims = self.dim();
-  auto reduce_dims = dim_list_to_bitset(dims_, ndims);
+  // `reduced_numel` and `reduced_size` will be updated in the loop.
+  // Before that, they are just size and numel.
+  int64_t reduced_numel = self.numel();
+  auto reduced_size = self.sizes().vec();
+  auto dims = dims_.vec();
+  maybe_wrap_dims(dims, ndims);
+  // Sort the reduced dimensions so that we reduce the largest dimension first.
+  std::sort(dims.begin(), dims.end(),
+        [&](int64_t i, int64_t j){ return reduced_size[i] > reduced_size[j]; });
+  // Calculate 1st buffer region size
+  int64_t max_reduced_numel = reduced_numel / reduced_size[dims[0]];
+  int64_t buffer_size = max_reduced_numel + max_reduced_numel / reduced_size[dims[1]];
+  // We separate `buffer` into two regions, one starting at 0, and another
+  // starting at max_reduced_numel. These two regions are used alternatively as
+  // the output of a `reduce_1` along a particular dimension. `offset` will
+  // indicate which region we should use next.
+  // Have keepdim=true when reducing. We will squeeze later.
+  auto buffer = at::empty({buffer_size}, self.options());
+  int64_t offset = 0;
   Tensor t = self;
-  int64_t last_reduction = dims_.size()-1;
+  int64_t last_reduction = dims.size() - 1;
   int64_t num_reduction = 0;
-  for (int64_t dim = ndims-1; dim >= 0; dim--) {
-    if (reduce_dims[dim]) {
-      if (num_reduction < last_reduction) {
-	t = reduce_1(t, dim, keepdim);
-      } else {
-	reduce_1_out(result, t, dim, keepdim);
+  for (auto& dim : dims) {
+    reduced_numel /= reduced_size[dim];
+    reduced_size[dim] = 1;
+    auto res = buffer.narrow(0, offset, reduced_numel).view(reduced_size);
+    if (num_reduction < last_reduction) {
+      t = reduce_1_out(res, t, dim, true);
+    } else {
+      reduce_1_out(result, t, dim, true);
+    }
+    // switch to other buffer region
+    // this alternatively changes `offset` between 0 and max_reduced_numel
+    offset = max_reduced_numel - offset;
+    num_reduction++;
+  }
+  // squeeze if needed (use in-place squeeze_)
+  if (!keepdim) {
+    auto reduce_dims = dim_list_to_bitset(dims_, ndims);
+    for (int64_t dim = ndims - 1; dim >= 0; dim--) {
+      if (reduce_dims[dim]) {
+        result.squeeze_(dim);
       }
-      num_reduction++;
     }
   }
   return result;
 }
 
-
 Tensor& _sum_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
   if (self.is_cuda()) {
     return at::_sum_cuda_out(result, self, dim, keepdim);
-  }
-  else {
+  } else {
     return _sum_out_cpu(result, self, dim, keepdim);
   }
 }
 
 Tensor _sum(const Tensor &self, IntList dims, bool keepdim) {
-  return reduce_multi_associative<_sum>(self, dims, keepdim);
+  return reduce_multi_associative<_sum, _sum_out>(self, dims, keepdim);
 }
 
 Tensor& _sum_out(Tensor &result, const Tensor &self, IntList dims, bool keepdim)