New MaxPool1d without indices implementation

heitorschueroff · heitorschueroff · commit fe2d9ec7c330 · 2020-08-28T09:54:29.000-07:00
ghstack-source-id: f25c08c Pull Request resolved: #43745
diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
@@ -0,0 +1,126 @@
+#include <ATen/ATen.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/MaxPooling.h>
+
+namespace at {
+namespace native {
+
+DEFINE_DISPATCH(max_pool1d_stub);
+
+namespace {
+
+// Compute the output size for the given pooling parameters
+inline int64_t output_size(
+    int64_t input_size,
+    int64_t kernel_size,
+    int64_t stride,
+    int64_t padding,
+    int64_t dilation,
+    bool ceil_mode) {
+  int64_t num = input_size + 2 * padding - dilation * (kernel_size - 1) - 1;
+  // Ensure last kernel window starts within bounds in ceil mode
+  if (ceil_mode && stride - dilation * (kernel_size - 1) <= num % stride) {
+    return (num + stride - 1) / stride + 1;
+  }
+  return num / stride + 1;
+}
+
+Tensor max_pool1d_impl(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+  NoNamesGuard guard;
+
+  TORCH_CHECK(
+      self.dim() == 2 || self.dim() == 3,
+      "max_pool1d() input tensor must have 2 or 3 dimensions but got ",
+      self.dim());
+  TORCH_CHECK(
+      kernel_size.size() == 1,
+      "max_pool1d() kernel_size must be an int or int list of size 1 but got size ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride.size() == 0 || stride.size() == 1,
+      "max_pool1d() stride must be None, an int or int list of size 1 but got size ",
+      stride.size());
+  TORCH_CHECK(
+      padding.size() == 1,
+      "max_pool1d() padding must be an int or int list of size 1 but got size ",
+      padding.size());
+  TORCH_CHECK(
+      dilation.size() == 1,
+      "max_pool1d() dilation must be an int or int list of size 1 but got size ",
+      dilation.size());
+
+  // If stride=None then set it to kernel_size
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+
+  const int64_t NB = self.dim() == 3 ? self.size(-3) : 1;
+  const int64_t NC = self.size(-2);
+  const int64_t IW = self.size(-1);
+  const int64_t KW = kernel_size[0];
+  const int64_t SJ = stride[0];
+  const int64_t PJ = padding[0];
+  const int64_t DJ = dilation[0];
+
+  TORCH_CHECK(
+      KW > 0,
+      "max_pool1d() kernel_size must be greater than zero, but got ",
+      KW);
+  TORCH_CHECK(
+      SJ > 0, "max_pool1d() stride must be greater than zero, but got ", SJ);
+  TORCH_CHECK(
+      PJ >= 0, "max_pool1d() padding must be non-negative, but got ", PJ);
+  TORCH_CHECK(
+      PJ <= KW / 2,
+      "max_pool1d() padding should be at most half of kernel size, but got padding=",
+      PJ,
+      " and kernel_size=",
+      KW);
+  TORCH_CHECK(
+      DJ > 0, "max_pool1d() dilation must be greater than zero, but got ", DJ);
+
+  const int64_t OW = output_size(IW, KW, SJ, PJ, DJ, ceil_mode);
+  TORCH_CHECK(OW >= 0, "max_pool1d() Invalid computed output size: ", OW);
+  Tensor output = at::empty({NB, NC, OW}, self.options());
+
+  PoolingParams1D params{NB, NC, IW, OW, KW, SJ, PJ, DJ};
+  max_pool1d_stub(self.device().type(), output, self, params);
+
+  if (self.dim() == 2) {
+    output.squeeze_(0);
+  }
+
+  guard.reset();
+  namedinference::propagate_names(output, self);
+
+  return output;
+}
+
+} // namespace
+
+Tensor max_pool1d(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+  if (self.requires_grad() || !self.device().is_cpu()) {
+    // Needs indices for grad and with_indices defines CUDA dispatch
+    return std::get<0>(at::max_pool1d_with_indices(
+        self, kernel_size, stride, padding, dilation, ceil_mode));
+  }
+  return max_pool1d_impl(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/MaxPooling.h b/aten/src/ATen/native/MaxPooling.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+// TODO(Heitor) Template by dimension
+struct PoolingParams1D {
+  int64_t NB; // Number of batches
+  int64_t NC; // Number of channels
+  int64_t IW; // Input width
+  int64_t OW; // Output width
+  int64_t KW; // Kernel width
+  int64_t SJ; // Column stride
+  int64_t PJ; // Column padding
+  int64_t DJ; // Column dilation
+
+  // Return index of first output within bounds for this kernel index
+  inline int64_t valid_kernel_start(int64_t kj) const {
+    int64_t ij = kj * DJ - PJ;
+    return ij < 0 ? (-ij + SJ - 1) / SJ : 0;
+  }
+
+  // Return index one past last output within bounds for this kernel index
+  inline int64_t valid_kernel_end(int64_t kj) const {
+    int64_t ij = (OW - 1) * SJ + kj * DJ - PJ;
+    return ij >= IW ? OW - (ij - IW + SJ) / SJ : OW;
+  }
+};
+
+using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&);
+
+DECLARE_DISPATCH(pooling_fn, max_pool1d_stub);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
@@ -107,18 +107,6 @@ Tensor avg_pool1d(
   return output.squeeze(2);
 }
 
-Tensor max_pool1d(
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
-  auto output_and_indices = at::max_pool1d_with_indices(
-      self, kernel_size, stride, padding, dilation, ceil_mode);
-  return std::get<0>(output_and_indices);
-}
-
 Tensor max_pool2d(
     const Tensor& self,
     IntArrayRef kernel_size,
diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp
@@ -0,0 +1,56 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec256/vec256.h>
+#include <ATen/native/MaxPooling.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename scalar_t>
+inline void max_pool1d_kernel(
+    scalar_t* op,
+    const scalar_t* ip,
+    const PoolingParams1D& p) {
+  for (int64_t kj = 0; kj < p.KW; ++kj) {
+    int64_t oj = p.valid_kernel_start(kj);
+    int64_t oe = p.valid_kernel_end(kj);
+    int64_t ij = oj * p.SJ + kj * p.DJ - p.PJ;
+    for (; oj < oe; ++oj, ij += p.SJ) {
+      bool update_max = std::isnan(ip[ij]) || op[oj] < ip[ij];
+      op[oj] = update_max ? ip[ij] : op[oj];
+    }
+  }
+}
+
+void max_pool1d_impl(
+    Tensor& output,
+    const Tensor& input,
+    const PoolingParams1D& p) {
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "max_pool1d_impl", [&] {
+    scalar_t* const OP = output.data_ptr<scalar_t>();
+    const scalar_t* const IP = input.contiguous().data_ptr<scalar_t>();
+
+    // Value used for padding
+    constexpr scalar_t FILL = std::numeric_limits<scalar_t>::has_infinity
+        ? -std::numeric_limits<scalar_t>::infinity()
+        : std::numeric_limits<scalar_t>::lowest();
+
+    at::parallel_for(0, p.NB * p.NC, 0, [&](int64_t begin, int64_t end) {
+      for (int64_t it = begin; it < end; ++it) {
+        scalar_t* op = OP + it * p.OW;
+        const scalar_t* ip = IP + it * p.IW;
+        std::fill_n(op, p.OW, FILL);
+        max_pool1d_kernel(op, ip, p);
+      }
+    });
+  });
+}
+
+} // namespace
+
+REGISTER_DISPATCH(max_pool1d_stub, &max_pool1d_impl);
+
+} // namespace native
+} // namespace at
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -42,7 +42,7 @@
     module_tests, criterion_tests, new_criterion_tests, loss_reference_fns, \
     ctcloss_reference, new_module_tests
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
-    dtypesIfCUDA, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, \
+    dtypesIfCUDA, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
     skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, largeCUDATensorTest, onlyOnCPUAndCUDA, \
     deviceCountAtLeast, expectedAlertNondeterministic, largeTensorTest
 from torch.nn import MultiheadAttention
@@ -9810,6 +9810,41 @@ def helper(n, c, h, w, kernel_size, stride=None,
         helper(10, 512, 31, 31, 3, stride=2)
         helper(1, 129, 8, 8, 3, stride=2)
 
+    @onlyCPU
+    @dtypes(torch.float)
+    def test_max_pool1d_errors(self, device, dtype):
+        def check(x, args, message):
+            model = torch.nn.MaxPool1d(*args)
+            with self.assertRaisesRegex(RuntimeError, r'max_pool1d\(\) ' + message):
+                model(torch.tensor(x, device=device, dtype=dtype))
+
+        # Pooling args: (kernel_size, stride, padding, dilation, return_indices, ceil_mode)
+        check(0, (1,), "input tensor must have 2 or 3 dimensions but got 0")
+        check([], (1,), "input tensor must have 2 or 3 dimensions but got 1")
+        check([[]], (1, 0), "stride must be greater than zero, but got 0")
+        check([[]], (1, 1, -1), "padding must be non-negative, but got -1")
+        check([[]], (1, 1, 2), "padding should be at most half of kernel size, but got padding=2 and kernel_size=1")
+        check([[]], (1, 1, 0, 0), "dilation must be greater than zero, but got 0")
+        check([[]], (5, 1, 0, 1), "Invalid computed output size: -4")
+
+    @onlyCPU
+    @dtypes(torch.float, torch.double)
+    def test_max_pool1d_corner_cases(self, device, dtype):
+        def check(x, args, expected):
+            model = torch.nn.MaxPool1d(*args)
+            tensor = torch.tensor(x, device=device, dtype=dtype)
+            self.assertEqual(model(tensor), torch.tensor(expected, device=device, dtype=dtype))
+
+        # Pooling args: (kernel_size, stride, padding, dilation, return_indices, ceil_mode)
+        check([[]], (1, None, 0, 1, False, False), [[]])
+        check([[[]]], (1, None, 0, 1, False, False), [[[]]])
+        check([[[]]], (2, 1, 1, 2, False, True), [[[]]])
+        check([[1]], (1, None, 0, 1, False, False), [[1]])
+        check([[1]], (2, None, 1, 2, False, False), [[float('-inf')]])
+        check([[1], [1]], (2, None, 1, 2, False, False), [[float('-inf')], [float('-inf')]])
+        check([[1, 2]], (2, 1, 1, 2, False, False), [[2, 1]])
+        check([[1, 2]], (2, 2, 1, 2, False, True), [[2, 2]])
+
     @onlyCUDA
     def test_max_pool2d(self, device):
         def helper(n, c, h, w, ks):
@@ -11328,15 +11363,22 @@ def test_max_pool_nan_inf(self, device, dtype):
             for num_dim in [1, 2, 3]:
                 fn_name = '{}max_pool{}d'.format(adaptive, num_dim)
                 fn = getattr(F, fn_name)
+
                 x = torch.full([1, 1] + num_dim * [3], nan, device=device, dtype=dtype, requires_grad=True)
                 res = fn(x, 1 if adaptive else 3)
                 res.backward(torch.randn_like(res))
                 self.assertTrue(math.isnan(res.item()))
+                x.requires_grad_(False)
+                res = fn(x, 1 if adaptive else 3)
+                self.assertTrue(math.isnan(res.item()))
 
                 x2 = torch.full([1, 1] + num_dim * [3], -inf, device=device, dtype=dtype, requires_grad=True)
                 res2 = fn(x2, 1 if adaptive else 3)
                 res2.backward(torch.randn_like(res2))
                 self.assertTrue(math.isinf(res2.item()))
+                x2.requires_grad_(False)
+                res2 = fn(x2, 1 if adaptive else 3)
+                self.assertTrue(math.isinf(res2.item()))
 
     @onlyOnCPUAndCUDA
     @dtypes(torch.float, torch.double)
@@ -11373,12 +11415,12 @@ def test_pooling_zero_stride(self, device):
                 fn_name = '{}_pool{}d'.format(op, num_dim)
                 fn = getattr(F, fn_name)
                 x = torch.ones([1, 2] + num_dim * [4], device=device, dtype=torch.float)
-                self.assertRaisesRegex(RuntimeError, "stride should not be zero",
+                self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
                                        lambda: fn(x, kernel_size=2, stride=0))
 
                 fn_module_name = '{}Pool{}d'.format(op.title(), num_dim)
                 fn_module = getattr(nn, fn_module_name)(kernel_size=2, stride=0)
-                self.assertRaisesRegex(RuntimeError, "stride should not be zero",
+                self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
                                        lambda: fn_module(x))
 
     @dtypesIfCUDA(*ALL_TENSORTYPES2)
@@ -11401,6 +11443,10 @@ def test_pool_invalid_size(self, device, dtype):
         for op in ('max', 'avg'):
             for num_dim in [1, 2, 3]:
                 fn_name = '{}_pool{}d'.format(op, num_dim)
+                if op == 'max':
+                    # New implementation without indices supports empty tensors
+                    # TODO(Heitor) change once with_indices code is updated
+                    fn_name += '_with_indices'
                 fn = getattr(F, fn_name)
                 # use a configuration that gives zero outputs only
                 # when doing a correct floor division by the stride