pytorch · soumith · Jun 16, 2018 · May 29, 2018 · May 29, 2018 · May 29, 2018
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
@@ -0,0 +1,55 @@
+#include "TensorTransformations.h"
+
+#include "ATen/NativeFunctions.h"
+
+namespace at {
+namespace native {
+
+Tensor flip_cpu(const Tensor& self, IntList dims) {
+  const int64_t total_dims = self.dim(), flip_dims_size = dims.size();
+  check_errors(total_dims, flip_dims_size, dims);
+
+  auto flip_dims_v = std::vector<int64_t>(dims);
+  std::sort(flip_dims_v.begin(), flip_dims_v.end());
+  auto final_indices = std::vector<at::Tensor>(total_dims);
+
+  auto indices = std::vector<at::Tensor>(flip_dims_size);
+  for (int64_t i = 0; i < flip_dims_size; i++) {
+    indices[i] = at::arange(self.type().toScalarType(at::ScalarType::Long), self.size(i) - 1, -1, -1);
+    // creates a meshgrid
+    auto temp = std::vector<int64_t>(flip_dims_size, 1);
+    temp[i] = indices[i].size(0);
+    indices[i] = indices[i].view(IntList(temp));
+    final_indices[flip_dims_v[i]] = indices[i];
+  }
+
+  // check if distance between two flip dims >= 2, where permute of output tensor is needed,
+  // because the advanced indexing puts all non-consecutive indices in the beginning of the tensor
+  bool to_permute = false;
+  int64_t first = flip_dims_v[0], second = flip_dims_v[0];
+  for (int64_t i = 1; i < flip_dims_size; i++) {
+    second = flip_dims_v[i];
+    if (second - first >= 2) {
+      to_permute = true;
+      break;
+    }
+    first = second;
+  }
+
+  if (to_permute) {
+    // permute output tensor
+    auto permute_order = std::vector<int64_t>(flip_dims_v);
+    for (int64_t i = 0; i < total_dims; i++) {
+      if (std::find(flip_dims_v.begin(), flip_dims_v.end(), i) == flip_dims_v.end()) {
+        permute_order.emplace_back(i);
+      }
+    }
+    auto out_tensor = self.index(TensorList(final_indices));
+    return out_tensor.permute(IntList(permute_order));
+  }
+
+  auto out_tensor = self.index(TensorList(final_indices));
+  return out_tensor;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
@@ -0,0 +1,34 @@
+#include "ATen/ATen.h"
+
+namespace at {
+namespace native {
+
+static inline void check_errors(int64_t total_dims, int64_t flip_dims_size, IntList dims) {
+  // check if number of axis in dim is valid
+  AT_CHECK(flip_dims_size > 0,
+    "expected input tensor dims > 0, but got tensor dims size=", flip_dims_size);
+
+  // check duplicates in dims
+  auto flip_dims_v = std::vector<int64_t>(dims);
+  flip_dims_v.erase(std::unique(flip_dims_v.begin(), flip_dims_v.end()), flip_dims_v.end());
+  AT_CHECK((int64_t)flip_dims_v.size() == flip_dims_size,
+    "dims has duplicates, original flip dims size=", flip_dims_size,
+    ", but unique flip dims size=", flip_dims_v.size());
+
+  // check len of dims
+  AT_CHECK(flip_dims_size <= total_dims,
+    "expected flip dims size <= tensor total dims, but got flip dims size=",
+    flip_dims_size, " and tensor total dim=", total_dims);
+
+  // check if dims axis within range
+  auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end());
+
+  AT_CHECK(*min_max_d.first >= 0,
+    "expected flip dims axis >= 0, but got min flip dims=", *min_max_d.first);
+
+  AT_CHECK(*min_max_d.second < total_dims,
+    "expected flip dims axis < tensor total dims, but got max flip dims=",
+    *min_max_d.second, " and tensor total dim=", total_dims);
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -0,0 +1,120 @@
+#include "ATen/native/TensorTransformations.h"
+
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include "ATen/NativeFunctions.h"
+#include "ATen/cuda/CUDATensorMethods.cuh"
+#include "ATen/cuda/CUDATypeConversion.cuh"
+
+namespace at {
+namespace native {
+
+#define AT_APPLY_THREADS_PER_BLOCK 32 * 16
+#define AT_APPLY_BLOCKS_PER_SM 4
+
+template <typename scalar_t, typename IndexType>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void
+kernel_pointwise_flip_apply2(const cuda::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
+                          cuda::detail::TensorInfo<scalar_t, IndexType> out_tensor_info,
+                          IndexType N,
+                          int flip_dim,
+                          IndexType total_dims) {
+  for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) {
+    IndexType dst_offset = 0;
+    if (flip_dim == 0) {
+      // flip 1st dim
+      dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0];
+    }
+    else {
+      // flip last dim
+      IndexType i = total_dims - 1;
+      dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]);
+    }
+    out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index];
+  }
+}
+
+template <typename scalar_t>
+__global__
+void flip_cuda_kernel(scalar_t* in_tensor, scalar_t* out_tensor, int64_t N, int64_t* flip_dims, int64_t flip_dims_size, int64_t* strides, int64_t* strides_contiguous, int64_t* shape, int64_t total_dims) {
+
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+
+  int64_t cur_indices = linear_index, rem = 0, dst_offset = 0;
+  for (int64_t i = 0; i < total_dims; i++) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / strides_contiguous[i];
+    rem = temp - cur_indices * strides_contiguous[i];
+    // flip the indices if it is in flip_dims
+    for (int64_t j = 0; j < flip_dims_size; j++) {
+      if (i == flip_dims[j]) {
+        cur_indices = shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * strides[i];
+    cur_indices = rem;
+  }
+  out_tensor[linear_index] = in_tensor[dst_offset];
+}
+
+// Flip tensor given a list of dims
+Tensor flip_cuda(const Tensor& self, IntList dims) {
+  auto in_tensor = self;
+  const int64_t flip_dims_size = dims.size(), total_dims = in_tensor.dim(), N = in_tensor.numel();
+  check_errors(total_dims, flip_dims_size, dims);
+
+  int64_t block_size = 512;
+  dim3 dim_block(block_size);
+  dim3 dim_grid((N + block_size - 1) / block_size);
+
+  // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
+  if (flip_dims_size == 1 && in_tensor.is_contiguous() && (dims[0] == 0 || dims[0] == total_dims - 1)) {
+    auto out_tensor = at::empty_like(self);
+    AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
+      using cuda_scalar_t = cuda::into_type<scalar_t>;
+      auto in_tensor_info = cuda::detail::getTensorInfo<cuda_scalar_t, int64_t>(in_tensor);
+      auto out_tensor_info = cuda::detail::getTensorInfo<cuda_scalar_t, int64_t>(out_tensor);
+      int flip_dim = in_tensor_info.collapseDims(dims[0]);
+      out_tensor_info.collapseDims(dims[0]);
+      kernel_pointwise_flip_apply2<cuda_scalar_t, int64_t>
+        <<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+          in_tensor_info, out_tensor_info, N, flip_dim, total_dims);
+    });
+    return out_tensor;
+  }
+
+  auto flip_dims = std::vector<int64_t>(dims);
+  auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast<int64_t>(flip_dims.size())});
+
+  auto shape = std::vector<int64_t>(in_tensor.sizes());
+  auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast<int64_t>(shape.size())});
+
+  auto strides = std::vector<int64_t>(in_tensor.strides());
+  auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast<int64_t>(strides.size())});
+
+  auto out_tensor = at::empty_like(in_tensor);
+
+  // stride_contiguous is the stride of non-contiguous tensor after called contiguous(), it is used to compute indices for each element in non-contiguous tensor
+  Tensor stride_contiguous = at::zeros(CPU(kLong), {total_dims});
+  int64_t* stride_contiguous_d = stride_contiguous.data<int64_t>();
+  int64_t tmp = N;
+  for (int64_t i = 0; i < total_dims; i++) {
+    tmp = tmp / shape[i];
+    stride_contiguous_d[i] = tmp;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_HALF(in_tensor.type(), "flip_cuda", [&] {
+    using cuda_scalar_t = cuda::into_type<scalar_t>;
+    flip_cuda_kernel<<<dim_grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(
+      in_tensor.data<cuda_scalar_t>(), out_tensor.data<cuda_scalar_t>(), N, flip_dims_t.toType(CUDA(kLong)).data<int64_t>(), flip_dims_size, strides_t.toType(CUDA(kLong)).data<int64_t>(), stride_contiguous.toType(CUDA(kLong)).data<int64_t>(), shape_t.toType(CUDA(kLong)).data<int64_t>(), total_dims);
+  });
+
+  return out_tensor;
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1088,6 +1088,11 @@
 - func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
   variants: method
 
+- func: flip(Tensor self, IntList dims) -> Tensor
+  dispatch:
+    CPU: flip_cpu
+    CUDA: flip_cuda
+
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor
   variants: function
 

diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -2509,6 +2509,10 @@ class dont_convert(tuple):
     ('reshape', (S,), (S,), '1d'),
     ('reshape', (), (dont_convert(()),), 'scalar_to_scalar'),
     ('reshape', (), (1,), 'scalar_to_1d'),
+    ('flip', (S, S, S), ([0],), 'd0'),
+    ('flip', (S, S, S), ([0, 1, 2],), 'd012'),
+    ('flip', (S, S, S), ([0, 2],), 'd02'),
+    ('flip', (S, S, S), ([2, 0],), 'd20'),
     ('view_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
     ('view_as', (), (non_differentiable(torch.tensor(5.5)),), 'scalar'),
     ('view_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),

diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -409,6 +409,10 @@ def tmp(t):
     ('zero', small_3d, lambda t: [],),
     ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
     ('eye', small_2d, lambda t: [3, 4],),
+    ('flip', small_3d, lambda t: [0], 'd0', types, True),
+    ('flip', small_3d, lambda t: [0, 1, 2], 'd012', types, True),
+    ('flip', small_3d, lambda t: [0, 2], 'd02', types, True),
+    ('flip', small_3d, lambda t: [2, 0], 'd20', types, True),
     ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types),
     ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
     ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
@@ -1372,6 +1376,9 @@ def test_gesv_batched_dims(self):
     def test_view(self):
         TestTorch._test_view(self, lambda t: t.cuda())
 
+    def test_flip(self):
+        TestTorch._test_flip(self, use_cuda=True)
+
     def test_signal_window_functions(self):
         TestTorch._test_signal_window_functions(self, device=torch.device('cuda'))
 

diff --git a/test/test_torch.py b/test/test_torch.py
@@ -5953,6 +5953,51 @@ def test_permute(self):
         self.assertEqual(perm, new)
         self.assertEqual(x.size(), orig)
 
+    @staticmethod
+    def _test_flip(self, use_cuda=False):
+        if use_cuda:
+            cuda = torch.device("cuda")
+            data = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], device=cuda).view(2, 2, 2)
+            # large data testing
+            large_data = torch.arange(0, 100000000, device=cuda).view(10000, 10000)
+            large_data.flip([0, 1])
+        else:
+            data = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(2, 2, 2)
+
+        self.assertEqual(torch.tensor([5, 6, 7, 8, 1, 2, 3, 4]).view(2, 2, 2), data.flip(0))
+        self.assertEqual(torch.tensor([3, 4, 1, 2, 7, 8, 5, 6]).view(2, 2, 2), data.flip(1))
+        self.assertEqual(torch.tensor([2, 1, 4, 3, 6, 5, 8, 7]).view(2, 2, 2), data.flip(2))
+        self.assertEqual(torch.tensor([7, 8, 5, 6, 3, 4, 1, 2]).view(2, 2, 2), data.flip(0, 1))
+        self.assertEqual(torch.tensor([8, 7, 6, 5, 4, 3, 2, 1]).view(2, 2, 2), data.flip(0, 1, 2))
+
+        # check for permute
+        self.assertEqual(torch.tensor([6, 5, 8, 7, 2, 1, 4, 3]).view(2, 2, 2), data.flip(0, 2))
+        self.assertEqual(torch.tensor([6, 5, 8, 7, 2, 1, 4, 3]).view(2, 2, 2), data.flip(2, 0))
+
+        # not allow flip on the same dim more than once
+        self.assertRaises(RuntimeError, lambda: data.flip(0, 1, 1))
+        # not allow empty list as input
+        self.assertRaises(TypeError, lambda: data.flip())
+        # not allow size of flip dim > total dims
+        self.assertRaises(RuntimeError, lambda: data.flip(0, 1, 2, 3))
+        # not allow dim < 0
+        self.assertRaises(RuntimeError, lambda: data.flip(-1))
+        # not allow dim > max dim
+        self.assertRaises(RuntimeError, lambda: data.flip(3))
+
+        # test for non-contiguous case
+        if use_cuda:
+            expanded_data = torch.arange(1, 4, device=cuda).view(3, 1).expand(3, 2)
+            tranposed_data = torch.arange(1, 9, device=cuda).view(2, 2, 2).transpose(0, 1)
+        else:
+            expanded_data = torch.arange(1, 4).view(3, 1).expand(3, 2)
+            tranposed_data = torch.arange(1, 9).view(2, 2, 2).transpose(0, 1)
+        self.assertEqual(torch.tensor([3, 3, 2, 2, 1, 1]).view(3, 2), expanded_data.flip(0))
+        self.assertEqual(torch.tensor([8, 7, 4, 3, 6, 5, 2, 1]).view(2, 2, 2), tranposed_data.flip(0, 1, 2))
+
+    def test_flip(self):
+        self._test_flip(self, use_cuda=False)
+
     def test_storage(self):
         v = torch.randn(3, 5)
         self.assertEqual(v.storage()[0], v.data[0][0])

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -629,6 +629,9 @@
 - name: t(Tensor self)
   self: grad.t()
 
+- name: flip(Tensor self, IntList dims)
+  self: grad.flip(dims)
+
 - name: take(Tensor self, Tensor index)
   self: zeros_like(self).put_(index, grad, true)
 

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -4392,6 +4392,33 @@ def parse_kwargs(desc):
             [-0.5872,  0.6932]])
 """)
 
+add_docstr(torch.flip,
+           r"""
+flip(input, dims) -> Tensor
+
+Reverse the order of a n-D tensor along given axis in dims.
+
+Args:
+    input (Tensor): the input tensor
+    dims (a list or tuple): axis to flip on
+
+Example::
+
+    >>> x = torch.arange(8).view(2, 2, 2)
+    >>> x
+    tensor([[[ 0,  1],
+             [ 2,  3]],
+
+            [[ 4,  5],
+             [ 6,  7]]])
+    >>> torch.flip(x, [0, 1])
+    tensor([[[ 6,  7],
+             [ 4,  5]],
+
+            [[ 2,  3],
+             [ 0,  1]]])
+""")
+
 add_docstr(torch.take,
            r"""
 take(input, indices) -> Tensor