pytorch · ezyang · Mar 24, 2018 · Feb 17, 2018 · Mar 21, 2018 · Mar 24, 2018
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
@@ -206,17 +206,17 @@
 # Note: The upsampling backwards functions also include an IntList input_size
 # parameter, which is added by nn_parse.py
 
-- name: upsample_linear1d(Tensor self, IntList[1] output_size)
+- name: upsample_linear1d(Tensor self, IntList[1] output_size, bool align_corners)
   cname: TemporalUpSamplingLinear
   scalar_check:
     grad_input: 'false'
 
-- name: upsample_bilinear2d(Tensor self, IntList[2] output_size)
+- name: upsample_bilinear2d(Tensor self, IntList[2] output_size, bool align_corners)
   cname: SpatialUpSamplingBilinear
   scalar_check:
     grad_input: 'false'
 
-- name: upsample_trilinear3d(Tensor self, IntList[3] output_size)
+- name: upsample_trilinear3d(Tensor self, IntList[3] output_size, bool align_corners)
   cname: VolumetricUpSamplingTrilinear
   scalar_check:
     grad_input: 'false'

diff --git a/aten/src/ATen/nn_parse.py b/aten/src/ATen/nn_parse.py
@@ -286,19 +286,23 @@ def backward_declaration(base, thnn_functions):
                   if arg['name'] != 'inplace']
     arguments += base['buffers']
 
+    if 'upsample' in base['name']:
+        # Add input_size as parameter to upsample backwards functions
+        # Note that input_size is 4-dim for upsample_xxx2d
+        size = 2 + int(re.search(r'(\d+)d', base['name']).group(1))
+        input_size_arg = {'type': 'IntList', 'name': 'input_size', 'size': size}
+        for output_size_idx, arg in enumerate(arguments):
+            if arg['name'] == 'output_size':
+                break
+        arguments.insert(output_size_idx + 1, input_size_arg)
+
     # outputs from the forward may be inputs to the backwards
     for arg in arguments:
         if 'output' in arg:
             del arg['output']
 
     arguments += unique_args([output_arguments(f) for f in thnn_functions])
 
-    if 'upsample' in base['name']:
-        # Add input_size as parameter to upsample backwards functions
-        # Note that input_size is 4-dim for upsample_xxx2d
-        size = 2 + int(re.search(r'(\d+)d', base['name']).group(1))
-        arguments.append({'type': 'IntList', 'name': 'input_size', 'size': size})
-
     def initialize_output_arg(arg):
         # the mask array<bool, N> specifies which return values to compute
         arg['mask'] = True

diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
@@ -232,7 +232,7 @@ static void test(Type & type) {
     for (int64_t i = 0; i < tensor.numel(); ++i) {
       REQUIRE(tensor[i].equal(one * i));
     }
-    for (size_t i = 0; i < tensor.numel(); ++i) {
+    for (size_t i = 0; i < static_cast<uint64_t>(tensor.numel()); ++i) {
       REQUIRE(tensor[i].equal(one * static_cast<int64_t>(i)));
     }
     for (int i = 0; i < tensor.numel(); ++i) {

diff --git a/aten/src/THCUNN/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/SpatialUpSamplingBilinear.cu
@@ -2,6 +2,7 @@
 // Originally developed by George Papandreou
 #include "THCUNN.h"
 #include "common.h"
+#include "linear_upsampling.h"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
@@ -11,7 +12,7 @@
 
 template<typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rheight, const Acctype rwidth,
+    const Acctype rheight, const Acctype rwidth, const bool align_corners,
     const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -37,13 +38,13 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
       return;
     }
     //
-    const Acctype h1r = rheight * h2;
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;
@@ -64,7 +65,7 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rheight, const Acctype rwidth,
+    const Acctype rheight, const Acctype rwidth, const bool align_corners,
     THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -89,13 +90,13 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n,
       return;
     }
     //
-    const Acctype h1r = rheight * h2;
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;

diff --git a/aten/src/THCUNN/TemporalUpSamplingLinear.cu b/aten/src/THCUNN/TemporalUpSamplingLinear.cu
@@ -2,6 +2,7 @@
 // Originally developed by George Papandreou
 #include "THCUNN.h"
 #include "common.h"
+#include "linear_upsampling.h"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
@@ -11,7 +12,7 @@
 
 template<typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rwidth,
+    const Acctype rwidth, const bool align_corners,
     const THCDeviceTensor<Dtype, 3> data1, THCDeviceTensor<Dtype, 3> data2) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -33,7 +34,7 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
       return;
     }
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;
@@ -52,7 +53,7 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template <typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rwidth,
+    const Acctype rwidth, const bool align_corners,
     THCDeviceTensor<Dtype, 3> data1, const THCDeviceTensor<Dtype, 3> data2){
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -73,7 +74,7 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n,
       return;
     }
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;

diff --git a/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu b/aten/src/THCUNN/VolumetricUpSamplingTrilinear.cu
@@ -2,6 +2,7 @@
 // Originally developed by George Papandreou
 #include "THCUNN.h"
 #include "common.h"
+#include "linear_upsampling.h"
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
@@ -12,7 +13,7 @@
 template<typename Dtype, typename Acctype>
 __launch_bounds__(1024)
 __global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rdepth, const Acctype rheight, const Acctype rwidth,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
     const THCDeviceTensor<Dtype, 5> data1, THCDeviceTensor<Dtype, 5> data2) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -42,31 +43,31 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
       return;
     }
     //
-    const Acctype t1r = rdepth * t2;
+    const Acctype t1r = linear_upsampling_compute_source_index<Acctype>(rdepth, t2, align_corners);
     const int t1 = t1r;
     const int t1p = (t1 < depth1 - 1) ? 1 : 0;
     const Acctype t1lambda = t1r - t1;
     const Acctype t0lambda = Acctype(1) - t1lambda;
     //
-    const Acctype h1r = rheight * h2;
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;
     const Acctype w0lambda = Acctype(1) - w1lambda;
     //
     for (int n = 0; n < batchsize ; n++){
         for (int c = 0; c < channels; ++c) {
-        const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1] 
+        const Acctype val = t0lambda * (h0lambda * (w0lambda * data1[n][c][t1][h1][w1]
                                                   + w1lambda * data1[n][c][t1][h1][w1+w1p])
                                       + h1lambda * (w0lambda * data1[n][c][t1][h1+h1p][w1]
                                                   + w1lambda * data1[n][c][t1][h1+h1p][w1+w1p]))
-                          + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1] 
+                          + t1lambda * (h0lambda * (w0lambda * data1[n][c][t1+t1p][h1][w1]
                                                   + w1lambda * data1[n][c][t1+t1p][h1][w1+w1p])
                                       + h1lambda * (w0lambda * data1[n][c][t1+t1p][h1+h1p][w1]
                                                   + w1lambda * data1[n][c][t1+t1p][h1+h1p][w1+w1p]));
@@ -80,7 +81,7 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
 template <typename Dtype, typename Acctype>
 __launch_bounds__(1024)
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const Acctype rdepth, const Acctype rheight, const Acctype rwidth,
+    const Acctype rdepth, const Acctype rheight, const Acctype rwidth, const bool align_corners,
     THCDeviceTensor<Dtype, 5> data1, const THCDeviceTensor<Dtype, 5> data2){
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
@@ -109,19 +110,19 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n,
       return;
     }
     //
-    const Acctype t1r = rdepth * t2;
+    const Acctype t1r = linear_upsampling_compute_source_index<Acctype>(rdepth, t2, align_corners);
     const int t1 = t1r;
     const int t1p = (t1 < depth1 - 1) ? 1 : 0;
     const Acctype t1lambda = t1r - t1;
     const Acctype t0lambda = Acctype(1) - t1lambda;
     //
-    const Acctype h1r = rheight * h2;
+    const Acctype h1r = linear_upsampling_compute_source_index<Acctype>(rheight, h2, align_corners);
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
     const Acctype h1lambda = h1r - h1;
     const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const Acctype w1r = rwidth * w2;
+    const Acctype w1r = linear_upsampling_compute_source_index<Acctype>(rwidth, w2, align_corners);
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
     const Acctype w1lambda = w1r - w1;

diff --git a/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu b/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu
@@ -2,6 +2,8 @@
 #define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu"
 #else
 
+#include "../linear_upsampling.h"
+
 static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
                         (THCState *state,
                          THCTensor *input, THCTensor *gradOutput,
@@ -31,7 +33,8 @@ void THNN_(SpatialUpSamplingBilinear_updateOutput)(
            THCTensor *input,
            THCTensor *output,
            int outputHeight,
-           int outputWidth)
+           int outputWidth,
+           bool align_corners)
 {
   int nbatch = THCTensor_(size)(state, input, 0);
   int channels = THCTensor_(size)(state, input, 1);
@@ -52,14 +55,14 @@ void THNN_(SpatialUpSamplingBilinear_updateOutput)(
   THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
   THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
   THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
-  const accreal rheight= (outputHeight > 1) ? (accreal)(inputHeight - 1)/(outputHeight - 1) : accreal(0);
-  const accreal rwidth = (outputWidth > 1) ? (accreal)(inputWidth - 1)/(outputWidth - 1) : accreal(0);
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
   const int num_kernels = outputHeight * outputWidth;
   const int num_threads =
     THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
   cudaStream_t stream = THCState_getCurrentStream(state);
   caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
-   0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
+   0 , stream>>>(num_kernels, rheight, rwidth, align_corners, idata, odata);
   THCudaCheck(cudaGetLastError());
   THCTensor_(free)(state, input);
 }
@@ -74,7 +77,8 @@ void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
            int inputHeight,
            int inputWidth,
            int outputHeight,
-           int outputWidth)
+           int outputWidth,
+           bool align_corners)
 {
   THNN_(SpatialUpSamplingBilinear_shapeCheck)
        (state, NULL, gradOutput,
@@ -88,19 +92,14 @@ void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
   THCTensor_(zero)(state, gradInput);
   THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
   THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
-  int height1 = data1.getSize(2);
-  int width1 = data1.getSize(3);
-  int height2 = data2.getSize(2);
-  int width2 = data2.getSize(3);
-  assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
-  const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
-  const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0);
-  const int num_kernels = height2 * width2;
+  const accreal rheight = linear_upsampling_compute_scale<accreal>(inputHeight, outputHeight, align_corners);
+  const accreal rwidth = linear_upsampling_compute_scale<accreal>(inputWidth, outputWidth, align_corners);
+  const int num_kernels = outputHeight * outputWidth;
   const int num_threads =
     THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
   cudaStream_t stream = THCState_getCurrentStream(state);
   caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
+  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, align_corners, data1, data2);
   THCudaCheck(cudaGetLastError());
   THCTensor_(free)(state, gradInput);
   THCTensor_(free)(state, gradOutput);

diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
@@ -1042,7 +1042,8 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
                   THCTensor *input,
                   THCTensor *output,
                   int outputHeight,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
                   THCState *state,
@@ -1053,7 +1054,8 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
                   int inputHeight,
                   int inputWidth,
                   int outputHeight,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
                   THCState *state,
@@ -1336,7 +1338,8 @@ TH_API void THNN_(TemporalUpSamplingLinear_updateOutput)(
                   THCState *state,
                   THCTensor *input,
                   THCTensor *output,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
                   THCState *state,
@@ -1345,7 +1348,8 @@ TH_API void THNN_(TemporalUpSamplingLinear_updateGradInput)(
                   int nbatch,
                   int nchannels,
                   int inputWidth,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 TH_API void THNN_(TemporalUpSamplingNearest_updateGradInput)(
                   THCState *state,
@@ -1701,7 +1705,8 @@ TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
                   THCTensor *output,
                   int outputDepth,
                   int outputHeight,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
                   THCState *state,
@@ -1714,6 +1719,7 @@ TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
                   int inputWidth,
                   int outputDepth,
                   int outputHeight,
-                  int outputWidth);
+                  int outputWidth,
+                  bool align_corners);
 
 #endif