improve performance of maxpooling backwards (#4106)

ngimel · soumith · commit 63d6afdc62a3 · 2018-02-06T21:20:30.000-08:00
diff --git a/torch/lib/THCUNN/SpatialDilatedMaxPooling.cu b/torch/lib/THCUNN/SpatialDilatedMaxPooling.cu
@@ -41,41 +41,71 @@ __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
   }
 }
 
+const int BACKWARD_THREADS = 256;
 
 template <typename Dtype, typename AccType>
+__launch_bounds__(BACKWARD_THREADS,2048/BACKWARD_THREADS)
 __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
     const int64_t* top_mask, const int num, const int channels,
     const int height, const int width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w,
     Dtype* bottom_diff) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    // find out the local index
-    // find out the local offset
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-    int phstart =
-        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1;
-    int phend = min((h + pad_h) / stride_h + 1, pooled_height);
-    int pwstart =
+    CUDA_KERNEL_LOOP(index, height*width) {
+    int h = index/width;
+    int w = index - h * width;
+//get some templating performance benefits without actually templating
+    int phstart, phend, pwstart, pwend;
+    if (stride_h == 1) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1))  + 1;
+       phend = min((h + pad_h)  + 1, pooled_height);
+    } else if (stride_h == 2) {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / 2  + 1;
+       phend = min((h + pad_h) / 2  + 1, pooled_height);
+    } else {
+       phstart =
+        (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h  + 1;
+       phend = min((h + pad_h) / stride_h  + 1, pooled_height);
+    }
+    if (stride_w == 1) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) + 1;
+        pwend = min((w + pad_w) + 1, pooled_width);
+    } else if (stride_w == 2) {
+        pwstart =
+        (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / 2 + 1;
+        pwend = min((w + pad_w) / 2 + 1, pooled_width);
+    } else {
+        pwstart =
         (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
-    int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+        pwend = min((w + pad_w) / stride_w + 1, pooled_width);
+    }
+    for (int n = blockIdx.y; n < num; n += gridDim.y)
+       for (int c = blockIdx.z; c < channels; c+= gridDim.z) { 
 
-    AccType gradient = AccType(0);
-    int offset = (n * channels + c) * pooled_height * pooled_width;
-    top_diff += offset;
-    top_mask += offset;
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
-          gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
+        AccType gradient = AccType(0);
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        top_diff += offset;
+        top_mask += offset;
+//get some templating performance benefits without actually templating
+        if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) {
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
+              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
+            }
+          }
         }
+        } else {
+            if (top_mask[phstart * pooled_width + pwstart] - TH_INDEX_BASE == h * width + w) {
+              gradient += ScalarConvert<Dtype, AccType>::to(top_diff[phstart * pooled_width + pwstart]);
+            }  
+        }
+        bottom_diff[(n*channels+c)*height*width+index] = ScalarConvert<AccType, Dtype>::to(gradient);
       }
-    }
-    bottom_diff[index] = ScalarConvert<AccType, Dtype>::to(gradient);
   }
 }
 
diff --git a/torch/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu b/torch/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu
@@ -217,8 +217,17 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
   THCTensor_(resizeAs)(state, gradInput, input);
 
   int count = THCTensor_(nElement)(state, input);
-
-  MaxPoolBackward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+  dim3 grid;
+  int imgcount = nInputCols * nInputRows;
+  const int blocks = (imgcount + BACKWARD_THREADS - 1) / BACKWARD_THREADS;
+  grid.x = blocks;
+  grid.y = batchSize;
+  grid.z = nInputPlane;
+  uint64_t maxGridY = THCState_getCurrentDeviceProperties(state)->maxGridSize[1];
+  uint64_t maxGridZ = THCState_getCurrentDeviceProperties(state)->maxGridSize[2];
+  if (maxGridY < grid.y) grid.y = maxGridY;
+  if (maxGridZ < grid.z) grid.z = maxGridZ;
+  MaxPoolBackward<real, accreal> <<< grid, BACKWARD_THREADS, 0, THCState_getCurrentStream(state) >>>
       (count,
       THCTensor_(data)(state, gradOutput),
       THCIndexTensor_(data)(state, indices),