pytorch
diff --git a/‎aten/src/THCUNN/VolumetricDilatedMaxPooling.cu‎
Lines changed: 63 additions & 71 deletions b/‎aten/src/THCUNN/VolumetricDilatedMaxPooling.cu‎
Lines changed: 63 additions & 71 deletions
diff --git a/‎aten/src/THCUNN/VolumetricMaxUnpooling.cu‎
Lines changed: 8 additions & 24 deletions b/‎aten/src/THCUNN/VolumetricMaxUnpooling.cu‎
Lines changed: 8 additions & 24 deletions
diff --git a/‎aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu‎
Lines changed: 18 additions & 10 deletions b/‎aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu‎
Lines changed: 18 additions & 10 deletions
@@ -11,7 +11,7 @@
 
 template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
-  THCDeviceTensor<Dtype, 4> input,
+  Dtype* inputData, int inputT, int inputH, int inputW,
   THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> output,
   int kT, int kH, int kW,
@@ -27,56 +27,53 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
 
   if (oRow < output.getSize(2) && oColumn < output.getSize(3))
   {
-    int iColumn = oColumn * dW - padW;
-    int iRow    = oRow    * dH - padH;
-    int iFrame  = oFrame  * dT - padT;
-
-    int maxColumn = 0;
-    int maxRow = 0;
-    int maxFrame = 0;
+    int tStart = oFrame  * dT - padT;
+    int hStart = oRow    * dH - padH;
+    int wStart = oColumn * dW - padW;
+    int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
+    int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
+    int wEnd = fminf(wStart + (kW - 1) * dilationW + 1, inputW);
+
+    while(tStart < 0)
+      tStart += dilationT;
+    while(hStart < 0)
+      hStart += dilationH;
+    while(wStart < 0)
+      wStart += dilationW;
+
+    int index = 0;
+    int maxIndex = -1;
+    inputData += slice * inputT * inputH * inputW;
 
     Dtype max = THCNumerics<Dtype>::min();
 
-    for (int frame = 0; frame < kT; ++frame)
+    for (int t = tStart; t < tEnd; t += dilationT)
     {
-      if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
+      for (int h = hStart; h < hEnd; h += dilationH)
       {
-        for (int row = 0; row < kH; ++row)
+        for (int w = wStart; w < wEnd; w += dilationW)
         {
-          if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
+          index = t * inputH * inputW + h * inputW + w;
+          Dtype val = inputData[index];
+
+          if (max < val)
           {
-            for (int column = 0; column < kW; ++column)
-            {
-              if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
-              {
-                Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
-
-                if (max < val)
-                {
-                  max = val;
-                  maxColumn = column;
-                  maxRow    = row;
-                  maxFrame  = frame;
-                }
-              }
-            }
+            max = val;
+            maxIndex = index;
           }
         }
       }
     }
 
     output[slice][oFrame][oRow][oColumn] = max;
-    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
-    ((unsigned char*)(idx))[0] = maxFrame;
-    ((unsigned char*)(idx))[1] = maxRow;
-    ((unsigned char*)(idx))[2] = maxColumn;
-    ((unsigned char*)(idx))[3] = 0;
+    indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
   }
 }
 
 template <int KERNEL_WIDTH, typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
-  THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
+  Dtype* inputData, int inputT, int inputH, int inputW,
+  THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> output,
   int kT, int kH,
   int dT, int dH, int dW,
@@ -91,58 +88,54 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
 
   if (oRow < output.getSize(2) && oColumn < output.getSize(3))
   {
-    int iColumn = oColumn * dW - padW;
-    int iRow    = oRow    * dH - padH;
-    int iFrame  = oFrame  * dT - padT;
-
-    int maxColumn = 0;
-    int maxRow = 0;
-    int maxFrame;
+    int tStart = oFrame  * dT - padT;
+    int hStart = oRow    * dH - padH;
+    int wStart = oColumn * dW - padW;
+    int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
+    int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
+    int wEnd = fminf(wStart + (KERNEL_WIDTH - 1) * dilationW + 1, inputW);
+
+    while(tStart < 0)
+      tStart += dilationT;
+    while(hStart < 0)
+      hStart += dilationH;
+    while(wStart < 0)
+      wStart += dilationW;
+
+    int index = 0;
+    int maxIndex = -1;
 
     Dtype max = THCNumerics<Dtype>::min();
 
-    for (int frame = 0; frame < kT; ++frame)
+    for (int t = tStart; t < tEnd; t += dilationT)
     {
-      if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
+      for (int h = hStart; h < hEnd; h += dilationH)
       {
-        for (int row = 0; row < kH; ++row)
+        for (int w = wStart; w < wEnd; w += dilationW)
         {
-          if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
+          index = t * inputH * inputW + h * inputW + w;
+          Dtype val = inputData[slice * inputT * inputH * inputW + index];
+
+          if (max < val)
           {
-            for (int column = 0; column < KERNEL_WIDTH; ++column)
-            {
-              if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
-              {
-                Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
-
-                if (max < val)
-                {
-                  max = val;
-                  maxColumn = column;
-                  maxRow    = row;
-                  maxFrame  = frame;
-                }
-              }
-            }
+            max = val;
+            maxIndex = index;
           }
         }
       }
     }
 
     output[slice][oFrame][oRow][oColumn] = max;
-    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
-    ((unsigned char*)(idx))[0] = maxFrame;
-    ((unsigned char*)(idx))[1] = maxRow;
-    ((unsigned char*)(idx))[2] = maxColumn;
-    ((unsigned char*)(idx))[3] = 0;
+    indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
   }
 }
 
 template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
   THCDeviceTensor<Dtype, 4> gradOutput,
   THCDeviceTensor<THCIndex_t, 4> indices,
-  THCDeviceTensor<Dtype, 4> gradInput,
+  Dtype* gradInputData,
+  int inputT, int inputH, int inputW,
   int dT, int dH, int dW,
   int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
@@ -155,12 +148,11 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
 
   if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
   {
-    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
-    int iFrame  = ((unsigned char*)(idx))[0] * dilationT + oFrame  * dT - padT;
-    int iRow    = ((unsigned char*)(idx))[1] * dilationH + oRow    * dH - padH;
-    int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
-    atomicAdd(&gradInput[slice][iFrame][iRow][iColumn],
-              gradOutput[slice][oFrame][oRow][oColumn]);
+    int maxIndex = indices[slice][oFrame][oRow][oColumn] - TH_INDEX_BASE;
+    if (maxIndex != -1) {
+      atomicAdd(&gradInputData[slice * inputT * inputH * inputW + maxIndex],
+                gradOutput[slice][oFrame][oRow][oColumn]);
+    }
   }
 }
 
 
@@ -12,7 +12,8 @@ template <typename Dtype>
 __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
   THCDeviceTensor<Dtype, 4> input,
   THCDeviceTensor<THCIndex_t, 4> indices,
-  THCDeviceTensor<Dtype, 4> output,
+  Dtype* outputData,
+  int oT, int oH, int oW,
   int dT, int dH, int dW,
   int padT, int padH, int padW, int offsetZ)
 {
@@ -23,23 +24,16 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
 
   if (iRow < input.getSize(2) && iColumn < input.getSize(3))
   {
-    int64_t start_t = iFrame * dT - padT;
-    int64_t start_h = iRow * dH - padH;
-    int64_t start_w = iColumn * dW - padW;
-
     Dtype val = input[slice][iFrame][iRow][iColumn];
-
-    THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
-    int64_t maxz = ((unsigned char*)(idx))[0];
-    int64_t maxy = ((unsigned char*)(idx))[1];
-    int64_t maxx = ((unsigned char*)(idx))[2];
-    output[slice][start_t + maxz][start_h + maxy][start_w + maxx] = val;
+    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    outputData[slice*oT*oH*oW + index] = val;
   }
 }
 
 template <typename Dtype>
 __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
-  THCDeviceTensor<Dtype, 4> gradOutput,
+  Dtype* gradOutputData,
+  int oT, int oH, int oW,
   THCDeviceTensor<THCIndex_t, 4> indices,
   THCDeviceTensor<Dtype, 4> gradInput,
   int dT, int dH, int dW,
@@ -52,18 +46,8 @@ __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
 
   if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
   {
-
-    int64_t start_t = iFrame * dT - padT;
-    int64_t start_h = iRow * dH - padH;
-    int64_t start_w = iColumn * dW - padW;
-
-    THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
-    int64_t maxz = ((unsigned char*)(idx))[0];
-    int64_t maxy = ((unsigned char*)(idx))[1];
-    int64_t maxx = ((unsigned char*)(idx))[2];
-
-    Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
-
+    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    Dtype grad_val = gradOutputData[slice*oT*oH*oW + index];
     gradInput[slice][iFrame][iRow][iColumn] = grad_val;
   }
 }
 
@@ -5,7 +5,8 @@
 #define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:                         \
   cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block,             \
     0, THCState_getCurrentStream(state)>>>(                             \
-    cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
+    inputData, inputTime, inputHeight, inputWidth, \
+    cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
     dilationT, dilationH, dilationW, offsetZ); \
     break
 
@@ -233,10 +234,10 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   } else {
     THCTensor_(retain)(state, output);
   }
+  
+  real* inputData = THCTensor_(data)(state, input);
 
-  THCDeviceTensor<real, 4> cudaInput;
   THCDeviceTensor<real, 4> cudaOutput;
-  cudaInput  = toDeviceTensor<real, 4>(state, input);
   cudaOutput = toDeviceTensor<real, 4>(state, output);
 
   THLongStorage *indicesSize = THLongStorage_newWithSize(4);
@@ -275,7 +276,8 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
       default:
         cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
           0, THCState_getCurrentStream(state)>>>(
-                             cudaInput, cudaIndices, cudaOutput,
+                             inputData, inputTime, inputHeight, inputWidth,
+                             cudaIndices, cudaOutput,
                              kT, kH, kW, dT, dH, dW,
                              padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
       }
@@ -306,14 +308,14 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   // TODO: gradOutput shape check
   // Resize and initialize result tensor.
   THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(newContiguous)(state, gradInput);
   THCTensor_(zero)(state, gradInput);
 
   int batchSize;
   int inputSlices;
 
-  int outputTime;
-  int outputHeight;
-  int outputWidth;
+  int outputTime, outputHeight, outputWidth;
+  int inputTime, inputHeight, inputWidth;
 
   int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
 
@@ -331,6 +333,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
     outputTime   = THCTensor_(size)(state, gradOutput, 1);
     outputHeight = THCTensor_(size)(state, gradOutput, 2);
     outputWidth  = THCTensor_(size)(state, gradOutput, 3);
+    inputTime   = THCTensor_(size)(state, gradInput, 1);
+    inputHeight = THCTensor_(size)(state, gradInput, 2);
+    inputWidth  = THCTensor_(size)(state, gradInput, 3);
   }
   else
   {
@@ -340,6 +345,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
     outputTime   = THCTensor_(size)(state, gradOutput, 2);
     outputHeight = THCTensor_(size)(state, gradOutput, 3);
     outputWidth  = THCTensor_(size)(state, gradOutput, 4);
+    inputTime   = THCTensor_(size)(state, gradInput, 2);
+    inputHeight = THCTensor_(size)(state, gradInput, 3);
+    inputWidth  = THCTensor_(size)(state, gradInput, 4);
   }
 
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
@@ -354,10 +362,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
     THCTensor_(retain)(state, gradInput);
   }
 
-  THCDeviceTensor<real, 4> cudaGradInput;
   THCDeviceTensor<real, 4> cudaGradOutput;
-  cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
   cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  real* gradInputData = THCTensor_(data)(state, gradInput);
 
   THLongStorage *indicesSize = THLongStorage_newWithSize(4);
   int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
@@ -384,7 +391,8 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
       0, THCState_getCurrentStream(state)>>>(
                                              cudaGradOutput,
                                              cudaIndices,
-                                             cudaGradInput,
+                                             gradInputData,
+                                             inputTime, inputHeight, inputWidth,
                                              dT, dH, dW,
                                              padT, padH, padW,
                                              dilationT, dilationH, dilationW, offsetZ);