Replace downcastOuter with newFoldBatchDim

zou3519 · zou3519 · commit c088e32bb822 · 2018-02-05T10:25:34.000-08:00
diff --git a/aten/src/THC/generic/THCTensor.c b/aten/src/THC/generic/THCTensor.c
@@ -280,6 +280,20 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage
   return self;
 }
 
+// Collapses the first two dimensions of a tensor
+THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input) {
+  int in_dims = THCTensor_(nDimension)(state, input);
+  THArgCheck(in_dims >= 2, 1, "Tensor needs to have at least two dimensions");
+  THLongStorage *newSize = THLongStorage_newWithSize(in_dims - 1);
+  newSize->data[0] = THCTensor_(size)(state, input, 0) * THCTensor_(size)(state, input, 1);
+  for (int i = 2; i < in_dims; i++) {
+    newSize->data[i - 1] = THCTensor_(size)(state, input, i);
+  }
+  THCTensor *output = THCTensor_(newView)(state, input, newSize);
+  THLongStorage_free(newSize);
+  return output;
+}
+
 /* Resize */
 void THCTensor_(resize)(THCState *state, THCTensor *self, THLongStorage *size, THLongStorage *stride)
 {
diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h
@@ -67,6 +67,7 @@ THC_API THCTensor *THCTensor_(newNarrow)(THCState *state, THCTensor *tensor, int
 THC_API THCTensor *THCTensor_(newTranspose)(THCState *state, THCTensor *tensor, int dimension1_, int dimension2_);
 THC_API THCTensor *THCTensor_(newUnfold)(THCState *state, THCTensor *tensor, int dimension_, int64_t size_, int64_t step_);
 THC_API THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, THLongStorage *size);
+THC_API THCTensor *THCTensor_(newFoldBatchDim)(THCState *state, THCTensor *input);
 THC_API THCTensor *THCTensor_(newExpand)(THCState *state, THCTensor *tensor, THLongStorage *size);
 
 THC_API void THCTensor_(expand)(THCState *state, THCTensor *r, THCTensor *tensor, THLongStorage *sizes);
diff --git a/aten/src/THCUNN/generic/VolumetricAveragePooling.cu b/aten/src/THCUNN/generic/VolumetricAveragePooling.cu
@@ -128,7 +128,8 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   int dimh = 2;
   int dimw = 3;
 
-  if (input->nDimension == 5)
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (fiveDimensionalInput)
   {
     dimt++;
     dimh++;
@@ -139,7 +140,7 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
        (state, input, NULL, kT, kW, kH, dT, dW, dH,
         padT, padW, padH, ceil_mode);
 
-  if (THCTensor_(nDimension)(state, input) == 4)
+  if (!fiveDimensionalInput) /* 4D */
   {
     /* sizes */
     batchSize   = 1;
@@ -186,7 +187,7 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
       --outputWidth;
   }
 
-  if (input->nDimension == 4) /* 4D */
+  if (!fiveDimensionalInput) /* 4D */
   {
     /* resize output */
     THCTensor_(resize4d)(state, output, inputSlices,
@@ -199,20 +200,21 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
   }
 
   input = THCTensor_(newContiguous)(state, input);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    output = THCTensor_(newFoldBatchDim)(state, output);
+    
+    THCTensor *old_input = input;
+    input = THCTensor_(newFoldBatchDim)(state, input);
+    THCTensor_(free)(state, old_input);
+  } else {
+    THCTensor_(retain)(state, output);
+  }
 
-  // Collapse batch and feature dimensions
   THCDeviceTensor<real, 4> cudaInput;
   THCDeviceTensor<real, 4> cudaOutput;
-  if (THCTensor_(nDimension)(state, input) == 4)
-  {
-    cudaInput  = toDeviceTensor<real, 4>(state, input);
-    cudaOutput = toDeviceTensor<real, 4>(state, output);
-  }
-  else
-  {
-    cudaInput  = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
-    cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
-  }
+  cudaInput  = toDeviceTensor<real, 4>(state, input);
+  cudaOutput = toDeviceTensor<real, 4>(state, output);
 
   int totalZ = outputTime * inputSlices * batchSize;
   int offsetZ = 0;
@@ -247,7 +249,9 @@ void THNN_(VolumetricAveragePooling_updateOutput)(
     offsetZ += 65535;
     THCudaCheck(cudaGetLastError());
   }
+
   THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
 }
 
 void THNN_(VolumetricAveragePooling_updateGradInput)(
@@ -280,7 +284,8 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   int outputHeight;
   int outputWidth;
 
-  if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+  if (!fiveDimensionalInput) /* 4D */
   {
     batchSize = 1;
     inputSlices  = THCTensor_(size)(state, input, 0);
@@ -306,22 +311,21 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
   }
 
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    gradInput = THCTensor_(newFoldBatchDim)(state, gradInput);
+
+    THCTensor *old_gradOutput = gradOutput;
+    gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput);
+    THCTensor_(free)(state, old_gradOutput);
+  } else {
+    THCTensor_(retain)(state, gradInput);
+  }
 
-  // Collapse batch and feature dimensions
   THCDeviceTensor<real, 4> cudaGradInput;
   THCDeviceTensor<real, 4> cudaGradOutput;
-  if (THCTensor_(nDimension)(state, input) == 4)
-  {
-    cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
-    cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
-  }
-  else
-  {
-    cudaGradInput =
-      toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
-    cudaGradOutput =
-      toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
-  }
+  cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+  cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
 
   dim3 block(32, 8);
 
@@ -372,6 +376,7 @@ void THNN_(VolumetricAveragePooling_updateGradInput)(
     }
   }
 
+  THCTensor_(free)(state, gradInput);
   THCTensor_(free)(state, gradOutput);
 }
 
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
@@ -141,7 +141,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   int dimh = 2;
   int dimw = 3;
 
-  if (input->nDimension == 5)
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+
+  if (fiveDimensionalInput)
   {
     dimt++;
     dimh++;
@@ -163,7 +165,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     inputHeight = THCTensor_(size)(state, input, 2);
     inputWidth  = THCTensor_(size)(state, input, 3);
   }
-  else if (THCTensor_(nDimension)(state, input) == 5)
+  else if (fiveDimensionalInput)
   {
     /* sizes */
     batchSize   = THCTensor_(size)(state, input, 0);
@@ -200,7 +202,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
       --outputWidth;
   }
 
-  if (input->nDimension == 4) /* 4D */
+  if (!fiveDimensionalInput) /* 4D */
   {
     /* resize output */
     THCTensor_(resize4d)(state, output, inputSlices,
@@ -217,23 +219,25 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
     // Index tensor packs index offsets as uchars into floats
     THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices,
                           outputTime, outputHeight, outputWidth);
+    fiveDimensionalInput = 1;
   }
 
   input = THCTensor_(newContiguous)(state, input);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    output = THCTensor_(newFoldBatchDim)(state, output);
+
+    THCTensor *old_input = input;
+    input = THCTensor_(newFoldBatchDim)(state, input);
+    THCTensor_(free)(state, old_input);
+  } else {
+    THCTensor_(retain)(state, output);
+  }
 
-  // Collapse batch and feature dimensions
   THCDeviceTensor<real, 4> cudaInput;
   THCDeviceTensor<real, 4> cudaOutput;
-  if (THCTensor_(nDimension)(state, input) == 4)
-  {
-    cudaInput  = toDeviceTensor<real, 4>(state, input);
-    cudaOutput = toDeviceTensor<real, 4>(state, output);
-  }
-  else
-  {
-    cudaInput  = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
-    cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
-  }
+  cudaInput  = toDeviceTensor<real, 4>(state, input);
+  cudaOutput = toDeviceTensor<real, 4>(state, output);
 
   THLongStorage *indicesSize = THLongStorage_newWithSize(4);
   int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
@@ -281,6 +285,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
   }
 
   THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
   THCIndexTensor_(free)(state, indices1);
 }
 
@@ -310,13 +315,15 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   int outputHeight;
   int outputWidth;
 
+  int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;
+
   THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
   THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
         state, input, gradOutput, indices, kT, kW, kH,
         dT, dW, dH, padT, padW, padH,
         dilationT, dilationW, dilationH, ceilMode);
 
-  if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+  if (!fiveDimensionalInput) /* 4D */
   {
     batchSize = 1;
     inputSlices  = THCTensor_(size)(state, input, 0);
@@ -336,22 +343,21 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   }
 
   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  if (fiveDimensionalInput) {
+    // Collapse batch and feature dimensions
+    gradInput = THCTensor_(newFoldBatchDim)(state, gradInput);
+
+    THCTensor *old_gradOutput = gradOutput;
+    gradOutput = THCTensor_(newFoldBatchDim)(state, gradOutput);
+    THCTensor_(free)(state, old_gradOutput);
+  } else {
+    THCTensor_(retain)(state, gradInput);
+  }
 
-  // Collapse batch and feature dimensions
   THCDeviceTensor<real, 4> cudaGradInput;
   THCDeviceTensor<real, 4> cudaGradOutput;
-  if (THCTensor_(nDimension)(state, input) == 4)
-  {
-    cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
-    cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
-  }
-  else
-  {
-    cudaGradInput =
-      toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
-    cudaGradOutput =
-      toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
-  }
+  cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+  cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
 
   THLongStorage *indicesSize = THLongStorage_newWithSize(4);
   int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
@@ -388,6 +394,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   }
 
   // cleanup
+  THCTensor_(free)(state, gradInput);
   THCTensor_(free)(state, gradOutput);
   THCIndexTensor_(free)(state, indices1);
 }
diff --git a/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu b/aten/src/THCUNN/generic/VolumetricMaxUnpooling.cu
diff --git a/test/test_nn.py b/test/test_nn.py