Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 63 additions & 71 deletions aten/src/THCUNN/VolumetricDilatedMaxPooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
THCDeviceTensor<Dtype, 4> input,
Dtype* inputData, int inputT, int inputH, int inputW,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> output,
int kT, int kH, int kW,
Expand All @@ -27,56 +27,53 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(

if (oRow < output.getSize(2) && oColumn < output.getSize(3))
{
int iColumn = oColumn * dW - padW;
int iRow = oRow * dH - padH;
int iFrame = oFrame * dT - padT;

int maxColumn = 0;
int maxRow = 0;
int maxFrame = 0;
int tStart = oFrame * dT - padT;
int hStart = oRow * dH - padH;
int wStart = oColumn * dW - padW;
int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
int wEnd = fminf(wStart + (kW - 1) * dilationW + 1, inputW);

while(tStart < 0)
tStart += dilationT;
while(hStart < 0)
hStart += dilationH;
while(wStart < 0)
wStart += dilationW;

int index = 0;
int maxIndex = -1;
inputData += slice * inputT * inputH * inputW;

Dtype max = THCNumerics<Dtype>::min();

for (int frame = 0; frame < kT; ++frame)
for (int t = tStart; t < tEnd; t += dilationT)
{
if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
for (int h = hStart; h < hEnd; h += dilationH)
{
for (int row = 0; row < kH; ++row)
for (int w = wStart; w < wEnd; w += dilationW)
{
if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
index = t * inputH * inputW + h * inputW + w;
Dtype val = inputData[index];

if (max < val)
{
for (int column = 0; column < kW; ++column)
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

if (max < val)
{
max = val;
maxColumn = column;
maxRow = row;
maxFrame = frame;
}
}
}
max = val;
maxIndex = index;
}
}
}
}

output[slice][oFrame][oRow][oColumn] = max;
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
((unsigned char*)(idx))[3] = 0;
indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
}
}

template <int KERNEL_WIDTH, typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
Dtype* inputData, int inputT, int inputH, int inputW,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> output,
int kT, int kH,
int dT, int dH, int dW,
Expand All @@ -91,58 +88,54 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(

if (oRow < output.getSize(2) && oColumn < output.getSize(3))
{
int iColumn = oColumn * dW - padW;
int iRow = oRow * dH - padH;
int iFrame = oFrame * dT - padT;

int maxColumn = 0;
int maxRow = 0;
int maxFrame;
int tStart = oFrame * dT - padT;
int hStart = oRow * dH - padH;
int wStart = oColumn * dW - padW;
int tEnd = fminf(tStart + (kT - 1) * dilationT + 1, inputT);
int hEnd = fminf(hStart + (kH - 1) * dilationH + 1, inputH);
int wEnd = fminf(wStart + (KERNEL_WIDTH - 1) * dilationW + 1, inputW);

while(tStart < 0)
tStart += dilationT;
while(hStart < 0)
hStart += dilationH;
while(wStart < 0)
wStart += dilationW;

int index = 0;
int maxIndex = -1;

Dtype max = THCNumerics<Dtype>::min();

for (int frame = 0; frame < kT; ++frame)
for (int t = tStart; t < tEnd; t += dilationT)
{
if (iFrame + frame * dilationT < input.getSize(1) && iFrame + frame * dilationT >= 0)
for (int h = hStart; h < hEnd; h += dilationH)
{
for (int row = 0; row < kH; ++row)
for (int w = wStart; w < wEnd; w += dilationW)
{
if (iRow + row * dilationH < input.getSize(2) && iRow + row * dilationH >= 0)
index = t * inputH * inputW + h * inputW + w;
Dtype val = inputData[slice * inputT * inputH * inputW + index];

if (max < val)
{
for (int column = 0; column < KERNEL_WIDTH; ++column)
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];

if (max < val)
{
max = val;
maxColumn = column;
maxRow = row;
maxFrame = frame;
}
}
}
max = val;
maxIndex = index;
}
}
}
}

output[slice][oFrame][oRow][oColumn] = max;
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
((unsigned char*)(idx))[3] = 0;
indices[slice][oFrame][oRow][oColumn] = maxIndex + TH_INDEX_BASE;
}
}

template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
THCDeviceTensor<Dtype, 4> gradOutput,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> gradInput,
Dtype* gradInputData,
int inputT, int inputH, int inputW,
int dT, int dH, int dW,
int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
Expand All @@ -155,12 +148,11 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(

if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
{
THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
int iFrame = ((unsigned char*)(idx))[0] * dilationT + oFrame * dT - padT;
int iRow = ((unsigned char*)(idx))[1] * dilationH + oRow * dH - padH;
int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
atomicAdd(&gradInput[slice][iFrame][iRow][iColumn],
gradOutput[slice][oFrame][oRow][oColumn]);
int maxIndex = indices[slice][oFrame][oRow][oColumn] - TH_INDEX_BASE;
if (maxIndex != -1) {
atomicAdd(&gradInputData[slice * inputT * inputH * inputW + maxIndex],
gradOutput[slice][oFrame][oRow][oColumn]);
}
}
}

Expand Down
32 changes: 8 additions & 24 deletions aten/src/THCUNN/VolumetricMaxUnpooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ template <typename Dtype>
__global__ void cuda_VolumetricMaxUnpooling_updateOutput(
THCDeviceTensor<Dtype, 4> input,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> output,
Dtype* outputData,
int oT, int oH, int oW,
int dT, int dH, int dW,
int padT, int padH, int padW, int offsetZ)
{
Expand All @@ -23,23 +24,16 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(

if (iRow < input.getSize(2) && iColumn < input.getSize(3))
{
int64_t start_t = iFrame * dT - padT;
int64_t start_h = iRow * dH - padH;
int64_t start_w = iColumn * dW - padW;

Dtype val = input[slice][iFrame][iRow][iColumn];

THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
int64_t maxz = ((unsigned char*)(idx))[0];
int64_t maxy = ((unsigned char*)(idx))[1];
int64_t maxx = ((unsigned char*)(idx))[2];
output[slice][start_t + maxz][start_h + maxy][start_w + maxx] = val;
int64_t index = indices[slice][iFrame][iRow][iColumn];
outputData[slice*oT*oH*oW + index] = val;
}
}

template <typename Dtype>
__global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
THCDeviceTensor<Dtype, 4> gradOutput,
Dtype* gradOutputData,
int oT, int oH, int oW,
THCDeviceTensor<THCIndex_t, 4> indices,
THCDeviceTensor<Dtype, 4> gradInput,
int dT, int dH, int dW,
Expand All @@ -52,18 +46,8 @@ __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(

if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
{

int64_t start_t = iFrame * dT - padT;
int64_t start_h = iRow * dH - padH;
int64_t start_w = iColumn * dW - padW;

THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
int64_t maxz = ((unsigned char*)(idx))[0];
int64_t maxy = ((unsigned char*)(idx))[1];
int64_t maxx = ((unsigned char*)(idx))[2];

Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];

int64_t index = indices[slice][iFrame][iRow][iColumn];
Dtype grad_val = gradOutputData[slice*oT*oH*oW + index];
gradInput[slice][iFrame][iRow][iColumn] = grad_val;
}
}
Expand Down
28 changes: 18 additions & 10 deletions aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block, \
0, THCState_getCurrentStream(state)>>>( \
cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
inputData, inputTime, inputHeight, inputWidth, \
cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
dilationT, dilationH, dilationW, offsetZ); \
break

Expand Down Expand Up @@ -233,10 +234,10 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
} else {
THCTensor_(retain)(state, output);
}

real* inputData = THCTensor_(data)(state, input);

THCDeviceTensor<real, 4> cudaInput;
THCDeviceTensor<real, 4> cudaOutput;
cudaInput = toDeviceTensor<real, 4>(state, input);
cudaOutput = toDeviceTensor<real, 4>(state, output);

THLongStorage *indicesSize = THLongStorage_newWithSize(4);
Expand Down Expand Up @@ -275,7 +276,8 @@ void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
default:
cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
0, THCState_getCurrentStream(state)>>>(
cudaInput, cudaIndices, cudaOutput,
inputData, inputTime, inputHeight, inputWidth,
cudaIndices, cudaOutput,
kT, kH, kW, dT, dH, dW,
padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
}
Expand Down Expand Up @@ -306,14 +308,14 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
// TODO: gradOutput shape check
// Resize and initialize result tensor.
THCTensor_(resizeAs)(state, gradInput, input);
THCTensor_(newContiguous)(state, gradInput);
THCTensor_(zero)(state, gradInput);

int batchSize;
int inputSlices;

int outputTime;
int outputHeight;
int outputWidth;
int outputTime, outputHeight, outputWidth;
int inputTime, inputHeight, inputWidth;

int fiveDimensionalInput = THCTensor_(nDimension)(state, input) == 5;

Expand All @@ -331,6 +333,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
outputTime = THCTensor_(size)(state, gradOutput, 1);
outputHeight = THCTensor_(size)(state, gradOutput, 2);
outputWidth = THCTensor_(size)(state, gradOutput, 3);
inputTime = THCTensor_(size)(state, gradInput, 1);
inputHeight = THCTensor_(size)(state, gradInput, 2);
inputWidth = THCTensor_(size)(state, gradInput, 3);
}
else
{
Expand All @@ -340,6 +345,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
outputTime = THCTensor_(size)(state, gradOutput, 2);
outputHeight = THCTensor_(size)(state, gradOutput, 3);
outputWidth = THCTensor_(size)(state, gradOutput, 4);
inputTime = THCTensor_(size)(state, gradInput, 2);
inputHeight = THCTensor_(size)(state, gradInput, 3);
inputWidth = THCTensor_(size)(state, gradInput, 4);
}

gradOutput = THCTensor_(newContiguous)(state, gradOutput);
Expand All @@ -354,10 +362,9 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
THCTensor_(retain)(state, gradInput);
}

THCDeviceTensor<real, 4> cudaGradInput;
THCDeviceTensor<real, 4> cudaGradOutput;
cudaGradInput = toDeviceTensor<real, 4>(state, gradInput);
cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
real* gradInputData = THCTensor_(data)(state, gradInput);

THLongStorage *indicesSize = THLongStorage_newWithSize(4);
int64_t indicesSizeRaw[4] = { batchSize * inputSlices,
Expand All @@ -384,7 +391,8 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
0, THCState_getCurrentStream(state)>>>(
cudaGradOutput,
cudaIndices,
cudaGradInput,
gradInputData,
inputTime, inputHeight, inputWidth,
dT, dH, dW,
padT, padH, padW,
dilationT, dilationH, dilationW, offsetZ);
Expand Down
Loading