Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 66 additions & 66 deletions aten/src/THCUNN/VolumetricConvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,30 @@
// Borrowed from Theano
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
template <typename Dtype>
__global__ void im3d2col_kernel(const int n, const Dtype* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
__global__ void im3d2col_kernel(const int64_t n, const Dtype* data_im,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
const int64_t height_col, const int64_t width_col, const int64_t depth_col,
Dtype* data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int d_out = index % depth_col;
int w_index = index / depth_col;
int w_out = w_index % width_col;
int h_index = w_index / width_col;
int h_out = h_index % height_col;
int64_t d_out = index % depth_col;
int64_t w_index = index / depth_col;
int64_t w_out = w_index % width_col;
int64_t h_index = w_index / width_col;
int64_t h_out = h_index % height_col;

int channel_in = h_index / height_col;
int64_t channel_in = h_index / height_col;
//channel_in = 1;

int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d;

int h_in = h_out * stride_h - pad_h;
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
int64_t h_in = h_out * stride_h - pad_h;
int64_t w_in = w_out * stride_w - pad_w;
int64_t d_in = d_out * stride_d - pad_d;

Dtype* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
Expand All @@ -41,15 +41,15 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
data_im_ptr += channel_in * (height * width * depth) +
h_in * (width * depth) + w_in * depth + d_in;

for (int i = 0; i < kernel_h; ++i)
for (int64_t i = 0; i < kernel_h; ++i)
{
int h = h_in + i;
for (int j = 0; j < kernel_w; ++j)
int64_t h = h_in + i;
for (int64_t j = 0; j < kernel_w; ++j)
{
int w = w_in + j;
for (int k = 0; k < kernel_d; ++k)
int64_t w = w_in + j;
for (int64_t k = 0; k < kernel_d; ++k)
{
int d = d_in + k;
int64_t d = d_in + k;
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
h < height && w < width && d < depth) ?
data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0);
Expand All @@ -61,19 +61,19 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
}

template <typename Dtype>
void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
void im3d2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
Dtype* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col;
int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
int64_t num_kernels = channels * height_col * width_col * depth_col;
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_im,
height, width, depth,
Expand All @@ -86,42 +86,42 @@ void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
}

template <typename Dtype, typename Acctype>
__global__ void col2im3d_kernel(const int n, const Dtype* data_col,
const int height, const int width, const int depth,
const int channels,
const int patch_h, const int patch_w, const int patch_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
__global__ void col2im3d_kernel(const int64_t n, const Dtype* data_col,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t channels,
const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
const int64_t height_col, const int64_t width_col, const int64_t depth_col,
Dtype* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
Acctype val = 0;
int d = index % depth + pad_d;
int w_index = index / depth;
int w = w_index % width + pad_w;
int h_index = w_index / width;
int h = h_index % height + pad_h;
int c = h_index / height;
int64_t d = index % depth + pad_d;
int64_t w_index = index / depth;
int64_t w = w_index % width + pad_w;
int64_t h_index = w_index / width;
int64_t h = h_index % height + pad_h;
int64_t c = h_index / height;

// compute the start and end of the output
int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
int d_col_end = min(d / stride_d + 1, depth_col);
int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int w_col_end = min(w / stride_w + 1, width_col);
int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, height_col);
int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
int64_t d_col_end = min(d / stride_d + 1, depth_col);
int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
int64_t w_col_end = min(w / stride_w + 1, width_col);
int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
int64_t h_col_end = min(h / stride_h + 1, height_col);

int offset =
int64_t offset =
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;

int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col)
for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
Expand All @@ -130,17 +130,17 @@ __global__ void col2im3d_kernel(const int n, const Dtype* data_col,
}

template <typename Dtype, typename Acctype>
void col2im3d(cudaStream_t stream, const Dtype* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
void col2im3d(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
const int64_t height, const int64_t width, const int64_t depth,
const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
Dtype* data_im)
{
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth;
int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
int64_t num_kernels = channels * height * width * depth;

// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
Expand Down
2 changes: 0 additions & 2 deletions aten/src/THCUNN/generic/Im2Col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ static inline void THNN_(Im2Col_shapeCheck)(
int inputWidth = THCTensor_(size)(state, input, dim_batch + 3);
int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
int outputWidth = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
int nOutputPlane = nInputPlane * kW * kH;
int outputLength = outputHeight * outputWidth;

if (outputHeight < 1 || outputWidth < 1) {
THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
Expand Down
22 changes: 11 additions & 11 deletions aten/src/THCUNN/generic/VolumetricConvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ static inline void THNN_(VolumetricConvolution_shapeCheck)
if (weight == NULL) {
weight = gradWeight;
}
int nOutputPlane = (int)weight->size[0];
int nInputPlane = (int)weight->size[1];
int kT = (int)weight->size[2];
int kH = (int)weight->size[3];
int kW = (int)weight->size[4];
int64_t nOutputPlane = weight->size[0];
int64_t nInputPlane = weight->size[1];
int64_t kT = weight->size[2];
int64_t kH = weight->size[3];
int64_t kW = weight->size[4];

THArgCheck(kT > 0 && kW > 0 && kH > 0, 4,
"kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
Expand Down Expand Up @@ -267,11 +267,11 @@ void THNN_(VolumetricConvolution_updateGradInput)(
int padT, int padW, int padH)
{

int nOutputPlane = (int)weight->size[0];
int nInputPlane = (int)weight->size[1];
int kT = (int)weight->size[2];
int kH = (int)weight->size[3];
int kW = (int)weight->size[4];
int64_t nOutputPlane = weight->size[0];
int64_t nInputPlane = weight->size[1];
int64_t kT = weight->size[2];
int64_t kH = weight->size[3];
int64_t kW = weight->size[4];

THCTensor *gradColumns = finput;

Expand Down Expand Up @@ -507,7 +507,7 @@ void THNN_(VolumetricConvolution_accGradParameters)(
#endif
}
}

// Free
THCTensor_(free)(state, input_n);
THCTensor_(free)(state, gradOutput_n);
Expand Down
2 changes: 1 addition & 1 deletion aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
THCDeviceTensor<THCIndex_t, 4> cudaIndices =
toDeviceTensor<THCIndex_t, 4>(state, indices1);

int totalZ = outputTime * inputSlices * batchSize;
int64_t totalZ = outputTime * inputSlices * batchSize;
int offsetZ = 0;
dim3 block(32, 8);

Expand Down
4 changes: 2 additions & 2 deletions aten/src/THNN/generic/Col2Im.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ static void THNN_(im2col)(const real* data_im, const int channels,
int h_offset = (c_col / kernel_w) % kernel_h;
int c_im = c_col / kernel_h / kernel_w;
for (int h_col = 0; h_col < height_col; ++h_col) {
int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
for (int w_col = 0; w_col < width_col; ++w_col) {
int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
data_col[(c_col * height_col + h_col) * width_col + w_col] =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
Expand All @@ -98,8 +98,8 @@ static void THNN_(col2im)(const real* data_col, const int channels,
int h_offset = (c_col / kernel_w) % kernel_h;
int c_im = c_col / kernel_h / kernel_w;
for (int h_col = 0; h_col < height_col; ++h_col) {
int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
for (int w_col = 0; w_col < width_col; ++w_col) {
int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
data_im[(c_im * height + h_im) * width + w_im] +=
Expand Down
8 changes: 4 additions & 4 deletions aten/src/THNN/generic/VolumetricDilatedConvolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
dilationT, dilationH, dilationW, 0);

// Params:
int nInputPlane = weight->size[1];
int nOutputPlane = weight->size[0];
int64_t nInputPlane = weight->size[1];
int64_t nOutputPlane = weight->size[0];

input = THTensor_(newContiguous)(input);
weight = THTensor_(newContiguous)(weight);
Expand Down Expand Up @@ -230,8 +230,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
dilationT, dilationH, dilationW, 0);

// Params
int nInputPlane = weight->size[1];
int nOutputPlane = weight->size[0];
int64_t nInputPlane = weight->size[1];
int64_t nOutputPlane = weight->size[0];

input = THTensor_(newContiguous)(input);
gradOutput = THTensor_(newContiguous)(gradOutput);
Expand Down
Loading