88// Borrowed from Theano
99// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
1010template <typename Dtype>
11- __global__ void im3d2col_kernel (const int n, const Dtype* data_im,
12- const int height, const int width, const int depth,
13- const int kernel_h, const int kernel_w, const int kernel_d,
14- const int pad_h, const int pad_w, const int pad_d,
15- const int stride_h, const int stride_w, const int stride_d,
16- const int height_col, const int width_col, const int depth_col,
11+ __global__ void im3d2col_kernel (const int64_t n, const Dtype* data_im,
12+ const int64_t height, const int64_t width, const int64_t depth,
13+ const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
14+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
15+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
16+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
1717 Dtype* data_col)
1818{
1919 CUDA_KERNEL_LOOP (index, n)
2020 {
21- int d_out = index % depth_col;
22- int w_index = index / depth_col;
23- int w_out = w_index % width_col;
24- int h_index = w_index / width_col;
25- int h_out = h_index % height_col;
21+ int64_t d_out = index % depth_col;
22+ int64_t w_index = index / depth_col;
23+ int64_t w_out = w_index % width_col;
24+ int64_t h_index = w_index / width_col;
25+ int64_t h_out = h_index % height_col;
2626
27- int channel_in = h_index / height_col;
27+ int64_t channel_in = h_index / height_col;
2828 // channel_in = 1;
2929
30- int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
30+ int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d;
3131
32- int h_in = h_out * stride_h - pad_h;
33- int w_in = w_out * stride_w - pad_w;
34- int d_in = d_out * stride_d - pad_d;
32+ int64_t h_in = h_out * stride_h - pad_h;
33+ int64_t w_in = w_out * stride_w - pad_w;
34+ int64_t d_in = d_out * stride_d - pad_d;
3535
3636 Dtype* data_col_ptr = data_col;
3737 data_col_ptr += channel_out * (height_col * width_col * depth_col) +
@@ -41,15 +41,15 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
4141 data_im_ptr += channel_in * (height * width * depth) +
4242 h_in * (width * depth) + w_in * depth + d_in;
4343
44- for (int i = 0 ; i < kernel_h; ++i)
44+ for (int64_t i = 0 ; i < kernel_h; ++i)
4545 {
46- int h = h_in + i;
47- for (int j = 0 ; j < kernel_w; ++j)
46+ int64_t h = h_in + i;
47+ for (int64_t j = 0 ; j < kernel_w; ++j)
4848 {
49- int w = w_in + j;
50- for (int k = 0 ; k < kernel_d; ++k)
49+ int64_t w = w_in + j;
50+ for (int64_t k = 0 ; k < kernel_d; ++k)
5151 {
52- int d = d_in + k;
52+ int64_t d = d_in + k;
5353 *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
5454 h < height && w < width && d < depth) ?
5555 data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int , Dtype>::to (0 );
@@ -61,19 +61,19 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
6161}
6262
6363template <typename Dtype>
64- void im3d2col (cudaStream_t stream, const Dtype* data_im, const int channels,
65- const int height, const int width, const int depth,
66- const int kernel_h, const int kernel_w, const int kernel_d,
67- const int pad_h, const int pad_w, const int pad_d,
68- const int stride_h, const int stride_w, const int stride_d,
64+ void im3d2col (cudaStream_t stream, const Dtype* data_im, const int64_t channels,
65+ const int64_t height, const int64_t width, const int64_t depth,
66+ const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
67+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
68+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
6969 Dtype* data_col)
7070{
7171 // We are going to launch channels * height_col * width_col * depth_col kernels, each
7272 // kernel responsible for copying a single-channel grid.
73- int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1 ;
74- int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1 ;
75- int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1 ;
76- int num_kernels = channels * height_col * width_col * depth_col;
73+ int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1 ;
74+ int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1 ;
75+ int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1 ;
76+ int64_t num_kernels = channels * height_col * width_col * depth_col;
7777 im3d2col_kernel<<<GET_BLOCKS(num_kernels),
7878 CUDA_NUM_THREADS, 0 , stream>>> (num_kernels, data_im,
7979 height, width, depth,
@@ -86,42 +86,42 @@ void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
8686}
8787
8888template <typename Dtype, typename Acctype>
89- __global__ void col2im3d_kernel (const int n, const Dtype* data_col,
90- const int height, const int width, const int depth,
91- const int channels,
92- const int patch_h, const int patch_w, const int patch_d,
93- const int pad_h, const int pad_w, const int pad_d,
94- const int stride_h, const int stride_w, const int stride_d,
95- const int height_col, const int width_col, const int depth_col,
89+ __global__ void col2im3d_kernel (const int64_t n, const Dtype* data_col,
90+ const int64_t height, const int64_t width, const int64_t depth,
91+ const int64_t channels,
92+ const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
93+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
94+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
95+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
9696 Dtype* data_im)
9797{
9898 CUDA_KERNEL_LOOP (index, n)
9999 {
100100 Acctype val = 0 ;
101- int d = index % depth + pad_d;
102- int w_index = index / depth;
103- int w = w_index % width + pad_w;
104- int h_index = w_index / width;
105- int h = h_index % height + pad_h;
106- int c = h_index / height;
101+ int64_t d = index % depth + pad_d;
102+ int64_t w_index = index / depth;
103+ int64_t w = w_index % width + pad_w;
104+ int64_t h_index = w_index / width;
105+ int64_t h = h_index % height + pad_h;
106+ int64_t c = h_index / height;
107107
108108 // compute the start and end of the output
109- int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1 ;
110- int d_col_end = min (d / stride_d + 1 , depth_col);
111- int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1 ;
112- int w_col_end = min (w / stride_w + 1 , width_col);
113- int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1 ;
114- int h_col_end = min (h / stride_h + 1 , height_col);
109+ int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1 ;
110+ int64_t d_col_end = min (d / stride_d + 1 , depth_col);
111+ int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1 ;
112+ int64_t w_col_end = min (w / stride_w + 1 , width_col);
113+ int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1 ;
114+ int64_t h_col_end = min (h / stride_h + 1 , height_col);
115115
116- int offset =
116+ int64_t offset =
117117 (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
118118
119- int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
120- int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
121- int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
122- for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
123- for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
124- for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
119+ int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
120+ int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
121+ int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
122+ for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col)
123+ for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) {
124+ for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) {
125125 val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
126126 }
127127 }
@@ -130,17 +130,17 @@ __global__ void col2im3d_kernel(const int n, const Dtype* data_col,
130130}
131131
132132template <typename Dtype, typename Acctype>
133- void col2im3d (cudaStream_t stream, const Dtype* data_col, const int channels,
134- const int height, const int width, const int depth,
135- const int patch_h, const int patch_w, const int patch_d,
136- const int pad_h, const int pad_w, const int pad_d,
137- const int stride_h, const int stride_w, const int stride_d,
133+ void col2im3d (cudaStream_t stream, const Dtype* data_col, const int64_t channels,
134+ const int64_t height, const int64_t width, const int64_t depth,
135+ const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
136+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
137+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
138138 Dtype* data_im)
139139{
140- int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1 ;
141- int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1 ;
142- int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1 ;
143- int num_kernels = channels * height * width * depth;
140+ int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1 ;
141+ int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1 ;
142+ int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1 ;
143+ int64_t num_kernels = channels * height * width * depth;
144144
145145 // To avoid involving atomic operations, we will launch one kernel per
146146 // bottom dimension, and then in the kernel add up the top dimensions.
0 commit comments