Skip to content

Commit db08656

Browse files
author
Marina Kolpakova
committed
resize area are fixed for scales that aren't divide 128
1 parent 0f01d8d commit db08656

File tree

3 files changed

+30
-22
lines changed

3 files changed

+30
-22
lines changed

modules/gpu/src/cuda/resize.cu

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -537,19 +537,22 @@ namespace cv { namespace gpu { namespace device
537537
}
538538

539539
template<typename T, typename W>
540-
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)
540+
__global__ void resise_scan_fast_x(const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride)
541541
{
542542
extern __shared__ W sbuf[];
543543

544544
const unsigned int tid = threadIdx. x;
545545

546546
// load line-block on shared memory
547547
int y = blockIdx.x / thred_lines;
548-
int input_stride = (blockIdx.x - y * thred_lines) * blockDim.x;
548+
int input_stride = (blockIdx.x % thred_lines) * stride;
549549
int x = input_stride + tid;
550550

551551
// store global data in shared memory
552-
sbuf[tid] = src(y, x);
552+
if (x < src.cols && y < src.rows)
553+
sbuf[tid] = src(y, x);
554+
else
555+
sbuf[tid] = 0;
553556
__syncthreads();
554557

555558
scan_block<inclusive, W>(sbuf);
@@ -575,7 +578,7 @@ namespace cv { namespace gpu { namespace device
575578
}
576579

577580
template<typename T, typename W>
578-
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)
581+
__global__ void resise_scan_fast_y(const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride)
579582
{
580583
extern __shared__ W sbuf[];
581584

@@ -584,13 +587,15 @@ namespace cv { namespace gpu { namespace device
584587
// load line-block on shared memory
585588
int x = blockIdx.x / thred_lines;
586589

587-
int global_stride = (blockIdx.x % thred_lines) * blockDim.x;
588-
if (!tid) printf("STRIDE : %d", global_stride);
590+
int global_stride = (blockIdx.x % thred_lines) * stride;
589591
int y = global_stride + tid;
590592

591593
// store global data in shared memory
594+
if (x < src.cols && y < src.rows)
595+
sbuf[tid] = src(y, x);
596+
else
597+
sbuf[tid] = 0;
592598

593-
sbuf[tid] = src(y, x);
594599
__syncthreads();
595600
scan_block<inclusive, W>(sbuf);
596601

@@ -623,28 +628,30 @@ namespace cv { namespace gpu { namespace device
623628
int iscale_x = round(fx);
624629
int iscale_y = round(fy);
625630

626-
const int warps = 4;
631+
int warps = 4;
627632
const int threads = 32 * warps;
633+
int input_stride = threads / iscale_x;
628634

629-
int thred_lines = divUp(src.cols, threads);
635+
int thred_lines = divUp(src.cols, input_stride * iscale_x);
630636
int blocks = src.rows * thred_lines;
631637

632-
printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
633-
src.cols, warps, threads, thred_lines, blocks);
638+
printf("device code executed for X coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d input strude %d\n",
639+
src.cols, warps, threads, thred_lines, blocks, input_stride * iscale_x);
634640

635641
typedef typename scan_traits<T>::scan_line_type smem_type;
636642

637643
resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
638-
(src, buffer, iscale_x, iscale_y, thred_lines);
644+
(src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x);
639645

640-
thred_lines = divUp(src.rows, threads);
646+
input_stride = threads / iscale_y;
647+
thred_lines = divUp(src.rows, input_stride * iscale_y);
641648
blocks = dst.cols * thred_lines;
642649

643650
printf("device code executed for Y coordinate with:\nsize %d warps %d, threads %d, thred_lines %d, blocks %d\n",
644651
dst.rows, warps, threads, thred_lines, blocks);
645652

646653
resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof(smem_type)>>>
647-
(buffer, dst, iscale_x, iscale_y, thred_lines);
654+
(buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y);
648655

649656
cudaSafeCall( cudaGetLastError() );
650657

modules/gpu/src/resize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, GpuMat& buffer, Size dsize,
9595
CV_Assert( (fx < 1.0) && (fy < 1.0));
9696
CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
9797
CV_Assert(src.cols >= 128 && src.rows >= 128);
98+
CV_Assert((fx - 128.0) <= 0 && (fy - 128.0) <= 0);
9899

99100
if (dsize == Size())
100101
dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));

modules/gpu/test/test_resize.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -201,23 +201,23 @@ TEST_P(ResizeArea, Accuracy)
201201
cv::Mat gpu;
202202
dst.download(gpu);
203203

204-
std::cout //<< src
204+
// std::cout // << src
205+
// // << std::endl << std::endl
206+
// // << gpu_buff
207+
// // << std::endl << std::endl
208+
// << gpu
205209
// << std::endl << std::endl
206-
// << gpu_buff
207-
// << std::endl << std::endl
208-
<< gpu
209-
<< std::endl << std::endl
210-
<< dst_cpu<< std::endl;
210+
// << dst_cpu<< std::endl;
211211

212212

213213
EXPECT_MAT_NEAR(dst_cpu, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
214214
}
215215

216216
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeArea, testing::Combine(
217217
ALL_DEVICES,
218-
testing::Values(cv::Size(640, 10 * 128)),//DIFFERENT_SIZES,
218+
testing::Values(cv::Size(640, 480)),//DIFFERENT_SIZES,
219219
testing::Values(MatType(CV_8UC1)/*MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
220-
testing::Values(0.1),
220+
testing::Values(0.05, 0.1),
221221
testing::Values(Interpolation(cv::INTER_AREA)),
222222
WHOLE_SUBMAT));
223223

0 commit comments

Comments
 (0)