@@ -537,19 +537,22 @@ namespace cv { namespace gpu { namespace device
537537 }
538538
539539 template <typename T, typename W>
540- __global__ void resise_scan_fast_x (const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines)
540+ __global__ void resise_scan_fast_x (const DevMem2D_<T> src, DevMem2D_<W> dst, int fx, int fy, int thred_lines, int stride )
541541 {
542542 extern __shared__ W sbuf[];
543543
544544 const unsigned int tid = threadIdx . x;
545545
546546 // load line-block on shared memory
547547 int y = blockIdx .x / thred_lines;
548- int input_stride = (blockIdx .x - y * thred_lines) * blockDim . x ;
548+ int input_stride = (blockIdx .x % thred_lines) * stride ;
549549 int x = input_stride + tid;
550550
551551 // store global data in shared memory
552- sbuf[tid] = src (y, x);
552+ if (x < src.cols && y < src.rows )
553+ sbuf[tid] = src (y, x);
554+ else
555+ sbuf[tid] = 0 ;
553556 __syncthreads ();
554557
555558 scan_block<inclusive, W>(sbuf);
@@ -575,7 +578,7 @@ namespace cv { namespace gpu { namespace device
575578 }
576579
577580 template <typename T, typename W>
578- __global__ void resise_scan_fast_y (const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines)
581+ __global__ void resise_scan_fast_y (const DevMem2D_<W> src, DevMem2D_<T> dst, int fx, int fy, int thred_lines, int stride )
579582 {
580583 extern __shared__ W sbuf[];
581584
@@ -584,13 +587,15 @@ namespace cv { namespace gpu { namespace device
584587 // load line-block on shared memory
585588 int x = blockIdx .x / thred_lines;
586589
587- int global_stride = (blockIdx .x % thred_lines) * blockDim .x ;
588- if (!tid) printf (" STRIDE : %d" , global_stride);
590+ int global_stride = (blockIdx .x % thred_lines) * stride;
589591 int y = global_stride + tid;
590592
591593 // store global data in shared memory
594+ if (x < src.cols && y < src.rows )
595+ sbuf[tid] = src (y, x);
596+ else
597+ sbuf[tid] = 0 ;
592598
593- sbuf[tid] = src (y, x);
594599 __syncthreads ();
595600 scan_block<inclusive, W>(sbuf);
596601
@@ -623,28 +628,30 @@ namespace cv { namespace gpu { namespace device
623628 int iscale_x = round (fx);
624629 int iscale_y = round (fy);
625630
626- const int warps = 4 ;
631+ int warps = 4 ;
627632 const int threads = 32 * warps;
633+ int input_stride = threads / iscale_x;
628634
629- int thred_lines = divUp (src.cols , threads );
635+ int thred_lines = divUp (src.cols , input_stride * iscale_x );
630636 int blocks = src.rows * thred_lines;
631637
632- printf (" device code executed for X coordinate with:\n size %d warps %d, threads %d, thred_lines %d, blocks %d\n " ,
633- src.cols , warps, threads, thred_lines, blocks);
638+ printf (" device code executed for X coordinate with:\n size %d warps %d, threads %d, thred_lines %d, blocks %d input strude %d \n " ,
639+ src.cols , warps, threads, thred_lines, blocks, input_stride * iscale_x );
634640
635641 typedef typename scan_traits<T>::scan_line_type smem_type;
636642
637643 resise_scan_fast_x<T, smem_type><<<blocks, threads, warps * 32 * sizeof (smem_type)>>>
638- (src, buffer, iscale_x, iscale_y, thred_lines);
644+ (src, buffer, iscale_x, iscale_y, thred_lines, input_stride * iscale_x );
639645
640- thred_lines = divUp (src.rows , threads);
646+ input_stride = threads / iscale_y;
647+ thred_lines = divUp (src.rows , input_stride * iscale_y);
641648 blocks = dst.cols * thred_lines;
642649
643650 printf (" device code executed for Y coordinate with:\n size %d warps %d, threads %d, thred_lines %d, blocks %d\n " ,
644651 dst.rows , warps, threads, thred_lines, blocks);
645652
646653 resise_scan_fast_y<T, smem_type><<<blocks, threads, warps * 32 * sizeof (smem_type)>>>
647- (buffer, dst, iscale_x, iscale_y, thred_lines);
654+ (buffer, dst, iscale_x, iscale_y, thred_lines, input_stride * iscale_y );
648655
649656 cudaSafeCall ( cudaGetLastError () );
650657
0 commit comments