Skip to content

Commit bbd4442

Browse files
committed
update prefixSum section
1 parent 85f7739 commit bbd4442

File tree

1 file changed

+57
-1
lines changed

1 file changed

+57
-1
lines changed

8.misc/gvdb_notes.md

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ GPU的构建逻辑:
7070
* Voxel点云排序
7171
* CUDA::FUNC_FIND_UNIQUE
7272
* CUDA::AUX_UNIQUE_CNT
73+
* CUDA::FUNC_PREFIXSUM
74+
* CUDA::FUNC_PREFIXFIXUP
7375
* CUDA::FUNC_COMPACT_UNIQUE
7476
![](images/gvdb_activate_bricks.png)
7577
![](images/gvdb_build_alloc_resources.png)
@@ -116,4 +118,58 @@ GPU的构建逻辑:
116118
}
117119
```
118120
119-
* 核心CUDA代码位于 `gvdb_library/src/cuda_gvdb_nodes.cuh`
121+
* 核心CUDA代码位于 `gvdb_library/src/cuda_gvdb_nodes.cuh`
122+
123+
124+
### PrefixSum
125+
126+
```cuda
127+
#define SCAN_BLOCKSIZE 512
128+
129+
extern "C" __global__ void prefixFixup ( uint *input, uint *aux, int len)
130+
{
131+
unsigned int t = threadIdx.x;
132+
unsigned int start = t + 2 * blockIdx.x * SCAN_BLOCKSIZE;
133+
if (start < len) input[start] += aux[blockIdx.x] ;
134+
if (start + SCAN_BLOCKSIZE < len) input[start + SCAN_BLOCKSIZE] += aux[blockIdx.x];
135+
}
136+
137+
extern "C" __global__ void prefixSum ( uint* input, uint* output, uint* aux, int len, int zeroff )
138+
{
139+
__shared__ uint scan_array[SCAN_BLOCKSIZE << 1];
140+
unsigned int t1 = threadIdx.x + 2 * blockIdx.x * SCAN_BLOCKSIZE;
141+
unsigned int t2 = t1 + SCAN_BLOCKSIZE;
142+
143+
// Pre-load into shared memory
144+
scan_array[threadIdx.x] = (t1<len) ? input[t1] : 0.0f;
145+
scan_array[threadIdx.x + SCAN_BLOCKSIZE] = (t2<len) ? input[t2] : 0.0f;
146+
__syncthreads();
147+
148+
// Reduction
149+
int stride;
150+
for (stride = 1; stride <= SCAN_BLOCKSIZE; stride <<= 1) {
151+
int index = (threadIdx.x + 1) * stride * 2 - 1;
152+
if (index < 2 * SCAN_BLOCKSIZE)
153+
scan_array[index] += scan_array[index - stride];
154+
__syncthreads();
155+
}
156+
157+
// Post reduction
158+
for (stride = SCAN_BLOCKSIZE >> 1; stride > 0; stride >>= 1) {
159+
int index = (threadIdx.x + 1) * stride * 2 - 1;
160+
if (index + stride < 2 * SCAN_BLOCKSIZE)
161+
scan_array[index + stride] += scan_array[index];
162+
__syncthreads();
163+
}
164+
__syncthreads();
165+
166+
// Output values & aux
167+
if (t1+zeroff < len) output[t1+zeroff] = scan_array[threadIdx.x];
168+
if (t2+zeroff < len) output[t2+zeroff] = (threadIdx.x==SCAN_BLOCKSIZE-1 && zeroff) ? 0 : scan_array[threadIdx.x + SCAN_BLOCKSIZE];
169+
if ( threadIdx.x == 0 ) {
170+
if ( zeroff ) output[0] = 0;
171+
if (aux) aux[blockIdx.x] = scan_array[2 * SCAN_BLOCKSIZE - 1];
172+
}
173+
}
174+
```
175+

0 commit comments

Comments
 (0)