@@ -70,6 +70,8 @@ GPU的构建逻辑:
7070 * Voxel点云排序
7171 * CUDA::FUNC_FIND_UNIQUE
7272 * CUDA::AUX_UNIQUE_CNT
73+ * CUDA::FUNC_PREFIXSUM
74+ * CUDA::FUNC_PREFIXFIXUP
7375 * CUDA::FUNC_COMPACT_UNIQUE
7476 
7577 
@@ -116,4 +118,58 @@ GPU的构建逻辑:
116118 }
117119 ```
118120
119- * 核心CUDA代码位于 `gvdb_library/src/cuda_gvdb_nodes.cuh`
121+ * 核心CUDA代码位于 `gvdb_library/src/cuda_gvdb_nodes.cuh`
122+
123+
124+ ### PrefixSum
125+
126+ ```cuda
127+ #define SCAN_BLOCKSIZE 512
128+
129+ extern "C" __global__ void prefixFixup ( uint *input, uint *aux, int len)
130+ {
131+ unsigned int t = threadIdx.x;
132+ unsigned int start = t + 2 * blockIdx.x * SCAN_BLOCKSIZE;
133+ if (start < len) input[start] += aux[blockIdx.x] ;
134+ if (start + SCAN_BLOCKSIZE < len) input[start + SCAN_BLOCKSIZE] += aux[blockIdx.x];
135+ }
136+
137+ extern "C" __global__ void prefixSum ( uint* input, uint* output, uint* aux, int len, int zeroff )
138+ {
139+ __shared__ uint scan_array[SCAN_BLOCKSIZE << 1];
140+ unsigned int t1 = threadIdx.x + 2 * blockIdx.x * SCAN_BLOCKSIZE;
141+ unsigned int t2 = t1 + SCAN_BLOCKSIZE;
142+
143+ // Pre-load into shared memory
144+ scan_array[threadIdx.x] = (t1<len) ? input[t1] : 0.0f;
145+ scan_array[threadIdx.x + SCAN_BLOCKSIZE] = (t2<len) ? input[t2] : 0.0f;
146+ __syncthreads();
147+
148+ // Reduction
149+ int stride;
150+ for (stride = 1; stride <= SCAN_BLOCKSIZE; stride <<= 1) {
151+ int index = (threadIdx.x + 1) * stride * 2 - 1;
152+ if (index < 2 * SCAN_BLOCKSIZE)
153+ scan_array[index] += scan_array[index - stride];
154+ __syncthreads();
155+ }
156+
157+ // Post reduction
158+ for (stride = SCAN_BLOCKSIZE >> 1; stride > 0; stride >>= 1) {
159+ int index = (threadIdx.x + 1) * stride * 2 - 1;
160+ if (index + stride < 2 * SCAN_BLOCKSIZE)
161+ scan_array[index + stride] += scan_array[index];
162+ __syncthreads();
163+ }
164+ __syncthreads();
165+
166+ // Output values & aux
167+ if (t1+zeroff < len) output[t1+zeroff] = scan_array[threadIdx.x];
168+ if (t2+zeroff < len) output[t2+zeroff] = (threadIdx.x==SCAN_BLOCKSIZE-1 && zeroff) ? 0 : scan_array[threadIdx.x + SCAN_BLOCKSIZE];
169+ if ( threadIdx.x == 0 ) {
170+ if ( zeroff ) output[0] = 0;
171+ if (aux) aux[blockIdx.x] = scan_array[2 * SCAN_BLOCKSIZE - 1];
172+ }
173+ }
174+ ```
175+
0 commit comments