Skip to content

Commit 2f162f4

Browse files
fweigdavidrohr
authored andcommitted
GPU: Improve gather kernel performance
- Buffer data in shared memory for more efficent dma writes - Distribute work equally among all threads - Add flag to switch between kernels
1 parent 1dad6ed commit 2f162f4

File tree

9 files changed

+354
-90
lines changed

9 files changed

+354
-90
lines changed

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,16 @@ GPUdi() T work_group_broadcast_FUNC(T v, int i, S& smem)
363363
return retVal;
364364
}
365365

366+
#define work_group_reduce_add(v) work_group_reduce_add_FUNC(v, smem)
367+
template <class T, class S>
368+
GPUdi() T work_group_reduce_add_FUNC(T v, S& smem)
369+
{
370+
v = typename S::BlockReduce(smem.cubReduceTmpMem).Sum(v);
371+
__syncthreads();
372+
v = work_group_broadcast(v, 0);
373+
return v;
374+
}
375+
366376
#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v, smem)
367377
template <class T, class S>
368378
GPUdi() T warp_scan_inclusive_add_FUNC(T v, S& smem)
@@ -380,6 +390,12 @@ GPUdi() T work_group_scan_inclusive_add(T v)
380390
return v;
381391
}
382392

393+
template <class T>
394+
GPUdi() T work_group_reduce_add(T v)
395+
{
396+
return v;
397+
}
398+
383399
template <class T>
384400
GPUdi() T work_group_broadcast(T v, int i)
385401
{

GPU/Common/GPUDefGPUParameters.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
#define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
7070
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 256
7171
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512
72-
#define GPUCA_LB_GPUTPCCompressionKernels_step2gather 128
72+
#define GPUCA_LB_COMPRESSION_GATHER 1024
7373
#define GPUCA_LB_CLUSTER_FINDER 512
7474
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
7575
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -122,7 +122,7 @@
122122
#define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
123123
#define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
124124
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
125-
#define GPUCA_LB_GPUTPCCompressionKernels_step2gather 1024
125+
#define GPUCA_LB_COMPRESSION_GATHER 1024
126126
#define GPUCA_LB_CLUSTER_FINDER 512
127127
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
128128
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -174,15 +174,15 @@
174174
#ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached
175175
#define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256
176176
#endif
177-
#ifndef GPUCA_LB_GPUTPCCompressionKernels_step2gather
178-
#define GPUCA_LB_GPUTPCCompressionKernels_step2gather 256
179-
#endif
180177
#ifndef GPUCA_LB_GPUTPCCFDecodeZS
181178
#define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
182179
#endif
183180
#ifndef GPUCA_LB_GPUTPCCFGather
184181
#define GPUCA_LB_GPUTPCCFGather 1024, 1
185182
#endif
183+
#ifndef GPUCA_LB_COMPRESSION_GATHER
184+
#define GPUCA_LB_COMPRESSION_GATHER 1024
185+
#endif
186186
#ifndef GPUCA_LB_CLUSTER_FINDER
187187
#define GPUCA_LB_CLUSTER_FINDER 128
188188
#endif
@@ -301,6 +301,10 @@
301301
#define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_THREAD_COUNT_SCAN
302302
#define GPUCA_LB_GPUTPCTrackletConstructor_singleSlice GPUCA_LB_GPUTPCTrackletConstructor
303303
#define GPUCA_LB_GPUTPCTrackletConstructor_allSlices GPUCA_LB_GPUTPCTrackletConstructor
304+
#define GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered GPUCA_LB_COMPRESSION_GATHER
305+
#define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32 GPUCA_LB_COMPRESSION_GATHER
306+
#define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER
307+
#define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered128 GPUCA_LB_COMPRESSION_GATHER
304308

305309
#ifndef GPUCA_NEIGHBORSFINDER_REGS
306310
#define GPUCA_NEIGHBORSFINDER_REGS NONE, 0

GPU/GPUTracking/Base/GPUGeneralKernels.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,11 @@ class GPUKernelTemplate
5454
// Provides the shared memory resources for CUB collectives
5555
#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE)
5656
typedef GPUCA_CUB::BlockScan<T, I> BlockScan;
57+
typedef GPUCA_CUB::BlockReduce<T, I> BlockReduce;
5758
typedef GPUCA_CUB::WarpScan<T> WarpScan;
5859
union {
5960
typename BlockScan::TempStorage cubTmpMem;
61+
typename BlockReduce::TempStorage cubReduceTmpMem;
6062
typename WarpScan::TempStorage cubWarpTmpMem;
6163
int tmpBroadcast;
6264
};

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,9 @@ int GPUReconstruction::InitPhaseBeforeDevice()
232232
if (mProcessingSettings.tpcCompressionGatherMode < 0) {
233233
mProcessingSettings.tpcCompressionGatherMode = (mRecoStepsGPU & GPUDataTypes::RecoStep::TPCCompression) ? 2 : 0;
234234
}
235+
if (mProcessingSettings.tpcCompressionGatherModeKernel < 0) {
236+
mProcessingSettings.tpcCompressionGatherModeKernel = 0;
237+
}
235238
if (!(mRecoStepsGPU & GPUDataTypes::RecoStep::TPCMerging)) {
236239
mProcessingSettings.mergerSortTracks = false;
237240
}

GPU/GPUTracking/Base/GPUReconstructionKernels.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,16 @@ GPUCA_KRNL_LB((GPUTPCGMMergerFinalize, step2 ), (simple), (), ())
6161
GPUCA_KRNL_LB((GPUTRDTrackerKernels ), (simple), (), ())
6262
GPUCA_KRNL_LB((GPUITSFitterKernel ), (simple), (), ())
6363
GPUCA_KRNL_LB((GPUTPCConvertKernel ), (simple), (), ())
64-
GPUCA_KRNL_LB((GPUTPCCompressionKernels, step0attached ), (simple), (), ())
65-
GPUCA_KRNL_LB((GPUTPCCompressionKernels, step1unattached ), (simple), (), ())
66-
GPUCA_KRNL_LB((GPUTPCCompressionKernels, step2gather ), (simple), (), ())
67-
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, fillIndexMap ), (single), (), ())
68-
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, fillFromDigits ), (single), (), ())
69-
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, findFragmentStart), (single), (), ())
64+
GPUCA_KRNL_LB((GPUTPCCompressionKernels, step0attached ), (simple), (), ())
65+
GPUCA_KRNL_LB((GPUTPCCompressionKernels, step1unattached ), (simple), (), ())
66+
GPUCA_KRNL_LB((GPUTPCCompressionGatherKernels, unbuffered ), (simple), (), ())
67+
GPUCA_KRNL_LB((GPUTPCCompressionGatherKernels, buffered32 ), (simple), (), ())
68+
GPUCA_KRNL_LB((GPUTPCCompressionGatherKernels, buffered64 ), (simple), (), ())
69+
GPUCA_KRNL_LB((GPUTPCCompressionGatherKernels, buffered128 ), (simple), (), ())
70+
71+
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, fillIndexMap ), (single), (), ())
72+
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, fillFromDigits ), (single), (), ())
73+
GPUCA_KRNL_LB((GPUTPCCFChargeMapFiller, findFragmentStart ), (single), (), ())
7074
GPUCA_KRNL_LB((GPUTPCCFPeakFinder ), (single), (), ())
7175
GPUCA_KRNL_LB((GPUTPCCFNoiseSuppression, noiseSuppression ), (single), (), ())
7276
GPUCA_KRNL_LB((GPUTPCCFNoiseSuppression, updatePeaks ), (single), (), ())

GPU/GPUTracking/DataCompression/GPUTPCCompression.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class GPUTPCGMMerger;
4545
class GPUTPCCompression : public GPUProcessor
4646
{
4747
friend class GPUTPCCompressionKernels;
48+
friend class GPUTPCCompressionGatherKernels;
4849
friend class GPUChainTracking;
4950

5051
public:

0 commit comments

Comments
 (0)