Skip to content

Commit a6a4918

Browse files
davidrohrktf
authored andcommitted
GPU: Add GPUrestrict() keyword, for now deactivated, later to become __restrict__
1 parent caaa605 commit a6a4918

29 files changed

+135
-134
lines changed

GPU/Common/GPUCommonDefAPI.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@
177177
#define GPUconstexprref()
178178
#endif
179179

180+
#define GPUrestrict() // We don't use restrict at the moment, could try at a later time
181+
180182
// Macros for GRID dimension
181183
#if defined(__CUDACC__) || defined(__HIPCC__)
182184
#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x)

GPU/Common/GPUDef.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
#endif
3838

3939
#ifdef GPUCA_GPUCODE
40-
#define CA_MAKE_SHARED_REF(vartype, varname, varglobal, varshared) const GPUsharedref() MEM_LOCAL(vartype) &varname = varshared;
40+
#define CA_MAKE_SHARED_REF(vartype, varname, varglobal, varshared) const GPUsharedref() MEM_LOCAL(vartype) & __restrict__ varname = varshared;
4141
#define CA_SHARED_STORAGE(storage) storage
4242
#define CA_SHARED_CACHE(target, src, size) \
4343
static_assert((size) % sizeof(int) == 0, "Invalid shared cache size"); \
@@ -46,12 +46,12 @@
4646
}
4747
#define CA_SHARED_CACHE_REF(target, src, size, reftype, ref) \
4848
CA_SHARED_CACHE(target, src, size) \
49-
GPUsharedref() const reftype* ref = (target)
49+
GPUsharedref() const reftype* __restrict__ ref = (target)
5050
#else
51-
#define CA_MAKE_SHARED_REF(vartype, varname, varglobal, varshared) const GPUglobalref() MEM_GLOBAL(vartype) &varname = varglobal;
51+
#define CA_MAKE_SHARED_REF(vartype, varname, varglobal, varshared) const GPUglobalref() MEM_GLOBAL(vartype) & __restrict__ varname = varglobal;
5252
#define CA_SHARED_STORAGE(storage)
5353
#define CA_SHARED_CACHE(target, src, size)
54-
#define CA_SHARED_CACHE_REF(target, src, size, reftype, ref) GPUglobalref() const reftype* ref = src
54+
#define CA_SHARED_CACHE_REF(target, src, size, reftype, ref) GPUglobalref() const reftype* __restrict__ ref = src
5555
#endif
5656

5757
#ifdef GPUCA_TEXTURE_FETCH_CONSTRUCTOR

GPU/GPUTracking/Base/GPUGeneralKernels.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
using namespace GPUCA_NAMESPACE::gpu;
1616

1717
template <>
18-
GPUdii() void GPUMemClean16::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() MEM_LOCAL(GPUSharedMemory) & smem, processorType& processors, GPUglobalref() void* ptr, unsigned long size)
18+
GPUdii() void GPUMemClean16::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() MEM_LOCAL(GPUSharedMemory) & GPUrestrict() smem, processorType& GPUrestrict() processors, GPUglobalref() void* ptr, unsigned long size)
1919
{
2020
const unsigned long stride = get_global_size(0);
2121
int4 i0;

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,25 @@ using namespace o2::tpc;
2626
template <>
2727
GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0attached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
2828
{
29-
GPUTPCGMMerger& merger = processors.tpcMerger;
30-
const o2::tpc::ClusterNativeAccess* clusters = processors.tpcConverter.getClustersNative();
31-
GPUTPCCompression& compressor = processors.tpcCompressor;
32-
GPUParam& param = processors.param;
29+
const GPUTPCGMMerger& GPUrestrict() merger = processors.tpcMerger;
30+
const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.tpcConverter.getClustersNative();
31+
GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
32+
const GPUParam& GPUrestrict() param = processors.param;
3333

3434
char lastLeg = 0;
3535
int myTrack = 0;
3636
for (unsigned int i = get_global_id(0); i < (unsigned int)merger.NOutputTracks(); i += get_global_size(0)) {
37-
const GPUTPCGMMergedTrack& trk = merger.OutputTracks()[i];
37+
const GPUTPCGMMergedTrack& GPUrestrict() trk = merger.OutputTracks()[i];
3838
if (!trk.OK()) {
3939
continue;
4040
}
4141
bool rejectTrk = CAMath::Abs(trk.GetParam().GetQPt()) > processors.param.rec.tpcRejectQPt;
4242
int nClustersStored = 0;
43-
CompressedClustersPtrsOnly& c = compressor.mPtrs;
43+
CompressedClustersPtrsOnly& GPUrestrict() c = compressor.mPtrs;
4444
unsigned int lastRow = 0, lastSlice = 0; // BUG: These should be unsigned char, but then CUDA breaks
4545
GPUTPCCompressionTrackModel track;
4646
for (int k = trk.NClusters() - 1; k >= 0; k--) {
47-
const GPUTPCGMMergedTrackHit& hit = merger.Clusters()[trk.FirstClusterRef() + k];
47+
const GPUTPCGMMergedTrackHit& GPUrestrict() hit = merger.Clusters()[trk.FirstClusterRef() + k];
4848
if (hit.state & GPUTPCGMMergedTrackHit::flagReject) {
4949
continue;
5050
}
@@ -63,7 +63,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
6363
if (!(param.rec.tpcCompressionModes & GPUSettings::CompressionTrackModel)) {
6464
continue; // No track model compression
6565
}
66-
const ClusterNative& orgCl = clusters->clusters[hit.slice][hit.row][hit.num - clusters->clusterOffset[hit.slice][hit.row]];
66+
const ClusterNative& GPUrestrict() orgCl = clusters->clusters[hit.slice][hit.row][hit.num - clusters->clusterOffset[hit.slice][hit.row]];
6767
float x = param.tpcGeometry.Row2X(hit.row);
6868
float y = param.tpcGeometry.LinearPad2Y(hit.slice, hit.row, orgCl.getPad());
6969
float z = param.tpcGeometry.LinearTime2Z(hit.slice, orgCl.getTime());
@@ -168,12 +168,12 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::opera
168168
}
169169

170170
template <>
171-
GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
171+
GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& GPUrestrict() smem, processorType& GPUrestrict() processors)
172172
{
173-
GPUTPCGMMerger& merger = processors.tpcMerger;
174-
const o2::tpc::ClusterNativeAccess* clusters = processors.tpcConverter.getClustersNative();
175-
GPUTPCCompression& compressor = processors.tpcCompressor;
176-
GPUParam& param = processors.param;
173+
const GPUTPCGMMerger& GPUrestrict() merger = processors.tpcMerger;
174+
const o2::tpc::ClusterNativeAccess* GPUrestrict() clusters = processors.tpcConverter.getClustersNative();
175+
GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
176+
GPUParam& GPUrestrict() param = processors.param;
177177
unsigned int* sortBuffer = compressor.mClusterSortBuffer + iBlock * compressor.mNMaxClusterSliceRow;
178178
for (int iSliceRow = iBlock; iSliceRow < GPUCA_NSLICES * GPUCA_ROW_COUNT; iSliceRow += nBlocks) {
179179
const int iSlice = iSliceRow / GPUCA_ROW_COUNT;
@@ -184,7 +184,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
184184
}
185185
GPUbarrier();
186186

187-
CompressedClustersPtrsOnly& c = compressor.mPtrs;
187+
CompressedClustersPtrsOnly& GPUrestrict() c = compressor.mPtrs;
188188
for (unsigned int i = get_local_id(0); i < clusters->nClusters[iSlice][iRow]; i += get_local_size(0)) {
189189
const int idx = idOffset + i;
190190
if (compressor.mClusterStatus[idx]) {
@@ -231,7 +231,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
231231
unsigned short lastPad = 0;
232232
for (unsigned int i = 0; i < smem.nCount; i++) {
233233
int cidx = idOffset + i;
234-
const ClusterNative& orgCl = clusters->clusters[iSlice][iRow][sortBuffer[i]];
234+
const ClusterNative& GPUrestrict() orgCl = clusters->clusters[iSlice][iRow][sortBuffer[i]];
235235
c.padDiffU[cidx] = orgCl.padPacked - lastPad;
236236
c.timeDiffU[cidx] = (orgCl.getTimePacked() - lastTime) & 0xFFFFFF;
237237
if (param.rec.tpcCompressionModes & GPUSettings::CompressionDifferences) {

GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class GPUTPCCompressionKernels : public GPUKernelTemplate
4343
};
4444

4545
template <int iKernel = defaultKernel>
46-
GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors);
46+
GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& GPUrestrict() smem, processorType& GPUrestrict() processors);
4747

4848
public:
4949
template <int I>

GPU/GPUTracking/DataCompression/GPUTPCCompressionTrackModel.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ using namespace GPUCA_NAMESPACE::gpu;
2222
// encoded with the old version!!!
2323

2424
#ifdef GPUCA_COMPRESSION_TRACK_MODEL_MERGER
25-
GPUd() void GPUTPCCompressionTrackModel::Init(float x, float y, float z, float alpha, unsigned char qPt, const GPUParam& param)
25+
GPUd() void GPUTPCCompressionTrackModel::Init(float x, float y, float z, float alpha, unsigned char qPt, const GPUParam& GPUrestrict() param)
2626
{
2727
static constexpr float kRho = 1.025e-3f; // 0.9e-3;
2828
static constexpr float kRadLen = 29.532f; // 28.94;

GPU/GPUTracking/ITS/GPUITSFitterKernels.cxx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ using namespace GPUCA_NAMESPACE::gpu;
2929
using namespace o2;
3030
using namespace o2::its;
3131

32-
GPUdii() bool GPUITSFitterKernel::fitTrack(GPUITSFitter& Fitter, GPUTPCGMPropagator& prop, GPUITSTrack& track, int start, int end, int step)
32+
GPUdii() bool GPUITSFitterKernel::fitTrack(GPUITSFitter& GPUrestrict() Fitter, GPUTPCGMPropagator& GPUrestrict() prop, GPUITSTrack& GPUrestrict() track, int start, int end, int step)
3333
{
3434
for (int iLayer{start}; iLayer != end; iLayer += step) {
3535
if (track.mClusters[iLayer] == o2::its::constants::its::UnusedIndex) {
3636
continue;
3737
}
38-
const TrackingFrameInfo& trackingHit = Fitter.trackingFrame()[iLayer][track.mClusters[iLayer]];
38+
const TrackingFrameInfo& GPUrestrict() trackingHit = Fitter.trackingFrame()[iLayer][track.mClusters[iLayer]];
3939

4040
if (prop.PropagateToXAlpha(trackingHit.xTrackingFrame, trackingHit.alphaTrackingFrame, step > 0)) {
4141
return false;
@@ -55,7 +55,7 @@ GPUdii() bool GPUITSFitterKernel::fitTrack(GPUITSFitter& Fitter, GPUTPCGMPropaga
5555
}
5656

5757
template <>
58-
GPUdii() void GPUITSFitterKernel::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
58+
GPUdii() void GPUITSFitterKernel::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& GPUrestrict() smem, processorType& GPUrestrict() processors)
5959
{
6060
GPUITSFitter& Fitter = processors.itsFitter;
6161

GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
using namespace GPUCA_NAMESPACE::gpu;
2020

2121
template <>
22-
GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& merger)
22+
GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& GPUrestrict() smem, processorType& GPUrestrict() merger)
2323
{
2424
#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE)
2525
#pragma omp parallel for num_threads(merger.GetRec().GetDeviceProcessingSettings().nThreads)

GPU/GPUTracking/Merger/GPUTPCGMPhysicalTrackModel.cxx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
using namespace GPUCA_NAMESPACE::gpu;
1818

19-
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLight(float x, float Bz, float& dLp)
19+
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLight(float x, float Bz, float& GPUrestrict() dLp)
2020
{
2121
GPUTPCGMPhysicalTrackModel t = *this;
2222
if (CAMath::Abs(x - t.X()) < 1.e-8f) {
@@ -31,7 +31,7 @@ GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLight(float x, float Bz, fl
3131
return 0;
3232
}
3333

34-
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLightNoUpdate(float x, float Bz, float& dLp)
34+
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLightNoUpdate(float x, float Bz, float& GPUrestrict() dLp)
3535
{
3636
//
3737
// transport the track to X=x in magnetic field B = ( 0, 0, Bz[kG*0.000299792458] )
@@ -85,7 +85,7 @@ GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBzLightNoUpdate(float x, floa
8585
return 0;
8686
}
8787

88-
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBxByBz(float x, float Bx, float By, float Bz, float& dLp)
88+
GPUd() int GPUTPCGMPhysicalTrackModel::PropagateToXBxByBz(float x, float Bx, float By, float Bz, float& GPUrestrict() dLp)
8989
{
9090
//
9191
// transport the track to X=x in magnetic field B = ( Bx, By, Bz )[kG*0.000299792458]

GPU/GPUTracking/Merger/GPUTPCGMPhysicalTrackModel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ class GPUTPCGMPhysicalTrackModel
164164

165165
GPUdi() GPUTPCGMPhysicalTrackModel::GPUTPCGMPhysicalTrackModel(const GPUTPCGMTrackParam& t) { Set(t); }
166166

167-
GPUdi() void GPUTPCGMPhysicalTrackModel::Set(const GPUTPCGMTrackParam& t)
167+
GPUdi() void GPUTPCGMPhysicalTrackModel::Set(const GPUTPCGMTrackParam& GPUrestrict() t)
168168
{
169169
float pti = CAMath::Abs(t.GetQPt());
170170
if (pti < 1.e-4f) {

0 commit comments

Comments
 (0)