-
Notifications
You must be signed in to change notification settings - Fork 496
Expand file tree
/
Copy pathGPUCommonDefAPI.h
More file actions
234 lines (222 loc) · 10.1 KB
/
GPUCommonDefAPI.h
File metadata and controls
234 lines (222 loc) · 10.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.
/// \file GPUCommonDefAPI.h
/// \author David Rohr
#ifndef GPUCOMMONDEFAPI_H
#define GPUCOMMONDEFAPI_H
// clang-format off
#ifndef GPUCOMMONDEF_H
#error Please include GPUCommonDef.h!
#endif
#ifndef GPUCA_GPUCODE_DEVICE
#include <cstdint>
#endif
//Define macros for GPU keywords. i-version defines inline functions.
//All host-functions in GPU code are automatically inlined, to avoid duplicate symbols.
//For non-inline host only functions, use no keyword at all!
#if !defined(GPUCA_GPUCODE) || defined(__OPENCL_HOST__) // For host / ROOT dictionary
#define GPUd() // device function
#define GPUdDefault() // default (constructor / operator) device function
#define GPUhdDefault() // default (constructor / operator) host device function
#define GPUdi() inline // to-be-inlined device function
#define GPUdii() // Only on GPU to-be-inlined (forced) device function
#define GPUdni() // Device function, not-to-be-inlined
#define GPUdnii() inline // Device function, not-to-be-inlined on device, inlined on host
#define GPUh() // Host-only function
// NOTE: All GPUd*() functions are also compiled on the host during host compilation.
// The GPUh*() macros are for the rare cases of functions that you want to compile for the host during GPU compilation.
// Usually, you do not need the GPUh*() versions. If in doubt, use GPUd*()!
#define GPUhi() inline // to-be-inlined host-only function
#define GPUhd() // Host and device function, inlined during GPU compilation to avoid symbol clashes in host code
#define GPUhdi() inline // Host and device function, to-be-inlined on host and device
#define GPUhdni() // Host and device function, not to-be-inlined automatically
#define GPUg() INVALID_TRIGGER_ERROR_NO_GPU_CODE // GPU kernel
#define GPUshared() // shared memory variable declaration
#define GPUglobal() // global memory variable declaration (only used for kernel input pointers)
#define GPUconstant() // constant memory variable declaraion
#define GPUconstexpr() static constexpr // constexpr on GPU that needs to be instantiated for dynamic access (e.g. arrays), becomes __constant on GPU
#define GPUprivate() // private memory variable declaration
#define GPUgeneric() // reference / ptr to generic address space
#define GPUbarrier() // synchronize all GPU threads in block
#define GPUbarrierWarp() // synchronize threads inside warp
#define GPUAtomic(type) type // atomic variable type
#define GPUsharedref() // reference / ptr to shared memory
#define GPUglobalref() // reference / ptr to global memory
#define GPUconstantref() // reference / ptr to constant memory
#define GPUconstexprref() // reference / ptr to variable declared as GPUconstexpr()
#ifndef __VECTOR_TYPES_H__ // FIXME: ROOT will pull in these CUDA definitions if built against CUDA, so we have to add an ugly protection here
struct float4 { float x, y, z, w; };
struct float3 { float x, y, z; };
struct float2 { float x; float y; };
struct uchar2 { uint8_t x, y; };
struct short2 { int16_t x, y; };
struct ushort2 { uint16_t x, y; };
struct int2 { int32_t x, y; };
struct int3 { int32_t x, y, z; };
struct int4 { int32_t x, y, z, w; };
struct uint1 { uint32_t x; };
struct uint2 { uint32_t x, y; };
struct uint3 { uint32_t x, y, z; };
struct uint4 { uint32_t x, y, z, w; };
struct dim3 { uint32_t x, y, z; };
#endif
#elif defined(__OPENCL__) // Defines for OpenCL
#define GPUd()
#define GPUdDefault()
#define GPUhdDefault()
#define GPUdi() inline
#define GPUdii() __attribute__((always_inline)) inline
#define GPUdni()
#define GPUdnii()
#define GPUh() INVALID_TRIGGER_ERROR_NO_HOST_CODE
#define GPUhi() INVALID_TRIGGER_ERROR_NO_HOST_CODE
#define GPUhd() inline
#define GPUhdi() inline
#define GPUhdni()
#define GPUg() __kernel
#define GPUshared() __local
#define GPUglobal() __global
#define GPUconstant() __constant // TODO: possibly add const __restrict where possible later!
#define GPUconstexpr() __constant
#define GPUprivate() __private
#define GPUgeneric() __generic
#define GPUconstexprref() GPUconstexpr()
#if defined(__OPENCL__) && !defined(__clang__)
#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local)
#define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local)
#define GPUAtomic(type) atomic<type>
static_assert(sizeof(atomic<uint32_t>) == sizeof(uint32_t), "Invalid size of atomic type");
#else
#define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
#define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
#if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)
namespace o2 { namespace gpu {
template <class T> struct oclAtomic;
template <> struct oclAtomic<uint32_t> {typedef atomic_uint t;};
static_assert(sizeof(oclAtomic<uint32_t>::t) == sizeof(uint32_t), "Invalid size of atomic type");
}}
#define GPUAtomic(type) o2::gpu::oclAtomic<type>::t
#else
#define GPUAtomic(type) volatile type
#endif
#endif
#if !defined(__OPENCL__) // Other special defines for OpenCL 1
#define GPUCA_USE_TEMPLATE_ADDRESS_SPACES // TODO: check if we can make this (partially, where it is already implemented) compatible with OpenCL CPP
#define GPUsharedref() GPUshared()
#define GPUglobalref() GPUglobal()
#undef GPUgeneric
#define GPUgeneric()
#endif
#if (!defined(__OPENCL__) || !defined(GPUCA_NO_CONSTANT_MEMORY))
#define GPUconstantref() GPUconstant()
#endif
#elif defined(__HIPCC__) //Defines for HIP
#define GPUd() __device__
#define GPUdDefault() __device__
#define GPUhdDefault() __host__ __device__
#define GPUdi() __device__ inline
#define GPUdii() __device__ __forceinline__
#define GPUdni() __device__ __attribute__((noinline))
#define GPUdnii() __device__ __attribute__((noinline))
#define GPUh() __host__ inline
#define GPUhi() __host__ inline
#define GPUhd() __host__ __device__ inline
#define GPUhdi() __host__ __device__ inline
#define GPUhdni() __host__ __device__
#define GPUg() __global__
#define GPUshared() __shared__
#if defined(GPUCA_GPUCODE_DEVICE) && 0 // TODO: Fix for HIP
#define GPUCA_USE_TEMPLATE_ADDRESS_SPACES
#define GPUglobal() __attribute__((address_space(1)))
#define GPUglobalref() GPUglobal()
#define GPUconstantref() __attribute__((address_space(4)))
#define GPUsharedref() __attribute__((address_space(3)))
#else
#define GPUglobal()
#endif
#define GPUconstant() __constant__
#define GPUconstexpr() constexpr __constant__
#define GPUprivate()
#define GPUgeneric()
#define GPUbarrier() __syncthreads()
#define GPUbarrierWarp()
#define GPUAtomic(type) type
#elif defined(__CUDACC__) //Defines for CUDA
#ifndef GPUCA_GPUCODE_DEVICE
#define GPUd() __device__ inline // FIXME: DR: Workaround: mark device function as inline such that nvcc does not create bogus host symbols
#else
#define GPUd() __device__
#endif
#define GPUdDefault()
#define GPUhdDefault()
#define GPUdi() __device__ inline
#define GPUdii() __device__ inline
#define GPUdni() __device__ __attribute__((noinline))
#define GPUdnii() __device__ __attribute__((noinline))
#define GPUh() __host__ inline
#define GPUhi() __host__ inline
#define GPUhd() __host__ __device__ inline
#define GPUhdi() __host__ __device__ inline
#define GPUhdni() __host__ __device__
#define GPUg() __global__
#define GPUshared() __shared__
#define GPUglobal()
#define GPUconstant() __constant__
#define GPUconstexpr() constexpr __constant__
#define GPUprivate()
#define GPUgeneric()
#define GPUbarrier() __syncthreads()
#define GPUbarrierWarp() __syncwarp()
#define GPUAtomic(type) type
#endif
#ifndef GPUdic // Takes different parameter for inlining: 0 = never, 1 = always, 2 = compiler-decision
#define GPUdic(...) GPUd()
#endif
#define GPUCA_GPUdic_select_0() GPUdni()
#define GPUCA_GPUdic_select_1() GPUdii()
#define GPUCA_GPUdic_select_2() GPUd()
#if defined(GPUCA_NO_CONSTANT_MEMORY)
#undef GPUconstant
#define GPUconstant() GPUglobal()
#endif
#ifndef GPUsharedref
#define GPUsharedref()
#endif
#ifndef GPUglobalref
#define GPUglobalref()
#endif
#ifndef GPUconstantref
#define GPUconstantref()
#endif
#ifndef GPUconstexprref
#define GPUconstexprref()
#endif
#define GPUrestrict() __restrict__
// Macros for GRID dimension
#if defined(__CUDACC__) || defined(__HIPCC__)
#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x)
#define get_global_size(dim) (blockDim.x * gridDim.x)
#define get_num_groups(dim) (gridDim.x)
#define get_local_id(dim) (threadIdx.x)
#define get_local_size(dim) (blockDim.x)
#define get_group_id(dim) (blockIdx.x)
#elif defined(__OPENCL__)
// Using OpenCL defaults
#else
#define get_global_id(dim) iBlock
#define get_global_size(dim) nBlocks
#define get_num_groups(dim) nBlocks
#define get_local_id(dim) 0
#define get_local_size(dim) 1
#define get_group_id(dim) iBlock
#endif
// clang-format on
#endif