AliceO2/GPU/Common/GPUCommonDefAPI.h at dev · AliceO2Group/AliceO2

History

234 lines (222 loc) · 10.1 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.

// All rights not expressly granted are reserved.

// This software is distributed under the terms of the GNU General Public

// License v3 (GPL Version 3), copied verbatim in the file "COPYING".

// In applying this license CERN does not waive the privileges and immunities

// granted to it by virtue of its status as an Intergovernmental Organization

// or submit itself to any jurisdiction.

/// \file GPUCommonDefAPI.h

/// \author David Rohr

#ifndef GPUCOMMONDEFAPI_H

#define GPUCOMMONDEFAPI_H

// clang-format off

#ifndef GPUCOMMONDEF_H

#error Please include GPUCommonDef.h!

#endif

#ifndef GPUCA_GPUCODE_DEVICE

#include <cstdint>

#endif

//Define macros for GPU keywords. i-version defines inline functions.

//All host-functions in GPU code are automatically inlined, to avoid duplicate symbols.

//For non-inline host only functions, use no keyword at all!

#if !defined(GPUCA_GPUCODE) || defined(__OPENCL_HOST__) // For host / ROOT dictionary

#define GPUd() // device function

#define GPUdDefault() // default (constructor / operator) device function

#define GPUhdDefault() // default (constructor / operator) host device function

#define GPUdi() inline // to-be-inlined device function

#define GPUdii() // Only on GPU to-be-inlined (forced) device function

#define GPUdni() // Device function, not-to-be-inlined

#define GPUdnii() inline // Device function, not-to-be-inlined on device, inlined on host

#define GPUh() // Host-only function

// NOTE: All GPUd*() functions are also compiled on the host during host compilation.

// The GPUh*() macros are for the rare cases of functions that you want to compile for the host during GPU compilation.

// Usually, you do not need the GPUh*() versions. If in doubt, use GPUd*()!

#define GPUhi() inline // to-be-inlined host-only function

#define GPUhd() // Host and device function, inlined during GPU compilation to avoid symbol clashes in host code

#define GPUhdi() inline // Host and device function, to-be-inlined on host and device

#define GPUhdni() // Host and device function, not to-be-inlined automatically

#define GPUg() INVALID_TRIGGER_ERROR_NO_GPU_CODE // GPU kernel

#define GPUshared() // shared memory variable declaration

#define GPUglobal() // global memory variable declaration (only used for kernel input pointers)

#define GPUconstant() // constant memory variable declaraion

#define GPUconstexpr() static constexpr // constexpr on GPU that needs to be instantiated for dynamic access (e.g. arrays), becomes __constant on GPU

#define GPUprivate() // private memory variable declaration

#define GPUgeneric() // reference / ptr to generic address space

#define GPUbarrier() // synchronize all GPU threads in block

#define GPUbarrierWarp() // synchronize threads inside warp

#define GPUAtomic(type) type // atomic variable type

#define GPUsharedref() // reference / ptr to shared memory

#define GPUglobalref() // reference / ptr to global memory

#define GPUconstantref() // reference / ptr to constant memory

#define GPUconstexprref() // reference / ptr to variable declared as GPUconstexpr()

#ifndef __VECTOR_TYPES_H__ // FIXME: ROOT will pull in these CUDA definitions if built against CUDA, so we have to add an ugly protection here

struct float4 { float x, y, z, w; };

struct float3 { float x, y, z; };

struct float2 { float x; float y; };

struct uchar2 { uint8_t x, y; };

struct short2 { int16_t x, y; };

struct ushort2 { uint16_t x, y; };

struct int2 { int32_t x, y; };

struct int3 { int32_t x, y, z; };

struct int4 { int32_t x, y, z, w; };

struct uint1 { uint32_t x; };

struct uint2 { uint32_t x, y; };

struct uint3 { uint32_t x, y, z; };

struct uint4 { uint32_t x, y, z, w; };

struct dim3 { uint32_t x, y, z; };

#endif

#elif defined(__OPENCL__) // Defines for OpenCL

#define GPUd()

#define GPUdDefault()

#define GPUhdDefault()

#define GPUdi() inline

#define GPUdii() __attribute__((always_inline)) inline

#define GPUdni()

#define GPUdnii()

#define GPUh() INVALID_TRIGGER_ERROR_NO_HOST_CODE

#define GPUhi() INVALID_TRIGGER_ERROR_NO_HOST_CODE

#define GPUhd() inline

#define GPUhdi() inline

#define GPUhdni()

#define GPUg() __kernel

#define GPUshared() __local

#define GPUglobal() __global

#define GPUconstant() __constant // TODO: possibly add const __restrict where possible later!

#define GPUconstexpr() __constant

#define GPUprivate() __private

#define GPUgeneric() __generic

#define GPUconstexprref() GPUconstexpr()

#if defined(__OPENCL__) && !defined(__clang__)

#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local)

#define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local)

#define GPUAtomic(type) atomic<type>

static_assert(sizeof(atomic<uint32_t>) == sizeof(uint32_t), "Invalid size of atomic type");

#else

#define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)

#define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)

#if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)

namespace o2 { namespace gpu {

template <class T> struct oclAtomic;

template <> struct oclAtomic<uint32_t> {typedef atomic_uint t;};

static_assert(sizeof(oclAtomic<uint32_t>::t) == sizeof(uint32_t), "Invalid size of atomic type");

}}

#define GPUAtomic(type) o2::gpu::oclAtomic<type>::t

#else

#define GPUAtomic(type) volatile type

#endif

#if !defined(__OPENCL__) // Other special defines for OpenCL 1

#define GPUCA_USE_TEMPLATE_ADDRESS_SPACES // TODO: check if we can make this (partially, where it is already implemented) compatible with OpenCL CPP

#define GPUsharedref() GPUshared()

#define GPUglobalref() GPUglobal()

#undef GPUgeneric

#define GPUgeneric()

#endif

#if (!defined(__OPENCL__) || !defined(GPUCA_NO_CONSTANT_MEMORY))

#define GPUconstantref() GPUconstant()

#endif

#elif defined(__HIPCC__) //Defines for HIP

#define GPUd() __device__

#define GPUdDefault() __device__

#define GPUhdDefault() __host__ __device__

#define GPUdi() __device__ inline

#define GPUdii() __device__ __forceinline__

#define GPUdni() __device__ __attribute__((noinline))

#define GPUdnii() __device__ __attribute__((noinline))

#define GPUh() __host__ inline

#define GPUhi() __host__ inline

#define GPUhd() __host__ __device__ inline

#define GPUhdi() __host__ __device__ inline

#define GPUhdni() __host__ __device__

#define GPUg() __global__

#define GPUshared() __shared__

#if defined(GPUCA_GPUCODE_DEVICE) && 0 // TODO: Fix for HIP

#define GPUCA_USE_TEMPLATE_ADDRESS_SPACES

#define GPUglobal() __attribute__((address_space(1)))

#define GPUglobalref() GPUglobal()

#define GPUconstantref() __attribute__((address_space(4)))

#define GPUsharedref() __attribute__((address_space(3)))

#else

#define GPUglobal()

#endif

#define GPUconstant() __constant__

#define GPUconstexpr() constexpr __constant__

#define GPUprivate()

#define GPUgeneric()

#define GPUbarrier() __syncthreads()

#define GPUbarrierWarp()

#define GPUAtomic(type) type

#elif defined(__CUDACC__) //Defines for CUDA

#ifndef GPUCA_GPUCODE_DEVICE

#define GPUd() __device__ inline // FIXME: DR: Workaround: mark device function as inline such that nvcc does not create bogus host symbols

#else

#define GPUd() __device__

#endif

#define GPUdDefault()

#define GPUhdDefault()

#define GPUdi() __device__ inline

#define GPUdii() __device__ inline

#define GPUdni() __device__ __attribute__((noinline))

#define GPUdnii() __device__ __attribute__((noinline))

#define GPUh() __host__ inline

#define GPUhi() __host__ inline

#define GPUhd() __host__ __device__ inline

#define GPUhdi() __host__ __device__ inline

#define GPUhdni() __host__ __device__

#define GPUg() __global__

#define GPUshared() __shared__

#define GPUglobal()

#define GPUconstant() __constant__

#define GPUconstexpr() constexpr __constant__

#define GPUprivate()

#define GPUgeneric()

#define GPUbarrier() __syncthreads()

#define GPUbarrierWarp() __syncwarp()

#define GPUAtomic(type) type

#endif

#ifndef GPUdic // Takes different parameter for inlining: 0 = never, 1 = always, 2 = compiler-decision

#define GPUdic(...) GPUd()

#endif

#define GPUCA_GPUdic_select_0() GPUdni()

#define GPUCA_GPUdic_select_1() GPUdii()

#define GPUCA_GPUdic_select_2() GPUd()

#if defined(GPUCA_NO_CONSTANT_MEMORY)

#undef GPUconstant

#define GPUconstant() GPUglobal()

#endif

#ifndef GPUsharedref

#define GPUsharedref()

#endif

#ifndef GPUglobalref

#define GPUglobalref()

#endif

#ifndef GPUconstantref

#define GPUconstantref()

#endif

#ifndef GPUconstexprref

#define GPUconstexprref()

#endif

#define GPUrestrict() __restrict__

// Macros for GRID dimension

#if defined(__CUDACC__) || defined(__HIPCC__)

#define get_global_id(dim) (blockIdx.x * blockDim.x + threadIdx.x)

#define get_global_size(dim) (blockDim.x * gridDim.x)

#define get_num_groups(dim) (gridDim.x)

#define get_local_id(dim) (threadIdx.x)

#define get_local_size(dim) (blockDim.x)

#define get_group_id(dim) (blockIdx.x)

#elif defined(__OPENCL__)

// Using OpenCL defaults

#else

#define get_global_id(dim) iBlock

#define get_global_size(dim) nBlocks

#define get_num_groups(dim) nBlocks

#define get_local_id(dim) 0

#define get_local_size(dim) 1

#define get_group_id(dim) iBlock

#endif

// clang-format on

#endif

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

GPUCommonDefAPI.h

Latest commit

History

GPUCommonDefAPI.h

File metadata and controls