Skip to content
2 changes: 2 additions & 0 deletions src/backend/cuda/device_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ static const int jetsonComputeCapabilities[] = {
// clang-format off
static const cuNVRTCcompute Toolkit2MaxCompute[] = {
{12060, 9, 0, 0},
{12050, 9, 0, 0},
{12040, 9, 0, 0},
{12030, 9, 0, 0},
{12020, 9, 0, 0},
Expand Down Expand Up @@ -144,6 +145,7 @@ struct ComputeCapabilityToStreamingProcessors {
static const ToolkitDriverVersions
CudaToDriverVersion[] = {
{12060, 525.60f, 528.33f},
{12050, 525.60f, 528.33f},
{12040, 525.60f, 528.33f},
{12030, 525.60f, 528.33f},
{12020, 525.60f, 528.33f},
Expand Down
70 changes: 25 additions & 45 deletions src/backend/cuda/kernel/reduce_by_key.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@

using std::unique_ptr;

const static unsigned int FULL_MASK = 0xFFFFFFFF;

namespace arrayfire {
namespace cuda {
namespace kernel {
Expand Down Expand Up @@ -68,9 +66,9 @@ __global__ void test_needs_reduction(int *needs_another_reduction,

if (tid < n) { k = keys_in.ptr[tid]; }

int update_key = (k == shfl_down_sync(FULL_MASK, k, 1)) &&
int update_key = (k == shfl_down_sync(k, 1)) &&
(tid < (n - 1)) && ((threadIdx.x % 32) < 31);
int remaining_updates = any_sync(FULL_MASK, update_key);
int remaining_updates = any_sync(update_key);

__syncthreads();

Expand All @@ -83,7 +81,7 @@ __global__ void test_needs_reduction(int *needs_another_reduction,
&& (threadIdx.x < (blockDim.x - 1)) // not last thread in block
// next value valid and equal
&& ((tid + 1) < n) && (k == keys_in.ptr[tid + 1]));
remaining_updates = any_sync(FULL_MASK, update_key);
remaining_updates = any_sync(update_key);

// TODO: single per warp? change to assignment rather than atomicOr
if (remaining_updates) atomicOr(needs_another_reduction, remaining_updates);
Expand Down Expand Up @@ -243,50 +241,41 @@ __global__ static void reduce_blocks_by_key(int *reduced_block_sizes,
v = common::Binary<compute_t<To>, op>::init();
}

compute_t<Tk> eq_check = (k != shfl_up_sync(FULL_MASK, k, 1));
compute_t<Tk> eq_check = (k != shfl_up_sync(k, 1));
// mark threads containing unique keys
char unique_flag = (eq_check || (laneid == 0)) && (tidx < n);

// scan unique flags to enumerate unique keys
char unique_id = unique_flag;
#pragma unroll
for (int offset = 1; offset < 32; offset <<= 1) {
char y = shfl_up_sync(FULL_MASK, unique_id, offset);
char y = shfl_up_sync(unique_id, offset);
if (laneid >= offset) unique_id += y;
}

//
// Reduce each warp by key
char all_eq = (k == shfl_down_sync(FULL_MASK, k, 1));
if (all_sync(FULL_MASK,
all_eq)) { // check special case of single key per warp
v = reduce(v, shfl_down_sync(FULL_MASK, v, 1));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 2));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 4));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 8));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 16));
char all_eq = (k == shfl_down_sync(k, 1));
if (all_sync(all_eq)) { // check special case of single key per warp
v = reduce(v, shfl_down_sync(v, 1));
v = reduce(v, shfl_down_sync(v, 2));
v = reduce(v, shfl_down_sync(v, 4));
v = reduce(v, shfl_down_sync(v, 8));
v = reduce(v, shfl_down_sync(v, 16));
} else {
compute_t<To> init = common::Binary<compute_t<To>, op>::init();
int eq_check, update_key;
unsigned shflmask;
#pragma unroll
for (int delta = 1; delta < 32; delta <<= 1) {
eq_check =
(unique_id == shfl_down_sync(FULL_MASK, unique_id, delta));
(unique_id == shfl_down_sync(unique_id, delta));

// checks if this thread should perform a reduction
update_key =
eq_check && (laneid < (32 - delta)) && ((tidx + delta) < n);

// obtains mask of all threads that should be reduced
shflmask = ballot_sync(FULL_MASK, update_key);

// shifts mask to include source threads that should participate in
// _shfl
shflmask |= (shflmask << delta);

// shfls data from neighboring threads
compute_t<To> uval = shfl_down_sync(shflmask, v, delta);
compute_t<To> uval = shfl_down_sync(v, delta);

// update if thread requires it
v = reduce(v, (update_key ? uval : init));
Expand Down Expand Up @@ -479,50 +468,41 @@ __global__ static void reduce_blocks_dim_by_key(
v = init;
}

Tk eq_check = (k != shfl_up_sync(FULL_MASK, k, 1));
Tk eq_check = (k != shfl_up_sync(k, 1));
// mark threads containing unique keys
char unique_flag = (eq_check || (laneid == 0)) && (tidx < n);

// scan unique flags to enumerate unique keys
char unique_id = unique_flag;
#pragma unroll
for (int offset = 1; offset < 32; offset <<= 1) {
char y = shfl_up_sync(FULL_MASK, unique_id, offset);
char y = shfl_up_sync(unique_id, offset);
if (laneid >= offset) unique_id += y;
}

//
// Reduce each warp by key
char all_eq = (k == shfl_down_sync(FULL_MASK, k, 1));
if (all_sync(FULL_MASK,
all_eq)) { // check special case of single key per warp
v = reduce(v, shfl_down_sync(FULL_MASK, v, 1));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 2));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 4));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 8));
v = reduce(v, shfl_down_sync(FULL_MASK, v, 16));
char all_eq = (k == shfl_down_sync(k, 1));
if (all_sync(all_eq)) { // check special case of single key per warp
v = reduce(v, shfl_down_sync(v, 1));
v = reduce(v, shfl_down_sync(v, 2));
v = reduce(v, shfl_down_sync(v, 4));
v = reduce(v, shfl_down_sync(v, 8));
v = reduce(v, shfl_down_sync(v, 16));
} else {
compute_t<To> init = common::Binary<compute_t<To>, op>::init();
int eq_check, update_key;
unsigned shflmask;
#pragma unroll
for (int delta = 1; delta < 32; delta <<= 1) {
eq_check =
(unique_id == shfl_down_sync(FULL_MASK, unique_id, delta));
(unique_id == shfl_down_sync(unique_id, delta));

// checks if this thread should perform a reduction
update_key =
eq_check && (laneid < (32 - delta)) && ((tidx + delta) < n);

// obtains mask of all threads that should be reduced
shflmask = ballot_sync(FULL_MASK, update_key);

// shifts mask to include source threads that should participate in
// _shfl
shflmask |= (shflmask << delta);

// shfls data from neighboring threads
compute_t<To> uval = shfl_down_sync(shflmask, v, delta);
compute_t<To> uval = shfl_down_sync(v, delta);

// update if thread requires it
v = reduce(v, (update_key ? uval : init));
Expand Down
46 changes: 24 additions & 22 deletions src/backend/cuda/kernel/shfl_intrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,63 +11,65 @@ namespace arrayfire {
namespace cuda {
namespace kernel {

constexpr unsigned int FULL_MASK = 0xffffffff;

//__all_sync wrapper
template<typename T>
__device__ T all_sync(unsigned mask, T var) {
__device__ T all_sync(T var) {
#if (CUDA_VERSION >= 9000)
return __all_sync(mask, var);
return __all_sync(FULL_MASK, var);
#else
return __all(var);
#endif
}

//__all_sync wrapper
template<typename T>
__device__ T any_sync(unsigned mask, T var) {
__device__ T any_sync(T var) {
#if (CUDA_VERSION >= 9000)
return __any_sync(mask, var);
return __any_sync(FULL_MASK, var);
#else
return __any(var);
#endif
}

//__shfl_down_sync wrapper
template<typename T>
__device__ T ballot_sync(unsigned mask, T var) {
__device__ T ballot_sync(T var) {
#if (CUDA_VERSION >= 9000)
return __ballot_sync(mask, var);
return __ballot_sync(FULL_MASK, var);
#else
return __ballot(var);
#endif
}

//__shfl_down_sync wrapper
template<typename T>
__device__ T shfl_down_sync(unsigned mask, T var, int delta) {
__device__ T shfl_down_sync(T var, int delta) {
#if (CUDA_VERSION >= 9000)
return __shfl_down_sync(mask, var, delta);
return __shfl_down_sync(FULL_MASK, var, delta);
#else
return __shfl_down(var, delta);
#endif
}
// specialization for cfloat
template<>
inline __device__ cfloat shfl_down_sync(unsigned mask, cfloat var, int delta) {
inline __device__ cfloat shfl_down_sync(cfloat var, int delta) {
#if (CUDA_VERSION >= 9000)
cfloat res = {__shfl_down_sync(mask, var.x, delta),
__shfl_down_sync(mask, var.y, delta)};
cfloat res = {__shfl_down_sync(FULL_MASK, var.x, delta),
__shfl_down_sync(FULL_MASK, var.y, delta)};
#else
cfloat res = {__shfl_down(var.x, delta), __shfl_down(var.y, delta)};
#endif
return res;
}
// specialization for cdouble
template<>
inline __device__ cdouble shfl_down_sync(unsigned mask, cdouble var,
inline __device__ cdouble shfl_down_sync(cdouble var,
int delta) {
#if (CUDA_VERSION >= 9000)
cdouble res = {__shfl_down_sync(mask, var.x, delta),
__shfl_down_sync(mask, var.y, delta)};
cdouble res = {__shfl_down_sync(FULL_MASK, var.x, delta),
__shfl_down_sync(FULL_MASK, var.y, delta)};
#else
cdouble res = {__shfl_down(var.x, delta), __shfl_down(var.y, delta)};
#endif
Expand All @@ -76,30 +78,30 @@ inline __device__ cdouble shfl_down_sync(unsigned mask, cdouble var,

//__shfl_up_sync wrapper
template<typename T>
__device__ T shfl_up_sync(unsigned mask, T var, int delta) {
__device__ T shfl_up_sync(T var, int delta) {
#if (CUDA_VERSION >= 9000)
return __shfl_up_sync(mask, var, delta);
return __shfl_up_sync(FULL_MASK, var, delta);
#else
return __shfl_up(var, delta);
#endif
}
// specialization for cfloat
template<>
inline __device__ cfloat shfl_up_sync(unsigned mask, cfloat var, int delta) {
inline __device__ cfloat shfl_up_sync(cfloat var, int delta) {
#if (CUDA_VERSION >= 9000)
cfloat res = {__shfl_up_sync(mask, var.x, delta),
__shfl_up_sync(mask, var.y, delta)};
cfloat res = {__shfl_up_sync(FULL_MASK, var.x, delta),
__shfl_up_sync(FULL_MASK, var.y, delta)};
#else
cfloat res = {__shfl_up(var.x, delta), __shfl_up(var.y, delta)};
#endif
return res;
}
// specialization for cdouble
template<>
inline __device__ cdouble shfl_up_sync(unsigned mask, cdouble var, int delta) {
inline __device__ cdouble shfl_up_sync(cdouble var, int delta) {
#if (CUDA_VERSION >= 9000)
cdouble res = {__shfl_up_sync(mask, var.x, delta),
__shfl_up_sync(mask, var.y, delta)};
cdouble res = {__shfl_up_sync(FULL_MASK, var.x, delta),
__shfl_up_sync(FULL_MASK, var.y, delta)};
#else
cdouble res = {__shfl_up(var.x, delta), __shfl_up(var.y, delta)};
#endif
Expand Down
33 changes: 30 additions & 3 deletions src/backend/cuda/qr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@ struct mqr_func_def_t {
int, T *, int, int *);
};

template<typename T>
struct mqr_buf_func_def_t {
using mqr_buf_func_def = cusolverStatus_t (*)(cusolverDnHandle_t,
cublasSideMode_t,
cublasOperation_t, int, int, int,
const T *, int, const T *, T *,
int, int *);
};


#define QR_FUNC_DEF(FUNC) \
template<typename T> \
typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func(); \
Expand Down Expand Up @@ -94,15 +104,25 @@ QR_FUNC(geqrf, double, D)
QR_FUNC(geqrf, cfloat, C)
QR_FUNC(geqrf, cdouble, Z)

#define MQR_FUNC_DEF(FUNC) \
template<typename T> \
typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func();
#define MQR_FUNC_DEF(FUNC) \
template<typename T> \
typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func(); \
\
template<typename T> \
typename FUNC##_buf_func_def_t<T>::FUNC##_buf_func_def FUNC##_buf_func();

#define MQR_FUNC(FUNC, TYPE, PREFIX) \
template<> \
typename FUNC##_func_def_t<TYPE>::FUNC##_func_def FUNC##_func<TYPE>() { \
return (FUNC##_func_def_t<TYPE>::FUNC##_func_def) & \
cusolverDn##PREFIX; \
} \
\
template<> \
typename FUNC##_buf_func_def_t<TYPE>::FUNC##_buf_func_def \
FUNC##_buf_func<TYPE>() { \
return (FUNC##_buf_func_def_t<TYPE>::FUNC##_buf_func_def) & \
cusolverDn##PREFIX##_bufferSize; \
}

MQR_FUNC_DEF(mqr)
Expand Down Expand Up @@ -143,6 +163,13 @@ void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in) {
dim4 qdims(M, mn);
q = identity<T>(qdims);

CUSOLVER_CHECK(mqr_buf_func<T>()(
solverDnHandle(), CUBLAS_SIDE_LEFT, CUBLAS_OP_N, q.dims()[0],
q.dims()[1], min(M, N), in_copy.get(), in_copy.strides()[1], t.get(),
q.get(), q.strides()[1], &lwork));

workspace = memAlloc<T>(lwork);

CUSOLVER_CHECK(mqr_func<T>()(
solverDnHandle(), CUBLAS_SIDE_LEFT, CUBLAS_OP_N, q.dims()[0],
q.dims()[1], min(M, N), in_copy.get(), in_copy.strides()[1], t.get(),
Expand Down
Loading
Loading