arrayfire · christophe-murphy · Jan 9, 2025 · Aug 5, 2024 · Aug 5, 2024 · Aug 20, 2024
diff --git a/src/backend/cuda/device_manager.cpp b/src/backend/cuda/device_manager.cpp
@@ -102,6 +102,7 @@ static const int jetsonComputeCapabilities[] = {
 // clang-format off
 static const cuNVRTCcompute Toolkit2MaxCompute[] = {
     {12060, 9, 0, 0},
+    {12050, 9, 0, 0},
     {12040, 9, 0, 0},
     {12030, 9, 0, 0},
     {12020, 9, 0, 0},
@@ -144,6 +145,7 @@ struct ComputeCapabilityToStreamingProcessors {
 static const ToolkitDriverVersions
     CudaToDriverVersion[] = {
         {12060, 525.60f, 528.33f},
+        {12050, 525.60f, 528.33f},
         {12040, 525.60f, 528.33f},
         {12030, 525.60f, 528.33f},
         {12020, 525.60f, 528.33f},

diff --git a/src/backend/cuda/kernel/reduce_by_key.hpp b/src/backend/cuda/kernel/reduce_by_key.hpp
@@ -25,8 +25,6 @@
 
 using std::unique_ptr;
 
-const static unsigned int FULL_MASK = 0xFFFFFFFF;
-
 namespace arrayfire {
 namespace cuda {
 namespace kernel {
@@ -68,9 +66,9 @@ __global__ void test_needs_reduction(int *needs_another_reduction,
 
     if (tid < n) { k = keys_in.ptr[tid]; }
 
-    int update_key = (k == shfl_down_sync(FULL_MASK, k, 1)) &&
+    int update_key = (k == shfl_down_sync(k, 1)) &&
                      (tid < (n - 1)) && ((threadIdx.x % 32) < 31);
-    int remaining_updates = any_sync(FULL_MASK, update_key);
+    int remaining_updates = any_sync(update_key);
 
     __syncthreads();
 
@@ -83,7 +81,7 @@ __global__ void test_needs_reduction(int *needs_another_reduction,
          && (threadIdx.x < (blockDim.x - 1))  // not last thread in block
          // next value valid and equal
          && ((tid + 1) < n) && (k == keys_in.ptr[tid + 1]));
-    remaining_updates = any_sync(FULL_MASK, update_key);
+    remaining_updates = any_sync(update_key);
 
     // TODO: single per warp? change to assignment rather than atomicOr
     if (remaining_updates) atomicOr(needs_another_reduction, remaining_updates);
@@ -243,50 +241,41 @@ __global__ static void reduce_blocks_by_key(int *reduced_block_sizes,
         v = common::Binary<compute_t<To>, op>::init();
     }
 
-    compute_t<Tk> eq_check = (k != shfl_up_sync(FULL_MASK, k, 1));
+    compute_t<Tk> eq_check = (k != shfl_up_sync(k, 1));
     // mark threads containing unique keys
     char unique_flag = (eq_check || (laneid == 0)) && (tidx < n);
 
     // scan unique flags to enumerate unique keys
     char unique_id = unique_flag;
 #pragma unroll
     for (int offset = 1; offset < 32; offset <<= 1) {
-        char y = shfl_up_sync(FULL_MASK, unique_id, offset);
+        char y = shfl_up_sync(unique_id, offset);
         if (laneid >= offset) unique_id += y;
     }
 
     //
     // Reduce each warp by key
-    char all_eq = (k == shfl_down_sync(FULL_MASK, k, 1));
-    if (all_sync(FULL_MASK,
-                 all_eq)) {  // check special case of single key per warp
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 1));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 2));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 4));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 8));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 16));
+    char all_eq = (k == shfl_down_sync(k, 1));
+    if (all_sync(all_eq)) {  // check special case of single key per warp
+        v = reduce(v, shfl_down_sync(v, 1));
+        v = reduce(v, shfl_down_sync(v, 2));
+        v = reduce(v, shfl_down_sync(v, 4));
+        v = reduce(v, shfl_down_sync(v, 8));
+        v = reduce(v, shfl_down_sync(v, 16));
     } else {
         compute_t<To> init = common::Binary<compute_t<To>, op>::init();
         int eq_check, update_key;
-        unsigned shflmask;
 #pragma unroll
         for (int delta = 1; delta < 32; delta <<= 1) {
             eq_check =
-                (unique_id == shfl_down_sync(FULL_MASK, unique_id, delta));
+                (unique_id == shfl_down_sync(unique_id, delta));
 
             // checks if this thread should perform a reduction
             update_key =
                 eq_check && (laneid < (32 - delta)) && ((tidx + delta) < n);
 
-            // obtains mask of all threads that should be reduced
-            shflmask = ballot_sync(FULL_MASK, update_key);
-
-            // shifts mask to include source threads that should participate in
-            // _shfl
-            shflmask |= (shflmask << delta);
-
             // shfls data from neighboring threads
-            compute_t<To> uval = shfl_down_sync(shflmask, v, delta);
+            compute_t<To> uval = shfl_down_sync(v, delta);
 
             // update if thread requires it
             v = reduce(v, (update_key ? uval : init));
@@ -479,50 +468,41 @@ __global__ static void reduce_blocks_dim_by_key(
         v = init;
     }
 
-    Tk eq_check = (k != shfl_up_sync(FULL_MASK, k, 1));
+    Tk eq_check = (k != shfl_up_sync(k, 1));
     // mark threads containing unique keys
     char unique_flag = (eq_check || (laneid == 0)) && (tidx < n);
 
     // scan unique flags to enumerate unique keys
     char unique_id = unique_flag;
 #pragma unroll
     for (int offset = 1; offset < 32; offset <<= 1) {
-        char y = shfl_up_sync(FULL_MASK, unique_id, offset);
+        char y = shfl_up_sync(unique_id, offset);
         if (laneid >= offset) unique_id += y;
     }
 
     //
     // Reduce each warp by key
-    char all_eq = (k == shfl_down_sync(FULL_MASK, k, 1));
-    if (all_sync(FULL_MASK,
-                 all_eq)) {  // check special case of single key per warp
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 1));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 2));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 4));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 8));
-        v = reduce(v, shfl_down_sync(FULL_MASK, v, 16));
+    char all_eq = (k == shfl_down_sync(k, 1));
+    if (all_sync(all_eq)) {  // check special case of single key per warp
+        v = reduce(v, shfl_down_sync(v, 1));
+        v = reduce(v, shfl_down_sync(v, 2));
+        v = reduce(v, shfl_down_sync(v, 4));
+        v = reduce(v, shfl_down_sync(v, 8));
+        v = reduce(v, shfl_down_sync(v, 16));
     } else {
         compute_t<To> init = common::Binary<compute_t<To>, op>::init();
         int eq_check, update_key;
-        unsigned shflmask;
 #pragma unroll
         for (int delta = 1; delta < 32; delta <<= 1) {
             eq_check =
-                (unique_id == shfl_down_sync(FULL_MASK, unique_id, delta));
+                (unique_id == shfl_down_sync(unique_id, delta));
 
             // checks if this thread should perform a reduction
             update_key =
                 eq_check && (laneid < (32 - delta)) && ((tidx + delta) < n);
 
-            // obtains mask of all threads that should be reduced
-            shflmask = ballot_sync(FULL_MASK, update_key);
-
-            // shifts mask to include source threads that should participate in
-            // _shfl
-            shflmask |= (shflmask << delta);
-
             // shfls data from neighboring threads
-            compute_t<To> uval = shfl_down_sync(shflmask, v, delta);
+            compute_t<To> uval = shfl_down_sync(v, delta);
 
             // update if thread requires it
             v = reduce(v, (update_key ? uval : init));

diff --git a/src/backend/cuda/kernel/shfl_intrinsics.hpp b/src/backend/cuda/kernel/shfl_intrinsics.hpp
@@ -11,63 +11,65 @@ namespace arrayfire {
 namespace cuda {
 namespace kernel {
 
+constexpr unsigned int FULL_MASK = 0xffffffff;
+
 //__all_sync wrapper
 template<typename T>
-__device__ T all_sync(unsigned mask, T var) {
+__device__ T all_sync(T var) {
 #if (CUDA_VERSION >= 9000)
-    return __all_sync(mask, var);
+    return __all_sync(FULL_MASK, var);
 #else
     return __all(var);
 #endif
 }
 
 //__all_sync wrapper
 template<typename T>
-__device__ T any_sync(unsigned mask, T var) {
+__device__ T any_sync(T var) {
 #if (CUDA_VERSION >= 9000)
-    return __any_sync(mask, var);
+    return __any_sync(FULL_MASK, var);
 #else
     return __any(var);
 #endif
 }
 
 //__shfl_down_sync wrapper
 template<typename T>
-__device__ T ballot_sync(unsigned mask, T var) {
+__device__ T ballot_sync(T var) {
 #if (CUDA_VERSION >= 9000)
-    return __ballot_sync(mask, var);
+    return __ballot_sync(FULL_MASK, var);
 #else
     return __ballot(var);
 #endif
 }
 
 //__shfl_down_sync wrapper
 template<typename T>
-__device__ T shfl_down_sync(unsigned mask, T var, int delta) {
+__device__ T shfl_down_sync(T var, int delta) {
 #if (CUDA_VERSION >= 9000)
-    return __shfl_down_sync(mask, var, delta);
+    return __shfl_down_sync(FULL_MASK, var, delta);
 #else
     return __shfl_down(var, delta);
 #endif
 }
 // specialization for cfloat
 template<>
-inline __device__ cfloat shfl_down_sync(unsigned mask, cfloat var, int delta) {
+inline __device__ cfloat shfl_down_sync(cfloat var, int delta) {
 #if (CUDA_VERSION >= 9000)
-    cfloat res = {__shfl_down_sync(mask, var.x, delta),
-                  __shfl_down_sync(mask, var.y, delta)};
+    cfloat res = {__shfl_down_sync(FULL_MASK, var.x, delta),
+                  __shfl_down_sync(FULL_MASK, var.y, delta)};
 #else
     cfloat res  = {__shfl_down(var.x, delta), __shfl_down(var.y, delta)};
 #endif
     return res;
 }
 // specialization for cdouble
 template<>
-inline __device__ cdouble shfl_down_sync(unsigned mask, cdouble var,
+inline __device__ cdouble shfl_down_sync(cdouble var,
                                          int delta) {
 #if (CUDA_VERSION >= 9000)
-    cdouble res = {__shfl_down_sync(mask, var.x, delta),
-                   __shfl_down_sync(mask, var.y, delta)};
+    cdouble res = {__shfl_down_sync(FULL_MASK, var.x, delta),
+                   __shfl_down_sync(FULL_MASK, var.y, delta)};
 #else
     cdouble res = {__shfl_down(var.x, delta), __shfl_down(var.y, delta)};
 #endif
@@ -76,30 +78,30 @@ inline __device__ cdouble shfl_down_sync(unsigned mask, cdouble var,
 
 //__shfl_up_sync wrapper
 template<typename T>
-__device__ T shfl_up_sync(unsigned mask, T var, int delta) {
+__device__ T shfl_up_sync(T var, int delta) {
 #if (CUDA_VERSION >= 9000)
-    return __shfl_up_sync(mask, var, delta);
+    return __shfl_up_sync(FULL_MASK, var, delta);
 #else
     return __shfl_up(var, delta);
 #endif
 }
 // specialization for cfloat
 template<>
-inline __device__ cfloat shfl_up_sync(unsigned mask, cfloat var, int delta) {
+inline __device__ cfloat shfl_up_sync(cfloat var, int delta) {
 #if (CUDA_VERSION >= 9000)
-    cfloat res = {__shfl_up_sync(mask, var.x, delta),
-                  __shfl_up_sync(mask, var.y, delta)};
+    cfloat res = {__shfl_up_sync(FULL_MASK, var.x, delta),
+                  __shfl_up_sync(FULL_MASK, var.y, delta)};
 #else
     cfloat res  = {__shfl_up(var.x, delta), __shfl_up(var.y, delta)};
 #endif
     return res;
 }
 // specialization for cdouble
 template<>
-inline __device__ cdouble shfl_up_sync(unsigned mask, cdouble var, int delta) {
+inline __device__ cdouble shfl_up_sync(cdouble var, int delta) {
 #if (CUDA_VERSION >= 9000)
-    cdouble res = {__shfl_up_sync(mask, var.x, delta),
-                   __shfl_up_sync(mask, var.y, delta)};
+    cdouble res = {__shfl_up_sync(FULL_MASK, var.x, delta),
+                   __shfl_up_sync(FULL_MASK, var.y, delta)};
 #else
     cdouble res = {__shfl_up(var.x, delta), __shfl_up(var.y, delta)};
 #endif

diff --git a/src/backend/cuda/qr.cpp b/src/backend/cuda/qr.cpp
@@ -67,6 +67,16 @@ struct mqr_func_def_t {
                                               int, T *, int, int *);
 };
 
+template<typename T>
+struct mqr_buf_func_def_t {
+    using mqr_buf_func_def = cusolverStatus_t (*)(cusolverDnHandle_t,
+                                                  cublasSideMode_t,
+                                                  cublasOperation_t, int, int, int,
+                                                  const T *, int, const T *, T *,
+                                                  int, int *);
+};
+
+
 #define QR_FUNC_DEF(FUNC)                                         \
     template<typename T>                                          \
     typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func(); \
@@ -94,15 +104,25 @@ QR_FUNC(geqrf, double, D)
 QR_FUNC(geqrf, cfloat, C)
 QR_FUNC(geqrf, cdouble, Z)
 
-#define MQR_FUNC_DEF(FUNC) \
-    template<typename T>   \
-    typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func();
+#define MQR_FUNC_DEF(FUNC)                                        \
+    template<typename T>                                          \
+    typename FUNC##_func_def_t<T>::FUNC##_func_def FUNC##_func(); \
+                                                                  \
+    template<typename T>                                          \
+    typename FUNC##_buf_func_def_t<T>::FUNC##_buf_func_def FUNC##_buf_func();
 
 #define MQR_FUNC(FUNC, TYPE, PREFIX)                                        \
     template<>                                                              \
     typename FUNC##_func_def_t<TYPE>::FUNC##_func_def FUNC##_func<TYPE>() { \
         return (FUNC##_func_def_t<TYPE>::FUNC##_func_def) &                 \
                cusolverDn##PREFIX;                                          \
+    }                                                                       \
+                                                                            \
+    template<>                                                              \
+    typename FUNC##_buf_func_def_t<TYPE>::FUNC##_buf_func_def               \
+        FUNC##_buf_func<TYPE>() {                                           \
+        return (FUNC##_buf_func_def_t<TYPE>::FUNC##_buf_func_def) &         \
+               cusolverDn##PREFIX##_bufferSize;                             \
     }
 
 MQR_FUNC_DEF(mqr)
@@ -143,6 +163,13 @@ void qr(Array<T> &q, Array<T> &r, Array<T> &t, const Array<T> &in) {
     dim4 qdims(M, mn);
     q = identity<T>(qdims);
 
+    CUSOLVER_CHECK(mqr_buf_func<T>()(
+        solverDnHandle(), CUBLAS_SIDE_LEFT, CUBLAS_OP_N, q.dims()[0],
+	q.dims()[1], min(M, N), in_copy.get(), in_copy.strides()[1], t.get(),
+        q.get(), q.strides()[1], &lwork));
+
+    workspace = memAlloc<T>(lwork);
+
     CUSOLVER_CHECK(mqr_func<T>()(
         solverDnHandle(), CUBLAS_SIDE_LEFT, CUBLAS_OP_N, q.dims()[0],
         q.dims()[1], min(M, N), in_copy.get(), in_copy.strides()[1], t.get(),