Include support for the scatter gather cuda kernels to allow for comp… (#124809)

ZelboK · pytorchmergebot · commit 9e24c263f998 · 2024-05-01T23:58:35.000Z
Fixes #121965 This PR hopes to add support complex numbers in the scatter/gather related kernels. For brevity, I will only include `complex<float>` for now as `complex<double>`, for example, will be more complicated. C++ unit tests are currently passing alongside tests in `test_scatter_gather_ops.py`. Python test suites also seem to be passing. Please keep the following in mind: 1) I think this is my first time using Pytorch. 2) This is my first contribution to Pytorch. Environment: 3080 & WSL 2. `nvcc` is at 12.4. Pull Request resolved: #124809 Approved by: https://github.com/mikaylagawarecki
diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
@@ -38,7 +38,11 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 
 template <typename T, std::enable_if_t<c10::is_complex<T>::value, int> = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return ::isnan(val.real()) || ::isnan(val.imag());
+#else
   return std::isnan(val.real()) || std::isnan(val.imag());
+#endif
 }
 
 template <typename T, std::enable_if_t<std::is_same_v<T, at::Half>, int> = 0>
diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
@@ -35,6 +35,26 @@ struct AtomicFPOp<at::Half> {
   }
 };
 
+template <>
+struct AtomicFPOp<c10::complex<float>> {
+  template <typename func_t>
+  inline __device__ c10::complex<float> operator() (c10::complex<float> *address, c10::complex<float> val, const func_t& func) {
+    unsigned long long int* addr_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *addr_as_ull;
+    unsigned long long int assumed, new_val;
+
+    c10::complex<float> csum;
+    do {
+        assumed = old;
+        csum = func(csum, val);
+        new_val = *reinterpret_cast<unsigned long long*>(&csum);
+        old = atomicCAS(addr_as_ull, assumed, new_val);
+    } while (assumed != old);
+
+    return *reinterpret_cast<c10::complex<float>*>(&addr_as_ull);
+  }
+};
+
 template <>
 struct AtomicFPOp<at::BFloat16> {
   template <typename func_t>
@@ -348,6 +368,14 @@ GPU_ATOMIC_INTEGER(Mul, a * b, int16_t)
 GPU_ATOMIC_INTEGER(Mul, a * b, int32_t)
 GPU_ATOMIC_INTEGER(Mul, a * b, int64_t)
 
+inline __device__ c10::complex<float> gpuAtomicMul(c10::complex<float> *address, c10::complex<float> val){
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  bsum*=(val);
+                                  return bsum;
+                                });
+}
+
 inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
   return AtomicFPOp<at::Half>()(address, val,
                                 [](at::Half bsum, at::Half val) {
@@ -369,7 +397,7 @@ inline __device__ double gpuAtomicMul(double * address, double val) {
                               });
 }
 
-// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+// Don't use a templated function for this since the addition function defaults to the CUDA built-in.
 inline __device__ float gpuAtomicMul (float * address, float val) {
   unsigned int* address_as_ull = (unsigned int*)address;
   unsigned int old = *address_as_ull;
@@ -402,6 +430,29 @@ __host__ __device__ T safe_max(T a, T b) {
   return max;
 }
 
+__inline__ __device__ c10::complex<float> complex_max(c10::complex<float> a, c10::complex<float> b) {
+  if(at::_isnan(b)) {
+    return b;
+  } else {
+    // Compute the magnitude of the complex numbers and compare each to see which one is greater.
+    float a_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(a.real(), a.real()) +
+        __fmul_rn(a.imag(),a.imag())
+      )
+    );
+    float b_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(b.real(), b.real()) +
+        __fmul_rn(b.imag(),b.imag())
+      )
+    );
+    return std::max<float>(a_magnitude, b_magnitude);
+  }
+
+}
+
+
 ATOMIC_INTEGER_IMPL(Max)
 GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t)
 GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t)
@@ -416,6 +467,13 @@ inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
                                 });
 }
 
+inline __device__ c10::complex<float> gpuAtomicMax(c10::complex<float> * address, c10::complex<float> val) {
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  return complex_max(bsum, val);
+                                });
+}
+
 inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
@@ -462,6 +520,27 @@ __host__ __device__ T safe_min(T a, T b) {
   return min;
 }
 
+__inline__ __device__ c10::complex<float> complex_min(c10::complex<float> a, c10::complex<float> b) {
+   if(at::_isnan(b)) {
+    return b;
+  } else {
+    // Compute the magnitude of the complex numbers and compare each to see which one is smaller.
+    float a_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(a.real(), a.real()) +
+        __fmul_rn(a.imag(),a.imag())
+      )
+    );
+    float b_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(b.real(), b.real()) +
+        __fmul_rn(b.imag(),b.imag())
+      )
+    );
+    return std::min<float>(a_magnitude, b_magnitude);
+  }
+}
+
 ATOMIC_INTEGER_IMPL(Min)
 GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t)
 GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t)
@@ -476,6 +555,13 @@ inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
                                 });
 }
 
+inline __device__ c10::complex<float> gpuAtomicMin(c10::complex<float> * address, c10::complex<float> val) {
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  return complex_min(bsum, val);
+                                });
+}
+
 inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -4,7 +4,6 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
-
 #include <ATen/native/ScatterGatherChecks.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
@@ -201,7 +200,6 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       iter.dtype(),
@@ -259,7 +257,6 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       iter.dtype(),
@@ -318,9 +315,9 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
-    AT_DISPATCH_ALL_TYPES_AND2(
+    AT_DISPATCH_ALL_TYPES_AND3(
       at::ScalarType::Half, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexFloat,
       iter.dtype(),
       "cuda_scatter_gather_base_kernel_func", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
@@ -450,8 +447,9 @@ struct cuda_scatter_fill_base_kernel {
     auto index_size = ensure_nonempty_size(self, dim);
     auto index_stride = ensure_nonempty_stride(self, dim);
 
-    AT_DISPATCH_ALL_TYPES_AND2(
+    AT_DISPATCH_ALL_TYPES_AND3(
       at::ScalarType::Half, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexFloat,
       iter.dtype(),
       "cuda_scatter_fill_base_kernel_reduce_multiply", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
@@ -221,15 +221,17 @@ def test_scatter_reduce_sum(self, device, dtype):
                                             include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_prod(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
                                     is_scalar=False, reduction='prod', unique_indices=False,
                                     include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_mean(self, device, dtype):
         for include_self in (True, False):
             for deterministic in [False, True]:
@@ -239,7 +241,8 @@ def test_scatter_reduce_mean(self, device, dtype):
                                             include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_amax(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
@@ -258,7 +261,8 @@ def test_scatter_reduce_amax(self, device, dtype):
 
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_amin(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -57,8 +57,8 @@
     _create_scaling_case, _create_scaling_models_optimizers)
 from torch.testing._internal.common_mkldnn import bf32_on_and_off
 from torch.testing._internal.common_dtype import (
-    floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types,
-    all_types_and, floating_types, floating_and_complex_types, integral_types_and,
+    floating_types_and, get_all_math_dtypes, all_types_and_complex_and, all_types_and, floating_types,
+    floating_and_complex_types, integral_types_and,
     get_all_qint_dtypes,
 )
 from torch.testing._internal.two_tensor import TwoTensor
@@ -3837,7 +3837,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
             self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}")
 
     @onlyCUDA
-    @dtypes(*complex_types())
+    @dtypes(torch.cdouble)
     def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype):
         height = 2
         width = 2