Include support for the scatter gather cuda kernels to allow for complex<float>. Initial foundation

ZelboK · pytorchmergebot · commit a1fe3238e436 · 2024-04-30T15:32:34.000Z
diff --git a/CMakeCache.txt b/CMakeCache.txt
@@ -0,0 +1,38 @@
+# This is the CMakeCache file.
+# For build in directory: /home/ksm/pytorch
+# It was generated by CMake: /home/ksm/anaconda3/envs/pyt_dev/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/ksm/pytorch
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=26
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=4
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/home/ksm/anaconda3/envs/pyt_dev/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/home/ksm/anaconda3/envs/pyt_dev/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/home/ksm/anaconda3/envs/pyt_dev/bin/ctest
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/home/ksm/anaconda3/envs/pyt_dev/share/cmake-3.26
+
diff --git a/CMakeFiles/cmake.check_cache b/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
@@ -35,6 +35,26 @@ struct AtomicFPOp<at::Half> {
   }
 };
 
+template <>
+struct AtomicFPOp<c10::complex<float>> {
+  template <typename func_t>
+  inline __device__ c10::complex<float> operator() (c10::complex<float> *address, c10::complex<float> val, const func_t& func) {
+    unsigned long long int* addr_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *addr_as_ull;
+    unsigned long long int assumed, new_val;
+
+    c10::complex<float> csum;
+    do {
+        assumed = old;
+        csum = func(csum, val);
+        new_val = *reinterpret_cast<unsigned long long*>(&csum);
+        old = atomicCAS(addr_as_ull, assumed, new_val);
+    } while (assumed != old);
+
+    return *reinterpret_cast<c10::complex<float>*>(&addr_as_ull);
+  }
+};
+
 template <>
 struct AtomicFPOp<at::BFloat16> {
   template <typename func_t>
@@ -348,6 +368,14 @@ GPU_ATOMIC_INTEGER(Mul, a * b, int16_t)
 GPU_ATOMIC_INTEGER(Mul, a * b, int32_t)
 GPU_ATOMIC_INTEGER(Mul, a * b, int64_t)
 
+inline __device__ c10::complex<float> gpuAtomicMul(c10::complex<float> *address, c10::complex<float> val){
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  bsum*=(val);
+                                  return bsum;
+                                });
+}
+
 inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
   return AtomicFPOp<at::Half>()(address, val,
                                 [](at::Half bsum, at::Half val) {
@@ -369,7 +397,7 @@ inline __device__ double gpuAtomicMul(double * address, double val) {
                               });
 }
 
-// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+// Don't use a templated function for this since the addition function defaults to the CUDA built-in.
 inline __device__ float gpuAtomicMul (float * address, float val) {
   unsigned int* address_as_ull = (unsigned int*)address;
   unsigned int old = *address_as_ull;
@@ -402,6 +430,29 @@ __host__ __device__ T safe_max(T a, T b) {
   return max;
 }
 
+__inline__ __device__ c10::complex<float> complex_max(c10::complex<float> a, c10::complex<float> b) {
+  if(at::_isnan(b)) {
+    return b;
+  } else {
+    // Compute the magnitude of the complex numbers and compare each to see which one is greater.
+    float a_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(a.real(), a.real()) +
+        __fmul_rn(a.imag(),a.imag())
+      )
+    );
+    float b_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(b.real(), b.real()) +
+        __fmul_rn(b.imag(),b.imag())
+      )
+    );
+    return std::max<float>(a_magnitude, b_magnitude);
+  }
+
+}
+
+
 ATOMIC_INTEGER_IMPL(Max)
 GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t)
 GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t)
@@ -416,6 +467,13 @@ inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
                                 });
 }
 
+inline __device__ c10::complex<float> gpuAtomicMax(c10::complex<float> * address, c10::complex<float> val) {
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  return complex_max(bsum, val);
+                                });
+}
+
 inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
@@ -462,6 +520,27 @@ __host__ __device__ T safe_min(T a, T b) {
   return min;
 }
 
+__inline__ __device__ c10::complex<float> complex_min(c10::complex<float> a, c10::complex<float> b) {
+   if(at::_isnan(b)) {
+    return b;
+  } else {
+    // Compute the magnitude of the complex numbers and compare each to see which one is smaller.
+    float a_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(a.real(), a.real()) +
+        __fmul_rn(a.imag(),a.imag())
+      )
+    );
+    float b_magnitude = __fsqrt_rn(
+      (
+        __fmul_rn(b.real(), b.real()) +
+        __fmul_rn(b.imag(),b.imag())
+      )
+    );
+    return std::min<float>(a_magnitude, b_magnitude);
+  }
+}
+
 ATOMIC_INTEGER_IMPL(Min)
 GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t)
 GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t)
@@ -476,6 +555,13 @@ inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
                                 });
 }
 
+inline __device__ c10::complex<float> gpuAtomicMin(c10::complex<float> * address, c10::complex<float> val) {
+  return AtomicFPOp<c10::complex<float>>()(address, val,
+                                [](c10::complex<float> bsum, c10::complex<float> val) {
+                                  return complex_min(bsum, val);
+                                });
+}
+
 inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@@ -4,7 +4,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
-
+#include <iostream>
 #include <ATen/native/ScatterGatherChecks.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/TensorIterator.h>
@@ -201,7 +201,6 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       iter.dtype(),
@@ -259,7 +258,6 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
       iter.dtype(),
@@ -318,9 +316,10 @@ struct cuda_scatter_gather_base_kernel {
     auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
     auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
 
-
-    AT_DISPATCH_ALL_TYPES_AND2(
+    // this should have complex in it
+    AT_DISPATCH_ALL_TYPES_AND3(
       at::ScalarType::Half, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexFloat,
       iter.dtype(),
       "cuda_scatter_gather_base_kernel_func", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
@@ -450,8 +449,9 @@ struct cuda_scatter_fill_base_kernel {
     auto index_size = ensure_nonempty_size(self, dim);
     auto index_stride = ensure_nonempty_stride(self, dim);
 
-    AT_DISPATCH_ALL_TYPES_AND2(
+    AT_DISPATCH_ALL_TYPES_AND3(
       at::ScalarType::Half, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexFloat,
       iter.dtype(),
       "cuda_scatter_fill_base_kernel_reduce_multiply", [&] {
         using dtype = typename std::conditional<cast_to_opaque,
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
@@ -221,15 +221,17 @@ def test_scatter_reduce_sum(self, device, dtype):
                                             include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_prod(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
                                     is_scalar=False, reduction='prod', unique_indices=False,
                                     include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_bool=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_mean(self, device, dtype):
         for include_self in (True, False):
             for deterministic in [False, True]:
@@ -239,7 +241,8 @@ def test_scatter_reduce_mean(self, device, dtype):
                                             include_self=include_self)
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_amax(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,
@@ -258,7 +261,8 @@ def test_scatter_reduce_amax(self, device, dtype):
 
 
     @dtypes(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False))
-    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False, include_bool=False))
+    @dtypesIfCUDA(*get_all_dtypes(include_half=True, include_bfloat16=True, include_complex32=True,
+                                  include_complex=False, include_bool=False))
     def test_scatter_reduce_amin(self, device, dtype):
         for include_self in (True, False):
             self._test_scatter_base(torch.Tensor.scatter_reduce_, device=device, dtype=dtype,

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# This file is generated by cmake for dependency checking of the CMakeCache.txt file`