-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Include support for the scatter gather cuda kernels to allow for comp… #124809
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a1fe323
4eda126
7302719
84dc4ba
5520517
ae3f022
6c5e77c
1aac787
17013ad
53e7bb1
55c1d76
2d75ec3
6bdcc8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -35,6 +35,26 @@ struct AtomicFPOp<at::Half> { | |||||
| } | ||||||
| }; | ||||||
|
|
||||||
| template <> | ||||||
| struct AtomicFPOp<c10::complex<float>> { | ||||||
| template <typename func_t> | ||||||
| inline __device__ c10::complex<float> operator() (c10::complex<float> *address, c10::complex<float> val, const func_t& func) { | ||||||
| unsigned long long int* addr_as_ull = (unsigned long long int*)address; | ||||||
| unsigned long long int old = *addr_as_ull; | ||||||
| unsigned long long int assumed, new_val; | ||||||
|
|
||||||
| c10::complex<float> csum; | ||||||
| do { | ||||||
| assumed = old; | ||||||
| csum = func(csum, val); | ||||||
| new_val = *reinterpret_cast<unsigned long long*>(&csum); | ||||||
| old = atomicCAS(addr_as_ull, assumed, new_val); | ||||||
| } while (assumed != old); | ||||||
|
|
||||||
| return *reinterpret_cast<c10::complex<float>*>(&addr_as_ull); | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't atomic? You need to return
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact this is also wrong as atomic read-modify-write ops return the old value, not the new value. So this should be bit-casting
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sorry for the oversight. Could you help me understand? I know that I understand that
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
See the CAS implementation for half as an example: pytorch/aten/src/ATen/cuda/Atomic.cuh Lines 33 to 34 in 6bdcc8d
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh you're right. I got tunnel visioned on the line on the actual call of |
||||||
| } | ||||||
| }; | ||||||
|
|
||||||
| template <> | ||||||
| struct AtomicFPOp<at::BFloat16> { | ||||||
| template <typename func_t> | ||||||
|
|
@@ -348,6 +368,14 @@ GPU_ATOMIC_INTEGER(Mul, a * b, int16_t) | |||||
| GPU_ATOMIC_INTEGER(Mul, a * b, int32_t) | ||||||
| GPU_ATOMIC_INTEGER(Mul, a * b, int64_t) | ||||||
|
|
||||||
| inline __device__ c10::complex<float> gpuAtomicMul(c10::complex<float> *address, c10::complex<float> val){ | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The build failure is totally not your fault as it can't be seen from external CI, we only see it when internal workflows run after the PR is merged Looking at the failure and pattern matching a bit, it looks like maybe we need Does this change make sense? I can import the PR and see whether this fixes the internal build tomorrow morning
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately since The easiest solution would be to add an overload for Just adding this in Also on second look I made an oversight in the |
||||||
| return AtomicFPOp<c10::complex<float>>()(address, val, | ||||||
| [](c10::complex<float> bsum, c10::complex<float> val) { | ||||||
| bsum*=(val); | ||||||
| return bsum; | ||||||
| }); | ||||||
| } | ||||||
|
|
||||||
| inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) { | ||||||
| return AtomicFPOp<at::Half>()(address, val, | ||||||
| [](at::Half bsum, at::Half val) { | ||||||
|
|
@@ -369,7 +397,7 @@ inline __device__ double gpuAtomicMul(double * address, double val) { | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| // Dont use a templated function for this since the addition function defaults to the CUDA built-in. | ||||||
| // Don't use a templated function for this since the addition function defaults to the CUDA built-in. | ||||||
| inline __device__ float gpuAtomicMul (float * address, float val) { | ||||||
| unsigned int* address_as_ull = (unsigned int*)address; | ||||||
| unsigned int old = *address_as_ull; | ||||||
|
|
@@ -402,6 +430,28 @@ __host__ __device__ T safe_max(T a, T b) { | |||||
| return max; | ||||||
| } | ||||||
|
|
||||||
| __inline__ __device__ c10::complex<float> complex_max(c10::complex<float> a, c10::complex<float> b) { | ||||||
| if(at::_isnan(b)) { | ||||||
ZelboK marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| return b; | ||||||
| } else { | ||||||
| // Compute the magnitude of the complex numbers and compare each to see which one is greater. | ||||||
| float a_magnitude = __fsqrt_rn( | ||||||
| ( | ||||||
| __fmul_rn(a.real(), a.real()) + | ||||||
| __fmul_rn(a.imag(),a.imag()) | ||||||
| ) | ||||||
| ); | ||||||
| float b_magnitude = __fsqrt_rn( | ||||||
| ( | ||||||
| __fmul_rn(b.real(), b.real()) + | ||||||
| __fmul_rn(b.imag(),b.imag()) | ||||||
| ) | ||||||
| ); | ||||||
| return (a_magnitude > b_magnitude) ? a : b; | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any precedence for this definition of complex max/min in PyTorch?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not experienced enough with Pytorch to answer that. Aside from using magnitudes how else would you order them? I followed convention from other ecosystems and from my research this is how it is done across different disciplines/domains, is it not?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly, they cannot be ordered (in mathematical terms, complex numbers are not an ordered field)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was under the impression that some contexts will use magnitude for ordering complex numbers, like spectral analysis for DSP. I also took motivation from https://www.mathworks.com/help/matlab/ref/max.html as well Could you comment on whether or not you had a use case for scattering complex numbers? What kind of work were you trying to do? Would you know if ordering of complex numbers is practically useful?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may be implemented and it may be useful, but we don't implement that in PyTorch at a kernel level. As mentioned above, all these orderings can often be simulated with the current API and a bit of imagination :)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see - in that case I'll wait until @mikaylagawarecki has a chance to review again. Thanks for taking a look! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ZelboK I am working on Tensor_network, which require a series of matrix multiplication. In the case of complex elements, the torch scatter connot be used in GPU. As far as I concerned right now, we do not use sortage here, just elements' multiplication.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Thanks a lot for responding, I was genuinely curious. This helps give me perspective :) |
||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| ATOMIC_INTEGER_IMPL(Max) | ||||||
| GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t) | ||||||
| GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t) | ||||||
|
|
@@ -416,6 +466,13 @@ inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) { | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| inline __device__ c10::complex<float> gpuAtomicMax(c10::complex<float> * address, c10::complex<float> val) { | ||||||
| return AtomicFPOp<c10::complex<float>>()(address, val, | ||||||
| [](c10::complex<float> bsum, c10::complex<float> val) { | ||||||
| return complex_max(bsum, val); | ||||||
| }); | ||||||
| } | ||||||
|
|
||||||
| inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) { | ||||||
| return AtomicFPOp<at::BFloat16>()(address, val, | ||||||
| [](at::BFloat16 bsum, at::BFloat16 val) { | ||||||
|
|
@@ -462,6 +519,27 @@ __host__ __device__ T safe_min(T a, T b) { | |||||
| return min; | ||||||
| } | ||||||
|
|
||||||
| __inline__ __device__ c10::complex<float> complex_min(c10::complex<float> a, c10::complex<float> b) { | ||||||
| if(at::_isnan(b)) { | ||||||
| return b; | ||||||
| } else { | ||||||
| // Compute the magnitude of the complex numbers and compare each to see which one is smaller. | ||||||
| float a_magnitude = __fsqrt_rn( | ||||||
| ( | ||||||
| __fmul_rn(a.real(), a.real()) + | ||||||
| __fmul_rn(a.imag(),a.imag()) | ||||||
| ) | ||||||
| ); | ||||||
| float b_magnitude = __fsqrt_rn( | ||||||
| ( | ||||||
| __fmul_rn(b.real(), b.real()) + | ||||||
| __fmul_rn(b.imag(),b.imag()) | ||||||
| ) | ||||||
| ); | ||||||
| return (a_magnitude < b_magnitude) ? a : b; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| ATOMIC_INTEGER_IMPL(Min) | ||||||
| GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t) | ||||||
| GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t) | ||||||
|
|
@@ -476,6 +554,13 @@ inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) { | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| inline __device__ c10::complex<float> gpuAtomicMin(c10::complex<float> * address, c10::complex<float> val) { | ||||||
| return AtomicFPOp<c10::complex<float>>()(address, val, | ||||||
| [](c10::complex<float> bsum, c10::complex<float> val) { | ||||||
| return complex_min(bsum, val); | ||||||
| }); | ||||||
| } | ||||||
|
|
||||||
| inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) { | ||||||
| return AtomicFPOp<at::BFloat16>()(address, val, | ||||||
| [](at::BFloat16 bsum, at::BFloat16 val) { | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.