Split BinaryCompareKernel.cu into a file-per-kernel to speed up compilation. (#33871)

gchanan · facebook-github-bot · commit d97560999b39 · 2020-02-27T13:48:36.000-08:00
Summary: Pull Request resolved: #33871 Test Plan: Imported from OSS Differential Revision: D20140862 Pulled By: gchanan fbshipit-source-id: a4fde38c1c7c5905e3855fa490ea2e87bb24c703
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -0,0 +1,25 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/zmath.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void eq_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBool, iter.common_dtype(), "eq_cuda", [&]() {
+    using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> bool {
+      return a == b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(eq_stub, &eq_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CompareGEKernel.cu b/aten/src/ATen/native/cuda/CompareGEKernel.cu
@@ -0,0 +1,23 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void ge_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "ge_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a >= b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(ge_stub, &ge_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CompareGTKernel.cu b/aten/src/ATen/native/cuda/CompareGTKernel.cu
@@ -0,0 +1,23 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void gt_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "gt_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a > b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(gt_stub, &gt_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CompareLEKernel.cu b/aten/src/ATen/native/cuda/CompareLEKernel.cu
@@ -0,0 +1,23 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void le_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "le_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a <= b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(le_stub, &le_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CompareLTKernel.cu b/aten/src/ATen/native/cuda/CompareLTKernel.cu
@@ -0,0 +1,23 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void lt_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "lt_cuda", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a < b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(lt_stub, &lt_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CompareNEKernel.cu b/aten/src/ATen/native/cuda/CompareNEKernel.cu
@@ -0,0 +1,25 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/zmath.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at { namespace native {
+
+void ne_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBool, iter.common_dtype(), "ne_cuda", [&]() {
+    using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> bool {
+      return a != b;
+    });
+  });
+}
+
+REGISTER_DISPATCH(ne_stub, &ne_kernel_cuda);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/MaxMinElementwiseKernel.cu b/aten/src/ATen/native/cuda/MaxMinElementwiseKernel.cu
@@ -11,56 +11,6 @@
 
 namespace at { namespace native {
 
-void lt_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "lt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a < b;
-    });
-  });
-}
-
-void le_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "le_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a <= b;
-    });
-  });
-}
-
-void gt_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "gt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a > b;
-    });
-  });
-}
-
-void ge_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "ge_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a >= b;
-    });
-  });
-}
-
-void eq_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBool, iter.common_dtype(), "eq_cuda", [&]() {
-    using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> bool {
-      return a == b;
-    });
-  });
-}
-
-void ne_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBool, iter.common_dtype(), "ne_cuda", [&]() {
-    using thrust_t = typename ztype_cuda<scalar_t>::thrust_t;
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(thrust_t a, thrust_t b) -> bool {
-      return a != b;
-    });
-  });
-}
-
 void max_elementwise_kernel_cuda(TensorIterator& iter) {
   if (iter.dtype() == ScalarType::Bool) {
     gpu_kernel(iter, []GPU_LAMBDA(bool a, bool b) -> bool {
@@ -119,13 +69,6 @@ void min_elementwise_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-
-REGISTER_DISPATCH(lt_stub, &lt_kernel_cuda);
-REGISTER_DISPATCH(le_stub, &le_kernel_cuda);
-REGISTER_DISPATCH(gt_stub, &gt_kernel_cuda);
-REGISTER_DISPATCH(ge_stub, &ge_kernel_cuda);
-REGISTER_DISPATCH(eq_stub, &eq_kernel_cuda);
-REGISTER_DISPATCH(ne_stub, &ne_kernel_cuda);
 REGISTER_DISPATCH(max_elementwise_stub, &max_elementwise_kernel_cuda);
 REGISTER_DISPATCH(min_elementwise_stub, &min_elementwise_kernel_cuda);