Replace assert with CUDA_KERNEL_ASSERT in Reduce.cuh for consistency (#113098)

hongxiayang · pytorchmergebot · commit b30f178d09b6 · 2023-11-08T21:25:54.000Z
Related to Fixes #94891 **Problem:** We are trying to disable `printf` in kernels for Pytorch build on ROCm to fix the `torch.sum()` issues for certain community users by disabling `CUDA_KERNEL_ASSERT`, but found that there are still hostcall printf happening in `ReduceSumProdKernel` used by `torch.sum`. **Reason:** The reason is that there are `assert` function calls inside `Reduce.cuh`, ( defined as `__assert_fail` ) which caused `printf`. **Fix:** This pull request is to change `assert` to `CUDA_KERNEL_ASSERT` so that we can consistently disable assertion/printf in cuda/hip kernel code. Pull Request resolved: #113098 Approved by: https://github.com/ezyang
diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <assert.h>
 #include <cfloat>
 #include <limits>
 #include <stdint.h>
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <assert.h>
 #include <ATen/core/Array.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/DeviceUtils.cuh>
@@ -483,7 +482,7 @@ struct ReduceOp {
   template <int output_vec_size>
   C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
     if (config.vectorize_input) {
-      assert(output_vec_size == 1);
+      CUDA_KERNEL_ASSERT(output_vec_size == 1);
       // reduce at the header of input_slice where memory is not aligned,
       // so that thread_reduce will have an aligned memory to work on.
       return {input_vectorized_thread_reduce_impl(data)};
@@ -720,7 +719,7 @@ struct ReduceOp {
     out_scalar_t* out, arg_t value,
     typename std::enable_if<can_acc>::type* = nullptr
   ) const {
-    assert(!final_output);
+    CUDA_KERNEL_ASSERT(!final_output);
     return (out_scalar_t)value;
   }
 
@@ -733,7 +732,7 @@ struct ReduceOp {
     at::detail::Array<arg_t, output_vec_size>,
     typename std::enable_if<!can_acc>::type* = nullptr
   ) const {
-    assert(false); // can't use AT_ASSERT in Cuda.
+    CUDA_KERNEL_ASSERT(false);
     return arg_t {};
   }
 
@@ -745,13 +744,13 @@ struct ReduceOp {
     out_scalar_t* out, arg_t value,
     typename std::enable_if<!can_acc>::type* = nullptr
   ) const {
-    assert(false);
+    CUDA_KERNEL_ASSERT(false);
     return *out;
   }
 
   template<class T>
   C10_DEVICE void set_results(const T x, const index_t base_offset) const {
-    assert(noutputs == 1);
+    CUDA_KERNEL_ASSERT(noutputs == 1);
     auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
     *res = x;
   }
@@ -773,7 +772,7 @@ struct ReduceOp {
 
   template <int output_vec_size>
   C10_DEVICE void set_results_to_output(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<index_t, output_vec_size> base_offset) const {
-    assert(final_output);
+    CUDA_KERNEL_ASSERT(final_output);
     #pragma unroll
     for (int i = 0; i < output_vec_size; i++) {
       set_results(ops.project(value[i]), base_offset[i]);
diff --git a/aten/src/ATen/native/cuda/SortingCommon.cuh b/aten/src/ATen/native/cuda/SortingCommon.cuh
@@ -2,7 +2,6 @@
 #include <ATen/core/TensorBase.h>
 #include <ATen/ceil_div.h>
 #include <ATen/NumericUtils.h>
-#include <assert.h>
 #include <c10/macros/Macros.h>
 #include <stdlib.h>
 #include <ATen/cuda/detail/IndexUtils.cuh>