pytorch · malfet · Apr 4, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
@@ -78,7 +78,7 @@ constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
     }
 }
 
-#ifdef USE_ROCM
+#if defined(USE_ROCM)
 template <int io_sizes>
 constexpr auto elems_per_thread(){
   if constexpr (io_sizes == 1) {
@@ -219,7 +219,7 @@ static inline void launch_vectorized_kernel(
   constexpr auto io_size = calc_io_size<func_t>();
   int64_t grid = (N + io_block_work_size<io_size>() - 1) / io_block_work_size<io_size>();
   auto stream = at::cuda::getCurrentCUDAStream();
-#ifdef USE_ROCM
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && CUDA_VERSION < 12080)
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
 #else
   using cpp_type = typename function_traits<func_t>::result_type;
@@ -241,11 +241,13 @@ static inline void launch_vectorized_kernel(
       C10_CUDA_KERNEL_LAUNCH_CHECK();
       break;
 #endif
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
     case 8:
       vectorized_elementwise_kernel<8, func_t, array_t>
           <<<grid, num_threads(), 0, stream>>>(N, f, data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
       break;
+#endif
     case 4:
       vectorized_elementwise_kernel<4, func_t, array_t>
           <<<grid, num_threads(), 0, stream>>>(N, f, data);

@@ -216,12 +216,15 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) {
 #ifdef USE_ROCM
     // make sure we don't break assumption that we can't have > 16 elements / thread
     TORCH_INTERNAL_ASSERT(vec_size <= 16, "Value of VEC must be in [2, 4, 8, 16]");
-#else
+#elif (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
     const int optimal_vec_size = 16 / static_cast<int>(sizeof(scalar_t));
     vec_size = std::min<int>(optimal_vec_size, vec_size);
 
     // make sure we don't break assumption that we can't have > 4 elements / thread
     TORCH_INTERNAL_ASSERT(vec_size <= 8, "Value of VEC must be in [2, 4, 8]");
+#else
+    // make sure we don't break assumption that we can't have > 4 elements / thread
+    TORCH_INTERNAL_ASSERT(vec_size <= 4, "Value of VEC must be in [2, 4]");
 #endif
   }
 

@@ -486,7 +486,9 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) {
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
   constexpr int vec2_alignment = std::alignment_of_v<aligned_vector<scalar_t, 2>>;
   constexpr int vec4_alignment = std::alignment_of_v<aligned_vector<scalar_t, 4>>;
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
   constexpr int vec8_alignment = std::alignment_of_v<aligned_vector<scalar_t, 8>>;
+#endif
 #ifdef USE_ROCM
   constexpr int vec16_alignment = std::alignment_of_v<aligned_vector<scalar_t, 16>>;
   constexpr int type_size = sizeof(scalar_t);
@@ -495,7 +497,7 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) {
   } else if (type_size <= 2 && (address % vec8_alignment == 0)) {
     return 8;
   } else
-#else
+#elif defined(CUDA_VERSION) && CUDA_VERSION >= 12080
   if (address % vec8_alignment == 0) {
    return 8;
   } else

@@ -18,8 +18,11 @@ constexpr int thread_work_size() { return 4; }
 constexpr uint32_t num_threads() {
   return C10_WARP_SIZE * 4;
 }
-
+#if defined(CUDA_VERSION) && CUDA_VERSION < 12080
+constexpr int thread_work_size() { return 4; }
+#else
 constexpr int thread_work_size() { return 8; }
 #endif
+#endif
 
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -46,12 +46,17 @@ TEST(TestLoops, HasSameArgTypes) {
 
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
+#if defined(CUDA_VERSION) && CUDA_VERSION < 12080
+  constexpr auto vectorize_limit = 4;
+#else
+  constexpr auto vectorize_limit= 8;
+#endif
 
-  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr), 8);
+  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr), vectorize_limit);
+  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr), vectorize_limit);
+  ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr), vectorize_limit);
+  ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr), vectorize_limit);
+  ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr), vectorize_limit);
 
   ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 1), 1);
   ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 1), 1);
@@ -65,8 +70,8 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr + 4), 2);
   ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr + 4), 1);
 
-  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 8), 8);
-  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 8), 8);
+  ASSERT_EQ(memory::can_vectorize_up_to<bool>(ptr + 8), vectorize_limit);
+  ASSERT_EQ(memory::can_vectorize_up_to<int8_t>(ptr + 8), vectorize_limit);
   ASSERT_EQ(memory::can_vectorize_up_to<int16_t>(ptr + 8), 4);
   ASSERT_EQ(memory::can_vectorize_up_to<int>(ptr + 8), 2);
   ASSERT_EQ(memory::can_vectorize_up_to<int64_t>(ptr + 8), 1);