pytorch · Nicoshev · Oct 28, 2025
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@@ -21,61 +21,109 @@ inline void convertImpl(
   }
 }
 
+template <typename to_type>
+inline void convertFromBool(
+    const bool* __restrict src,
+    to_type* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
+  }
+}
+
+template <typename from_type>
+inline void convertToBool(
+    const from_type* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
+  }
+}
+
 #define CONVERT_TEMPLATE(from_type, to_type)                           \
   template <>                                                          \
   inline void convert(const from_type* src, to_type* dst, int64_t n) { \
     return convertImpl<from_type, to_type>(src, dst, n);               \
   }
 
+#define CONVERT_FROM_BOOL_TEMPLATE(to_type)                       \
+  inline void convert(const bool* src, to_type* dst, int64_t n) { \
+    return convertFromBool<to_type>(src, dst, n);                 \
+  }
+
+#define CONVERT_TO_BOOL_TEMPLATE(from_type)                         \
+  inline void convert(const from_type* src, bool* dst, int64_t n) { \
+    return convertToBool<from_type>(src, dst, n);                   \
+  }
+
 CONVERT_TEMPLATE(uint8_t, uint8_t)
 CONVERT_TEMPLATE(uint8_t, int8_t)
 CONVERT_TEMPLATE(uint8_t, int16_t)
 CONVERT_TEMPLATE(uint8_t, int32_t)
 CONVERT_TEMPLATE(uint8_t, int64_t)
 CONVERT_TEMPLATE(uint8_t, float)
 CONVERT_TEMPLATE(uint8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(uint8_t)
 CONVERT_TEMPLATE(int8_t, uint8_t)
 CONVERT_TEMPLATE(int8_t, int8_t)
 CONVERT_TEMPLATE(int8_t, int16_t)
 CONVERT_TEMPLATE(int8_t, int32_t)
 CONVERT_TEMPLATE(int8_t, int64_t)
 CONVERT_TEMPLATE(int8_t, float)
 CONVERT_TEMPLATE(int8_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int8_t)
 CONVERT_TEMPLATE(int16_t, uint8_t)
 CONVERT_TEMPLATE(int16_t, int8_t)
 CONVERT_TEMPLATE(int16_t, int16_t)
 CONVERT_TEMPLATE(int16_t, int32_t)
 CONVERT_TEMPLATE(int16_t, int64_t)
 CONVERT_TEMPLATE(int16_t, float)
 CONVERT_TEMPLATE(int16_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int16_t)
 CONVERT_TEMPLATE(int32_t, uint8_t)
 CONVERT_TEMPLATE(int32_t, int8_t)
 CONVERT_TEMPLATE(int32_t, int16_t)
 CONVERT_TEMPLATE(int32_t, int32_t)
 CONVERT_TEMPLATE(int32_t, int64_t)
 CONVERT_TEMPLATE(int32_t, float)
 CONVERT_TEMPLATE(int32_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int32_t)
 CONVERT_TEMPLATE(int64_t, uint8_t)
 CONVERT_TEMPLATE(int64_t, int8_t)
 CONVERT_TEMPLATE(int64_t, int16_t)
 CONVERT_TEMPLATE(int64_t, int32_t)
 CONVERT_TEMPLATE(int64_t, int64_t)
 CONVERT_TEMPLATE(int64_t, float)
 CONVERT_TEMPLATE(int64_t, double)
+CONVERT_TO_BOOL_TEMPLATE(int64_t)
 CONVERT_TEMPLATE(float, uint8_t)
 CONVERT_TEMPLATE(float, int8_t)
 CONVERT_TEMPLATE(float, int16_t)
 CONVERT_TEMPLATE(float, int32_t)
 CONVERT_TEMPLATE(float, int64_t)
 CONVERT_TEMPLATE(float, float)
 CONVERT_TEMPLATE(float, double)
+CONVERT_TO_BOOL_TEMPLATE(float)
 CONVERT_TEMPLATE(double, uint8_t)
 CONVERT_TEMPLATE(double, int8_t)
 CONVERT_TEMPLATE(double, int16_t)
 CONVERT_TEMPLATE(double, int32_t)
 CONVERT_TEMPLATE(double, int64_t)
 CONVERT_TEMPLATE(double, float)
 CONVERT_TEMPLATE(double, double)
+CONVERT_TO_BOOL_TEMPLATE(double)
+CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int8_t)
+CONVERT_FROM_BOOL_TEMPLATE(int16_t)
+CONVERT_FROM_BOOL_TEMPLATE(int32_t)
+CONVERT_FROM_BOOL_TEMPLATE(int64_t)
+CONVERT_FROM_BOOL_TEMPLATE(float)
+CONVERT_FROM_BOOL_TEMPLATE(double)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #define CONVERT_FROM_FP16_TEMPLATE(to_type)                            \
@@ -107,6 +155,41 @@ CONVERT_TO_FP16_TEMPLATE(int32_t)
 CONVERT_TO_FP16_TEMPLATE(int64_t)
 CONVERT_TO_FP16_TEMPLATE(float)
 CONVERT_TO_FP16_TEMPLATE(double)
+
+inline void convertBoolToFp16Impl(
+    const bool* __restrict src,
+    at::Half* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, at::Half* dst, int64_t n) {
+  return convertBoolToFp16Impl(src, dst, n);
+}
+
+inline void convertFp16ToBoolImpl(
+    const at::Half* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
+  }
+}
+
+template <>
+inline void convert(const at::Half* src, bool* dst, int64_t n) {
+  return convertFp16ToBoolImpl(src, dst, n);
+}
+
 #endif
 #ifdef __ARM_FEATURE_BF16
 CONVERT_TEMPLATE(bfloat16_t, uint8_t)
@@ -124,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
 CONVERT_TEMPLATE(int64_t, bfloat16_t)
 CONVERT_TEMPLATE(float, bfloat16_t)
 CONVERT_TEMPLATE(double, bfloat16_t)
+
+inline void convertBoolToBfloat16Impl(
+    const bool* __restrict src,
+    c10::BFloat16* __restrict dst,
+    int64_t n) {
+  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
+  uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
+  uint64_t len = static_cast<uint64_t>(n);
+  constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
+  for (uint64_t i = 0; i < len; i++) {
+    dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
+  }
+}
+
+template <>
+inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
+  return convertBoolToBfloat16Impl(src, dst, n);
+}
+
+inline void convertBfloat16ToBoolImpl(
+    const c10::BFloat16* __restrict src,
+    bool* __restrict dst,
+    int64_t n) {
+  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
+  const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
+  uint64_t len = static_cast<uint64_t>(n);
+  for (uint64_t i = 0; i < len; i++) {
+    // Check if all non-sign bits are 0
+    bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
+    dstPtr[i] = isBf16Zero ? 0 : 1;
+  }
+}
+
+template <>
+inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
+  return convertBfloat16ToBoolImpl(src, dst, n);
+}
+
 #endif
 
 #endif