Add NEON support (#621)

styxdoto · styxdoto · commit fd98546ee7db · 2025-12-18T22:09:28.000+03:00
Results of Reckless-neon vs Reckless-main (4+0.04, 1t, 16MB, UHO_Lichess_4852_v1.epd):
Elo: 18.01 +/- 7.37, nElo: 35.87 +/- 14.65
LOS: 100.00 %, DrawRatio: 54.12 %, PairsRatio: 1.53
Games: 2162, Wins: 623, Losses: 511, Draws: 1028, Points: 1137.0 (52.59 %)
Ptnml(0-2): [6, 190, 585, 286, 14], WL/DD Ratio: 1.12
LLR: 2.96 (100.4%) (-2.94, 2.94) [0.00, 5.00]

Bench: 3016642
diff --git a/src/nnue.rs b/src/nnue.rs
@@ -13,14 +13,14 @@ use crate::{
 use accumulator::{AccumulatorCache, PstAccumulator};
 
 mod forward {
-    #[cfg(target_feature = "avx2")]
+    #[cfg(any(target_feature = "avx2", target_feature = "neon"))]
     mod vectorized;
-    #[cfg(target_feature = "avx2")]
+    #[cfg(any(target_feature = "avx2", target_feature = "neon"))]
     pub use vectorized::*;
 
-    #[cfg(not(target_feature = "avx2"))]
+    #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))]
     mod scalar;
-    #[cfg(not(target_feature = "avx2"))]
+    #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))]
     pub use scalar::*;
 }
 
@@ -35,9 +35,14 @@ mod simd {
     #[cfg(all(target_feature = "avx2", not(target_feature = "avx512f")))]
     pub use avx2::*;
 
-    #[cfg(all(not(target_feature = "avx2"), not(target_feature = "avx512f")))]
+    #[cfg(all(target_feature = "neon", not(any(target_feature = "avx2", target_feature = "avx512f"))))]
+    mod neon;
+    #[cfg(all(target_feature = "neon", not(any(target_feature = "avx2", target_feature = "avx512f"))))]
+    pub use neon::*;
+
+    #[cfg(not(any(target_feature = "avx512f", target_feature = "avx2", target_feature = "neon")))]
     mod scalar;
-    #[cfg(all(not(target_feature = "avx2"), not(target_feature = "avx512f")))]
+    #[cfg(not(any(target_feature = "avx512f", target_feature = "avx2", target_feature = "neon")))]
     pub use scalar::*;
 }
 
diff --git a/src/nnue/forward/vectorized.rs b/src/nnue/forward/vectorized.rs
@@ -1,5 +1,3 @@
-use std::arch::x86_64::*;
-
 use crate::{
     nnue::{
         accumulator::{PstAccumulator, ThreatAccumulator},
@@ -11,7 +9,7 @@ use crate::{
 pub unsafe fn activate_ft(pst: &PstAccumulator, threat: &ThreatAccumulator, stm: Color) -> Aligned<[u8; L1_SIZE]> {
     let mut output = Aligned::new([0; L1_SIZE]);
 
-    let zero = simd::zeroed();
+    let zero = simd::splat_i16(0);
     let one = simd::splat_i16(FT_QUANT as i16);
 
     for flip in [0, 1] {
@@ -53,33 +51,6 @@ pub unsafe fn activate_ft(pst: &PstAccumulator, threat: &ThreatAccumulator, stm:
     output
 }
 
-pub unsafe fn find_nnz(
-    ft_out: &Aligned<[u8; L1_SIZE]>, nnz_table: &[SparseEntry],
-) -> (Aligned<[u16; L1_SIZE / 4]>, usize) {
-    let mut indexes = Aligned::new([0; L1_SIZE / 4]);
-    let mut count = 0;
-
-    let increment = _mm_set1_epi16(8);
-    let mut base = _mm_setzero_si128();
-
-    for i in (0..L1_SIZE).step_by(2 * simd::I16_LANES) {
-        let mask = simd::nnz_bitmask(*ft_out.as_ptr().add(i).cast());
-
-        for offset in (0..simd::I32_LANES).step_by(8) {
-            let slice = (mask >> offset) & 0xFF;
-            let entry = nnz_table.get_unchecked(slice as usize);
-
-            let store = indexes.as_mut_ptr().add(count).cast();
-            _mm_storeu_si128(store, _mm_add_epi16(base, *entry.indexes.as_ptr().cast()));
-
-            count += entry.count;
-            base = _mm_add_epi16(base, increment);
-        }
-    }
-
-    (indexes, count)
-}
-
 pub unsafe fn propagate_l1(ft_out: Aligned<[u8; L1_SIZE]>, nnz: &[u16]) -> Aligned<[f32; L2_SIZE]> {
     const CHUNKS: usize = 4;
 
@@ -100,23 +71,23 @@ pub unsafe fn propagate_l1(ft_out: Aligned<[u8; L1_SIZE]>, nnz: &[u16]) -> Align
         let weights2 = PARAMETERS.l1_weights.as_ptr().add(index2 * L2_SIZE * CHUNKS);
 
         for j in (0..L2_SIZE).step_by(simd::F32_LANES) {
-            let weights1 = weights1.add(j * CHUNKS).cast();
-            let weights2 = weights2.add(j * CHUNKS).cast();
+            let weights1 = *weights1.add(j * CHUNKS).cast();
+            let weights2 = *weights2.add(j * CHUNKS).cast();
 
             let vector = &mut pre_activations[j / simd::F32_LANES];
-            *vector = simd::double_dpbusd(*vector, input1, *weights1, input2, *weights2);
+            *vector = simd::double_dpbusd(*vector, input1, weights1, input2, weights2);
         }
     }
 
     if let Some(last) = pairs.remainder().first() {
         let index = *last as usize;
-        let pst_input = simd::splat_i32(*packed.get_unchecked(index));
+        let input = simd::splat_i32(*packed.get_unchecked(index));
         let weights = PARAMETERS.l1_weights.as_ptr().add(index * L2_SIZE * CHUNKS);
 
         for j in (0..L2_SIZE).step_by(simd::F32_LANES) {
-            let weights = weights.add(j * CHUNKS).cast();
+            let weights = *weights.add(j * CHUNKS).cast();
             let vector = &mut pre_activations[j / simd::F32_LANES];
-            *vector = simd::dpbusd(*vector, pst_input, *weights);
+            *vector = simd::dpbusd(*vector, input, weights);
         }
     }
 
@@ -139,13 +110,13 @@ pub unsafe fn propagate_l2(l1_out: Aligned<[f32; L2_SIZE]>) -> Aligned<[f32; L3_
     let mut output = PARAMETERS.l2_biases.clone();
 
     for i in 0..L2_SIZE {
-        let pst_input = simd::splat_f32(l1_out[i]);
+        let input = simd::splat_f32(l1_out[i]);
         let weights = PARAMETERS.l2_weights[i].as_ptr();
 
         for j in (0..L3_SIZE).step_by(simd::F32_LANES) {
-            let weights = weights.add(j).cast();
+            let weights = *weights.add(j).cast();
             let vector = output.as_mut_ptr().add(j).cast();
-            *vector = simd::mul_add_f32(*weights, pst_input, *vector);
+            *vector = simd::mul_add_f32(weights, input, *vector);
         }
     }
 
@@ -163,19 +134,80 @@ pub unsafe fn propagate_l2(l1_out: Aligned<[f32; L2_SIZE]>) -> Aligned<[f32; L3_
 pub unsafe fn propagate_l3(l2_out: Aligned<[f32; L3_SIZE]>) -> f32 {
     const LANES: usize = 16 / simd::F32_LANES;
 
-    let pst_input = l2_out.as_ptr();
+    let input = l2_out.as_ptr();
     let weights = PARAMETERS.l3_weights.as_ptr();
 
     let mut output = [simd::zero_f32(); LANES];
 
     for (lane, result) in output.iter_mut().enumerate() {
         for i in (0..L3_SIZE).step_by(LANES * simd::F32_LANES) {
-            let a = weights.add(i + lane * simd::F32_LANES).cast();
-            let b = pst_input.add(i + lane * simd::F32_LANES).cast();
+            let a = *weights.add(i + lane * simd::F32_LANES).cast();
+            let b = *input.add(i + lane * simd::F32_LANES).cast();
 
-            *result = simd::mul_add_f32(*a, *b, *result);
+            *result = simd::mul_add_f32(a, b, *result);
         }
     }
 
     simd::horizontal_sum(output) + PARAMETERS.l3_biases
 }
+
+#[cfg(not(target_feature = "neon"))]
+pub unsafe fn find_nnz(
+    ft_out: &Aligned<[u8; L1_SIZE]>, nnz_table: &[SparseEntry],
+) -> (Aligned<[u16; L1_SIZE / 4]>, usize) {
+    use std::arch::x86_64::*;
+
+    let mut indexes = Aligned::new([0; L1_SIZE / 4]);
+    let mut count = 0;
+
+    let increment = _mm_set1_epi16(8);
+    let mut base = _mm_setzero_si128();
+
+    for i in (0..L1_SIZE).step_by(2 * simd::I16_LANES) {
+        let mask = simd::nnz_bitmask(*ft_out.as_ptr().add(i).cast());
+
+        for offset in (0..simd::I32_LANES).step_by(8) {
+            let slice = (mask >> offset) & 0xFF;
+            let entry = nnz_table.get_unchecked(slice as usize);
+
+            let store = indexes.as_mut_ptr().add(count).cast();
+            _mm_storeu_si128(store, _mm_add_epi16(base, *entry.indexes.as_ptr().cast()));
+
+            count += entry.count;
+            base = _mm_add_epi16(base, increment);
+        }
+    }
+
+    (indexes, count)
+}
+
+#[cfg(target_feature = "neon")]
+pub unsafe fn find_nnz(
+    ft_out: &Aligned<[u8; L1_SIZE]>, nnz_table: &[SparseEntry],
+) -> (Aligned<[u16; L1_SIZE / 4]>, usize) {
+    use std::arch::aarch64::*;
+
+    let mut indexes = Aligned::new([0; L1_SIZE / 4]);
+    let mut count = 0;
+
+    let increment = vdupq_n_s16(8);
+    let mut base = vdupq_n_s16(0);
+
+    for i in (0..L1_SIZE).step_by(32) {
+        let v0 = *ft_out.as_ptr().add(i).cast();
+        let v1 = *ft_out.as_ptr().add(i + 16).cast();
+
+        let mask = (simd::nnz_bitmask(v0) | (simd::nnz_bitmask(v1) << 4)) as usize;
+        let entry = nnz_table.get_unchecked(mask);
+
+        let store = indexes.as_mut_ptr().add(count).cast();
+        let indexed = vaddq_s16(base, vld1q_s16(entry.indexes.as_ptr().cast()));
+
+        vst1q_s16(store, indexed);
+
+        count += entry.count;
+        base = vaddq_s16(base, increment);
+    }
+
+    (indexes, count)
+}
diff --git a/src/nnue/simd/neon.rs b/src/nnue/simd/neon.rs
@@ -0,0 +1,125 @@
+use std::{arch::aarch64::*, mem::size_of};
+
+pub const F32_LANES: usize = size_of::<float32x4_t>() / size_of::<f32>();
+pub const I16_LANES: usize = size_of::<int16x8_t>() / size_of::<i16>();
+
+pub fn add_i16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { vaddq_s16(a, b) }
+}
+
+pub fn sub_i16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    unsafe { vsubq_s16(a, b) }
+}
+
+pub unsafe fn zeroed() -> int32x4_t {
+    vdupq_n_s32(0)
+}
+
+pub unsafe fn splat_i16(a: i16) -> int16x8_t {
+    vdupq_n_s16(a)
+}
+
+pub unsafe fn clamp_i16(x: int16x8_t, min: int16x8_t, max: int16x8_t) -> int16x8_t {
+    vmaxq_s16(vminq_s16(x, max), min)
+}
+
+pub unsafe fn min_i16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    vminq_s16(a, b)
+}
+
+pub unsafe fn shift_left_i16<const SHIFT: i32>(a: int16x8_t) -> int16x8_t {
+    vshlq_n_s16::<SHIFT>(a)
+}
+
+pub unsafe fn mul_high_i16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let low = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+    let high = vmull_s16(vget_high_s16(a), vget_high_s16(b));
+
+    let low_hi = vshrn_n_s32::<16>(low);
+    let high_hi = vshrn_n_s32::<16>(high);
+
+    vcombine_s16(low_hi, high_hi)
+}
+
+pub unsafe fn convert_i8_i16(a: int8x8_t) -> int16x8_t {
+    vmovl_s8(a)
+}
+
+pub unsafe fn packus(a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let a_u8 = vqmovun_s16(a);
+    let b_u8 = vqmovun_s16(b);
+    vreinterpretq_s8_u8(vcombine_u8(a_u8, b_u8))
+}
+
+pub unsafe fn permute(a: int8x16_t) -> int8x16_t {
+    a
+}
+
+pub unsafe fn splat_i32(a: i32) -> int32x4_t {
+    vdupq_n_s32(a)
+}
+
+pub unsafe fn zero_f32() -> float32x4_t {
+    vdupq_n_f32(0.0)
+}
+
+pub unsafe fn splat_f32(a: f32) -> float32x4_t {
+    vdupq_n_f32(a)
+}
+
+pub unsafe fn mul_add_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    vfmaq_f32(c, a, b)
+}
+
+pub unsafe fn convert_to_f32(a: int32x4_t) -> float32x4_t {
+    vcvtq_f32_s32(a)
+}
+
+pub unsafe fn clamp_f32(x: float32x4_t, min: float32x4_t, max: float32x4_t) -> float32x4_t {
+    vmaxq_f32(vminq_f32(x, max), min)
+}
+
+unsafe fn dot_bytes(u8s: int32x4_t, i8s: int8x16_t) -> int32x4_t {
+    let u8s = vreinterpretq_u8_s32(u8s);
+
+    let products_low = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(u8s))), vmovl_s8(vget_low_s8(i8s)));
+    let products_high = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(u8s))), vmovl_s8(vget_high_s8(i8s)));
+
+    let sums_low = vpaddlq_s16(products_low);
+    let sums_high = vpaddlq_s16(products_high);
+
+    vpaddq_s32(sums_low, sums_high)
+}
+
+pub unsafe fn dpbusd(i32s: int32x4_t, u8s: int32x4_t, i8s: int8x16_t) -> int32x4_t {
+    vaddq_s32(i32s, dot_bytes(u8s, i8s))
+}
+
+pub unsafe fn double_dpbusd(
+    i32s: int32x4_t, u8s1: int32x4_t, i8s1: int8x16_t, u8s2: int32x4_t, i8s2: int8x16_t,
+) -> int32x4_t {
+    let accum = vaddq_s32(dot_bytes(u8s1, i8s1), dot_bytes(u8s2, i8s2));
+    vaddq_s32(i32s, accum)
+}
+
+pub unsafe fn horizontal_sum(x: [float32x4_t; 4]) -> f32 {
+    let sum01 = vaddq_f32(x[0], x[1]);
+    let sum23 = vaddq_f32(x[2], x[3]);
+    let sum = vaddq_f32(sum01, sum23);
+
+    let pair = vpadd_f32(vget_low_f32(sum), vget_high_f32(sum));
+    let final_sum = vpadd_f32(pair, pair);
+
+    vget_lane_f32::<0>(final_sum)
+}
+
+pub unsafe fn nnz_bitmask(x: int32x4_t) -> u16 {
+    let cmp = vcgtq_s32(x, vdupq_n_s32(0));
+
+    let mask0 = (vgetq_lane_u32::<0>(cmp) >> 31) & 1;
+    let mask1 = ((vgetq_lane_u32::<1>(cmp) >> 31) & 1) << 1;
+    let mask2 = ((vgetq_lane_u32::<2>(cmp) >> 31) & 1) << 2;
+    let mask3 = ((vgetq_lane_u32::<3>(cmp) >> 31) & 1) << 3;
+
+    (mask0 | mask1 | mask2 | mask3) as u16
+}