@@ -78,48 +78,6 @@ namespace Stockfish::Eval::NNUE::Layers {
7878 i / PaddedInputDimensions * 4 +
7979 i % 4
8080 ] = read_little_endian<WeightType>(stream);
81-
82- // Determine if eights of weight and input products can be summed using 16bits
83- // without saturation. We assume worst case combinations of 0 and 127 for all inputs.
84- if (OutputDimensions > 1 && !stream.fail ())
85- {
86- canSaturate16.count = 0 ;
87- #if !defined(USE_VNNI)
88- for (IndexType i = 0 ; i < PaddedInputDimensions; i += 16 )
89- for (IndexType j = 0 ; j < OutputDimensions; ++j)
90- for (int x = 0 ; x < 2 ; ++x)
91- {
92- WeightType* w = &weights[i * OutputDimensions + j * 4 + x * 2 ];
93- int sum[2 ] = {0 , 0 };
94- for (int k = 0 ; k < 8 ; ++k)
95- {
96- IndexType idx = k / 2 * OutputDimensions * 4 + k % 2 ;
97- sum[w[idx] < 0 ] += w[idx];
98- }
99- for (int sign : { -1 , 1 })
100- while (sign * sum[sign == -1 ] > 258 )
101- {
102- int maxK = 0 , maxW = 0 ;
103- for (int k = 0 ; k < 8 ; ++k)
104- {
105- IndexType idx = k / 2 * OutputDimensions * 4 + k % 2 ;
106- if (maxW < sign * w[idx])
107- maxK = k, maxW = sign * w[idx];
108- }
109-
110- IndexType idx = maxK / 2 * OutputDimensions * 4 + maxK % 2 ;
111- sum[sign == -1 ] -= w[idx];
112- canSaturate16.add (j, i + maxK / 2 * 4 + maxK % 2 + x * 2 , w[idx]);
113- w[idx] = 0 ;
114- }
115- }
116-
117- // Non functional optimization for faster more linear access
118- std::sort (canSaturate16.ids , canSaturate16.ids + canSaturate16.count ,
119- [](const typename CanSaturate::Entry& e1 , const typename CanSaturate::Entry& e2 )
120- { return e1 .in == e2 .in ? e1 .out < e2 .out : e1 .in < e2 .in ; });
121- #endif
122- }
12381#endif
12482
12583 return !stream.fail ();
@@ -162,10 +120,10 @@ namespace Stockfish::Eval::NNUE::Layers {
162120 __m512i product2 = _mm512_maddubs_epi16 (a2, b2);
163121 __m512i product3 = _mm512_maddubs_epi16 (a3, b3);
164122 product0 = _mm512_add_epi16 (product0, product1);
165- product2 = _mm512_add_epi16 (product2, product3);
166- product0 = _mm512_add_epi16 (product0, product2);
167123 product0 = _mm512_madd_epi16 (product0, Ones512);
168- acc = _mm512_add_epi32 (acc, product0);
124+ product2 = _mm512_add_epi16 (product2, product3);
125+ product2 = _mm512_madd_epi16 (product2, Ones512);
126+ acc = _mm512_add_epi32 (acc, _mm512_add_epi32 (product0, product2));
169127#endif
170128 };
171129
@@ -204,10 +162,10 @@ namespace Stockfish::Eval::NNUE::Layers {
204162 __m256i product2 = _mm256_maddubs_epi16 (a2, b2);
205163 __m256i product3 = _mm256_maddubs_epi16 (a3, b3);
206164 product0 = _mm256_add_epi16 (product0, product1);
207- product2 = _mm256_add_epi16 (product2, product3);
208- product0 = _mm256_add_epi16 (product0, product2);
209165 product0 = _mm256_madd_epi16 (product0, Ones256);
210- acc = _mm256_add_epi32 (acc, product0);
166+ product2 = _mm256_add_epi16 (product2, product3);
167+ product2 = _mm256_madd_epi16 (product2, Ones256);
168+ acc = _mm256_add_epi32 (acc, _mm256_add_epi32 (product0, product2));
211169#endif
212170 };
213171
@@ -235,10 +193,10 @@ namespace Stockfish::Eval::NNUE::Layers {
235193 __m128i product2 = _mm_maddubs_epi16 (a2, b2);
236194 __m128i product3 = _mm_maddubs_epi16 (a3, b3);
237195 product0 = _mm_add_epi16 (product0, product1);
238- product2 = _mm_add_epi16 (product2, product3);
239- product0 = _mm_add_epi16 (product0, product2);
240196 product0 = _mm_madd_epi16 (product0, Ones128);
241- acc = _mm_add_epi32 (acc, product0);
197+ product2 = _mm_add_epi16 (product2, product3);
198+ product2 = _mm_madd_epi16 (product2, Ones128);
199+ acc = _mm_add_epi32 (acc, _mm_add_epi32 (product0, product2));
242200 };
243201
244202#endif
@@ -298,8 +256,6 @@ namespace Stockfish::Eval::NNUE::Layers {
298256 for (int j = 0 ; j * OutputSimdWidth < OutputDimensions; ++j)
299257 vec_add_dpbusd_32x4 (outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
300258 }
301- for (int i = 0 ; i < canSaturate16.count ; ++i)
302- output[canSaturate16.ids [i].out ] += input[canSaturate16.ids [i].in ] * canSaturate16.ids [i].w ;
303259 }
304260 else if constexpr (OutputDimensions == 1 )
305261 {
@@ -446,23 +402,6 @@ namespace Stockfish::Eval::NNUE::Layers {
446402
447403 alignas (CacheLineSize) BiasType biases[OutputDimensions];
448404 alignas (CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
449- #if defined (USE_SSSE3)
450- struct CanSaturate {
451- int count;
452- struct Entry {
453- uint16_t out;
454- uint16_t in;
455- int8_t w;
456- } ids[PaddedInputDimensions * OutputDimensions * 3 / 4 ];
457-
458- void add (int i, int j, int8_t w) {
459- ids[count].out = i;
460- ids[count].in = j;
461- ids[count].w = w;
462- ++count;
463- }
464- } canSaturate16;
465- #endif
466405 };
467406
468407} // namespace Stockfish::Eval::NNUE::Layers
0 commit comments