3838# include < arm_neon.h>
3939#endif
4040
41- // The inline asm is only safe for GCC, where it is necessary to get good codegen.
42- // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101693
43- // Clang does fine without it.
44- // Play around here: https://godbolt.org/z/7EWqrYq51
45- #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER))
46- #define USE_INLINE_ASM
47- #endif
48-
49- // Use either the AVX512 or AVX-VNNI version of the VNNI instructions.
50- #if defined(USE_AVXVNNI)
51- #define VNNI_PREFIX " %{vex%} "
52- #else
53- #define VNNI_PREFIX " "
54- #endif
55-
5641namespace Stockfish ::Simd {
5742
5843#if defined (USE_AVX512)
@@ -117,29 +102,11 @@ namespace Stockfish::Simd {
117102 __m512i b) {
118103
119104# if defined (USE_VNNI)
120- # if defined (USE_INLINE_ASM)
121- asm (
122- " vpdpbusd %[b], %[a], %[acc]\n\t "
123- : [acc]" +v" (acc)
124- : [a]" v" (a), [b]" vm" (b)
125- );
126- # else
127105 acc = _mm512_dpbusd_epi32 (acc, a, b);
128- # endif
129106# else
130- # if defined (USE_INLINE_ASM)
131- __m512i tmp = _mm512_maddubs_epi16 (a, b);
132- asm (
133- " vpmaddwd %[tmp], %[ones], %[tmp]\n\t "
134- " vpaddd %[acc], %[tmp], %[acc]\n\t "
135- : [acc]" +v" (acc), [tmp]" +&v" (tmp)
136- : [ones]" v" (_mm512_set1_epi16 (1 ))
137- );
138- # else
139107 __m512i product0 = _mm512_maddubs_epi16 (a, b);
140108 product0 = _mm512_madd_epi16 (product0, _mm512_set1_epi16 (1 ));
141109 acc = _mm512_add_epi32 (acc, product0);
142- # endif
143110# endif
144111 }
145112
@@ -149,36 +116,14 @@ namespace Stockfish::Simd {
149116 __m512i a1, __m512i b1) {
150117
151118# if defined (USE_VNNI)
152- # if defined (USE_INLINE_ASM)
153- asm (
154- " vpdpbusd %[b0], %[a0], %[acc]\n\t "
155- " vpdpbusd %[b1], %[a1], %[acc]\n\t "
156- : [acc]" +&v" (acc)
157- : [a0]" v" (a0), [b0]" vm" (b0), [a1]" v" (a1), [b1]" vm" (b1)
158- );
159- # else
160119 acc = _mm512_dpbusd_epi32 (acc, a0, b0);
161120 acc = _mm512_dpbusd_epi32 (acc, a1, b1);
162- # endif
163121# else
164- # if defined (USE_INLINE_ASM)
165- __m512i tmp0 = _mm512_maddubs_epi16 (a0, b0);
166- __m512i tmp1 = _mm512_maddubs_epi16 (a1, b1);
167- asm (
168- " vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t "
169- " vpmaddwd %[tmp1], %[ones], %[tmp1]\n\t "
170- " vpaddd %[tmp0], %[tmp1], %[tmp0]\n\t "
171- " vpaddd %[acc], %[tmp0], %[acc]\n\t "
172- : [acc]" +v" (acc), [tmp0]" +&v" (tmp0), [tmp1]" +&v" (tmp1)
173- : [ones]" v" (_mm512_set1_epi16 (1 ))
174- );
175- # else
176122 __m512i product0 = _mm512_maddubs_epi16 (a0, b0);
177123 __m512i product1 = _mm512_maddubs_epi16 (a1, b1);
178124 product0 = _mm512_madd_epi16 (product0, _mm512_set1_epi16 (1 ));
179125 product1 = _mm512_madd_epi16 (product1, _mm512_set1_epi16 (1 ));
180126 acc = _mm512_add_epi32 (acc, _mm512_add_epi32 (product0, product1));
181- # endif
182127# endif
183128 }
184129
@@ -214,29 +159,11 @@ namespace Stockfish::Simd {
214159 __m256i b) {
215160
216161# if defined (USE_VNNI)
217- # if defined (USE_INLINE_ASM)
218- asm (
219- VNNI_PREFIX " vpdpbusd %[b], %[a], %[acc]\n\t "
220- : [acc]" +v" (acc)
221- : [a]" v" (a), [b]" vm" (b)
222- );
223- # else
224162 acc = _mm256_dpbusd_epi32 (acc, a, b);
225- # endif
226163# else
227- # if defined (USE_INLINE_ASM)
228- __m256i tmp = _mm256_maddubs_epi16 (a, b);
229- asm (
230- " vpmaddwd %[tmp], %[ones], %[tmp]\n\t "
231- " vpaddd %[acc], %[tmp], %[acc]\n\t "
232- : [acc]" +v" (acc), [tmp]" +&v" (tmp)
233- : [ones]" v" (_mm256_set1_epi16 (1 ))
234- );
235- # else
236164 __m256i product0 = _mm256_maddubs_epi16 (a, b);
237165 product0 = _mm256_madd_epi16 (product0, _mm256_set1_epi16 (1 ));
238166 acc = _mm256_add_epi32 (acc, product0);
239- # endif
240167# endif
241168 }
242169
@@ -246,36 +173,14 @@ namespace Stockfish::Simd {
246173 __m256i a1, __m256i b1) {
247174
248175# if defined (USE_VNNI)
249- # if defined (USE_INLINE_ASM)
250- asm (
251- VNNI_PREFIX " vpdpbusd %[b0], %[a0], %[acc]\n\t "
252- VNNI_PREFIX " vpdpbusd %[b1], %[a1], %[acc]\n\t "
253- : [acc]" +&v" (acc)
254- : [a0]" v" (a0), [b0]" vm" (b0), [a1]" v" (a1), [b1]" vm" (b1)
255- );
256- # else
257176 acc = _mm256_dpbusd_epi32 (acc, a0, b0);
258177 acc = _mm256_dpbusd_epi32 (acc, a1, b1);
259- # endif
260178# else
261- # if defined (USE_INLINE_ASM)
262- __m256i tmp0 = _mm256_maddubs_epi16 (a0, b0);
263- __m256i tmp1 = _mm256_maddubs_epi16 (a1, b1);
264- asm (
265- " vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t "
266- " vpmaddwd %[tmp1], %[ones], %[tmp1]\n\t "
267- " vpaddd %[tmp0], %[tmp1], %[tmp0]\n\t "
268- " vpaddd %[acc], %[tmp0], %[acc]\n\t "
269- : [acc]" +v" (acc), [tmp0]" +&v" (tmp0), [tmp1]" +&v" (tmp1)
270- : [ones]" v" (_mm256_set1_epi16 (1 ))
271- );
272- # else
273179 __m256i product0 = _mm256_maddubs_epi16 (a0, b0);
274180 __m256i product1 = _mm256_maddubs_epi16 (a1, b1);
275181 product0 = _mm256_madd_epi16 (product0, _mm256_set1_epi16 (1 ));
276182 product1 = _mm256_madd_epi16 (product1, _mm256_set1_epi16 (1 ));
277183 acc = _mm256_add_epi32 (acc, _mm256_add_epi32 (product0, product1));
278- # endif
279184# endif
280185 }
281186
@@ -304,44 +209,21 @@ namespace Stockfish::Simd {
304209 __m128i a,
305210 __m128i b) {
306211
307- # if defined (USE_INLINE_ASM)
308- __m128i tmp = _mm_maddubs_epi16 (a, b);
309- asm (
310- " pmaddwd %[ones], %[tmp]\n\t "
311- " paddd %[tmp], %[acc]\n\t "
312- : [acc]" +v" (acc), [tmp]" +&v" (tmp)
313- : [ones]" v" (_mm_set1_epi16 (1 ))
314- );
315- # else
316212 __m128i product0 = _mm_maddubs_epi16 (a, b);
317213 product0 = _mm_madd_epi16 (product0, _mm_set1_epi16 (1 ));
318214 acc = _mm_add_epi32 (acc, product0);
319- # endif
320215 }
321216
322217 [[maybe_unused]] static void m128_add_dpbusd_epi32x2 (
323218 __m128i& acc,
324219 __m128i a0, __m128i b0,
325220 __m128i a1, __m128i b1) {
326221
327- # if defined (USE_INLINE_ASM)
328- __m128i tmp0 = _mm_maddubs_epi16 (a0, b0);
329- __m128i tmp1 = _mm_maddubs_epi16 (a1, b1);
330- asm (
331- " pmaddwd %[ones], %[tmp0]\n\t "
332- " pmaddwd %[ones], %[tmp1]\n\t "
333- " paddd %[tmp1], %[tmp0]\n\t "
334- " paddd %[tmp0], %[acc]\n\t "
335- : [acc]" +v" (acc), [tmp0]" +&v" (tmp0), [tmp1]" +&v" (tmp1)
336- : [ones]" v" (_mm_set1_epi16 (1 ))
337- );
338- # else
339222 __m128i product0 = _mm_maddubs_epi16 (a0, b0);
340223 __m128i product1 = _mm_maddubs_epi16 (a1, b1);
341224 product0 = _mm_madd_epi16 (product0, _mm_set1_epi16 (1 ));
342225 product1 = _mm_madd_epi16 (product1, _mm_set1_epi16 (1 ));
343226 acc = _mm_add_epi32 (acc, _mm_add_epi32 (product0, product1));
344- # endif
345227 }
346228
347229#endif
0 commit comments