Skip to content

Commit 61485e2

Browse files
committed
Update NNUE architecture to SFNNv5. Update network to nn-3c0aa92af1da.nnue.
1 parent 9eb7b60 commit 61485e2

File tree

3 files changed

+128
-3
lines changed

3 files changed

+128
-3
lines changed

src/evaluate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ namespace Eval {
3939
// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
4040
// for the build process (profile-build and fishtest) to work. Do not change the
4141
// name of the macro, as it is used in the Makefile.
42-
#define EvalFileDefaultName "nn-d0b74ce1e5eb.nnue"
42+
#define EvalFileDefaultName "nn-3c0aa92af1da.nnue"
4343

4444
namespace NNUE {
4545

src/nnue/layers/sqr_clipped_relu.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3+
Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)
4+
5+
Stockfish is free software: you can redistribute it and/or modify
6+
it under the terms of the GNU General Public License as published by
7+
the Free Software Foundation, either version 3 of the License, or
8+
(at your option) any later version.
9+
10+
Stockfish is distributed in the hope that it will be useful,
11+
but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
GNU General Public License for more details.
14+
15+
You should have received a copy of the GNU General Public License
16+
along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
// Definition of layer ClippedReLU of NNUE evaluation function
20+
21+
#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
22+
#define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
23+
24+
#include "../nnue_common.h"
25+
26+
namespace Stockfish::Eval::NNUE::Layers {
27+
28+
// Clipped ReLU
29+
template <IndexType InDims>
30+
class SqrClippedReLU {
31+
public:
32+
// Input/output type
33+
using InputType = std::int32_t;
34+
using OutputType = std::uint8_t;
35+
36+
// Number of input/output dimensions
37+
static constexpr IndexType InputDimensions = InDims;
38+
static constexpr IndexType OutputDimensions = InputDimensions;
39+
static constexpr IndexType PaddedOutputDimensions =
40+
ceil_to_multiple<IndexType>(OutputDimensions, 32);
41+
42+
using OutputBuffer = OutputType[PaddedOutputDimensions];
43+
44+
// Hash value embedded in the evaluation file
45+
static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
46+
std::uint32_t hashValue = 0x538D24C7u;
47+
hashValue += prevHash;
48+
return hashValue;
49+
}
50+
51+
// Read network parameters
52+
bool read_parameters(std::istream&) {
53+
return true;
54+
}
55+
56+
// Write network parameters
57+
bool write_parameters(std::ostream&) const {
58+
return true;
59+
}
60+
61+
// Forward propagation
62+
const OutputType* propagate(
63+
const InputType* input, OutputType* output) const {
64+
65+
#if defined(USE_SSE2)
66+
constexpr IndexType NumChunks = InputDimensions / 16;
67+
68+
#ifdef USE_SSE41
69+
const __m128i Zero = _mm_setzero_si128();
70+
#else
71+
const __m128i k0x80s = _mm_set1_epi8(-128);
72+
#endif
73+
74+
static_assert(WeightScaleBits == 6);
75+
const auto in = reinterpret_cast<const __m128i*>(input);
76+
const auto out = reinterpret_cast<__m128i*>(output);
77+
for (IndexType i = 0; i < NumChunks; ++i) {
78+
__m128i words0 = _mm_packs_epi32(
79+
_mm_load_si128(&in[i * 4 + 0]),
80+
_mm_load_si128(&in[i * 4 + 1]));
81+
__m128i words1 = _mm_packs_epi32(
82+
_mm_load_si128(&in[i * 4 + 2]),
83+
_mm_load_si128(&in[i * 4 + 3]));
84+
85+
// Not sure if
86+
words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
87+
words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
88+
89+
const __m128i packedbytes = _mm_packs_epi16(words0, words1);
90+
91+
_mm_store_si128(&out[i],
92+
93+
#ifdef USE_SSE41
94+
_mm_max_epi8(packedbytes, Zero)
95+
#else
96+
_mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
97+
#endif
98+
99+
);
100+
}
101+
constexpr IndexType Start = NumChunks * 16;
102+
103+
#else
104+
constexpr IndexType Start = 0;
105+
#endif
106+
107+
for (IndexType i = Start; i < InputDimensions; ++i) {
108+
output[i] = static_cast<OutputType>(
109+
// realy should be /127 but we need to make it fast
110+
// needs to be accounted for in the trainer
111+
std::max(0ll, std::min(127ll, (((long long)input[i] * input[i]) >> (2 * WeightScaleBits)) / 128)));
112+
}
113+
114+
return output;
115+
}
116+
};
117+
118+
} // namespace Stockfish::Eval::NNUE::Layers
119+
120+
#endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED

src/nnue/nnue_architecture.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include "layers/affine_transform.h"
3131
#include "layers/clipped_relu.h"
32+
#include "layers/sqr_clipped_relu.h"
3233

3334
#include "../misc.h"
3435

@@ -48,8 +49,9 @@ struct Network
4849
static constexpr int FC_1_OUTPUTS = 32;
4950

5051
Layers::AffineTransform<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
52+
Layers::SqrClippedReLU<FC_0_OUTPUTS + 1> ac_sqr_0;
5153
Layers::ClippedReLU<FC_0_OUTPUTS + 1> ac_0;
52-
Layers::AffineTransform<FC_0_OUTPUTS, FC_1_OUTPUTS> fc_1;
54+
Layers::AffineTransform<FC_0_OUTPUTS*2, FC_1_OUTPUTS> fc_1;
5355
Layers::ClippedReLU<FC_1_OUTPUTS> ac_1;
5456
Layers::AffineTransform<FC_1_OUTPUTS, 1> fc_2;
5557

@@ -93,6 +95,7 @@ struct Network
9395
struct alignas(CacheLineSize) Buffer
9496
{
9597
alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out;
98+
alignas(CacheLineSize) decltype(ac_sqr_0)::OutputType ac_sqr_0_out[ceil_to_multiple<IndexType>(FC_0_OUTPUTS * 2, 32)];
9699
alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out;
97100
alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out;
98101
alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out;
@@ -114,8 +117,10 @@ struct Network
114117
#endif
115118

116119
fc_0.propagate(transformedFeatures, buffer.fc_0_out);
120+
ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out);
117121
ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
118-
fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out);
122+
std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out, FC_0_OUTPUTS * sizeof(decltype(ac_0)::OutputType));
123+
fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out);
119124
ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
120125
fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);
121126

0 commit comments

Comments
 (0)