|
1 | 1 | #include "argparse.hpp" |
2 | | -#include "chess/chess.h" |
3 | | -#include "dataset/batchloader.h" |
4 | | -#include "dataset/dataset.h" |
5 | | -#include "dataset/io.h" |
6 | | -#include "dataset/process.h" |
7 | | -#include "misc/csv.h" |
8 | | -#include "misc/timer.h" |
9 | | -#include "nn/nn.h" |
10 | | -#include "operations/operations.h" |
| 2 | +#include "models/berserk.h" |
11 | 3 |
|
12 | 4 | #include <fstream> |
13 | 5 | #include <limits> |
14 | 6 |
|
15 | | -namespace fs = std::filesystem; |
16 | | - |
17 | 7 | using namespace nn; |
18 | 8 | using namespace data; |
19 | | - |
20 | | -struct ChessModel : nn::Model { |
21 | | - float lambda; |
22 | | - |
23 | | - ChessModel(float lambda_) |
24 | | - : lambda(lambda_) {} |
25 | | - |
26 | | - // seting inputs |
27 | | - virtual void setup_inputs_and_outputs(dataset::DataSet<chess::Position>* positions) = 0; |
28 | | - |
29 | | - // train function |
30 | | - void train(dataset::BatchLoader<chess::Position>& loader, |
31 | | - int epochs = 1500, |
32 | | - int epoch_size = 1e8) { |
33 | | - this->compile(loader.batch_size); |
34 | | - |
35 | | - Timer t {}; |
36 | | - for (int i = 1; i <= epochs; i++) { |
37 | | - t.start(); |
38 | | - |
39 | | - uint64_t prev_print_tm = 0; |
40 | | - float total_epoch_loss = 0; |
41 | | - |
42 | | - for (int b = 1; b <= epoch_size / loader.batch_size; b++) { |
43 | | - auto* ds = loader.next(); |
44 | | - setup_inputs_and_outputs(ds); |
45 | | - |
46 | | - float batch_loss = batch(); |
47 | | - total_epoch_loss += batch_loss; |
48 | | - float epoch_loss = total_epoch_loss / b; |
49 | | - |
50 | | - t.stop(); |
51 | | - uint64_t elapsed = t.elapsed(); |
52 | | - if (elapsed - prev_print_tm > 1000 || b == epoch_size / loader.batch_size) { |
53 | | - prev_print_tm = elapsed; |
54 | | - |
55 | | - printf("\rep = [%4d], epoch_loss = [%1.8f], batch = [%5d], batch_loss = [%1.8f], " |
56 | | - "speed = [%7.2f it/s], time = [%3ds]", |
57 | | - i, |
58 | | - epoch_loss, |
59 | | - b, |
60 | | - batch_loss, |
61 | | - 1000.0f * b / elapsed, |
62 | | - (int) (elapsed / 1000.0f)); |
63 | | - std::cout << std::flush; |
64 | | - } |
65 | | - } |
66 | | - |
67 | | - std::cout << std::endl; |
68 | | - |
69 | | - float epoch_loss = total_epoch_loss / (epoch_size / loader.batch_size); |
70 | | - next_epoch(epoch_loss, 0.0); |
71 | | - } |
72 | | - } |
73 | | - |
74 | | - void test_fen(const std::string& fen) { |
75 | | - this->compile(1); |
76 | | - |
77 | | - chess::Position pos = chess::parse_fen(fen); |
78 | | - dataset::DataSet<chess::Position> ds {}; |
79 | | - ds.positions.push_back(pos); |
80 | | - ds.header.entry_count = 1; |
81 | | - |
82 | | - // setup inputs of network |
83 | | - setup_inputs_and_outputs(&ds); |
84 | | - |
85 | | - // forward pass |
86 | | - this->upload_inputs(); |
87 | | - this->forward(); |
88 | | - |
89 | | - // go through the layers and download values |
90 | | - |
91 | | - std::cout |
92 | | - << "==================================================================================\n"; |
93 | | - std::cout << "testing fen: " << fen << std::endl; |
94 | | - |
95 | | - int idx = 0; |
96 | | - for (auto layer : m_layers) { |
97 | | - layer->dense_output.values >> CPU; |
98 | | - |
99 | | - std::cout << "LAYER " << ++idx << std::endl; |
100 | | - for (int i = 0; i < std::min((size_t) 16, layer->size); i++) { |
101 | | - std::cout << std::setw(10) << layer->dense_output.values(i, 0); |
102 | | - } |
103 | | - if (layer->size > 16) { |
104 | | - std::cout << " ......... " << layer->dense_output.values(layer->size - 1, 0); |
105 | | - } |
106 | | - std::cout << "\n"; |
107 | | - } |
108 | | - } |
109 | | - |
110 | | - void distribution(dataset::BatchLoader<chess::Position>& loader, int batches = 32) { |
111 | | - this->compile(loader.batch_size); |
112 | | - |
113 | | - std::vector<DenseMatrix<float>> max_values {}; |
114 | | - std::vector<DenseMatrix<float>> min_values {}; |
115 | | - std::vector<std::pair<uint64_t, uint64_t>> sparsity {}; |
116 | | - |
117 | | - for (auto l : m_layers) { |
118 | | - max_values.emplace_back(l->dense_output.values.m, 1); |
119 | | - min_values.emplace_back(l->dense_output.values.m, 1); |
120 | | - max_values.back().malloc<data::CPU>(); |
121 | | - min_values.back().malloc<data::CPU>(); |
122 | | - |
123 | | - math::fill<float>(max_values.back(), -std::numeric_limits<float>::max()); |
124 | | - math::fill<float>(min_values.back(), std::numeric_limits<float>::max()); |
125 | | - |
126 | | - sparsity.push_back(std::pair(0, 0)); |
127 | | - } |
128 | | - |
129 | | - for (int b = 0; b < batches; b++) { |
130 | | - auto* ds = loader.next(); |
131 | | - setup_inputs_and_outputs(ds); |
132 | | - this->upload_inputs(); |
133 | | - this->forward(); |
134 | | - std::cout << "\r" << b << " / " << batches << std::flush; |
135 | | - |
136 | | - // get minimum and maximum values |
137 | | - for (int i = 0; i < m_layers.size(); i++) { |
138 | | - auto layer = m_layers[i].get(); |
139 | | - layer->dense_output.values >> data::CPU; |
140 | | - for (int m = 0; m < layer->dense_output.values.m; m++) { |
141 | | - for (int n = 0; n < layer->dense_output.values.n; n++) { |
142 | | - max_values[i](m, 0) = |
143 | | - std::max(max_values[i](m, 0), layer->dense_output.values(m, n)); |
144 | | - min_values[i](m, 0) = |
145 | | - std::min(min_values[i](m, 0), layer->dense_output.values(m, n)); |
146 | | - |
147 | | - sparsity[i].first++; |
148 | | - sparsity[i].second += (layer->dense_output.values(m, n) > 0); |
149 | | - } |
150 | | - } |
151 | | - } |
152 | | - } |
153 | | - std::cout << std::endl; |
154 | | - |
155 | | - for (int i = 0; i < m_layers.size(); i++) { |
156 | | - std::cout << "------------ LAYER " << i + 1 << " --------------------" << std::endl; |
157 | | - std::cout << "min: "; |
158 | | - for (int j = 0; j < std::min((size_t) 16, min_values[i].size()); j++) { |
159 | | - std::cout << std::setw(10) << min_values[i](j); |
160 | | - } |
161 | | - if (min_values[i].size() > 16) { |
162 | | - std::cout << " ......... " << min_values[i](min_values.size() - 1); |
163 | | - } |
164 | | - std::cout << "\n"; |
165 | | - |
166 | | - std::cout << "max: "; |
167 | | - for (int j = 0; j < std::min((size_t) 16, max_values[i].size()); j++) { |
168 | | - std::cout << std::setw(10) << max_values[i](j); |
169 | | - } |
170 | | - if (max_values[i].size() > 16) { |
171 | | - std::cout << " ......... " << max_values[i](max_values.size() - 1); |
172 | | - } |
173 | | - |
174 | | - std::cout << "\n"; |
175 | | - float min = 10000000; |
176 | | - float max = -10000000; |
177 | | - for (int m = 0; m < min_values.size(); m++) { |
178 | | - min = std::min(min, min_values[i](m)); |
179 | | - max = std::max(max, max_values[i](m)); |
180 | | - } |
181 | | - std::cout << "output bounds: [" << min << " ; " << max << "]\n"; |
182 | | - |
183 | | - int died = 0; |
184 | | - for (int j = 0; j < max_values[i].size(); j++) { |
185 | | - if (std::abs(max_values[i](j) - min_values[i](j)) < 1e-8) { |
186 | | - died++; |
187 | | - } |
188 | | - } |
189 | | - |
190 | | - std::cout << "died: " << died << " / " << max_values[i].size(); |
191 | | - std::cout << "\n"; |
192 | | - |
193 | | - float sparsity_total = sparsity[i].first; |
194 | | - float sparsity_active = sparsity[i].second; |
195 | | - |
196 | | - std::cout << "sparsity: " << sparsity_active / sparsity_total; |
197 | | - std::cout << "\n"; |
198 | | - |
199 | | - for (auto p : m_layers[i]->params()) { |
200 | | - float min = 10000000; |
201 | | - float max = -10000000; |
202 | | - for (int m = 0; m < p->values.m; m++) { |
203 | | - for (int n = 0; n < p->values.n; n++) { |
204 | | - min = std::min(min, p->values(m, n)); |
205 | | - max = std::max(max, p->values(m, n)); |
206 | | - } |
207 | | - } |
208 | | - |
209 | | - std::cout << "param bounds: [" << min << " ; " << max << "]\n"; |
210 | | - } |
211 | | - } |
212 | | - } |
213 | | -}; |
214 | | - |
215 | | -struct BerserkModel : ChessModel { |
216 | | - SparseInput* in1; |
217 | | - SparseInput* in2; |
218 | | - |
219 | | - const float sigmoid_scale = 1.0 / 160.0; |
220 | | - const float quant_one = 32.0; |
221 | | - const float quant_two = 32.0; |
222 | | - |
223 | | - const size_t n_features = 16 * 12 * 64; |
224 | | - const size_t n_l1 = 16; |
225 | | - const size_t n_l2 = 32; |
226 | | - const size_t n_out = 1; |
227 | | - |
228 | | - BerserkModel(size_t n_ft, float lambda, size_t save_rate) |
229 | | - : ChessModel(lambda) { |
230 | | - |
231 | | - in1 = add<SparseInput>(n_features, 32); |
232 | | - in2 = add<SparseInput>(n_features, 32); |
233 | | - |
234 | | - auto ft = add<FeatureTransformer>(in1, in2, n_ft); |
235 | | - auto fta = add<ClippedRelu>(ft); |
236 | | - ft->ft_regularization = 1.0 / 16384.0 / 4194304.0; |
237 | | - fta->max = 127.0; |
238 | | - |
239 | | - auto l1 = add<Affine>(fta, n_l1); |
240 | | - auto l1a = add<ReLU>(l1); |
241 | | - |
242 | | - auto l2 = add<Affine>(l1a, n_l2); |
243 | | - auto l2a = add<ReLU>(l2); |
244 | | - |
245 | | - auto pos_eval = add<Affine>(l2a, n_out); |
246 | | - auto sigmoid = add<Sigmoid>(pos_eval, sigmoid_scale); |
247 | | - |
248 | | - const float hidden_max = 127.0 / quant_two; |
249 | | - add_optimizer(AdamWarmup({{OptimizerEntry {&ft->weights}}, |
250 | | - {OptimizerEntry {&ft->bias}}, |
251 | | - {OptimizerEntry {&l1->weights}.clamp(-hidden_max, hidden_max)}, |
252 | | - {OptimizerEntry {&l1->bias}}, |
253 | | - {OptimizerEntry {&l2->weights}}, |
254 | | - {OptimizerEntry {&l2->bias}}, |
255 | | - {OptimizerEntry {&pos_eval->weights}}, |
256 | | - {OptimizerEntry {&pos_eval->bias}}}, |
257 | | - 0.95, |
258 | | - 0.999, |
259 | | - 1e-8, |
260 | | - 5 * 16384)); |
261 | | - |
262 | | - set_save_frequency(save_rate); |
263 | | - add_quantization(Quantizer { |
264 | | - "quant", |
265 | | - save_rate, |
266 | | - QuantizerEntry<int16_t>(&ft->weights.values, quant_one, true), |
267 | | - QuantizerEntry<int16_t>(&ft->bias.values, quant_one), |
268 | | - QuantizerEntry<int8_t>(&l1->weights.values, quant_two), |
269 | | - QuantizerEntry<int32_t>(&l1->bias.values, quant_two), |
270 | | - QuantizerEntry<float>(&l2->weights.values, 1.0), |
271 | | - QuantizerEntry<float>(&l2->bias.values, quant_two), |
272 | | - QuantizerEntry<float>(&pos_eval->weights.values, 1.0), |
273 | | - QuantizerEntry<float>(&pos_eval->bias.values, quant_two), |
274 | | - }); |
275 | | - } |
276 | | - |
277 | | - inline int king_square_index(int relative_king_square) { |
278 | | - constexpr int indices[64] { |
279 | | - -1, -1, -1, -1, 14, 14, 15, 15, // |
280 | | - -1, -1, -1, -1, 14, 14, 15, 15, // |
281 | | - -1, -1, -1, -1, 12, 12, 13, 13, // |
282 | | - -1, -1, -1, -1, 12, 12, 13, 13, // |
283 | | - -1, -1, -1, -1, 8, 9, 10, 11, // |
284 | | - -1, -1, -1, -1, 8, 9, 10, 11, // |
285 | | - -1, -1, -1, -1, 4, 5, 6, 7, // |
286 | | - -1, -1, -1, -1, 0, 1, 2, 3, // |
287 | | - }; |
288 | | - |
289 | | - return indices[relative_king_square]; |
290 | | - } |
291 | | - |
292 | | - inline int index(chess::Square piece_square, |
293 | | - chess::Piece piece, |
294 | | - chess::Square king_square, |
295 | | - chess::Color view) { |
296 | | - |
297 | | - const chess::PieceType piece_type = chess::type_of(piece); |
298 | | - const chess::Color piece_color = chess::color_of(piece); |
299 | | - |
300 | | - piece_square ^= 56; |
301 | | - king_square ^= 56; |
302 | | - |
303 | | - const int oP = piece_type + 6 * (piece_color != view); |
304 | | - const int oK = (7 * !(king_square & 4)) ^ (56 * view) ^ king_square; |
305 | | - const int oSq = (7 * !(king_square & 4)) ^ (56 * view) ^ piece_square; |
306 | | - |
307 | | - return king_square_index(oK) * 12 * 64 + oP * 64 + oSq; |
308 | | - } |
309 | | - |
310 | | - void setup_inputs_and_outputs(dataset::DataSet<chess::Position>* positions) { |
311 | | - in1->sparse_output.clear(); |
312 | | - in2->sparse_output.clear(); |
313 | | - |
314 | | - auto& target = m_loss->target; |
315 | | - |
316 | | -#pragma omp parallel for schedule(static) num_threads(16) |
317 | | - for (int b = 0; b < positions->header.entry_count; b++) { |
318 | | - chess::Position* pos = &positions->positions[b]; |
319 | | - // fill in the inputs and target values |
320 | | - |
321 | | - chess::Square wKingSq = pos->get_king_square<chess::WHITE>(); |
322 | | - chess::Square bKingSq = pos->get_king_square<chess::BLACK>(); |
323 | | - |
324 | | - chess::BB bb {pos->m_occupancy}; |
325 | | - int idx = 0; |
326 | | - |
327 | | - while (bb) { |
328 | | - chess::Square sq = chess::lsb(bb); |
329 | | - chess::Piece pc = pos->m_pieces.get_piece(idx); |
330 | | - |
331 | | - auto piece_index_white_pov = index(sq, pc, wKingSq, chess::WHITE); |
332 | | - auto piece_index_black_pov = index(sq, pc, bKingSq, chess::BLACK); |
333 | | - |
334 | | - if (pos->m_meta.stm() == chess::WHITE) { |
335 | | - in1->sparse_output.set(b, piece_index_white_pov); |
336 | | - in2->sparse_output.set(b, piece_index_black_pov); |
337 | | - } else { |
338 | | - in2->sparse_output.set(b, piece_index_white_pov); |
339 | | - in1->sparse_output.set(b, piece_index_black_pov); |
340 | | - } |
341 | | - |
342 | | - bb = chess::lsb_reset(bb); |
343 | | - idx++; |
344 | | - } |
345 | | - |
346 | | - float p_value = pos->m_result.score; |
347 | | - float w_value = pos->m_result.wdl; |
348 | | - |
349 | | - // flip if black is to move -> relative network style |
350 | | - if (pos->m_meta.stm() == chess::BLACK) { |
351 | | - p_value = -p_value; |
352 | | - w_value = -w_value; |
353 | | - } |
354 | | - |
355 | | - float p_target = 1 / (1 + expf(-p_value * sigmoid_scale)); |
356 | | - float w_target = (w_value + 1) / 2.0f; |
357 | | - |
358 | | - target(b) = lambda * p_target + (1.0 - lambda) * w_target; |
359 | | - |
360 | | - // layer_selector->dense_output.values(b, 0) = |
361 | | - // (int) ((chess::popcount(pos->m_occupancy) - 1) / 4); |
362 | | - } |
363 | | - } |
364 | | -}; |
| 9 | +namespace fs = std::filesystem; |
365 | 10 |
|
366 | 11 | int main(int argc, char* argv[]) { |
367 | 12 | argparse::ArgumentParser program("Grapheus"); |
@@ -456,7 +101,7 @@ int main(int argc, char* argv[]) { |
456 | 101 | dataset::BatchLoader<chess::Position> loader {files, batch_size}; |
457 | 102 | loader.start(); |
458 | 103 |
|
459 | | - BerserkModel model {static_cast<size_t>(ft_size), lambda, static_cast<size_t>(save_rate)}; |
| 104 | + model::BerserkModel model {static_cast<size_t>(ft_size), lambda, static_cast<size_t>(save_rate)}; |
460 | 105 | model.set_loss(MPE {2.5, true}); |
461 | 106 | model.set_lr_schedule(StepDecayLRSchedule {lr, lr_drop_ratio, lr_drop_epoch}); |
462 | 107 |
|
|
0 commit comments