Skip to content

Commit 27f7a29

Browse files
committed
Refactor stuff
1 parent 0bbba2e commit 27f7a29

File tree

7 files changed

+554
-371
lines changed

7 files changed

+554
-371
lines changed

Makefile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33

44
# Compiler options
55
NVCC = nvcc
6-
CXX = g++
7-
CXXFLAGS = -std=c++17 -fopenmp -stdlib=libc++
8-
NVCCFLAGS = -use_fast_math -O3 -DNDEBUG --compiler-options -std=c++17
6+
CXXFLAGS = -std=c++17 -fopenmp -march=native
7+
NVCCFLAGS = -use_fast_math -O3 -DNDEBUG
8+
9+
# Combine CXXFLAGS into NVCCFLAGS
10+
NVCCFLAGS += $(addprefix --compiler-options ,$(CXXFLAGS))
911

1012
# Libraries
1113
LIBS = -lcublas

src/main.cu

Lines changed: 3 additions & 358 deletions
Original file line numberDiff line numberDiff line change
@@ -1,367 +1,12 @@
11
#include "argparse.hpp"
2-
#include "chess/chess.h"
3-
#include "dataset/batchloader.h"
4-
#include "dataset/dataset.h"
5-
#include "dataset/io.h"
6-
#include "dataset/process.h"
7-
#include "misc/csv.h"
8-
#include "misc/timer.h"
9-
#include "nn/nn.h"
10-
#include "operations/operations.h"
2+
#include "models/berserk.h"
113

124
#include <fstream>
135
#include <limits>
146

15-
namespace fs = std::filesystem;
16-
177
using namespace nn;
188
using namespace data;
19-
20-
struct ChessModel : nn::Model {
21-
float lambda;
22-
23-
ChessModel(float lambda_)
24-
: lambda(lambda_) {}
25-
26-
// seting inputs
27-
virtual void setup_inputs_and_outputs(dataset::DataSet<chess::Position>* positions) = 0;
28-
29-
// train function
30-
void train(dataset::BatchLoader<chess::Position>& loader,
31-
int epochs = 1500,
32-
int epoch_size = 1e8) {
33-
this->compile(loader.batch_size);
34-
35-
Timer t {};
36-
for (int i = 1; i <= epochs; i++) {
37-
t.start();
38-
39-
uint64_t prev_print_tm = 0;
40-
float total_epoch_loss = 0;
41-
42-
for (int b = 1; b <= epoch_size / loader.batch_size; b++) {
43-
auto* ds = loader.next();
44-
setup_inputs_and_outputs(ds);
45-
46-
float batch_loss = batch();
47-
total_epoch_loss += batch_loss;
48-
float epoch_loss = total_epoch_loss / b;
49-
50-
t.stop();
51-
uint64_t elapsed = t.elapsed();
52-
if (elapsed - prev_print_tm > 1000 || b == epoch_size / loader.batch_size) {
53-
prev_print_tm = elapsed;
54-
55-
printf("\rep = [%4d], epoch_loss = [%1.8f], batch = [%5d], batch_loss = [%1.8f], "
56-
"speed = [%7.2f it/s], time = [%3ds]",
57-
i,
58-
epoch_loss,
59-
b,
60-
batch_loss,
61-
1000.0f * b / elapsed,
62-
(int) (elapsed / 1000.0f));
63-
std::cout << std::flush;
64-
}
65-
}
66-
67-
std::cout << std::endl;
68-
69-
float epoch_loss = total_epoch_loss / (epoch_size / loader.batch_size);
70-
next_epoch(epoch_loss, 0.0);
71-
}
72-
}
73-
74-
void test_fen(const std::string& fen) {
75-
this->compile(1);
76-
77-
chess::Position pos = chess::parse_fen(fen);
78-
dataset::DataSet<chess::Position> ds {};
79-
ds.positions.push_back(pos);
80-
ds.header.entry_count = 1;
81-
82-
// setup inputs of network
83-
setup_inputs_and_outputs(&ds);
84-
85-
// forward pass
86-
this->upload_inputs();
87-
this->forward();
88-
89-
// go through the layers and download values
90-
91-
std::cout
92-
<< "==================================================================================\n";
93-
std::cout << "testing fen: " << fen << std::endl;
94-
95-
int idx = 0;
96-
for (auto layer : m_layers) {
97-
layer->dense_output.values >> CPU;
98-
99-
std::cout << "LAYER " << ++idx << std::endl;
100-
for (int i = 0; i < std::min((size_t) 16, layer->size); i++) {
101-
std::cout << std::setw(10) << layer->dense_output.values(i, 0);
102-
}
103-
if (layer->size > 16) {
104-
std::cout << " ......... " << layer->dense_output.values(layer->size - 1, 0);
105-
}
106-
std::cout << "\n";
107-
}
108-
}
109-
110-
void distribution(dataset::BatchLoader<chess::Position>& loader, int batches = 32) {
111-
this->compile(loader.batch_size);
112-
113-
std::vector<DenseMatrix<float>> max_values {};
114-
std::vector<DenseMatrix<float>> min_values {};
115-
std::vector<std::pair<uint64_t, uint64_t>> sparsity {};
116-
117-
for (auto l : m_layers) {
118-
max_values.emplace_back(l->dense_output.values.m, 1);
119-
min_values.emplace_back(l->dense_output.values.m, 1);
120-
max_values.back().malloc<data::CPU>();
121-
min_values.back().malloc<data::CPU>();
122-
123-
math::fill<float>(max_values.back(), -std::numeric_limits<float>::max());
124-
math::fill<float>(min_values.back(), std::numeric_limits<float>::max());
125-
126-
sparsity.push_back(std::pair(0, 0));
127-
}
128-
129-
for (int b = 0; b < batches; b++) {
130-
auto* ds = loader.next();
131-
setup_inputs_and_outputs(ds);
132-
this->upload_inputs();
133-
this->forward();
134-
std::cout << "\r" << b << " / " << batches << std::flush;
135-
136-
// get minimum and maximum values
137-
for (int i = 0; i < m_layers.size(); i++) {
138-
auto layer = m_layers[i].get();
139-
layer->dense_output.values >> data::CPU;
140-
for (int m = 0; m < layer->dense_output.values.m; m++) {
141-
for (int n = 0; n < layer->dense_output.values.n; n++) {
142-
max_values[i](m, 0) =
143-
std::max(max_values[i](m, 0), layer->dense_output.values(m, n));
144-
min_values[i](m, 0) =
145-
std::min(min_values[i](m, 0), layer->dense_output.values(m, n));
146-
147-
sparsity[i].first++;
148-
sparsity[i].second += (layer->dense_output.values(m, n) > 0);
149-
}
150-
}
151-
}
152-
}
153-
std::cout << std::endl;
154-
155-
for (int i = 0; i < m_layers.size(); i++) {
156-
std::cout << "------------ LAYER " << i + 1 << " --------------------" << std::endl;
157-
std::cout << "min: ";
158-
for (int j = 0; j < std::min((size_t) 16, min_values[i].size()); j++) {
159-
std::cout << std::setw(10) << min_values[i](j);
160-
}
161-
if (min_values[i].size() > 16) {
162-
std::cout << " ......... " << min_values[i](min_values.size() - 1);
163-
}
164-
std::cout << "\n";
165-
166-
std::cout << "max: ";
167-
for (int j = 0; j < std::min((size_t) 16, max_values[i].size()); j++) {
168-
std::cout << std::setw(10) << max_values[i](j);
169-
}
170-
if (max_values[i].size() > 16) {
171-
std::cout << " ......... " << max_values[i](max_values.size() - 1);
172-
}
173-
174-
std::cout << "\n";
175-
float min = 10000000;
176-
float max = -10000000;
177-
for (int m = 0; m < min_values.size(); m++) {
178-
min = std::min(min, min_values[i](m));
179-
max = std::max(max, max_values[i](m));
180-
}
181-
std::cout << "output bounds: [" << min << " ; " << max << "]\n";
182-
183-
int died = 0;
184-
for (int j = 0; j < max_values[i].size(); j++) {
185-
if (std::abs(max_values[i](j) - min_values[i](j)) < 1e-8) {
186-
died++;
187-
}
188-
}
189-
190-
std::cout << "died: " << died << " / " << max_values[i].size();
191-
std::cout << "\n";
192-
193-
float sparsity_total = sparsity[i].first;
194-
float sparsity_active = sparsity[i].second;
195-
196-
std::cout << "sparsity: " << sparsity_active / sparsity_total;
197-
std::cout << "\n";
198-
199-
for (auto p : m_layers[i]->params()) {
200-
float min = 10000000;
201-
float max = -10000000;
202-
for (int m = 0; m < p->values.m; m++) {
203-
for (int n = 0; n < p->values.n; n++) {
204-
min = std::min(min, p->values(m, n));
205-
max = std::max(max, p->values(m, n));
206-
}
207-
}
208-
209-
std::cout << "param bounds: [" << min << " ; " << max << "]\n";
210-
}
211-
}
212-
}
213-
};
214-
215-
struct BerserkModel : ChessModel {
216-
SparseInput* in1;
217-
SparseInput* in2;
218-
219-
const float sigmoid_scale = 1.0 / 160.0;
220-
const float quant_one = 32.0;
221-
const float quant_two = 32.0;
222-
223-
const size_t n_features = 16 * 12 * 64;
224-
const size_t n_l1 = 16;
225-
const size_t n_l2 = 32;
226-
const size_t n_out = 1;
227-
228-
BerserkModel(size_t n_ft, float lambda, size_t save_rate)
229-
: ChessModel(lambda) {
230-
231-
in1 = add<SparseInput>(n_features, 32);
232-
in2 = add<SparseInput>(n_features, 32);
233-
234-
auto ft = add<FeatureTransformer>(in1, in2, n_ft);
235-
auto fta = add<ClippedRelu>(ft);
236-
ft->ft_regularization = 1.0 / 16384.0 / 4194304.0;
237-
fta->max = 127.0;
238-
239-
auto l1 = add<Affine>(fta, n_l1);
240-
auto l1a = add<ReLU>(l1);
241-
242-
auto l2 = add<Affine>(l1a, n_l2);
243-
auto l2a = add<ReLU>(l2);
244-
245-
auto pos_eval = add<Affine>(l2a, n_out);
246-
auto sigmoid = add<Sigmoid>(pos_eval, sigmoid_scale);
247-
248-
const float hidden_max = 127.0 / quant_two;
249-
add_optimizer(AdamWarmup({{OptimizerEntry {&ft->weights}},
250-
{OptimizerEntry {&ft->bias}},
251-
{OptimizerEntry {&l1->weights}.clamp(-hidden_max, hidden_max)},
252-
{OptimizerEntry {&l1->bias}},
253-
{OptimizerEntry {&l2->weights}},
254-
{OptimizerEntry {&l2->bias}},
255-
{OptimizerEntry {&pos_eval->weights}},
256-
{OptimizerEntry {&pos_eval->bias}}},
257-
0.95,
258-
0.999,
259-
1e-8,
260-
5 * 16384));
261-
262-
set_save_frequency(save_rate);
263-
add_quantization(Quantizer {
264-
"quant",
265-
save_rate,
266-
QuantizerEntry<int16_t>(&ft->weights.values, quant_one, true),
267-
QuantizerEntry<int16_t>(&ft->bias.values, quant_one),
268-
QuantizerEntry<int8_t>(&l1->weights.values, quant_two),
269-
QuantizerEntry<int32_t>(&l1->bias.values, quant_two),
270-
QuantizerEntry<float>(&l2->weights.values, 1.0),
271-
QuantizerEntry<float>(&l2->bias.values, quant_two),
272-
QuantizerEntry<float>(&pos_eval->weights.values, 1.0),
273-
QuantizerEntry<float>(&pos_eval->bias.values, quant_two),
274-
});
275-
}
276-
277-
inline int king_square_index(int relative_king_square) {
278-
constexpr int indices[64] {
279-
-1, -1, -1, -1, 14, 14, 15, 15, //
280-
-1, -1, -1, -1, 14, 14, 15, 15, //
281-
-1, -1, -1, -1, 12, 12, 13, 13, //
282-
-1, -1, -1, -1, 12, 12, 13, 13, //
283-
-1, -1, -1, -1, 8, 9, 10, 11, //
284-
-1, -1, -1, -1, 8, 9, 10, 11, //
285-
-1, -1, -1, -1, 4, 5, 6, 7, //
286-
-1, -1, -1, -1, 0, 1, 2, 3, //
287-
};
288-
289-
return indices[relative_king_square];
290-
}
291-
292-
inline int index(chess::Square piece_square,
293-
chess::Piece piece,
294-
chess::Square king_square,
295-
chess::Color view) {
296-
297-
const chess::PieceType piece_type = chess::type_of(piece);
298-
const chess::Color piece_color = chess::color_of(piece);
299-
300-
piece_square ^= 56;
301-
king_square ^= 56;
302-
303-
const int oP = piece_type + 6 * (piece_color != view);
304-
const int oK = (7 * !(king_square & 4)) ^ (56 * view) ^ king_square;
305-
const int oSq = (7 * !(king_square & 4)) ^ (56 * view) ^ piece_square;
306-
307-
return king_square_index(oK) * 12 * 64 + oP * 64 + oSq;
308-
}
309-
310-
void setup_inputs_and_outputs(dataset::DataSet<chess::Position>* positions) {
311-
in1->sparse_output.clear();
312-
in2->sparse_output.clear();
313-
314-
auto& target = m_loss->target;
315-
316-
#pragma omp parallel for schedule(static) num_threads(16)
317-
for (int b = 0; b < positions->header.entry_count; b++) {
318-
chess::Position* pos = &positions->positions[b];
319-
// fill in the inputs and target values
320-
321-
chess::Square wKingSq = pos->get_king_square<chess::WHITE>();
322-
chess::Square bKingSq = pos->get_king_square<chess::BLACK>();
323-
324-
chess::BB bb {pos->m_occupancy};
325-
int idx = 0;
326-
327-
while (bb) {
328-
chess::Square sq = chess::lsb(bb);
329-
chess::Piece pc = pos->m_pieces.get_piece(idx);
330-
331-
auto piece_index_white_pov = index(sq, pc, wKingSq, chess::WHITE);
332-
auto piece_index_black_pov = index(sq, pc, bKingSq, chess::BLACK);
333-
334-
if (pos->m_meta.stm() == chess::WHITE) {
335-
in1->sparse_output.set(b, piece_index_white_pov);
336-
in2->sparse_output.set(b, piece_index_black_pov);
337-
} else {
338-
in2->sparse_output.set(b, piece_index_white_pov);
339-
in1->sparse_output.set(b, piece_index_black_pov);
340-
}
341-
342-
bb = chess::lsb_reset(bb);
343-
idx++;
344-
}
345-
346-
float p_value = pos->m_result.score;
347-
float w_value = pos->m_result.wdl;
348-
349-
// flip if black is to move -> relative network style
350-
if (pos->m_meta.stm() == chess::BLACK) {
351-
p_value = -p_value;
352-
w_value = -w_value;
353-
}
354-
355-
float p_target = 1 / (1 + expf(-p_value * sigmoid_scale));
356-
float w_target = (w_value + 1) / 2.0f;
357-
358-
target(b) = lambda * p_target + (1.0 - lambda) * w_target;
359-
360-
// layer_selector->dense_output.values(b, 0) =
361-
// (int) ((chess::popcount(pos->m_occupancy) - 1) / 4);
362-
}
363-
}
364-
};
9+
namespace fs = std::filesystem;
36510

36611
int main(int argc, char* argv[]) {
36712
argparse::ArgumentParser program("Grapheus");
@@ -456,7 +101,7 @@ int main(int argc, char* argv[]) {
456101
dataset::BatchLoader<chess::Position> loader {files, batch_size};
457102
loader.start();
458103

459-
BerserkModel model {static_cast<size_t>(ft_size), lambda, static_cast<size_t>(save_rate)};
104+
model::BerserkModel model {static_cast<size_t>(ft_size), lambda, static_cast<size_t>(save_rate)};
460105
model.set_loss(MPE {2.5, true});
461106
model.set_lr_schedule(StepDecayLRSchedule {lr, lr_drop_ratio, lr_drop_epoch});
462107

0 commit comments

Comments
 (0)