@@ -217,8 +217,7 @@ struct BerserkModel : ChessModel {
217217 SparseInput* in2;
218218
219219 const float sigmoid_scale = 1.0 / 160.0 ;
220- const float quant_one = 32.0 ;
221- const float quant_two = 32.0 ;
220+ const float quant = 32.0 ;
222221
223222 const size_t n_features = 16 * 12 * 64 ;
224223 const size_t n_l1 = 16 ;
@@ -236,24 +235,26 @@ struct BerserkModel : ChessModel {
236235 ft->ft_regularization = 1.0 / 16384.0 / 4194304.0 ;
237236 fta->max = 127.0 ;
238237
239- auto l1 = add<Affine>(fta, n_l1);
240- auto l1a = add<ReLU>(l1);
238+ auto l1 = add<Affine>(fta, n_l1);
239+ auto l1a = add<ClippedRelu>(l1);
240+ l1a->max = 127.0 ;
241241
242- auto l2 = add<Affine>(l1a, n_l2);
243- auto l2a = add<ReLU>(l2);
242+ auto l2 = add<Affine>(l1a, n_l2);
243+ auto l2a = add<ClippedRelu>(l2);
244+ l2a->max = 127.0 ;
244245
245- auto pos_eval = add<Affine>(l2a, n_out);
246- auto sigmoid = add<Sigmoid>(pos_eval , sigmoid_scale);
246+ auto cp_eval = add<Affine>(l2a, n_out);
247+ auto sigmoid = add<Sigmoid>(cp_eval , sigmoid_scale);
247248
248- const float hidden_max = 127.0 / quant_two ;
249+ const float hidden_max = 127.0 / quant ;
249250 add_optimizer (AdamWarmup ({{OptimizerEntry {&ft->weights }},
250251 {OptimizerEntry {&ft->bias }},
251252 {OptimizerEntry {&l1->weights }.clamp (-hidden_max, hidden_max)},
252253 {OptimizerEntry {&l1->bias }},
253- {OptimizerEntry {&l2->weights }},
254+ {OptimizerEntry {&l2->weights }. clamp (-hidden_max, hidden_max) },
254255 {OptimizerEntry {&l2->bias }},
255- {OptimizerEntry {&pos_eval ->weights }},
256- {OptimizerEntry {&pos_eval ->bias }}},
256+ {OptimizerEntry {&cp_eval ->weights }. clamp (-hidden_max, hidden_max) },
257+ {OptimizerEntry {&cp_eval ->bias }}},
257258 0.95 ,
258259 0.999 ,
259260 1e-8 ,
@@ -263,14 +264,14 @@ struct BerserkModel : ChessModel {
263264 add_quantization (Quantizer {
264265 " quant" ,
265266 save_rate,
266- QuantizerEntry<int16_t >(&ft->weights .values , quant_one , true ),
267- QuantizerEntry<int16_t >(&ft->bias .values , quant_one ),
268- QuantizerEntry<int8_t >(&l1->weights .values , quant_two ),
269- QuantizerEntry<int32_t >(&l1->bias .values , quant_two ),
270- QuantizerEntry<float >(&l2->weights .values , 1.0 ),
271- QuantizerEntry<float >(&l2->bias .values , quant_two ),
272- QuantizerEntry<float >(&pos_eval ->weights .values , 1.0 ),
273- QuantizerEntry<float >(&pos_eval ->bias .values , quant_two ),
267+ QuantizerEntry<int16_t >(&ft->weights .values , quant , true ),
268+ QuantizerEntry<int16_t >(&ft->bias .values , quant ),
269+ QuantizerEntry<int8_t >(&l1->weights .values , quant ),
270+ QuantizerEntry<int32_t >(&l1->bias .values , quant ),
271+ QuantizerEntry<int8_t >(&l2->weights .values , quant ),
272+ QuantizerEntry<int32_t >(&l2->bias .values , quant ),
273+ QuantizerEntry<int8_t >(&cp_eval ->weights .values , quant ),
274+ QuantizerEntry<int32_t >(&cp_eval ->bias .values , quant ),
274275 });
275276 }
276277
@@ -313,7 +314,7 @@ struct BerserkModel : ChessModel {
313314
314315 auto & target = m_loss->target ;
315316
316- #pragma omp parallel for schedule(static) num_threads(8 )
317+ #pragma omp parallel for schedule(static) num_threads(16 )
317318 for (int b = 0 ; b < positions->header .entry_count ; b++) {
318319 chess::Position* pos = &positions->positions [b];
319320 // fill in the inputs and target values
@@ -356,9 +357,6 @@ struct BerserkModel : ChessModel {
356357 float w_target = (w_value + 1 ) / 2 .0f ;
357358
358359 target (b) = lambda * p_target + (1.0 - lambda) * w_target;
359-
360- // layer_selector->dense_output.values(b, 0) =
361- // (int) ((chess::popcount(pos->m_occupancy) - 1) / 4);
362360 }
363361 }
364362};
0 commit comments