@@ -217,7 +217,8 @@ struct BerserkModel : ChessModel {
217217 SparseInput* in2;
218218
219219 const float sigmoid_scale = 1.0 / 160.0 ;
220- const float quant = 32.0 ;
220+ const float quant_one = 32.0 ;
221+ const float quant_two = 32.0 ;
221222
222223 const size_t n_features = 16 * 12 * 64 ;
223224 const size_t n_l1 = 16 ;
@@ -235,26 +236,24 @@ struct BerserkModel : ChessModel {
235236 ft->ft_regularization = 1.0 / 16384.0 / 4194304.0 ;
236237 fta->max = 127.0 ;
237238
238- auto l1 = add<Affine>(fta, n_l1);
239- auto l1a = add<ClippedRelu>(l1);
240- l1a->max = 127.0 ;
239+ auto l1 = add<Affine>(fta, n_l1);
240+ auto l1a = add<ReLU>(l1);
241241
242- auto l2 = add<Affine>(l1a, n_l2);
243- auto l2a = add<ClippedRelu>(l2);
244- l2a->max = 127.0 ;
242+ auto l2 = add<Affine>(l1a, n_l2);
243+ auto l2a = add<ReLU>(l2);
245244
246- auto cp_eval = add<Affine>(l2a, n_out);
247- auto sigmoid = add<Sigmoid>(cp_eval , sigmoid_scale);
245+ auto pos_eval = add<Affine>(l2a, n_out);
246+ auto sigmoid = add<Sigmoid>(pos_eval , sigmoid_scale);
248247
249- const float hidden_max = 127.0 / quant ;
248+ const float hidden_max = 127.0 / quant_two ;
250249 add_optimizer (AdamWarmup ({{OptimizerEntry {&ft->weights }},
251250 {OptimizerEntry {&ft->bias }},
252251 {OptimizerEntry {&l1->weights }.clamp (-hidden_max, hidden_max)},
253252 {OptimizerEntry {&l1->bias }},
254- {OptimizerEntry {&l2->weights }. clamp (-hidden_max, hidden_max) },
253+ {OptimizerEntry {&l2->weights }},
255254 {OptimizerEntry {&l2->bias }},
256- {OptimizerEntry {&cp_eval ->weights }. clamp (-hidden_max, hidden_max) },
257- {OptimizerEntry {&cp_eval ->bias }}},
255+ {OptimizerEntry {&pos_eval ->weights }},
256+ {OptimizerEntry {&pos_eval ->bias }}},
258257 0.95 ,
259258 0.999 ,
260259 1e-8 ,
@@ -264,14 +263,14 @@ struct BerserkModel : ChessModel {
264263 add_quantization (Quantizer {
265264 " quant" ,
266265 save_rate,
267- QuantizerEntry<int16_t >(&ft->weights .values , quant , true ),
268- QuantizerEntry<int16_t >(&ft->bias .values , quant ),
269- QuantizerEntry<int8_t >(&l1->weights .values , quant ),
270- QuantizerEntry<int32_t >(&l1->bias .values , quant ),
271- QuantizerEntry<int8_t >(&l2->weights .values , quant ),
272- QuantizerEntry<int32_t >(&l2->bias .values , quant ),
273- QuantizerEntry<int8_t >(&cp_eval ->weights .values , quant ),
274- QuantizerEntry<int32_t >(&cp_eval ->bias .values , quant ),
266+ QuantizerEntry<int16_t >(&ft->weights .values , quant_one , true ),
267+ QuantizerEntry<int16_t >(&ft->bias .values , quant_one ),
268+ QuantizerEntry<int8_t >(&l1->weights .values , quant_two ),
269+ QuantizerEntry<int32_t >(&l1->bias .values , quant_two ),
270+ QuantizerEntry<float >(&l2->weights .values , 1.0 ),
271+ QuantizerEntry<float >(&l2->bias .values , quant_two ),
272+ QuantizerEntry<float >(&pos_eval ->weights .values , 1.0 ),
273+ QuantizerEntry<float >(&pos_eval ->bias .values , quant_two ),
275274 });
276275 }
277276
@@ -314,7 +313,7 @@ struct BerserkModel : ChessModel {
314313
315314 auto & target = m_loss->target ;
316315
317- #pragma omp parallel for schedule(static) num_threads(16 )
316+ #pragma omp parallel for schedule(static) num_threads(8 )
318317 for (int b = 0 ; b < positions->header .entry_count ; b++) {
319318 chess::Position* pos = &positions->positions [b];
320319 // fill in the inputs and target values
@@ -357,6 +356,9 @@ struct BerserkModel : ChessModel {
357356 float w_target = (w_value + 1 ) / 2 .0f ;
358357
359358 target (b) = lambda * p_target + (1.0 - lambda) * w_target;
359+
360+ // layer_selector->dense_output.values(b, 0) =
361+ // (int) ((chess::popcount(pos->m_occupancy) - 1) / 4);
360362 }
361363 }
362364};
0 commit comments