official-stockfish · vondele · Aug 15, 2021 · Jun 14, 2021 · Jun 30, 2021 · Aug 9, 2021
diff --git a/README.md b/README.md
@@ -54,11 +54,11 @@ python train.py --resume_from_checkpoint <path> ...
 python train.py --gpus 1 ...
 ```
 ## Feature set selection
-By default the trainer uses a factorized HalfKAv2 feature set (named "HalfKAv2^")
+By default the trainer uses a factorized HalfKAv2_hm feature set (named "HalfKAv2_hm^")
 If you wish to change the feature set used then you can use the `--features=NAME` option. For the list of available features see `--help`
 The default is:
 ```
-python train.py ... --features="HalfKAv2^"
+python train.py ... --features="HalfKAv2_hm^"
 ```
 
 ## Skipping certain fens in the training
@@ -69,7 +69,7 @@ python train.py ... --features="HalfKAv2^"
 ## Current recommended training invocation
 
 ```
-python train.py --smart-fen-skipping --random-fen-skipping 3 --batch-size 16384 --threads 8 --num-workers 8 --gpus 1 trainingdata validationdata
+python train.py --smart-fen-skipping --random-fen-skipping 3 --batch-size 16384 --threads 2 --num-workers 2 --gpus 1 trainingdata validationdata
 ```
 best nets have been trained with 16B d9-scored nets, training runs >200 epochs
 
@@ -96,13 +96,13 @@ python serialize.py nn.nnue converted.pt
 Visualize a network from either a checkpoint (`.ckpt`), a serialized model (`.pt`)
 or a SF NNUE file (`.nnue`).
 ```
-python visualize.py nn.nnue --features="HalfKAv2"
+python visualize.py nn.nnue --features="HalfKAv2_hm"
 ```
 
 Visualize the difference between two networks from either a checkpoint (`.ckpt`), a serialized model (`.pt`)
 or a SF NNUE file (`.nnue`).
 ```
-python visualize.py nn.nnue  --features="HalfKAv2" --ref-model nn.cpkt --ref-features="HalfKAv2^"
+python visualize.py nn.nnue  --features="HalfKAv2_hm" --ref-model nn.cpkt --ref-features="HalfKAv2_hm^"
 ```
 
 # Logging

diff --git a/features.py b/features.py
@@ -12,8 +12,9 @@
 import halfkp
 import halfka
 import halfka_v2
+import halfka_v2_hm
 
-_feature_modules = [halfkp, halfka, halfka_v2]
+_feature_modules = [halfkp, halfka, halfka_v2, halfka_v2_hm]
 
 _feature_blocks_by_name = dict()
 
@@ -41,7 +42,7 @@ def get_available_feature_blocks_names():
     return list(iter(_feature_blocks_by_name))
 
 def add_argparse_args(parser):
-    _default_feature_set_name = 'HalfKAv2^'
+    _default_feature_set_name = 'HalfKAv2_hm^'
     parser.add_argument("--features", dest='features', default=_default_feature_set_name, help="The feature set to use. Can be a union of feature blocks (for example P+HalfKP). \"^\" denotes a factorized block. Currently available feature blocks are: " + ', '.join(get_available_feature_blocks_names()))
 
 def _init():

diff --git a/halfka_v2_hm.py b/halfka_v2_hm.py
@@ -0,0 +1,95 @@
+import chess
+import torch
+import feature_block
+from collections import OrderedDict
+from feature_block import *
+
+NUM_SQ = 64
+NUM_PT_REAL = 11
+NUM_PT_VIRTUAL = 12
+NUM_PLANES_REAL = NUM_SQ * NUM_PT_REAL
+NUM_PLANES_VIRTUAL = NUM_SQ * NUM_PT_VIRTUAL
+NUM_INPUTS = NUM_PLANES_REAL * NUM_SQ // 2
+
+KingBuckets = [
+  -1, -1, -1, -1, 31, 30, 29, 28,
+  -1, -1, -1, -1, 27, 26, 25, 24,
+  -1, -1, -1, -1, 23, 22, 21, 20,
+  -1, -1, -1, -1, 19, 18, 17, 16,
+  -1, -1, -1, -1, 15, 14, 13, 12,
+  -1, -1, -1, -1, 11, 10, 9, 8,
+  -1, -1, -1, -1, 7, 6, 5, 4,
+  -1, -1, -1, -1, 3, 2, 1, 0
+]
+
+def orient(is_white_pov: bool, sq: int, ksq: int):
+  # ksq must not be oriented
+  kfile = (ksq % 8)
+  return (7 * (kfile < 4)) ^ (56 * (not is_white_pov)) ^ sq
+
+def halfka_idx(is_white_pov: bool, king_sq: int, sq: int, p: chess.Piece):
+  p_idx = (p.piece_type - 1) * 2 + (p.color != is_white_pov)
+  o_ksq = orient(is_white_pov, king_sq, king_sq)
+  if p_idx == 11:
+    p_idx -= 1
+  return orient(is_white_pov, sq, king_sq) + p_idx * NUM_SQ + KingBuckets[o_ksq] * NUM_PLANES_REAL
+
+def halfka_psqts():
+  # values copied from stockfish, in stockfish internal units
+  piece_values = {
+    chess.PAWN : 126,
+    chess.KNIGHT : 781,
+    chess.BISHOP : 825,
+    chess.ROOK : 1276,
+    chess.QUEEN : 2538
+  }
+
+  values = [0] * NUM_INPUTS
+
+  for ksq in range(64):
+    for s in range(64):
+      for pt, val in piece_values.items():
+        idxw = halfka_idx(True, ksq, s, chess.Piece(pt, chess.WHITE))
+        idxb = halfka_idx(True, ksq, s, chess.Piece(pt, chess.BLACK))
+        values[idxw] = val
+        values[idxb] = -val
+
+  return values
+
+class Features(FeatureBlock):
+  def __init__(self):
+    super(Features, self).__init__('HalfKAv2_hm', 0x7f234cb8, OrderedDict([('HalfKAv2_hm', NUM_INPUTS)]))
+
+  def get_active_features(self, board: chess.Board):
+    raise Exception('Not supported yet, you must use the c++ data loader for support during training')
+
+  def get_initial_psqt_features(self):
+    return halfka_psqts()
+
+class FactorizedFeatures(FeatureBlock):
+  def __init__(self):
+    super(FactorizedFeatures, self).__init__('HalfKAv2_hm^', 0x7f234cb8, OrderedDict([('HalfKAv2_hm', NUM_INPUTS), ('A', NUM_PLANES_VIRTUAL)]))
+
+  def get_active_features(self, board: chess.Board):
+    raise Exception('Not supported yet, you must use the c++ data loader for factorizer support during training')
+
+  def get_feature_factors(self, idx):
+    if idx >= self.num_real_features:
+      raise Exception('Feature must be real')
+
+    a_idx = idx % NUM_PLANES_REAL
+    k_idx = idx // NUM_PLANES_REAL
+
+    if a_idx // NUM_SQ == 10 and k_idx != KingBuckets[a_idx % NUM_SQ]:
+      a_idx += NUM_SQ
+
+    return [idx, self.get_factor_base_feature('A') + a_idx]
+
+  def get_initial_psqt_features(self):
+    return halfka_psqts() + [0] * NUM_PLANES_VIRTUAL
+
+'''
+This is used by the features module for discovery of feature blocks.
+'''
+def get_feature_block_clss():
+  return [Features, FactorizedFeatures]
diff --git a/model.py b/model.py
@@ -7,8 +7,8 @@
 from feature_transformer import DoubleFeatureTransformerSlice
 
 # 3 layer fully connected network
-L1 = 512
-L2 = 16
+L1 = 1024
+L2 = 8
 L3 = 32
 
 def coalesce_ft_weights(model, layer):
@@ -271,9 +271,9 @@ def step_(self, batch, batch_idx, loss_type):
     t = outcome
     p = (score / in_scaling).sigmoid()
 
-    loss_eval = (p - q).square().mean()
-    loss_result = (q - t).square().mean()
-    loss = self.lambda_ * loss_eval + (1.0 - self.lambda_) * loss_result
+    pt = p * self.lambda_ + t * (1.0 - self.lambda_)
+
+    loss = torch.pow(torch.abs(pt - q), 2.6).mean()
 
     self.log(loss_type, loss)
 
@@ -295,19 +295,19 @@ def test_step(self, batch, batch_idx):
 
   def configure_optimizers(self):
     # Train with a lower LR on the output layer
-    LR = 1.5e-3
+    LR = 8.75e-4
     train_params = [
       {'params' : get_parameters([self.input]), 'lr' : LR, 'gc_dim' : 0 },
       {'params' : [self.layer_stacks.l1_fact.weight], 'lr' : LR },
       {'params' : [self.layer_stacks.l1.weight], 'lr' : LR },
       {'params' : [self.layer_stacks.l1.bias], 'lr' : LR },
       {'params' : [self.layer_stacks.l2.weight], 'lr' : LR },
       {'params' : [self.layer_stacks.l2.bias], 'lr' : LR },
-      {'params' : [self.layer_stacks.output.weight], 'lr' : LR / 10 },
-      {'params' : [self.layer_stacks.output.bias], 'lr' : LR / 10 },
+      {'params' : [self.layer_stacks.output.weight], 'lr' : LR },
+      {'params' : [self.layer_stacks.output.bias], 'lr' : LR },
     ]
     # increasing the eps leads to less saturated nets with a few dead neurons
     optimizer = ranger.Ranger(train_params, betas=(.9, 0.999), eps=1.0e-7, gc_loc=False, use_gc=False)
     # Drop learning rate after 75 epochs
-    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.987)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.992)
     return [optimizer], [scheduler]
diff --git a/training_data_loader.cpp b/training_data_loader.cpp
@@ -258,6 +258,93 @@ struct HalfKAv2Factorized {
     }
 };
 
+// ksq must not be oriented
+static Square orient_flip_2(Color color, Square sq, Square ksq)
+{
+    bool h = ksq.file() < fileE;
+    if (color == Color::Black)
+        sq = sq.flippedVertically();
+    if (h)
+        sq = sq.flippedHorizontally();
+    return sq;
+}
+
+struct HalfKAv2_hm {
+    static constexpr int NUM_SQ = 64;
+    static constexpr int NUM_PT = 11;
+    static constexpr int NUM_PLANES = NUM_SQ * NUM_PT;
+    static constexpr int INPUTS = NUM_PLANES * NUM_SQ / 2;
+
+    static constexpr int MAX_ACTIVE_FEATURES = 32;
+
+    static constexpr int KingBuckets[64] = {
+      -1, -1, -1, -1, 31, 30, 29, 28,
+      -1, -1, -1, -1, 27, 26, 25, 24,
+      -1, -1, -1, -1, 23, 22, 21, 20,
+      -1, -1, -1, -1, 19, 18, 17, 16,
+      -1, -1, -1, -1, 15, 14, 13, 12,
+      -1, -1, -1, -1, 11, 10, 9, 8,
+      -1, -1, -1, -1, 7, 6, 5, 4,
+      -1, -1, -1, -1, 3, 2, 1, 0
+    };
+
+    static int feature_index(Color color, Square ksq, Square sq, Piece p)
+    {
+        Square o_ksq = orient_flip_2(color, ksq, ksq);
+        auto p_idx = static_cast<int>(p.type()) * 2 + (p.color() != color);
+        if (p_idx == 11)
+            --p_idx; // pack the opposite king into the same NUM_SQ * NUM_SQ
+        return static_cast<int>(orient_flip_2(color, sq, ksq)) + p_idx * NUM_SQ + KingBuckets[static_cast<int>(o_ksq)] * NUM_PLANES;
+    }
+
+    static std::pair<int, int> fill_features_sparse(const TrainingDataEntry& e, int* features, float* values, Color color)
+    {
+        auto& pos = e.pos;
+        auto pieces = pos.piecesBB();
+        auto ksq = pos.kingSquare(color);
+
+        int j = 0;
+        for(Square sq : pieces)
+        {
+            auto p = pos.pieceAt(sq);
+            values[j] = 1.0f;
+            features[j] = feature_index(color, ksq, sq, p);
+            ++j;
+        }
+
+        return { j, INPUTS };
+    }
+};
+
+struct HalfKAv2_hmFactorized {
+    // Factorized features
+    static constexpr int PIECE_INPUTS = HalfKAv2_hm::NUM_SQ * HalfKAv2_hm::NUM_PT;
+    static constexpr int INPUTS = HalfKAv2_hm::INPUTS + PIECE_INPUTS;
+
+    static constexpr int MAX_PIECE_FEATURES = 32;
+    static constexpr int MAX_ACTIVE_FEATURES = HalfKAv2_hm::MAX_ACTIVE_FEATURES + MAX_PIECE_FEATURES;
+
+    static std::pair<int, int> fill_features_sparse(const TrainingDataEntry& e, int* features, float* values, Color color)
+    {
+        const auto [start_j, offset] = HalfKAv2_hm::fill_features_sparse(e, features, values, color);
+        auto& pos = e.pos;
+        auto pieces = pos.piecesBB();
+        auto ksq = pos.kingSquare(color);
+
+        int j = start_j;
+        for(Square sq : pieces)
+        {
+            auto p = pos.pieceAt(sq);
+            auto p_idx = static_cast<int>(p.type()) * 2 + (p.color() != color);
+            values[j] = 1.0f;
+            features[j] = offset + (p_idx * HalfKAv2_hm::NUM_SQ) + static_cast<int>(orient_flip_2(color, sq, ksq));
+            ++j;
+        }
+
+        return { j, INPUTS };
+    }
+};
+
 template <typename T, typename... Ts>
 struct FeatureSet
 {
@@ -797,6 +884,14 @@ extern "C" {
         {
             return new SparseBatch(FeatureSet<HalfKAv2Factorized>{}, entries);
         }
+        else if (feature_set == "HalfKAv2_hm")
+        {
+            return new SparseBatch(FeatureSet<HalfKAv2_hm>{}, entries);
+        }
+        else if (feature_set == "HalfKAv2_hm^")
+        {
+            return new SparseBatch(FeatureSet<HalfKAv2_hmFactorized>{}, entries);
+        }
         fprintf(stderr, "Unknown feature_set %s\n", feature_set_c);
         return nullptr;
     }
@@ -842,6 +937,14 @@ extern "C" {
         {
             return new FeaturedBatchStream<FeatureSet<HalfKAv2Factorized>, SparseBatch>(concurrency, filename, batch_size, cyclic, skipPredicate);
         }
+        else if (feature_set == "HalfKAv2_hm")
+        {
+            return new FeaturedBatchStream<FeatureSet<HalfKAv2_hm>, SparseBatch>(concurrency, filename, batch_size, cyclic, skipPredicate);
+        }
+        else if (feature_set == "HalfKAv2_hm^")
+        {
+            return new FeaturedBatchStream<FeatureSet<HalfKAv2_hmFactorized>, SparseBatch>(concurrency, filename, batch_size, cyclic, skipPredicate);
+        }
         fprintf(stderr, "Unknown feature_set %s\n", feature_set_c);
         return nullptr;
     }