nodchip
diff --git a/‎data_loader/dataset.py‎
Lines changed: 3 additions & 5 deletions b/‎data_loader/dataset.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎data_loader/stream.py‎
Lines changed: 2 additions & 3 deletions b/‎data_loader/stream.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎ftperm.py‎
Lines changed: 0 additions & 2 deletions b/‎ftperm.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎model.py‎
Lines changed: 80 additions & 68 deletions b/‎model.py‎
Lines changed: 80 additions & 68 deletions
@@ -7,8 +7,6 @@
 from . import stream
 from .config import DataloaderSkipConfig
 
-from typing import List
-
 
 class FenBatchProvider:
     def __init__(
@@ -64,7 +62,7 @@ def __init__(
         destroy_stream,
         fetch_next,
         destroy_part,
-        filenames: List[str],
+        filenames: list[str],
         cyclic,
         num_workers,
         batch_size=None,
@@ -116,7 +114,7 @@ class SparseBatchProvider(TrainingDataProvider):
     def __init__(
         self,
         feature_set: str,
-        filenames: List[str],
+        filenames: list[str],
         batch_size,
         cyclic=True,
         num_workers=1,
@@ -140,7 +138,7 @@ class SparseBatchDataset(torch.utils.data.IterableDataset):
     def __init__(
         self,
         feature_set: str,
-        filenames: List[str],
+        filenames: list[str],
         batch_size,
         cyclic=True,
         num_workers=1,
 
@@ -3,7 +3,6 @@
 from ._native import c_lib, SparseBatchPtr, FenBatchPtr
 from .config import CDataloaderSkipConfig, DataloaderSkipConfig
 from features.feature_set import FeatureSet
-from typing import List
 
 
 def _to_c_str_array(str_list):
@@ -14,7 +13,7 @@ def _to_c_str_array(str_list):
 
 def create_fen_batch_stream(
     concurrency,
-    filenames: List[str],
+    filenames: list[str],
     batch_size,
     cyclic,
     config: DataloaderSkipConfig,
@@ -44,7 +43,7 @@ def destroy_fen_batch(fen_batch: FenBatchPtr):
 def create_sparse_batch_stream(
     feature_set: str,
     concurrency,
-    filenames: List[str],
+    filenames: list[str],
     batch_size,
     cyclic,
     config: DataloaderSkipConfig,
 
@@ -517,8 +517,6 @@ def gather_impl(model, dataset, count):
     ZERO_POINT = 0.0  # Vary this to check hypothetical forced larger truncation to zero
     BATCH_SIZE = 1000
 
-    old_device = model.device
-
     quantized_model = copy.deepcopy(model)
     quantize_ft(quantized_model)
     quantized_model.cuda()
 
@@ -8,7 +8,6 @@
 import lightning as L
 from dataclasses import dataclass
 from features.feature_set import FeatureSet
-from typing import List, Tuple
 
 # 3 layer fully connected network
 L1 = 3072
@@ -43,10 +42,6 @@ def __init__(self, count: int):
         self.l2 = nn.Linear(L2 * 2, L3 * count)
         self.output = nn.Linear(L3, 1 * count)
 
-        # Cached helper tensor for choosing outputs by bucket indices.
-        # Initialized lazily in forward.
-        self.idx_offset = None
-
         self._init_layers()
 
     def _init_layers(self):
@@ -83,9 +78,14 @@ def _init_layers(self):
         self.output.bias = nn.Parameter(output_bias)
 
     def forward(self, x: Tensor, ls_indices: Tensor):
-        assert self.idx_offset is not None and self.idx_offset.shape[0] == x.shape[0]
+        idx_offset = torch.arange(
+            0,
+            x.shape[0] * self.count,
+            self.count,
+            device=x.device
+        )
 
-        indices = ls_indices.flatten() + self.idx_offset
+        indices = ls_indices.flatten() + idx_offset
 
         l1s_ = self.l1(x).reshape((-1, self.count, L2 + 1))
         l1f_ = self.l1_fact(x)
@@ -135,45 +135,22 @@ def get_coalesced_layer_stacks(self):
                 yield l1, l2, output
 
 
-class NNUE(L.LightningModule):
-    """
-    feature_set - an instance of FeatureSet defining the input features
-
-    lambda_ = 0.0 - purely based on game results
-    0.0 < lambda_ < 1.0 - interpolated score and result
-    lambda_ = 1.0 - purely based on search scores
-
-    gamma - the multiplicative factor applied to the learning rate after each epoch
-
-    lr - the initial learning rate
-    """
-
+class NNUEModel(nn.Module):
     def __init__(
         self,
         feature_set: FeatureSet,
-        max_epoch=800,
-        num_batches_per_epoch=int(100_000_000 / 16384),
-        gamma=0.992,
-        lr=8.75e-4,
-        param_index=0,
-        num_psqt_buckets=8,
-        num_ls_buckets=8,
-        loss_params=LossParams(),
+        num_psqt_buckets: int = 8,
+        num_ls_buckets: int = 8,
     ):
-        super(NNUE, self).__init__()
+        super().__init__()
         self.num_psqt_buckets = num_psqt_buckets
         self.num_ls_buckets = num_ls_buckets
+
         self.input = DoubleFeatureTransformerSlice(
             feature_set.num_features, L1 + self.num_psqt_buckets
         )
         self.feature_set = feature_set
         self.layer_stacks = LayerStacks(self.num_ls_buckets)
-        self.loss_params = loss_params
-        self.max_epoch = max_epoch
-        self.num_batches_per_epoch = num_batches_per_epoch
-        self.gamma = gamma
-        self.lr = lr
-        self.param_index = param_index
 
         self.nnue2score = 600.0
         self.weight_scale_hidden = 64.0
@@ -205,22 +182,21 @@ def __init__(
 
         self._init_layers()
 
-    """
-  We zero all virtual feature weights because there's not need for them
-  to be initialized; they only aid the training of correlated features.
-  """
+    def _init_layers(self):
+        self._zero_virtual_feature_weights()
+        self._init_psqt()
 
     def _zero_virtual_feature_weights(self):
+        """
+        We zero all virtual feature weights because there's not need for them
+        to be initialized; they only aid the training of correlated features.
+        """
         weights = self.input.weight
         with torch.no_grad():
             for a, b in self.feature_set.get_virtual_feature_ranges():
                 weights[a:b, :] = 0.0
         self.input.weight = nn.Parameter(weights)
 
-    def _init_layers(self):
-        self._zero_virtual_feature_weights()
-        self._init_psqt()
-
     def _init_psqt(self):
         input_weights = self.input.weight
         input_bias = self.input.bias
@@ -251,12 +227,11 @@ def _init_psqt(self):
         self.input.weight = nn.Parameter(input_weights)
         self.input.bias = nn.Parameter(input_bias)
 
-    """
-  Clips the weights of the model based on the min/max values allowed
-  by the quantization scheme.
-  """
-
     def _clip_weights(self):
+        """
+        Clips the weights of the model based on the min/max values allowed
+        by the quantization scheme.
+        """
         for group in self.weight_clipping:
             for p in group["params"]:
                 if "min_weight" in group or "max_weight" in group:
@@ -287,12 +262,11 @@ def _clip_weights(self):
                             raise Exception("Not supported.")
                     p.data.copy_(p_data_fp32)
 
-    """
-  This method attempts to convert the model from using the self.feature_set
-  to new_feature_set. Currently only works for adding virtual features.
-  """
-
     def set_feature_set(self, new_feature_set: FeatureSet):
+        """
+        This method attempts to convert the model from using the self.feature_set
+        to new_feature_set. Currently only works for adding virtual features.
+        """
         if self.feature_set.name == new_feature_set.name:
             return
 
@@ -370,13 +344,51 @@ def forward(
 
         return x
 
-    def step_(self, batch: Tuple[Tensor, ...], batch_idx, loss_type):
+
+class NNUE(L.LightningModule):
+    """
+    feature_set - an instance of FeatureSet defining the input features
+
+    lambda_ = 0.0 - purely based on game results
+    0.0 < lambda_ < 1.0 - interpolated score and result
+    lambda_ = 1.0 - purely based on search scores
+
+    gamma - the multiplicative factor applied to the learning rate after each epoch
+
+    lr - the initial learning rate
+    """
+
+    def __init__(
+        self,
+        feature_set: FeatureSet,
+        max_epoch=800,
+        num_batches_per_epoch=int(100_000_000 / 16384),
+        gamma=0.992,
+        lr=8.75e-4,
+        param_index=0,
+        num_psqt_buckets=8,
+        num_ls_buckets=8,
+        loss_params=LossParams(),
+    ):
+        super().__init__()
+        self.model: NNUEModel = NNUEModel(feature_set, num_psqt_buckets, num_ls_buckets)
+        self.loss_params = loss_params
+        self.max_epoch = max_epoch
+        self.num_batches_per_epoch = num_batches_per_epoch
+        self.gamma = gamma
+        self.lr = lr
+        self.param_index = param_index
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def step_(self, batch: tuple[Tensor, ...], batch_idx, loss_type):
         _ = batch_idx  # unused, but required by pytorch-lightning
 
         # We clip weights at the start of each step. This means that after
         # the last step the weights might be outside of the desired range.
         # They should be also clipped accordingly in the serializer.
-        self._clip_weights()
+        self.model._clip_weights()
 
         (
             us,
@@ -392,7 +404,7 @@ def step_(self, batch: Tuple[Tensor, ...], batch_idx, loss_type):
         ) = batch
 
         scorenet = (
-            self(
+            self.model(
                 us,
                 them,
                 white_indices,
@@ -402,7 +414,7 @@ def step_(self, batch: Tuple[Tensor, ...], batch_idx, loss_type):
                 psqt_indices,
                 layer_stack_indices,
             )
-            * self.nnue2score
+            * self.model.nnue2score
         )
 
         p = self.loss_params
@@ -445,15 +457,15 @@ def test_step(self, batch, batch_idx):
     def configure_optimizers(self):
         LR = self.lr
         train_params = [
-            {"params": get_parameters([self.input]), "lr": LR, "gc_dim": 0},
-            {"params": [self.layer_stacks.l1_fact.weight], "lr": LR},
-            {"params": [self.layer_stacks.l1_fact.bias], "lr": LR},
-            {"params": [self.layer_stacks.l1.weight], "lr": LR},
-            {"params": [self.layer_stacks.l1.bias], "lr": LR},
-            {"params": [self.layer_stacks.l2.weight], "lr": LR},
-            {"params": [self.layer_stacks.l2.bias], "lr": LR},
-            {"params": [self.layer_stacks.output.weight], "lr": LR},
-            {"params": [self.layer_stacks.output.bias], "lr": LR},
+            {"params": get_parameters([self.model.input]), "lr": LR, "gc_dim": 0},
+            {"params": [self.model.layer_stacks.l1_fact.weight], "lr": LR},
+            {"params": [self.model.layer_stacks.l1_fact.bias], "lr": LR},
+            {"params": [self.model.layer_stacks.l1.weight], "lr": LR},
+            {"params": [self.model.layer_stacks.l1.bias], "lr": LR},
+            {"params": [self.model.layer_stacks.l2.weight], "lr": LR},
+            {"params": [self.model.layer_stacks.l2.bias], "lr": LR},
+            {"params": [self.model.layer_stacks.output.weight], "lr": LR},
+            {"params": [self.model.layer_stacks.output.bias], "lr": LR},
         ]
 
         optimizer = ranger21.Ranger21(
@@ -479,7 +491,7 @@ def configure_optimizers(self):
         return [optimizer], [scheduler]
 
 
-def coalesce_ft_weights(model: NNUE, layer: BaseFeatureTransformerSlice):
+def coalesce_ft_weights(model: NNUEModel, layer: BaseFeatureTransformerSlice):
     weight = layer.weight.data
     indices = model.feature_set.get_virtual_to_real_features_gather_indices()
     weight_coalesced = weight.new_zeros(
@@ -492,5 +504,5 @@ def coalesce_ft_weights(model: NNUE, layer: BaseFeatureTransformerSlice):
     return weight_coalesced
 
 
-def get_parameters(layers: List[nn.Module]):
+def get_parameters(layers: list[nn.Module]):
     return [p for layer in layers for p in layer.parameters()]