Merge Factorizer on LayerStacks forward (official-stockfish#364)

xu-shawn · web-flow · commit bc0cfcab3154 · 2025-10-02T20:24:21.000+02:00
* merge factorizer before forward

* ?

* ruff format
diff --git a/model/model.py b/model/model.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch import nn, Tensor
+import torch.nn.functional as F
 
 from .config import ModelConfig
 from .feature_transformer import DoubleFeatureTransformerSlice
@@ -34,10 +35,17 @@ def _init_uniformly(self) -> None:
 
     def forward(self, x: Tensor, ls_indices: Tensor) -> Tensor:
         stacked_output = self.linear(x)
+
+        return self.select_output(stacked_output, ls_indices)
+
+    def select_output(self, stacked_output: Tensor, ls_indices: Tensor) -> Tensor:
         reshaped_output = stacked_output.reshape(-1, self.out_features)
 
         idx_offset = torch.arange(
-            0, x.shape[0] * self.count, self.count, device=x.device
+            0,
+            ls_indices.shape[0] * self.count,
+            self.count,
+            device=stacked_output.device,
         )
         indices = ls_indices.flatten() + idx_offset
 
@@ -69,10 +77,14 @@ def __init__(self, in_features: int, out_features: int, count: int):
             self.factorized_linear.bias.zero_()
 
     def forward(self, x: Tensor, ls_indices: Tensor) -> Tensor:
-        stacked_output = super().forward(x, ls_indices)
-        factorized_output = self.factorized_linear(x)
+        merged_weight = self.linear.weight + self.factorized_linear.weight.repeat(
+            self.count, 1
+        )
+        merged_bias = self.linear.bias + self.factorized_linear.bias.repeat(self.count)
+
+        stacked_output = F.linear(x, merged_weight, merged_bias)
 
-        return stacked_output + factorized_output
+        return self.select_output(stacked_output, ls_indices)
 
     @torch.no_grad()
     def at_index(self, index: int) -> nn.Linear: