FauziAkram
diff --git a/‎model/model.py‎
Lines changed: 2 additions & 2 deletions b/‎model/model.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎model/modules/__init__.py‎
Lines changed: 6 additions & 6 deletions b/‎model/modules/__init__.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎model/modules/feature_transformer/__init__.py‎
Lines changed: 6 additions & 6 deletions b/‎model/modules/feature_transformer/__init__.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎model/modules/feature_transformer/functions.py‎
Lines changed: 5 additions & 5 deletions b/‎model/modules/feature_transformer/functions.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎model/modules/feature_transformer/kernel.py‎
Lines changed: 70 additions & 70 deletions b/‎model/modules/feature_transformer/kernel.py‎
Lines changed: 70 additions & 70 deletions
@@ -3,7 +3,7 @@
 
 from .config import ModelConfig
 from .features import FeatureSet
-from .modules import DoubleFeatureTransformerSlice, LayerStacks
+from .modules import DoubleFeatureTransformer, LayerStacks
 from .quantize import QuantizationConfig, QuantizationManager
 
 
@@ -25,7 +25,7 @@ def __init__(
         self.num_psqt_buckets = num_psqt_buckets
         self.num_ls_buckets = num_ls_buckets
 
-        self.input = DoubleFeatureTransformerSlice(
+        self.input = DoubleFeatureTransformer(
             feature_set.num_features, self.L1 + self.num_psqt_buckets
         )
         self.feature_set = feature_set
 
@@ -1,13 +1,13 @@
 from .feature_transformer import (
-    BaseFeatureTransformerSlice,
-    DoubleFeatureTransformerSlice,
-    FeatureTransformerSlice,
+    BaseFeatureTransformer,
+    DoubleFeatureTransformer,
+    FeatureTransformer,
 )
 from .layer_stacks import LayerStacks
 
 __all__ = [
-    "BaseFeatureTransformerSlice",
-    "DoubleFeatureTransformerSlice",
-    "FeatureTransformerSlice",
+    "BaseFeatureTransformer",
+    "DoubleFeatureTransformer",
+    "FeatureTransformer",
     "LayerStacks",
 ]
@@ -1,11 +1,11 @@
 from .module import (
-    BaseFeatureTransformerSlice,
-    DoubleFeatureTransformerSlice,
-    FeatureTransformerSlice,
+    BaseFeatureTransformer,
+    DoubleFeatureTransformer,
+    FeatureTransformer,
 )
 
 __all__ = [
-    "BaseFeatureTransformerSlice",
-    "DoubleFeatureTransformerSlice",
-    "FeatureTransformerSlice",
+    "BaseFeatureTransformer",
+    "DoubleFeatureTransformer",
+    "FeatureTransformer",
 ]
@@ -2,12 +2,12 @@
 from torch import autograd
 
 from .kernel import (
-    make_feature_transformer_slice_forward_kernel,
-    make_feature_transformer_slice_backward_kernel,
+    make_sparse_input_linear_forward_kernel,
+    make_sparse_input_linear_backward_kernel,
 )
 
 
-class FeatureTransformerSliceFunction(autograd.Function):
+class SparseLinearFunction(autograd.Function):
     @staticmethod
     def forward(ctx, feature_indices, feature_values, weight, bias):
         ctx.save_for_backward(feature_indices, feature_values, weight, bias)
@@ -52,7 +52,7 @@ def forward(ctx, feature_indices, feature_values, weight, bias):
             requires_grad=True,
         )
 
-        kernel = make_feature_transformer_slice_forward_kernel(
+        kernel = make_sparse_input_linear_forward_kernel(
             max_active_features, output_size
         )
         kernel(
@@ -87,7 +87,7 @@ def backward(ctx, grad_output):
         )
         bias_grad = torch.zeros(output_size, dtype=torch.float32, device=device)
 
-        kernel = make_feature_transformer_slice_backward_kernel(
+        kernel = make_sparse_input_linear_backward_kernel(
             max_active_features, output_size
         )
         kernel(
 
@@ -2,7 +2,7 @@
 import torch
 
 
-def _find_nearest_divisor(value, target):
+def _find_nearest_divisor(value: int, target: int) -> int:
     divisors = []
     for i in range(1, value + 1):
         if value % i == 0:
@@ -11,10 +11,10 @@ def _find_nearest_divisor(value, target):
     return divisors[0][0]
 
 
-_num_threads_forward_cache = dict()
+_num_threads_forward_cache: dict[int, int] = dict()
 
 
-def _get_num_threads_for_forward(output_size):
+def _get_num_threads_for_forward(output_size: int) -> int:
     optimal_num_threads = 512
     if output_size not in _num_threads_forward_cache:
         _num_threads_forward_cache[output_size] = _find_nearest_divisor(
@@ -24,10 +24,10 @@ def _get_num_threads_for_forward(output_size):
     return _num_threads_forward_cache[output_size]
 
 
-_num_threads_backward_cache = dict()
+_num_threads_backward_cache: dict[int, int] = dict()
 
 
-def _get_num_threads_for_backward(output_size):
+def _get_num_threads_for_backward(output_size: int) -> int:
     optimal_num_threads = 512
     if output_size not in _num_threads_backward_cache:
         _num_threads_backward_cache[output_size] = _find_nearest_divisor(
@@ -44,15 +44,15 @@ def f(grid, args):
     return f
 
 
-_feature_transformer_slice_forward_kernel_cache = dict()
+_sparse_input_linear_forward_kernel_cache = dict()
 
 
 @torch.compiler.disable(recursive=False)
-def make_feature_transformer_slice_forward_kernel(max_active_features, output_size):
+def make_sparse_input_linear_forward_kernel(max_active_indices: int, output_size: int):
     """
-    @param: max_active_features
-        The maximum number of features that are active
-        (non-zero) for a single position. This value determines
+    @param: max_active_indices
+        The maximum number of indices that are non-zero
+        for a single position. This value determines
         the shape of the inputs.
         This value is of type uint32_t.
 
@@ -63,8 +63,8 @@ def make_feature_transformer_slice_forward_kernel(max_active_features, output_si
     """
     num_threads = _get_num_threads_for_forward(output_size)
     output_thread_slice_size = output_size // num_threads
-    key = (max_active_features, output_size, num_threads)
-    if key not in _feature_transformer_slice_forward_kernel_cache:
+    key = (max_active_indices, output_size, num_threads)
+    if key not in _sparse_input_linear_forward_kernel_cache:
         kernel = cp.RawKernel(
             r"""
 
@@ -79,23 +79,23 @@ def make_feature_transformer_slice_forward_kernel(max_active_features, output_si
         The threads must have dimensionality (N,), where
         N * output_thread_slice_size == output_size.
 
-    @param: feature_indices
-        A matrix of shape (BATCH_SIZE, max_active_features)
-        containing indices of active features for each position
-        in a batch. Feature index of -1 means that the slot is empty
+    @param: input_indices
+        A matrix of shape (BATCH_SIZE, max_active_indices)
+        containing indices of active indices for each position
+        in a batch. Input index of -1 means that the slot is empty
         and the weights will not be accumulated for it. Moreover
         no further indices from this block will be considered.
         The indices form an implicit matrix of shape
         (BATCH_SIZE, NUM_INPUTS), where the first dimension index is
         inferred from the memory location (BATCH_SIZE), and the
-        second dimension index is stored in the feature_indices matrix.
-        The type for feature indices is int32_t.
+        second dimension index is stored in the input_indices matrix.
+        The type for input indices is int32_t.
 
-    @param: feature_values
-        A matrix of shape (BATCH_SIZE, max_active_features)
+    @param: input_values
+        A matrix of shape (BATCH_SIZE, max_active_indices)
         containing the values (arity) of the corresponding
-        feature index in feature_indices.
-        The type for the feature value (arity) is float32.
+        input index in input_indices.
+        The type for the input value (arity) is float32.
 
     @param: weight
         The weight matrix of shape (NUM_INPUTS, output_size).
@@ -111,9 +111,9 @@ def make_feature_transformer_slice_forward_kernel(max_active_features, output_si
         to the output first.
         Output values must have type float32.
 */
-void feature_transformer_slice_forward(
-    const int32_t* const feature_indices,
-    const float*   const feature_values,
+void sparse_input_linear_forward(
+    const int32_t* const input_indices,
+    const float*   const input_values,
     const float*   const weight,
     const float*   const bias,
           float*   const output
@@ -128,26 +128,26 @@ def make_feature_transformer_slice_forward_kernel(max_active_features, output_si
     const float*   const bias_slice          = bias                               + slice_offset;
           float*         shared_output_slice = shared_output                      + slice_offset;
 
-    const int32_t* const feature_index_row   = feature_indices + block_idx * {max_active_features};
-    const float*   const feature_value_row   = feature_values  + block_idx * {max_active_features};
+    const int32_t* const input_index_row     = input_indices + block_idx * {max_active_indices};
+    const float*   const input_value_row     = input_values  + block_idx * {max_active_indices};
 
     #pragma unroll
     for (uint32_t s = 0; s < {output_thread_slice_size}; ++s)
     {{
         shared_output_slice[s] = bias_slice[s];
     }}
 
-    for (uint32_t k = 0; k < {max_active_features}; ++k)
+    for (uint32_t k = 0; k < {max_active_indices}; ++k)
     {{
-        const int32_t feature_index = feature_index_row[k];
-        const float   feature_value = feature_value_row[k];
-        if (feature_index != -1)
+        const int32_t input_index = input_index_row[k];
+        const float   input_value = input_value_row[k];
+        if (input_index != -1)
         {{
-            const float* const weight_slice = weight + feature_index * {output_size} + slice_offset;
+            const float* const weight_slice = weight + input_index * {output_size} + slice_offset;
             #pragma unroll
             for (uint32_t s = 0; s < {output_thread_slice_size}; ++s)
             {{
-                shared_output_slice[s] += weight_slice[s] * feature_value;
+                shared_output_slice[s] += weight_slice[s] * input_value;
             }}
         }} else break;
     }}
@@ -160,29 +160,29 @@ def make_feature_transformer_slice_forward_kernel(max_active_features, output_si
 }}
 
 """.format(
-                max_active_features=max_active_features,
+                max_active_indices=max_active_indices,
                 output_thread_slice_size=output_thread_slice_size,
                 output_size=output_size,
             ),
-            "feature_transformer_slice_forward",
+            "sparse_input_linear_forward",
         )
         kernel.compile()
-        _feature_transformer_slice_forward_kernel_cache[key] = _kernel_with_threads(
+        _sparse_input_linear_forward_kernel_cache[key] = _kernel_with_threads(
             kernel, (num_threads,)
         )
-    return _feature_transformer_slice_forward_kernel_cache[key]
+    return _sparse_input_linear_forward_kernel_cache[key]
 
 
-_feature_transformer_slice_backward_kernel_cache = dict()
+_sparse_input_linear_backward_kernel_cache = dict()
 
 
 @torch.compiler.disable(recursive=False)
-def make_feature_transformer_slice_backward_kernel(max_active_features, output_size):
+def make_sparse_input_linear_backward_kernel(max_active_indices: int, output_size: int):
     """
-    @param: max_active_features
-        The maximum number of features that are active
-        (non-zero) for a single position. This value determines
-        the shape of the inputs.
+    @param: max_active_indices
+        The maximum number of indices that are non-zero for
+        a single position. This value determines the shape
+        of the inputs.
         This value is of type uint32_t.
 
     @param: output_size
@@ -192,8 +192,8 @@ def make_feature_transformer_slice_backward_kernel(max_active_features, output_s
     """
     num_threads = _get_num_threads_for_backward(output_size)
     output_thread_slice_size = output_size // num_threads
-    key = (max_active_features, output_size, num_threads)
-    if key not in _feature_transformer_slice_backward_kernel_cache:
+    key = (max_active_indices, output_size, num_threads)
+    if key not in _sparse_input_linear_backward_kernel_cache:
         kernel = cp.RawKernel(
             r"""
 
@@ -207,23 +207,23 @@ def make_feature_transformer_slice_backward_kernel(max_active_features, output_s
         The threads must have dimensionality (N,), where
         N * output_thread_slice_size == output_size.
 
-    @param: feature_indices
-        A matrix of shape (BATCH_SIZE, max_active_features)
-        containing indices of active features for each position
-        in a batch. Feature index of -1 means that the slot is empty
+    @param: input_indices
+        A matrix of shape (BATCH_SIZE, max_active_indices)
+        containing indices of active indices for each position
+        in a batch. Input index of -1 means that the slot is empty
         and the weights will not be accumulated for it. Moreover
         no further indices from this block will be considered.
         The indices form an implicit matrix of shape
         (BATCH_SIZE, NUM_INPUTS), where the first dimension index is
         inferred from the memory location (BATCH_SIZE), and the
-        second dimension index is stored in the feature_indices matrix.
-        The type for feature indices is int32_t.
+        second dimension index is stored in the input_indices matrix.
+        The type for input indices is int32_t.
 
-    @param: feature_values
-        A matrix of shape (BATCH_SIZE, max_active_features)
+    @param: input_values
+        A matrix of shape (BATCH_SIZE, max_active_indices)
         containing the values (arity) of the corresponding
-        feature index in feature_indices.
-        The type for the feature value (arity) is float32.
+        input index in input_indices.
+        The type for the input value (arity) is float32.
 
     @param: weight_grad
         The weight gradient matrix of shape (NUM_INPUTS, output_size).
@@ -241,9 +241,9 @@ def make_feature_transformer_slice_backward_kernel(max_active_features, output_s
         An output gradient matrix of shape (BATCH_SIZE, output_size).
         Output values must have type float32.
 */
-void feature_transformer_slice_backward(
-    const int32_t* const feature_indices,
-    const float*   const feature_values,
+void sparse_input_linear_backward(
+    const int32_t* const input_indices,
+    const float*   const input_values,
           float*   const weight_grad,
           float*   const bias_grad,
     const float*   const output_grad
@@ -258,8 +258,8 @@ def make_feature_transformer_slice_backward_kernel(max_active_features, output_s
           float*   const bias_grad_slice          = bias_grad                               + slice_offset;
           float*         shared_output_grad_slice = shared_output_grad                      + slice_offset;
 
-    const int32_t* const feature_index_row        = feature_indices + block_idx * {max_active_features};
-    const float*   const feature_value_row        = feature_values  + block_idx * {max_active_features};
+    const int32_t* const input_index_row          = input_indices + block_idx * {max_active_indices};
+    const float*   const input_value_row          = input_values  + block_idx * {max_active_indices};
 
     #pragma unroll
     for (uint32_t s = 0; s < {output_thread_slice_size}; ++s)
@@ -277,35 +277,35 @@ def make_feature_transformer_slice_backward_kernel(max_active_features, output_s
         }}
     }}
 
-    for (uint32_t k = 0; k < {max_active_features}; ++k)
+    for (uint32_t k = 0; k < {max_active_indices}; ++k)
     {{
-        const int32_t feature_index = feature_index_row[k];
-        const float   feature_value = feature_value_row[k];
-        if (feature_index != -1)
+        const int32_t input_index = input_index_row[k];
+        const float   input_value = input_value_row[k];
+        if (input_index != -1)
         {{
-            float* const weight_grad_slice = weight_grad + feature_index * {output_size} + slice_offset;
+            float* const weight_grad_slice = weight_grad + input_index * {output_size} + slice_offset;
             #pragma unroll
             for (int s = 0; s < {output_thread_slice_size}; ++s)
             {{
                 const float sog = shared_output_grad_slice[s];
                 if (sog != 0.0f)
                 {{
-                    atomicAdd(&weight_grad_slice[s], sog * feature_value);
+                    atomicAdd(&weight_grad_slice[s], sog * input_value);
                 }}
             }}
         }} else break;
     }}
 }}
 
 """.format(
-                max_active_features=max_active_features,
+                max_active_indices=max_active_indices,
                 output_thread_slice_size=output_thread_slice_size,
                 output_size=output_size,
             ),
-            "feature_transformer_slice_backward",
+            "sparse_input_linear_backward",
         )
         kernel.compile()
-        _feature_transformer_slice_backward_kernel_cache[key] = _kernel_with_threads(
+        _sparse_input_linear_backward_kernel_cache[key] = _kernel_with_threads(
             kernel, (num_threads,)
         )
-    return _feature_transformer_slice_backward_kernel_cache[key]
+    return _sparse_input_linear_backward_kernel_cache[key]
Original file line number	Diff line number	Diff line change
`@@ -2,12 +2,12 @@`
`2`	`2`	`from torch import autograd`
`3`	`3`
`4`	`4`	`from .kernel import (`
`5`		`- make_feature_transformer_slice_forward_kernel,`
`6`		`- make_feature_transformer_slice_backward_kernel,`
	`5`	`+ make_sparse_input_linear_forward_kernel,`
	`6`	`+ make_sparse_input_linear_backward_kernel,`
`7`	`7`	`)`
`8`	`8`
`9`	`9`
`10`		`-class FeatureTransformerSliceFunction(autograd.Function):`
	`10`	`+class SparseLinearFunction(autograd.Function):`
`11`	`11`	`@staticmethod`
`12`	`12`	`def forward(ctx, feature_indices, feature_values, weight, bias):`
`13`	`13`	`ctx.save_for_backward(feature_indices, feature_values, weight, bias)`
`@@ -52,7 +52,7 @@ def forward(ctx, feature_indices, feature_values, weight, bias):`
`52`	`52`	`requires_grad=True,`
`53`	`53`	`)`
`54`	`54`
`55`		`- kernel = make_feature_transformer_slice_forward_kernel(`
	`55`	`+ kernel = make_sparse_input_linear_forward_kernel(`
`56`	`56`	`max_active_features, output_size`
`57`	`57`	`)`
`58`	`58`	`kernel(`
`@@ -87,7 +87,7 @@ def backward(ctx, grad_output):`
`87`	`87`	`)`
`88`	`88`	`bias_grad = torch.zeros(output_size, dtype=torch.float32, device=device)`
`89`	`89`
`90`		`- kernel = make_feature_transformer_slice_backward_kernel(`
	`90`	`+ kernel = make_sparse_input_linear_backward_kernel(`
`91`	`91`	`max_active_features, output_size`
`92`	`92`	`)`
`93`	`93`	`kernel(`