[stateless] add weight tying support

samdow · samdow · commit a190722b8b4c · 2022-12-14T19:03:13.000Z
ghstack-source-id: 87031f7 Pull Request resolved: #90477
diff --git a/test/test_stateless.py b/test/test_stateless.py
@@ -21,6 +21,17 @@ def __init__(self):
     def forward(self, x):
         return self.l1(x) + self.buffer
 
+class MockTiedModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = torch.nn.Linear(1, 1)
+        self.tied_bias = self.l1.bias
+        self.register_buffer('buffer', torch.ones(1))
+        self.register_buffer('tied_buffer', self.buffer)
+
+    def forward(self, x):
+        return self.l1(x) + self.tied_bias + self.buffer + self.tied_buffer
+
 
 class TestStatelessFunctionalAPI(TestCase):
     def _run_call_with_mock_module(self, module, device='cpu', prefix=''):
@@ -130,7 +141,7 @@ def test_circular_references(self):
                       'l1.m.buffer': buffer}
         prev_weight = module.l1.weight.clone()
         prev_buffer = module.buffer.clone()
-        res = stateless.functional_call(module, parameters, x)
+        res = stateless.functional_call(module, parameters, x, tie_weights=False)
         self.assertEqual(x, res)
         # check that the weights remain unmodified and were correctly accesed
         cur_weight = module.l1.weight
@@ -176,10 +187,32 @@ def test_reparamertize_module_fail_reset_to_original(self):
         self.assertEqual(orig_sn_weight, module.l1.weight)
 
 
-    def test_tied_weights_warns(self):
-        module = MockModule()
-        module.tied_bias = module.l1.bias
-        module.register_buffer("tied_buffer", module.buffer)
+    def test_reparamertize_tie_weights(self):
+        module = MockTiedModule()
+        weight = torch.tensor([[2.0]],)
+        bias = torch.tensor([5.0])
+        buffer = torch.tensor([3.0])
+
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        out = stateless.functional_call(module, parameters, x, tie_weights=True)
+        self.assertEqual(out, x * weight + bias + bias + buffer + buffer)
+
+    def test_reparamertize_tie_some_weights(self):
+        module = MockTiedModule()
+        weight = torch.tensor([[2.0]],)
+        buffer = torch.tensor([3.0])
+
+        parameters = {'l1.weight': weight,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        out = stateless.functional_call(module, parameters, x, tie_weights=True)
+        self.assertEqual(out, x * 2. + module.l1.bias + module.tied_bias + buffer + buffer)
+
+    def test_tied_weights_errors(self):
+        module = MockTiedModule()
         weight = torch.tensor([[1.0]],)
         bias = torch.tensor([0.0])
         buffer = torch.tensor([0.0])
@@ -188,23 +221,41 @@ def test_tied_weights_warns(self):
                       'l1.bias': bias,
                       'buffer': buffer}
         x = torch.randn(1, 1)
-        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x))
+        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x, tie_weights=True))
 
         # if tied values are the same tensors, shouldn't warn
         parameters['tied_bias'] = bias
         parameters['tied_buffer'] = buffer
-        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x))
+        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x, tie_weights=True))
         del parameters['tied_bias']
         del parameters['tied_buffer']
 
-        with self.assertWarnsOnceRegex(UserWarning, "functional_call was passed multiple values"):
+        with self.assertRaisesRegex(ValueError, "functional_call got values for both (l1.bias|tied_bias)"):
             parameters['tied_bias'] = torch.tensor([5.0])
-            stateless.functional_call(module, parameters, x)
+            stateless.functional_call(module, parameters, x, tie_weights=True)
         del parameters['tied_bias']
 
-        with self.assertWarnsOnceRegex(UserWarning, "functional_call was passed multiple values"):
+        with self.assertRaisesRegex(ValueError, "functional_call got values for both (buffer|tied_buffer)"):
             parameters['tied_buffer'] = torch.tensor([5.0])
-            stateless.functional_call(module, parameters, x)
+            stateless.functional_call(module, parameters, x, tie_weights=True)
+
+
+    def test_tied_weights_no_error_without_kwarg(self):
+        module = MockTiedModule()
+        weight = torch.tensor([[1.0]],)
+        bias = torch.tensor([0.0])
+        buffer = torch.tensor([0.0])
+
+        parameters = {'l1.weight': weight,
+                      'l1.bias': bias,
+                      'buffer': buffer}
+        x = torch.randn(1, 1)
+        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x, tie_weights=False))
+        parameters['tied_bias'] = torch.tensor([5.0])
+        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x, tie_weights=False))
+        del parameters['tied_bias']
+        parameters['tied_buffer'] = torch.tensor([5.0])
+        self.assertNotWarn(lambda: stateless.functional_call(module, parameters, x, tie_weights=False))
 
 
     def test_setattr(self):
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
@@ -1,6 +1,5 @@
-import warnings
 import contextlib
-from typing import Any, Callable, Dict, Iterator, List, Tuple, Union
+from typing import Any, Callable, Dict, Iterator, List, Tuple, Union, Set, Optional
 
 import torch
 from torch import Tensor
@@ -37,21 +36,52 @@ def _setattr(self, name: str, value: Any) -> None:
     module._orig_class = cls
 
 
-def _check_tied_val_already_replaced(old_val, new_val, replaced_tensors_map):
-    if old_val not in replaced_tensors_map:
-        replaced_tensors_map[old_val] = new_val
-    elif replaced_tensors_map[old_val] is not new_val:
-        warnings.warn("functional_call was passed multiple values for tied weights. "
-                      "This behavior is deprecated and will be an error in future versions")
+def _create_tied_weights_map(module, params_and_buffers):
+    # creates a weight map of {tied_name: name_given_by_user} for all weights where one of their tied weights is passed
+    #
+    # The basic algorithm looks like:
+    #   - index all weights by their original tensor value to find tied weights
+    #     - when we encounter a weight not used by the user, we save it in a set (second element in the tuple)
+    #     - when we run into a weight used by the user, we save that separate from the set as the first element in the tuple
+    #     - ending map looks like {tensor: (name_given_by_user, set(all_tied_names)}
+    #   - then loop through the values of this map (name_given_by_user and set(all_tied_names))
+    #     - for each element of all_tied_names, add {tied_name: name_given_by_user} to a new map
+
+    names = params_and_buffers.keys()
+    weight_to_name_and_tied_names: Dict[torch.Tensor, Tuple[Optional[str], Set[str]]] = {}
+
+    def add_to_name_map(name, t):
+        if t in weight_to_name_and_tied_names:
+            first_seen_name = weight_to_name_and_tied_names[t][0]
+            if name in names and first_seen_name and params_and_buffers[name] is not params_and_buffers[first_seen_name]:
+                raise ValueError(f"functional_call got values for both {name} and {first_seen_name}, which are tied.")
+            elif name in names:
+                weight_to_name_and_tied_names[t] = (name, weight_to_name_and_tied_names[t][1])
+            else:
+                weight_to_name_and_tied_names[t][1].add(name)
+        else:
+            weight_to_name_and_tied_names[t] = (name, set()) if name in names else (None, {name})
+
+    for name, t in module.named_parameters(remove_duplicate=False):
+        add_to_name_map(name, t)
+
+    for name, t in module.named_buffers(remove_duplicate=False):
+        add_to_name_map(name, t)
 
+    # make {tied_name: name_given_by_user} from pairs of (name_given_by_user, set(all_tied_names))
+    tied_weights_to_given_name = {}
+    for name, tied_names in weight_to_name_and_tied_names.values():
+        if name is None:  # no mapping was passed for this tensor, use original tensor
+            continue
+        for tied_name in tied_names:
+            tied_weights_to_given_name[tied_name] = name
+    return tied_weights_to_given_name
 
-def _create_swap_params(params_and_buffers, replaced_tensors_map):
-    def _swap_parameters(module, tensor_name: str, full_path: str, tensor: Tensor) -> None:
+
+def _create_swap_params(params_and_buffers):
+    def _swap_parameters(module, tensor_name: str, full_path: str, tensor: Optional[Tensor]) -> None:
         # Changes the module class to get a new __getattr__ dunder method
         # that looks for the reparametrized tensor
-        if hasattr(module, tensor_name):
-            old_val = getattr(module, tensor_name)
-            _check_tied_val_already_replaced(old_val, tensor, replaced_tensors_map)
         if hasattr(module, "_attr_to_path"):
             module._attr_to_path[tensor_name] = full_path
         else:
@@ -72,12 +102,17 @@ def _remove_swap(module, name: str, full_path: str) -> None:
 def _reparametrize_module(
     module: 'torch.nn.Module',
     parameters_and_buffers: Dict[str, Tensor],
+    tie_weights: bool = False,
 ) -> Iterator[None]:
-    orig_tensors_to_replacements: Dict[Tensor, Tensor] = {}
+    tied_weights_map = _create_tied_weights_map(module, parameters_and_buffers) if tie_weights else {}
     for name, tensor in parameters_and_buffers.items():
         _apply_func_submodules(
-            _create_swap_params(parameters_and_buffers, orig_tensors_to_replacements),
+            _create_swap_params(parameters_and_buffers),
             module, name.split("."), name, (tensor,))
+    for tied_name, user_given_name in tied_weights_map.items():
+        _apply_func_submodules(
+            _create_swap_params(parameters_and_buffers),
+            module, tied_name.split("."), user_given_name, (None,))
     try:
         yield
     finally:
@@ -105,6 +140,7 @@ def functional_call(
     parameters_and_buffers: Dict[str, Tensor],
     args: Union[Any, Tuple],
     kwargs: Dict[str, Any] = None,
+    tie_weights: bool = False,
 ):
     r"""Performs a functional call on the module by replacing the module parameters
     and buffers with the provided ones.
@@ -151,7 +187,7 @@ def functional_call(
         raise RuntimeError("The stateless API can't be used with Jitted modules")
     if kwargs is None:
         kwargs = {}
-    with _reparametrize_module(module, parameters_and_buffers):
+    with _reparametrize_module(module, parameters_and_buffers, tie_weights):
         if isinstance(args, tuple):
             out = module(*args, **kwargs)
         else: