Revert "[follow-up] Python Attr Serialization (#88913)"

Andrew Gu · Andrew Gu · commit 86dbc243f870 · 2023-02-13T17:13:48.000Z
This reverts commit 745fe35. ghstack-source-id: 47fc202 Pull Request resolved: #94741
diff --git a/test/test_serialization.py b/test/test_serialization.py
@@ -951,7 +951,11 @@ def _test_save_load_attr(t):
 
         t = torch.zeros(3, 3)
         _test_save_load_attr(t)
-        _test_save_load_attr(torch.nn.Parameter(t))
+        # This should start failing once Parameter
+        # supports saving Python Attribute.
+        err_msg = "'Parameter' object has no attribute"
+        with self.assertRaisesRegex(AttributeError, err_msg):
+            _test_save_load_attr(torch.nn.Parameter(t))
 
     def test_weights_only_assert(self):
         class HelloWorld:
diff --git a/torch/_utils.py b/torch/_utils.py
@@ -357,6 +357,8 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
+# TODO(kshitij12345): Support serializing nn.Parameter with Python Attributes.
+# NOTE: We are just defining it here now for future use.
 def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
     param = torch.nn.Parameter(data, requires_grad)
     # NB: This line exists only for backwards compatibility; the
diff --git a/torch/distributed/optim/apply_optimizer_in_backward.py b/torch/distributed/optim/apply_optimizer_in_backward.py
@@ -4,12 +4,6 @@
 
 __all__: List[str] = []
 
-# WeakTensorKeyDictionary to store relevant meta-data for the Tensor/Parameter
-# without changing it's life-time.
-# NOTE: Alternative is to add the meta-data as an attribute to the tensor,
-#       but that will serialize the meta-data if Tensor is serialized.
-param_to_optim_hook_handle_map = torch.utils.weak.WeakTensorKeyDictionary()
-param_to_acc_grad_map = torch.utils.weak.WeakTensorKeyDictionary()
 
 @no_type_check
 def _apply_optimizer_in_backward(
@@ -50,12 +44,19 @@ def _apply_optimizer_in_backward_to_param(param: torch.nn.Parameter) -> None:
         # this parameter is ready (has been accumulated into .grad field)
 
         # Don't create a new acc_grad if we already have one
-        # i.e. for shared parameters or attaching multiple optimizers to a param.
-        if param not in param_to_acc_grad_map:
-            param_to_acc_grad_map[param] = param.view_as(param).grad_fn.next_functions[0][0]
+        # i.e.f or shared parameters or attaching multiple optimizers to a param.
+        if not hasattr(param, "acc_grad"):
+            acc_grad = param.view_as(param).grad_fn.next_functions[0][0]
+        else:
+            acc_grad = param._acc_grad
 
         optimizer = optimizer_class([param], **optimizer_kwargs)
 
+        # Keep the grad accumulator around for the lifetime of the Tensor,
+        # store it on the param to avoid uncollectable ref-cycle
+        if not hasattr(param, "acc_grad"):
+            param._acc_grad = acc_grad  # type: ignore[attr-defined]
+
         if not hasattr(param, "_in_backward_optimizers"):
             param._in_backward_optimizers = []  # type: ignore[attr-defined]
             # TODO: investigate whether we really need these attributes.
@@ -72,10 +73,10 @@ def optimizer_hook(*_unused) -> None:
 
             param.grad = None
 
-        handle = param_to_acc_grad_map[param].register_hook(optimizer_hook)  # type: ignore[attr-defined]
-        if param not in param_to_optim_hook_handle_map:
-            param_to_optim_hook_handle_map[param] = []
-        param_to_optim_hook_handle_map[param].append(handle)
+        handle = param._acc_grad.register_hook(optimizer_hook)  # type: ignore[attr-defined]
+        if not hasattr(param, '_optimizer_hook_handles'):
+            param._optimizer_hook_handles = []  # type: ignore[attr-defined]
+        param._optimizer_hook_handles.append(handle)  # type: ignore[attr-defined]
 
     for param in params:
         _apply_optimizer_in_backward_to_param(param)
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -717,9 +717,8 @@ def _setup_in_backward_optimizers(self):
             # Remove hooks that apply_optim_in_backward had registered because
             # DDP customizes how optimizer is overlapped with backward due to
             # the allreduce.
-            param_to_handle_map = dist.optim.apply_optimizer_in_backward.param_to_optim_hook_handle_map
             for p in self._module_parameters:
-                for handle in param_to_handle_map.get(p, []):
+                for handle in getattr(p, '_optimizer_hook_handles', []):
                     handle.remove()
 
             # Need a weakref to the reducer in order to run all_reduce.
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
@@ -60,19 +60,11 @@ def __repr__(self):
         return 'Parameter containing:\n' + super().__repr__()
 
     def __reduce_ex__(self, proto):
-        state = torch._utils._get_obj_state(self)
-
+        # TODO(kshitij12345): Support saving Python Attribute
         # See Note [Don't serialize hooks]
-        hooks = OrderedDict()
-        if not state:
-            return (
-                torch._utils._rebuild_parameter,
-                (self.data, self.requires_grad, hooks)
-            )
-
         return (
-            torch._utils._rebuild_parameter_with_state,
-            (self.data, self.requires_grad, hooks, state)
+            torch._utils._rebuild_parameter,
+            (self.data, self.requires_grad, OrderedDict())
         )
 
     __torch_function__ = _disabled_torch_function_impl