[FSDP][state_dict] Return tensors instead of FlatParameters to avoid pickling errors

fegin · fegin · commit 95d23a85fd86 · 2023-02-11T19:52:29.000-08:00
Pull Request resolved: #94637 After #88913, user-defined parameter states will be pickled. For a FlatParameter, this means `_local_shard` will also be pickled. Since state_dict and load_state_dict only require the tensor, returning the full FlatParameter does not give us any extra benefit. This PR changes the behavior to simply return a view of the FlatParameter. ghstack-source-id: 179983735 Differential Revision: [D43205127](https://our.internmc.facebook.com/intern/diff/D43205127/)
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
+import io
 import itertools
 import sys
 from contextlib import suppress
@@ -10,6 +11,7 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
     checkpoint_wrapper,
@@ -1067,6 +1069,23 @@ def forward(self, x):
             with FSDP.summon_full_params(model):
                 self.assertEqual(model.my_parameter.item(), 3.1415926)
 
+    @skip_if_lt_x_gpu(2)
+    def test_torch_save_load(self):
+        model = Model(wrap_fsdp=True).cuda()
+        with FSDP.state_dict_type(model, StateDictType.LOCAL_STATE_DICT):
+            state_dict = model.state_dict()
+            checkpoint = io.BytesIO()
+            torch.save(state_dict, checkpoint)
+            checkpoint.seek(0)
+            state_dict_saved = torch.load(checkpoint)
+            for k, v in state_dict_saved.items():
+                if isinstance(v, ShardedTensor):
+                    self.assertEqual(
+                        v._local_shards[0].tensor, state_dict[k]._local_shards[0].tensor
+                    )
+                else:
+                    self.assertEqual(v, state_dict[k])
+
 
 instantiate_parametrized_tests(TestFSDPStateDict)
 
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
@@ -393,8 +393,11 @@ def _local_post_state_dict_hook(
     shard_offset = flat_param.numel() * fsdp_state.rank
     valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
     if valid_data_size > 0:
-        if flat_param._shard_numel_padded > 0:
-            flat_param = flat_param.narrow(0, 0, valid_data_size)
+        # If FlatParameter is returned, FlatParameter._local_shard cause a
+        # pickling issue (can be torch.save but not torch.load). Since there
+        # is no benefit for state_dict to return the actual FlatParameter class,
+        # a view (which is a tensor) of the FlatParameter will be returned.
+        flat_param = flat_param[:valid_data_size].view(valid_data_size)
         local_shards = [
             Shard.from_tensor_and_offsets(flat_param, [shard_offset], fsdp_state.rank)
         ]