[FSDP] Consolidate FSDP state_dict offload_to_cpu settings

fegin · fegin · commit aa1cb742944c · 2022-10-04T09:29:19.000-07:00
Consolidate FSDP state_dict offload_to_cpu settings. All state_dict_types now have offload_to_cpu options. Differential Revision: [D40065969](https://our.internmc.facebook.com/intern/diff/D40065969/) ghstack-source-id: 169306202 Pull Request resolved: #86211
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -13,7 +13,12 @@
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
 )
-from torch.distributed.fsdp import CPUOffload, FullStateDictConfig
+from torch.distributed.fsdp import (
+    CPUOffload,
+    FullStateDictConfig,
+    LocalStateDictConfig,
+    ShardedStateDictConfig,
+)
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp import (
     LocalStateDictConfig,
@@ -73,8 +78,8 @@
 
 NON_ROOT_FSDP_PREFIX = 'non_fsdp_lin'
 
-_UNFLATTENED_STATE_DICT_IMPLS = ["state_dict", "sharded_state_dict"]
-_FLATTENED_STATE_DICT_IMPLS = ["local_state_dict"]
+_UNFLATTENED_STATE_DICT_IMPLS = ["state_dict"]
+_FLATTENED_STATE_DICT_IMPLS = []
 _SUPPORTED_STATE_DICT_IMPLS = (
     _UNFLATTENED_STATE_DICT_IMPLS + _FLATTENED_STATE_DICT_IMPLS
 )
@@ -180,8 +185,16 @@ def _get_state_dict_mgr(
                 rank0_only=state_dict_rank0_and_offload,
                 offload_to_cpu=state_dict_rank0_and_offload,
             )
+        elif state_dict_type == "local_state_dict":
+            config = LocalStateDictConfig(
+                offload_to_cpu=state_dict_rank0_and_offload,
+            )
+        elif state_dict_type == "sharded_state_dict":
+            config = ShardedStateDictConfig(
+                offload_to_cpu=state_dict_rank0_and_offload,
+            )
         else:
-            config = None
+            raise ValueError("Unspported state_dict_type")
         return FSDP.state_dict_type(model, _state_dict_type, config)
 
     def _validate_state_dict_contents(
diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py
@@ -7,6 +7,7 @@
     LocalStateDictConfig,
     MixedPrecision,
     OptimStateKeyType,
+    ShardedStateDictConfig,
     ShardingStrategy,
     StateDictType,
 )
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -315,7 +315,7 @@ class StateDictConfig:
     order to configure settings for the particular type of ``state_dict``
     implementation FSDP will use.
     """
-    pass
+    offload_to_cpu: bool = False
 
 @dataclass
 class FullStateDictConfig(StateDictConfig):
@@ -345,9 +345,9 @@ class FullStateDictConfig(StateDictConfig):
         >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
         >>> # After this point, all ranks have FSDP model with loaded checkpoint.
     """
-    offload_to_cpu: bool = False
     rank0_only: bool = False
 
+
 @dataclass
 class LocalStateDictConfig(StateDictConfig):
     pass
@@ -2251,10 +2251,12 @@ def _local_post_state_dict_hook(
         local_shards = [
             Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank)
         ]
-        state_dict[f"{prefix}{FLAT_PARAM}"] = init_from_local_shards(
+        sharded_tensor = init_from_local_shards(
             local_shards, full_numel, process_group=self.process_group
         )  # type: ignore[assignment]
-
+        if self._state_dict_type.offload_to_cpu:
+            sharded_tensor = sharded_tensor.cpu()
+        state_dict[f"{prefix}{FLAT_PARAM}"] = sharded_tensor
         return state_dict
 
     @torch.no_grad()
@@ -2279,13 +2281,17 @@ def _sharded_post_state_dict_hook(
             for fqn, _, _ in self._param_fqns:
                 # Create a ShardedTensor for the unflattened, non-sharded parameter.
                 param = functools.reduce(getattr, fqn.split("."), self.module)
-                state_dict[f"{prefix}{fqn}"] = _ext_chunk_tensor(
+                sharded_tensor = _ext_chunk_tensor(
                     tensor=param,
                     rank=self.rank,
                     world_size=self.world_size,
                     num_devices_per_node=torch.cuda.device_count(),
                     pg=self.process_group
                 )  # type: ignore[assignment]
+                if self._state_dict_config.offload_to_cpu:
+                    sharded_tensor = sharded_tensor.cpu()
+                state_dict[f"{prefix}{fqn}"] = sharded_tensor
+
         state_dict.pop(f"{prefix}{FLAT_PARAM}")
         return state_dict
 

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`LocalStateDictConfig,`
`8`	`8`	`MixedPrecision,`
`9`	`9`	`OptimStateKeyType,`
	`10`	`+ ShardedStateDictConfig,`
`10`	`11`	`ShardingStrategy,`
`11`	`12`	`StateDictType,`
`12`	`13`	`)`