[Dynamo x FSDP][2/x] Small changes to distributed to make it dynamo friendly

voznesenskym · voznesenskym · commit d5856cad83c1 · 2023-08-11T03:43:51.000Z
ghstack-source-id: d45babf Pull Request resolved: #106886 [Dynamo x FSDP][2/x] Small changes to distributed to make it dynamo friendly Dist nits
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
@@ -190,5 +190,10 @@ def _get_registry(module: nn.Module) -> Dict[str, RegistryItem]:
     Get an ``OrderedDict`` of composable APIs that have been applied to the
     ``module``, indexed by the API name.
     """
-    default_registry: Dict[str, RegistryItem] = OrderedDict()
-    return module.__dict__.setdefault(REGISTRY_KEY, default_registry)  # type: ignore[call-overload]
+    registry = getattr(module, REGISTRY_KEY, None)
+    if registry is None:
+        default_registry: Dict[str, RegistryItem] = OrderedDict()
+        setattr(module, REGISTRY_KEY, default_registry)
+        return default_registry
+    else:
+        return registry
diff --git a/torch/distributed/_composable_state.py b/torch/distributed/_composable_state.py
@@ -29,4 +29,7 @@ def _get_module_state(module: nn.Module) -> Optional[_State]:
     if isinstance(module, _State):
         return cast(_State, module)
     else:
-        return _module_state_mapping.get(module, None)
+        if module in _module_state_mapping:
+            return _module_state_mapping[module]
+        else:
+            return None
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -555,7 +555,7 @@ class group(metaclass=_WorldMeta):
     pass
 
 class GroupMember(metaclass=_WorldMeta):
-    NON_GROUP_MEMBER = object()
+    NON_GROUP_MEMBER = -100
 
 
 # Default process group state
@@ -982,7 +982,7 @@ def get_backend(group: Optional[ProcessGroup] = None) -> str:
         pg = group
     if _rank_not_in_group(pg):
         raise RuntimeError("Invalid process group specified")
-    pg_store = _world.pg_map.get(pg, None)
+    pg_store = _world.pg_map[pg] if pg in _world.pg_map else None
     assert pg_store is not None
     return pg_store[0]
 
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
@@ -392,12 +392,16 @@ def f(module: torch.nn.Module, prefix: str, tree_level: int, *args, **kwargs):
                         submodule_name == "_fsdp_wrapped_module"
                         or submodule_name == "_dmp_wrapped_module"
                     ):
-                        warnings.warn(
-                            "An unexpected prefix is detected. This case "
-                            " should only happen when using DMP with FSDP. "
-                            f"prefix = {prefix}, "
-                            f"submodule_name = {submodule_name}"
-                        )
+                        if (
+                            not torch.distributed._functional_collectives.is_torchdynamo_compiling()
+                        ):
+                            # TODO(voz): Don't graph break on this
+                            warnings.warn(
+                                "An unexpected prefix is detected. This case "
+                                " should only happen when using DMP with FSDP. "
+                                f"prefix = {prefix}, "
+                                f"submodule_name = {submodule_name}"
+                            )
                         new_prefix = prefix
                     elif submodule_name == "module":
                         warnings.warn(
@@ -511,7 +515,12 @@ def _no_dispatch_record_stream(tensor: torch.Tensor, stream: torch.Stream) -> No
     # FIXME record_stream doesn't work with non-cuda tensors
     if tensor.device.type not in ["cuda", torch._C._get_privateuse1_backend_name()]:
         return
-    with no_dispatch():
+
+    if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        # Don't no dispatch under torch compile like this
+        with no_dispatch():
+            tensor.record_stream(stream)
+    else:
         tensor.record_stream(stream)
 
 
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
@@ -217,18 +217,21 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
             # TODO (awgu): Since every module has at most one handle in the
             # current implementation, this should never raise the error.
             assert self.world_size is not None  # mypy
-            for (r1, n1), (r2, n2) in itertools.combinations(
-                (
-                    (rank, world_num_valid_indices[rank])
-                    for rank in range(self.world_size)
-                ),
-                2,
-            ):
-                if n1 != n2:
-                    raise RuntimeError(
-                        f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
-                        f"while rank {r2} is all-gathering {n2} parameters"
-                    )
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                # TODO(voz): Don't graph break on this - dynamo hates the n1 != n2
+                # tensor comparison control flow.
+                for (r1, n1), (r2, n2) in itertools.combinations(
+                    (
+                        (rank, world_num_valid_indices[rank])
+                        for rank in range(self.world_size)
+                    ),
+                    2,
+                ):
+                    if n1 != n2:
+                        raise RuntimeError(
+                            f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
+                            f"while rank {r2} is all-gathering {n2} parameters"
+                        )
             world_indices = torch.zeros(  # type: ignore[call-overload]
                 self.world_size * num_valid_indices, **tensor_kwargs
             )
@@ -239,26 +242,31 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
             # Copy entire tensor from D2H once to avoid per element D2H copies
             world_indices = world_indices.cpu()
             # Check that all ranks plan to all-gather the same index parameters
-            for (r1, i1), (r2, i2) in itertools.combinations(
-                (
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                # TODO(voz): Don't graph break on this - dynamo hates the i1 != i2
+                # tensor comparison control flow.
+                for (r1, i1), (r2, i2) in itertools.combinations(
                     (
-                        rank,
-                        world_indices[
-                            rank * num_valid_indices : (rank + 1) * num_valid_indices
-                        ],
-                    )
-                    for rank in range(self.world_size)
-                ),
-                2,
-            ):
-                if i1 != i2:
-                    r1_param_names = self._get_names_from_handle_indices(i1)
-                    r2_param_names = self._get_names_from_handle_indices(i2)
-                    raise RuntimeError(
-                        f"{msg_prefix} rank {r1} is all-gathering parameters "
-                        f"for {r1_param_names} while rank {r2} is all-gathering "
-                        f"parameters for {r2_param_names}"
-                    )
+                        (
+                            rank,
+                            world_indices[
+                                rank
+                                * num_valid_indices : (rank + 1)
+                                * num_valid_indices
+                            ],
+                        )
+                        for rank in range(self.world_size)
+                    ),
+                    2,
+                ):
+                    if i1 != i2:
+                        r1_param_names = self._get_names_from_handle_indices(i1)
+                        r2_param_names = self._get_names_from_handle_indices(i2)
+                        raise RuntimeError(
+                            f"{msg_prefix} rank {r1} is all-gathering parameters "
+                            f"for {r1_param_names} while rank {r2} is all-gathering "
+                            f"parameters for {r2_param_names}"
+                        )
         elif self._checking_order:
             # Only issue warnings on the first deviating iteration and stop
             # checking thereafter to avoid flooding the console
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
@@ -78,10 +78,10 @@
     ShardingStrategy.HYBRID_SHARD: HandleShardingStrategy.HYBRID_SHARD,
     ShardingStrategy._HYBRID_SHARD_ZERO2: HandleShardingStrategy._HYBRID_SHARD_ZERO2,
 }
-HYBRID_SHARDING_STRATEGIES = {
+HYBRID_SHARDING_STRATEGIES = [
     ShardingStrategy.HYBRID_SHARD,
     ShardingStrategy._HYBRID_SHARD_ZERO2,
-}
+]
 NO_RESHARD_AFTER_FORWARD_STRATEGIES = (
     ShardingStrategy.SHARD_GRAD_OP,
     ShardingStrategy._HYBRID_SHARD_ZERO2,
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
@@ -348,9 +348,12 @@ def _reshard(
     """
     handle.reshard(free_unsharded_flat_param)
     if state.limit_all_gathers and free_unsharded_flat_param:
-        free_event = state._device_handle.Event()
-        free_event.record()
-        state._free_event_queue.enqueue(free_event)
+        if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+            # We don't run a even queue for freeing under torch compile atm
+            # But maybe we need to? TODO(voz): Look into this
+            free_event = state._device_handle.Event()
+            free_event.record()
+            state._free_event_queue.enqueue(free_event)
     handle.post_reshard()
     # Since we prefetch entire handles keys at a time, conservatively mark
     # the entire key as no longer prefetched once we free at least one
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
@@ -27,7 +27,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp._common_utils import (
     _FSDPDeviceHandle,
     _named_parameters_with_duplicates,
@@ -1797,6 +1796,8 @@ def _use_unsharded_views(self, as_params: bool) -> None:
         flat_param = self.flat_param
         self._check_unsharded(flat_param)
         views = self._get_unflat_views()
+        from torch.distributed._tensor import DTensor
+
         for i, (view, (param_name, module, _)) in enumerate(
             zip(views, flat_param._param_infos)
         ):