[FSDP] Allow to use TorchDispatch with FSDP

fegin · fegin · commit a9c726cd37f5 · 2022-10-31T18:26:22.000Z
ghstack-source-id: 6786e65 Pull Request resolved: #88014
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
@@ -1,14 +1,15 @@
 import dataclasses
 import traceback
 from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union
 
 import torch
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.parallel.scatter_gather import (  # type: ignore[attr-defined]
     _is_namedtuple,
 )
 from torch.nn.utils.rnn import PackedSequence
+from torch.utils._mode_utils import no_dispatch
 
 
 def _contains_batchnorm(module):
@@ -107,6 +108,11 @@ def _same_storage(x: torch.Tensor, y: torch.Tensor) -> bool:
     return x.storage().data_ptr() == y.storage().data_ptr()
 
 
+def _no_dispatch_record_stream(tensor: torch.Tensor, stream: torch.cuda.Stream) -> None:
+    with no_dispatch():
+        tensor.record_stream(cast(torch._C.Stream, stream))
+
+
 def p_assert(cond: Any, s: Any, raise_assertion_error: bool = True) -> None:
     """This is used as an alternate to ``assert`` when in the backward context
     to print the error message ``s`` since otherwise, it is swallowed."""
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
@@ -5,7 +5,6 @@
 from itertools import accumulate, chain
 from typing import (
     Any,
-    cast,
     Dict,
     Generator,
     Iterator,
@@ -29,7 +28,13 @@
 )
 
 from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
-from ._utils import _alloc_storage, _free_storage, _same_storage, p_assert
+from ._utils import (
+    _alloc_storage,
+    _free_storage,
+    _no_dispatch_record_stream,
+    _same_storage,
+    p_assert,
+)
 
 __all__ = [
     "FlatParameter",
@@ -1121,9 +1126,7 @@ def _free_unsharded_flat_param(self):
         self._check_storage_allocated(unsharded_flat_param)
         self._check_on_compute_device(unsharded_flat_param)
         # Do not free the memory until all ops in the current stream finish
-        unsharded_flat_param.record_stream(
-            cast(torch._C.Stream, torch.cuda.current_stream())
-        )
+        _no_dispatch_record_stream(unsharded_flat_param, torch.cuda.current_stream())
         _free_storage(unsharded_flat_param)
 
     def _use_sharded_flat_param(self) -> None:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -90,6 +90,7 @@
     _apply_to_tensors,
     _contains_batchnorm,
     _free_storage,
+    _no_dispatch_record_stream,
     _override_batchnorm_mixed_precision,
     p_assert,
 )
@@ -2686,14 +2687,16 @@ def _post_backward_hook(
                         grad.detach(), non_blocking=True
                     )
                     # Don't let this memory get reused until after the transfer.
-                    grad.data.record_stream(torch.cuda.current_stream())
+                    _no_dispatch_record_stream(grad.data, torch.cuda.current_stream())
 
                 # After _post_backward_hook returns, orig_grad_data will eventually
                 # go out of scope, at which point it could otherwise be freed for
                 # further reuse by the main stream while the div/reduce_scatter/copy
                 # are underway in the post_backward stream. See:
                 # github.com/NVIDIA/apex/blob/master/apex/parallel/distributed.py
-                orig_grad_data.record_stream(self._streams["post_backward"])
+                _no_dispatch_record_stream(
+                    orig_grad_data, self._streams["post_backward"]
+                )
 
                 if handle._use_orig_params:
                     # Since the handle's `FlatParameter` completed its gradient
@@ -2727,7 +2730,7 @@ def _cast_grad_to_param_dtype(
             grad.data = grad.data.to(dtype=param.dtype)
             # Do not let the low precision gradient memory get reused until
             # the cast to full parameter precision completes
-            low_prec_grad_data.record_stream(torch.cuda.current_stream())
+            _no_dispatch_record_stream(low_prec_grad_data, torch.cuda.current_stream())
 
     def _should_free_unsharded_flat_param(self, handle: FlatParamHandle):
         return (