Update on "[FSDP] Add fast path for NO_SHARD clip_grad_norm_()"

Andrew Gu · Andrew Gu · commit ab9f9deacacf · 2022-11-16T18:36:24.000Z
[ghstack-poisoned]
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -218,9 +218,10 @@ def _test_ddp_parity(
         # Run a few more iterations
         # TODO: We cannot run too many iterations, or else there is drift:
         # https://github.com/pytorch/pytorch/issues/89136
-        for _ in range(3):
-            ddp_optim.zero_grad(set_to_none=True)
-            fsdp_optim.zero_grad(set_to_none=True)
+        for i in range(3):
+            set_to_none = i % 2 == 0  # exercise both
+            ddp_optim.zero_grad(set_to_none=set_to_none)
+            fsdp_optim.zero_grad(set_to_none=set_to_none)
             inp = ddp_model.module.get_input(device)
             for model in (ddp_model, fsdp_model):
                 out = model(*inp)
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1161,7 +1161,8 @@ def clip_grad_norm_(
             self._streams["unshard"],
             self._streams["pre_unshard"],
         )
-        # Check for an early return if every FSDP instance uses `NO_SHARD`
+        # If every FSDP instance uses `NO_SHARD`, then we can directly use
+        # the normal `nn.utils` one targeting local gradients
         all_no_shard = all(
             not handle.uses_sharded_strategy
             for handle in FullyShardedDataParallel._fsdp_handles(self)