[FSDP] Add fast path for NO_SHARD clip_grad_norm_() (#89137)

Andrew Gu · pytorchmergebot · commit ee1d375bf98f · 2022-11-16T22:08:50.000Z
Pull Request resolved: #89137 Approved by: https://github.com/rohan-varma
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -209,6 +209,35 @@ def _test_ddp_parity(
                 self.assertEqual(n1, n2)
                 self.assertEqual(p1, p2)
 
+        if offload_params:
+            # TODO: Gradient computation on CPU and GPU differ slightly causing
+            # drift unrelated to `clip_grad_norm_()`.
+            # https://github.com/pytorch/pytorch/issues/89133
+            return
+
+        # Run a few more iterations
+        # TODO: We cannot run too many iterations, or else there is drift:
+        # https://github.com/pytorch/pytorch/issues/89136
+        for i in range(3):
+            set_to_none = i % 2 == 0  # exercise both
+            ddp_optim.zero_grad(set_to_none=set_to_none)
+            fsdp_optim.zero_grad(set_to_none=set_to_none)
+            inp = ddp_model.module.get_input(device)
+            for model in (ddp_model, fsdp_model):
+                out = model(*inp)
+                out.sum().backward()
+            ddp_total_norm = torch.nn.utils.clip_grad_norm_(
+                ddp_model.parameters(),
+                max_norm=max_norm,
+                norm_type=norm_type,
+            )
+            fsdp_total_norm = fsdp_model.clip_grad_norm_(
+                max_norm=max_norm, norm_type=norm_type
+            )
+            self.assertEqual(ddp_total_norm, fsdp_total_norm)
+            ddp_optim.step()
+            fsdp_optim.step()
+
 
 instantiate_parametrized_tests(TestClipGradNorm)
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1161,10 +1161,20 @@ def clip_grad_norm_(
             self._streams["unshard"],
             self._streams["pre_unshard"],
         )
+        # If every FSDP instance uses `NO_SHARD`, then we can directly use
+        # the normal `nn.utils` one targeting local gradients
+        all_no_shard = all(
+            not handle.uses_sharded_strategy
+            for handle in FullyShardedDataParallel._fsdp_handles(self)
+        )
+        if all_no_shard:
+            return torch.nn.utils.clip_grad_norm_(
+                self.parameters(), max_norm, norm_type
+            )
+        # Otherwise, there exists some FSDP instance using a sharded strategy,
+        # where sharded and non-sharded parameters must be handled separately
         max_norm = float(max_norm)
         norm_type = float(norm_type)
-        # Perform local gradient norm computation, where sharded and
-        # non-sharded parameters must be handled separately
         sharded_params = set()
         nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
         for handle in FullyShardedDataParallel._fsdp_handles(self):