[FSDP2] Computed grad divide factors at runtime (#125484)

Andrew Gu · pytorchmergebot · commit f70bd71a4883 · 2024-05-05T14:11:33.000Z
**Context** We are interested in supporting the case where HSDP reduce-scatters but does not all-reduce in a microbatch backward. This saves communication while still saving memory. Only on the last microbatch do we need to both reduce-scatter and all-reduce. This is not implemented yet and will hopefully come in a future PR. There is one notable part of doing this. On the last microbatch, we need to perform an accumulation step after reduce-scatter and before all-reduce. If not, then the preceding microbatch's gradients will not be contributed across the replica group. (In other words, we cannot simply accumulate _after_ all-reduce.) Consider 32 GPUs with 4-way replication and 8-way sharding and 2 microbatches, and focus on global rank 0. - After the first microbatch, rank 0 will have its shard of $\frac{1}{8} \sum_{i \in S(0)} g_i^{(1)}$, where we define $S(0) = \{0, 1, \dots, 7\}$ to be the ranks in its shard group and we define the $(1)$ superscript to denote the first microbatch. - Upon the second microbatch, rank 0 after its reduce-scatter will additionally have its shard of $\frac{1}{8} \sum_{i \in S(0)} g_i^{(2)}$. If we only all-reduce this, then this second microbatch's gradients become $\frac{1}{32} \sum_{i=0, 1, \dots, 31} g_i^{(2)}$, so in total, rank 0 has $\frac{1}{8} \sum_{i \in S(0)} g_i^{(1)} + \frac{1}{32} \sum_{i=0, 1, \dots, 31} g_i^{(2)}$, which is wrong. - Importantly, we must accumulate $\frac{1}{8} \sum_{i \in S(0)} g_i^{(1)} + \frac{1}{8} \sum_{i \in S(0)} g_i^{(2)} = \frac{1}{8}\sum_{i \in S(0)} (g_i^{(1)} + g_i^{(2)})$ first before all-reducing to get $\frac{1}{32} \sum_{i=0, 1, \dots, 31} (g_i^{(1)} + g_i^{(2)})$. Now, note how under this approach, we want a factor of $\frac{1}{8}$ only (i.e. reciprocal of the shard group size), not $\frac{1}{32}$, for the first microbatch's gradients. - For bf16/fp32, since we use `ReduceOp.AVG` and we only reduce-scatter on the first microbatch, we correctly have a factor of $\frac{1}{8}$ on the first microbatch. - For fp16, since we precompute the gradient divide factors at init time assuming always reducing over both shard and replica groups, we incorrectly have a factor of $\frac{1}{32}$ on the first microbatch, deviating from the bf16/fp32 case. We can address this issue by matching the bf16/fp32 vs. fp16 semantics by computing the divide factors at runtime based on which process groups were passed into the reduction function (`foreach_reduce`). **Additional Notes** How to implement the HSDP reduce-scatter but no all-reduce is not entirely clear yet. (What is the cleanest way to do this?) We need to store the partial reduce-scatter output and check for it upon the next backward. We should also be sure to error if the set of parameters receiving gradients changes, in which case we cannot support this easily. Anyway, we will implement this in a follow-up. Pull Request resolved: #125484 Approved by: https://github.com/wanchaol ghstack dependencies: #125431, #125479
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -18,6 +18,8 @@
     OffloadPolicy,
 )
 from torch.distributed._composable.fsdp._fsdp_collectives import (
+    _div_if_needed,
+    _get_gradient_divide_factors,
     foreach_all_gather,
     foreach_all_gather_copy_out,
     foreach_reduce,
@@ -207,6 +209,18 @@ def test_reduce_scatter_fp32(self):
                 reduce_scatter_dtype=torch.float32,
             )
 
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_reduce_scatter_fp16(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.cuda.current_stream()
+        stream = torch.cuda.Stream()
+        for reduce_scatter_stream in (default_stream, stream):
+            self._test_reduce_scatter(
+                param_sizes,
+                reduce_scatter_stream=reduce_scatter_stream,
+                reduce_scatter_dtype=torch.float16,
+            )
+
     def _test_reduce_scatter(
         self,
         param_sizes: List[torch.Size],
@@ -238,17 +252,24 @@ def _test_reduce_scatter(
             orig_dtype=orig_params[0].dtype,
             reduce_dtype=reduce_scatter_dtype,
             device=self.device,
-            divide_factors=fsdp_param_group._grad_divide_factors,
             all_reduce_group=None,
             all_reduce_stream=all_reduce_stream,
         )
         torch.cuda.current_stream().wait_event(view_out_event)
 
         # Check reduce-scatter correctness
+        predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+            group, None, reduce_scatter_dtype
+        )
         reduced_grads = [grad.detach().clone() for grad in unsharded_grads]
         for grad in reduced_grads:
-            dist.all_reduce(grad, group=group)
-            grad /= self.world_size
+            _div_if_needed(grad, predivide_factor)
+            dist.all_reduce(
+                grad,
+                group=group,
+                op=dist.ReduceOp.AVG if predivide_factor is None else dist.ReduceOp.SUM,
+            )
+            _div_if_needed(grad, postdivide_factor)
         for fsdp_param, reduced_grad in zip(fsdp_params, reduced_grads):
             sharded_grad = fsdp_param.sharded_param.grad
             self.assertIsInstance(sharded_grad, DTensor)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_collectives.py b/torch/distributed/_composable/fsdp/_fsdp_collectives.py
@@ -125,7 +125,6 @@ def foreach_reduce(
     orig_dtype: torch.dtype,
     reduce_dtype: Optional[torch.dtype],
     device: torch.device,
-    divide_factors: Union[Tuple[None, None], Tuple[float, float]],
     all_reduce_group: Optional[dist.ProcessGroup],
     all_reduce_stream: torch.cuda.Stream,
 ) -> torch.cuda.Event:
@@ -142,7 +141,9 @@ def foreach_reduce(
         )
     grad_dtype = unsharded_grads[0].dtype
     reduce_dtype = reduce_dtype or grad_dtype
-    predivide_factor, postdivide_factor = divide_factors
+    predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+        reduce_scatter_group, all_reduce_group, reduce_dtype
+    )
     world_size = reduce_scatter_group.size()
     padded_unsharded_sizes = tuple(
         _get_dim0_padded_size(grad.size(), world_size) for grad in unsharded_grads
@@ -166,18 +167,22 @@ def foreach_reduce(
             (reduce_scatter_output_numel,)
         )
         _div_if_needed(reduce_scatter_input, predivide_factor)
-        _reduce_scatter(
-            post_reduce_output,
-            reduce_scatter_input,
-            reduce_scatter_group,
-            divide_factors,
+        dist.reduce_scatter_tensor(
+            output=post_reduce_output,
+            input=reduce_scatter_input,
+            group=reduce_scatter_group,
+            op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
         )
     view_out_stream = reduce_scatter_stream
     if all_reduce_group is not None:
         view_out_stream = all_reduce_stream
         all_reduce_stream.wait_stream(reduce_scatter_stream)
         with torch.cuda.stream(all_reduce_stream):
-            _all_reduce(post_reduce_output, all_reduce_group, divide_factors)
+            dist.all_reduce(
+                post_reduce_output,
+                group=all_reduce_group,
+                op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
+            )
     with torch.cuda.stream(view_out_stream):
         _div_if_needed(post_reduce_output, postdivide_factor)
         post_reduce_output = _to_dtype_if_needed(post_reduce_output, orig_dtype)
@@ -257,30 +262,27 @@ def _get_all_gather_input_metadatas(
     )
 
 
-def _reduce_scatter(
-    output: torch.Tensor,
-    input: torch.Tensor,
-    group: dist.ProcessGroup,
-    divide_factors: Union[Tuple[None, None], Tuple[float, float]],
-) -> None:
-    if divide_factors[0]:
-        dist.reduce_scatter_tensor(output, input, group=group)
-    else:
-        # Using NCCL's reduce-scatter to do the division by world size saves
-        # extra memory read/write from a separate division kernel
-        dist.reduce_scatter_tensor(output, input, op=ReduceOp.AVG, group=group)
-
-
-def _all_reduce(
-    tensor: torch.Tensor,
-    group: dist.ProcessGroup,
-    divide_factors: Union[Tuple[None, None], Tuple[float, float]],
-) -> None:
-    if divide_factors[0]:
-        dist.all_reduce(tensor, group=group)
-    else:
-        # saves extra memory read/write from a separate division kernel
-        dist.all_reduce(tensor, op=ReduceOp.AVG, group=group)
+def _get_gradient_divide_factors(
+    reduce_scatter_group: dist.ProcessGroup,
+    all_reduce_group: Optional[dist.ProcessGroup],
+    reduce_dtype: torch.dtype,
+) -> Union[Tuple[None, None], Tuple[float, float]]:
+    # For fp32/bf16, we do not need to worry about overflow/underflow, so we
+    # use NCCL's built-in division to avoid separate div kernels
+    if reduce_dtype in (torch.float32, torch.bfloat16):
+        return None, None
+    data_parallel_size = reduce_scatter_group.size()
+    if all_reduce_group is not None:
+        data_parallel_size *= all_reduce_group.size()
+    # Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
+    # overflow/underflow. For N data parallel workers, each worker computes
+    # g_i, and they collectively reduce (g_1 + ... + g_N) / N. To avoid
+    # overflow/underflow, we divide by ~sqrt(N) before/after the reduction.
+    factor: int = 1
+    while data_parallel_size % factor == 0 and data_parallel_size / factor > factor:
+        factor *= 2
+    factor = float(factor)
+    return (factor, data_parallel_size / factor)
 
 
 def _div_if_needed(tensor: torch.Tensor, div_factor: Optional[float]) -> None:
diff --git a/torch/distributed/_composable/fsdp/_fsdp_param_group.py b/torch/distributed/_composable/fsdp/_fsdp_param_group.py
@@ -1,6 +1,6 @@
 import contextlib
 
-from typing import Any, cast, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import Any, cast, Dict, List, NamedTuple, Optional, Set, Tuple
 
 import torch
 import torch.distributed as dist
@@ -164,32 +164,6 @@ def _init_mp_dtypes(self) -> None:
             )
         self._reduce_dtype = next(iter(reduce_dtypes))
 
-    def _init_grad_divide_factors(self):
-        data_parallel_world_size = 1
-        data_parallel_world_size *= self.mesh_info.shard_mesh_size
-        if self._is_hsdp:
-            data_parallel_world_size *= self.mesh_info.replicate_mesh_size
-        if self._reduce_dtype in (torch.float32, torch.bfloat16):
-            # Use NCCL's AVG op to divide after reduction since it is more
-            # performant and fp32 has sufficient precision
-            self._grad_divide_factors: Union[Tuple[None, None], Tuple[float, float]] = (
-                None,
-                None,
-            )
-            return
-        # Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
-        # overflow/underflow. For N data parallel workers, each worker computes
-        # g_i, and they collectively reduce (g_1 + ... + g_N) / N. To avoid
-        # overflow/underflow, we divide by ~sqrt(N) before/after the reduction.
-        factor: int = 1
-        while (
-            data_parallel_world_size % factor == 0
-            and data_parallel_world_size / factor > factor
-        ):
-            factor *= 2
-        factor = float(factor)
-        self._grad_divide_factors = (factor, data_parallel_world_size / factor)
-
     def lazy_init(self):
         # Lazy init should be idempotent
         param_names_on_meta = [
@@ -207,7 +181,6 @@ def lazy_init(self):
         # Initialize mixed precision attributes lazily in case the user changes
         # the parameter dtypes after construction time but before forward
         self._init_mp_dtypes()
-        self._init_grad_divide_factors()
         self._register_state_dict_hooks()
 
     # Runtime #
@@ -346,7 +319,6 @@ def post_backward(self, *unused: Any):
                 self._orig_dtype,
                 self._reduce_dtype,
                 self.device,
-                self._grad_divide_factors,
                 self._all_reduce_process_group
                 if self._is_hsdp and self.all_reduce_grads
                 else None,