[comm_mode] adding some initial c10d ops to CommDebugMode (#125475)

wanchaol · pytorchmergebot · commit ff061baa94f4 · 2024-05-04T04:20:46.000Z
looks like we can make it work :) Pull Request resolved: #125475 Approved by: https://github.com/awgu
diff --git a/test/distributed/_tensor/debug/test_comm_mode.py b/test/distributed/_tensor/debug/test_comm_mode.py
@@ -9,11 +9,13 @@
 
 from torch.distributed._tensor.debug.comm_mode import CommDebugMode
 from torch.distributed._tensor.placement_types import Shard
+from torch.testing._internal.common_distributed import requires_nccl
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 c10d_functional = torch.ops.c10d_functional
+c10d_ops = torch.ops.c10d
 
 
 class TestCommMode(TestCase):
@@ -79,6 +81,26 @@ def f(x, y):
         self.assertEqual(comm_counts[c10d_functional.all_gather_into_tensor], 1)
         self.assertEqual(comm_counts[c10d_functional.reduce_scatter_tensor], 0)
 
+    @requires_nccl()
+    def test_comm_mode_with_c10d(self):
+        world_pg = self.world_pg
+
+        inp = torch.rand(2, 8, 16).cuda()
+        all_gather_out = inp.new_empty(self.world_size * 2, 8, 16)
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            dist.all_reduce(inp)
+            dist.all_gather_into_tensor(all_gather_out, inp)
+            dist.reduce_scatter_tensor(inp, all_gather_out)
+            dist.broadcast(inp, 0)
+
+        comm_counts = comm_mode.get_comm_counts()
+        self.assertEqual(comm_counts[c10d_ops.allreduce_], 1)
+        self.assertEqual(comm_counts[c10d_ops._allgather_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops._reduce_scatter_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops.broadcast_], 1)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_utils.py b/test/distributed/_tensor/test_utils.py
@@ -144,9 +144,8 @@ def test_fsdp1_tp_2d_dtensor_local_shards_and_offsets(self):
                 global_tensor, tp_mesh, placements=[Shard(0)]
             )
             dtensor_2d = DTensor.from_local(
-                dtensor_tp.to_local(), mesh_2d, [Replicate(), Shard(0)]
+                dtensor_tp.to_local(), mesh_2d, [Replicate(), Shard(0)], run_check=False
             ).redistribute(mesh_2d, [Shard(0), Shard(0)])
-            self.assertEqual(len(comm_mode.get_comm_counts()), 1)
             self.assertEqual(
                 comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
             )
@@ -196,7 +195,6 @@ def test_fsdp2_tp_2d_dtensor_local_shards_and_offsets(self):
                 stride=global_tensor.stride(),
             )
 
-            self.assertEqual(len(comm_mode.get_comm_counts()), 0)
             self.assertEqual(
                 comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 0
             )
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -201,7 +201,7 @@ def _get_grads_as_flattened(
         all_grads_as_flattened = torch.cat(
             [torch.empty_like(local_grads_as_flattened) for _ in range(fsdp_pg.size())]
         ).contiguous()
-        dist._all_gather_base(
+        dist.all_gather_into_tensor(
             all_grads_as_flattened, local_grads_as_flattened, group=fsdp_pg
         )
         if not uses_tp:
@@ -387,11 +387,16 @@ def forward(self, x):
             fsdp_2d_model(torch.rand(2, 10).cuda(self.rank)).sum().backward()
 
         funcol = torch.ops.c10d_functional
+        c10d_ops = torch.ops.c10d
         comm_counts = comm_mode.get_comm_counts()
-        self.assertEqual(comm_mode.get_total_counts(), 5)
+        self.assertEqual(comm_mode.get_total_counts(), 7)
+        # TP comms
         self.assertEqual(comm_counts[funcol.reduce_scatter_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_gather_into_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_reduce], 1)
+        # FSDP comms
+        self.assertEqual(comm_counts[c10d_ops._allgather_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops._reduce_scatter_base_], 1)
 
         grads = [p.grad for p in fsdp_2d_model.parameters() if p.grad is not None]
 
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
@@ -49,10 +49,8 @@ def test_colwise_parallel_style(self):
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8, 16 // self.world_size))
@@ -65,10 +63,8 @@ def test_colwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_col_parallel = ColwiseParallel(input_layouts=Shard(0))
+        colwise_mod = parallelize_module(deepcopy(model), mesh, sharded_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, sharded_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8 * self.world_size, 16 // self.world_size))
@@ -94,10 +90,8 @@ def test_colwise_parallel_embedding(self):
         model = nn.Embedding(16, 16, device=self.device_type)
 
         default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (4, 2, 16 // self.world_size))
@@ -119,10 +113,8 @@ def test_rowwise_parallel_style(self):
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_row_parallel = RowwiseParallel()
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, default_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_row_parallel
-            )
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8, 16))
@@ -135,10 +127,8 @@ def test_rowwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_row_parallel = RowwiseParallel(output_layouts=Shard(0))
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, sharded_row_parallel
-            )
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8 // self.world_size, 16))
@@ -163,10 +153,10 @@ def test_rowwise_parallel_embedding(self):
         tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
         model = nn.Embedding(16, 16, device=self.device_type)
 
+        rowwise_mod = parallelize_module(
+            deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
+        )
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
-            )
             out = rowwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (4, 2, 16))
diff --git a/torch/distributed/_tensor/debug/comm_mode.py b/torch/distributed/_tensor/debug/comm_mode.py
@@ -9,6 +9,7 @@
 funcol_native = torch.ops._c10d_functional
 funcol_py = torch.ops.c10d_functional
 funcol_autograd = torch.ops._c10d_functional_autograd
+c10d_ops = torch.ops.c10d
 
 NATIVE_TO_PY_MAPPING = {
     funcol_native.all_gather_into_tensor: funcol_py.all_gather_into_tensor,
@@ -22,6 +23,13 @@
     funcol_autograd.all_to_all_single: funcol_py.all_to_all_single,
 }
 
+c10d_collective_ops = {
+    c10d_ops.allreduce_,
+    c10d_ops._allgather_base_,
+    c10d_ops._reduce_scatter_base_,
+    c10d_ops.broadcast_,
+}
+
 
 class CommDebugMode(TorchDispatchMode):
     """
@@ -88,7 +96,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # the need to modify all tests to accommodate the two implementations,
         # we make CommDebugMode translate native funcol ops into legacy funcol
         # ops until the migration finishes.
-        if func_packet in self.comm_registry:
+
+        if func_packet in self.comm_registry or func_packet in c10d_collective_ops:
             if func_packet in NATIVE_TO_PY_MAPPING:
                 func_packet = NATIVE_TO_PY_MAPPING[func_packet]
             self.comm_counts[func_packet] += 1

Original file line number	Diff line number	Diff line change
`@@ -144,9 +144,8 @@ def test_fsdp1_tp_2d_dtensor_local_shards_and_offsets(self):`
`144`	`144`	`global_tensor, tp_mesh, placements=[Shard(0)]`
`145`	`145`	`)`
`146`	`146`	`dtensor_2d = DTensor.from_local(`
`147`		`- dtensor_tp.to_local(), mesh_2d, [Replicate(), Shard(0)]`
	`147`	`+ dtensor_tp.to_local(), mesh_2d, [Replicate(), Shard(0)], run_check=False`
`148`	`148`	`).redistribute(mesh_2d, [Shard(0), Shard(0)])`
`149`		`- self.assertEqual(len(comm_mode.get_comm_counts()), 1)`
`150`	`149`	`self.assertEqual(`
`151`	`150`	`comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1`
`152`	`151`	`)`
`@@ -196,7 +195,6 @@ def test_fsdp2_tp_2d_dtensor_local_shards_and_offsets(self):`
`196`	`195`	`stride=global_tensor.stride(),`
`197`	`196`	`)`
`198`	`197`
`199`		`- self.assertEqual(len(comm_mode.get_comm_counts()), 0)`
`200`	`198`	`self.assertEqual(`
`201`	`199`	`comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 0`
`202`	`200`	`)`