Update on "[comm_mode] adding some initial c10d ops to CommDebugMode"

wanchaol · wanchaol · commit ed26b4fcc313 · 2024-05-03T16:03:08.000-07:00
looks like we can make it work :)

cc mrshenli pritamdamania87 zhaojuanmao satgera gqchen aazzolini osalpekar jiayisuse H-Huang kwen2501 awgu penguinwu fegin XilunWu fduwjj wz337 tianyu-l wconstab yf225 chauhang d4l3k

[ghstack-poisoned]
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -201,7 +201,7 @@ def _get_grads_as_flattened(
         all_grads_as_flattened = torch.cat(
             [torch.empty_like(local_grads_as_flattened) for _ in range(fsdp_pg.size())]
         ).contiguous()
-        dist._all_gather_base(
+        dist.all_gather_into_tensor(
             all_grads_as_flattened, local_grads_as_flattened, group=fsdp_pg
         )
         if not uses_tp:
@@ -387,11 +387,16 @@ def forward(self, x):
             fsdp_2d_model(torch.rand(2, 10).cuda(self.rank)).sum().backward()
 
         funcol = torch.ops.c10d_functional
+        c10d_ops = torch.ops.c10d
         comm_counts = comm_mode.get_comm_counts()
-        self.assertEqual(comm_mode.get_total_counts(), 5)
+        self.assertEqual(comm_mode.get_total_counts(), 7)
+        # TP comms
         self.assertEqual(comm_counts[funcol.reduce_scatter_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_gather_into_tensor], 2)
         self.assertEqual(comm_counts[funcol.all_reduce], 1)
+        # FSDP comms
+        self.assertEqual(comm_counts[c10d_ops._allgather_base_], 1)
+        self.assertEqual(comm_counts[c10d_ops._reduce_scatter_base_], 1)
 
         grads = [p.grad for p in fsdp_2d_model.parameters() if p.grad is not None]
 
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
@@ -49,10 +49,8 @@ def test_colwise_parallel_style(self):
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8, 16 // self.world_size))
@@ -65,10 +63,8 @@ def test_colwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_col_parallel = ColwiseParallel(input_layouts=Shard(0))
+        colwise_mod = parallelize_module(deepcopy(model), mesh, sharded_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, sharded_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (8 * self.world_size, 16 // self.world_size))
@@ -94,10 +90,8 @@ def test_colwise_parallel_embedding(self):
         model = nn.Embedding(16, 16, device=self.device_type)
 
         default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
         with comm_mode:
-            colwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_col_parallel
-            )
             out = colwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (4, 2, 16 // self.world_size))
@@ -119,10 +113,8 @@ def test_rowwise_parallel_style(self):
         model = nn.Linear(16, 16, device=self.device_type)
 
         default_row_parallel = RowwiseParallel()
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, default_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, default_row_parallel
-            )
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8, 16))
@@ -135,10 +127,8 @@ def test_rowwise_parallel_style(self):
             self.assertEqual(comm_mode.get_total_counts(), 1)
 
         sharded_row_parallel = RowwiseParallel(output_layouts=Shard(0))
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, sharded_row_parallel
-            )
             out = rowwise_mod(tensor)
             # ensure output replicated
             self.assertEqual(out.shape, (8 // self.world_size, 16))
@@ -163,10 +153,10 @@ def test_rowwise_parallel_embedding(self):
         tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
         model = nn.Embedding(16, 16, device=self.device_type)
 
+        rowwise_mod = parallelize_module(
+            deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
+        )
         with comm_mode:
-            rowwise_mod = parallelize_module(
-                deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
-            )
             out = rowwise_mod(tensor)
             # ensure output shard on the last dim
             self.assertEqual(out.shape, (4, 2, 16))