Prepare test_inductor_collectives.py for native funcol migration (#120025)

Yifu Wang · pytorchmergebot · commit dd6b5e236e3a · 2024-02-21T00:46:25.000Z
There are some tests in this file that are impl specific, e.g. verifying generated code via `FileCheck`. These tests are covered for native funcol in test_c10d_functional_native.py, therefore marking them with `@run_with_legacy_funcol`. Other tests are marked with `@run_with_both_funcol_impls`. Pull Request resolved: #120025 Approved by: https://github.com/wanchaol ghstack dependencies: #119982
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
@@ -17,9 +17,12 @@
     DynamoDistributedMultiProcTestCase,
     _dynamo_dist_per_rank_init,
     requires_nccl,
+    run_with_legacy_funcol,
+    run_with_both_funcol_impls,
+    run_with_both_funcol_impls_with_arg,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_utils import requires_cuda
+from torch.testing._internal.common_utils import instantiate_parametrized_tests, requires_cuda
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
 from torch.utils._triton import has_triton
 from torch._inductor.utils import run_and_get_triton_code
@@ -53,6 +56,7 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_broadcast_inductor(self):
         """
         Testing if broadcast works correctly when using inductor
@@ -89,6 +93,7 @@ def compile(func, example_inputs):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_allreduce_inductor(self):
         """
         This is matmul/cat/allreduce is a pattern we aim to optimize.
@@ -131,6 +136,7 @@ def test_c10d_functional_tagged_pt2_compliant(self):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_eager_allreduce_inductor_wait(self):
 
         def eager_func(a, b, c, d, *, tag, ranks, group_size):
@@ -170,6 +176,7 @@ def compile(func, example_inputs):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_inductor_allreduce_eager_wait(self):
 
         def inductor_func(a, b, c, d, *, tag, ranks, group_size):
@@ -208,6 +215,7 @@ def compile(func, example_inputs):
     @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_allreduce_input_buffer_reuse(self):
         def func(a, *, tag, ranks, group_size):
             ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
@@ -227,6 +235,7 @@ def func(a, *, tag, ranks, group_size):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_permute_tensor(self):
         def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
             return _functional_collectives.permute_tensor(tensor, src_dst_pairs, ranks, tag)
@@ -256,6 +265,7 @@ def func(tensor, src_dst_pairs, *, tag, ranks, group_size):
     @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_allgather_output_buffer_reuse(self):
         class Model(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None:
@@ -281,6 +291,7 @@ def forward(self, x, world_size, tag, ranks, group_size):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_allgather_contiguous_input(self):
         class Model(torch.nn.Module):
             def __init__(self, *args, **kwargs) -> None:
@@ -307,6 +318,7 @@ def forward(self, x, world_size, tag, ranks, group_size):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_allgather_into_tensor_inductor(self):
         """
         This is matmul/cat/allreduce is a pattern we aim to optimize.
@@ -339,6 +351,7 @@ def compile(func, example_inputs):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_reduce_scatter_tensor_inductor(self):
         def example(a, b, *, tag, ranks, group_size):
             c = torch.matmul(a, b)
@@ -369,6 +382,7 @@ def compile(func, example_inputs):
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_all_to_all_single_inductor(self):
         def example(inp, input_split_sizes_tensor, output_split_sizes_tensor, *, tag, ranks, group_size):
             input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
@@ -454,6 +468,7 @@ def example(inp, input_split_sizes_tensor, *, tag, ranks, group_size):
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_all_to_all_single_inductor_input_split_sizes_none(self):
         def example(inp, output_split_sizes_tensor, *, tag, ranks, group_size):
             output_split_sizes = _tolist_with_constrain_as_size(output_split_sizes_tensor)
@@ -495,6 +510,7 @@ def example(inp, output_split_sizes_tensor, *, tag, ranks, group_size):
     @skip_if_lt_x_gpu(2)
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
+    @run_with_legacy_funcol
     def test_all_to_all_single_inductor_split_sizes_none(self):
         def example(inp, *, tag, ranks, group_size):
             a2a = torch.ops.c10d_functional.all_to_all_single(
@@ -524,6 +540,7 @@ def example(inp, *, tag, ranks, group_size):
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
 
 
+@instantiate_parametrized_tests
 @requires_nccl()
 @requires_cuda
 class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
@@ -539,6 +556,7 @@ def get_world_trs(self, world_size=1):
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch(debug=True)
+    @run_with_legacy_funcol  # impl specific
     def test_inductor_single_op(self):
 
         def func(inp, *, tag, ranks, group_size):
@@ -567,6 +585,7 @@ def func(inp, *, tag, ranks, group_size):
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch(debug=True)
+    @run_with_legacy_funcol  # impl specific
     def test_inductor_steal_buffer(self):
         """
         it's ok and optimal if inductor allreduce mutates the buffer of an intermediate
@@ -604,6 +623,7 @@ def func(inp, *, tag, ranks, group_size):
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
+    @run_with_legacy_funcol  # impl specific
     def test_inductor_doesnt_mutate_shared(self):
         """
         make sure that an intermediate that's going to be reuse isn't mutated unless copied
@@ -641,40 +661,49 @@ def func(inp, *, tag, ranks, group_size):
         correct = func(inputs, **self.get_world_trs())
         self.assertTrue(same(out, correct))
 
-    def test_dynamo_trace_allreduce(self):
+    @run_with_both_funcol_impls_with_arg
+    def test_dynamo_trace_allreduce(self, use_native_funcol):
 
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_reduce(inp, "sum", ranks, tag)
+        def func(inp):
+            if use_native_funcol:
+                ar = _functional_collectives.all_reduce(inp, "sum", "0")
+            else:
+                ar = _functional_collectives.all_reduce(inp, "sum", [0], "")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (all_reduce, wait)
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
-    def test_dynamo_trace_all_gather_tensor(self):
+    @run_with_both_funcol_impls_with_arg
+    def test_dynamo_trace_all_gather_tensor(self, use_native_funcol):
 
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_gather_tensor(inp, 0, ranks, tag)
+        def func(inp):
+            if use_native_funcol:
+                ar = _functional_collectives.all_gather_tensor(inp, 0, "0")
+            else:
+                ar = _functional_collectives.all_gather_tensor(inp, 0, [0], "")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (all_gather, wait)
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @run_with_both_funcol_impls
     def test_dynamo_trace_all_gather_tensor_pg(self):
 
         def func(inp, *, pg):
@@ -692,6 +721,7 @@ def func(inp, *, pg):
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_all_gather(self):
 
         def func(inp, out, *, pg):
@@ -717,6 +747,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_all_gather_list(self):
 
         def func(inp, out, *, pg):
@@ -739,6 +770,7 @@ def func(inp, out, *, pg):
         assert counter.frame_count == 1
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_all_gather_args_match(self):
         # Duplicated most of the structure from test_dynamo_rewrite_dist_all_gather
         # except uses kwargs to ensure rewrite has matching arg names
@@ -766,6 +798,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_reduce_scatter(self):
 
         def func(inp, out, *, pg):
@@ -791,6 +824,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_allreduce(self):
 
         def func(tensor, pg):
@@ -813,6 +847,7 @@ def func(tensor, pg):
         assert counter.op_count == 3
         assert same(inputs_compiled, inputs_eager)
 
+    @run_with_both_funcol_impls
     def test_dynamo_rewrite_dist_all_to_all_single(self):
 
         def func(output, input, pg):
@@ -836,6 +871,7 @@ def func(output, input, pg):
         assert counter.frame_count == 1
         assert same(output_compiled, output_eager)
 
+    @run_with_both_funcol_impls
     def test_dynamo_support_collective_op_with_async_op_False(self):
 
         def func(inp, out, *, pg):
@@ -862,6 +898,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 3
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_graphbreaks_unsupported_async_op(self):
 
         def func(inp, out, *, pg):
@@ -887,6 +924,7 @@ def func(inp, out, *, pg):
         assert counter.op_count == 0
         assert same(outputs, correct_outputs)
 
+    @run_with_both_funcol_impls
     def test_dynamo_pg_var(self):
         def func(inp, *, pg):
             x = pg.rank() + 1 % pg.size()
@@ -903,23 +941,28 @@ def func(inp, *, pg):
         assert counter.op_count == 1
         assert same(outputs, correct_outputs)
 
-    def test_dynamo_trace_reduce_scatter_tensor(self):
+    @run_with_both_funcol_impls_with_arg
+    def test_dynamo_trace_reduce_scatter_tensor(self, use_native_funcol):
 
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, ranks, tag)
+        def func(inp):
+            if use_native_funcol:
+                ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, "0")
+            else:
+                ar = _functional_collectives.reduce_scatter_tensor(inp, "sum", 0, [0], "")
             return ar
 
         inputs = torch.ones(4, 4, device="cuda")
         counter = CompileCounter()
         compiled = torch.compile(func, backend=counter)
-        out = compiled(inputs, **self.get_world_trs())
-        correct = func(inputs, **self.get_world_trs())
+        out = compiled(inputs)
+        correct = func(inputs)
         self.assertEqual(counter.frame_count, 1)
 
         # should test more precisely, but the 2 is supposed to be (reduce_scatter, wait)
         self.assertEqual(counter.op_count, 2)
         self.assertTrue(same(out, correct))
 
+    @run_with_both_funcol_impls
     def test_dynamo_trace_allgather_coalesced(self):
         def func(inp, *, tag, ranks, group_size):
             ar = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(inp, tag, ranks, group_size)
@@ -935,25 +978,29 @@ def func(inp, *, tag, ranks, group_size):
         assert same(out, correct)
 
 
-    def test_backwards(self):
+    @run_with_both_funcol_impls_with_arg
+    def test_backwards(self, use_native_funcol):
         """
         It's probably not that common to need backwards support for collectives.
 
         However, I wanted to at least see if it was possible to support it as a design goal.
         """
-        def func(inp, *, tag, ranks, group_size):
-            ar = _functional_collectives.all_reduce(inp, "sum", ranks, tag)
+        def func(inp):
+            if use_native_funcol:
+                ar = _functional_collectives.all_reduce(inp, "sum", "0")
+            else:
+                ar = _functional_collectives.all_reduce(inp, "sum", [0], "")
             return ar
 
         input = torch.ones(4, 4, device="cuda", requires_grad=True)
         # TODO implement backwards
         with self.assertRaisesRegex(RuntimeError, "element 0 of tensors does not require grad and does not have a grad_fn"):
             compiled = torch.compile(func, backend="aot_eager")  # inductor bug with single-op allreduce graph
-            out = compiled(input, **self.get_world_trs())
+            out = compiled(input)
             out.sum().backward()
 
             correct_input = input.clone().detach().requires_grad_()
-            correct = func(correct_input, **self.get_world_trs())
+            correct = func(correct_input)
             correct.sum().backward()
             self.assertTrue(same(out, correct))
             self.assertTrue(same(input.grad, correct_input.grad))
@@ -965,6 +1012,7 @@ def test_meta(self):
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
+    @run_with_legacy_funcol  # impl specific
     def test_inductor_all_gather_coalesced(self):
         """
         make sure that an intermediate that's going to be reuse isn't mutated unless copied
@@ -1011,6 +1059,7 @@ def func(inp, *, tag, ranks, group_size):
 
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False})
+    @run_with_legacy_funcol  # impl specific
     def test_inductor_reduce_scatter_coalesced(self):
         """
         make sure that an intermediate that's going to be reuse isn't mutated unless copied
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
@@ -480,6 +480,13 @@ def all_to_all_single(
     if native_funcol_enabled():
         group_name = _resolve_group_name(group, tag)
         group_size = c10d._get_group_size_by_name(group_name)
+        if output_split_sizes is None or input_split_sizes is None:
+            assert output_split_sizes is None and input_split_sizes is None, (
+                "output_split_sizes and input_split_sizes must either be "
+                "specified together or both set to None"
+            )
+            output_split_sizes = [self.shape[0] // group_size] * group_size
+            input_split_sizes = output_split_sizes
         tensor = torch.ops._c10d_functional.all_to_all_single(  # type: ignore[attr-defined]
             self,
             output_split_sizes,