[DTensor] implement dist_cat as a sharding prop rule

XilunWu · XilunWu · commit b95f4935c0bf · 2023-01-20T19:22:36.000Z
ghstack-source-id: d4cbf11 Pull Request resolved: #92677
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
@@ -150,7 +150,6 @@ def wrapped(fn):
     xfail("diff"),
     xfail("dist"),
     xfail("dot"),
-    xfail("dstack"),
     xfail("einsum"),
     xfail("empty"),
     xfail("empty_like"),
diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
@@ -354,6 +354,24 @@ def test_index(self):
                 torch.randint(5, (12, 8, 12)),
             )
 
+    @with_comms
+    def test_sharded_cat(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor_1 = torch.rand(3, 5, 6)
+        tensor_2 = torch.rand(3, 5, 6)
+        tensor_3 = torch.rand(3, 5, 6)
+        sharding = [Shard(0)]
+        dt_1 = DTensor.from_local(tensor_1, device_mesh, sharding)
+        dt_2 = DTensor.from_local(tensor_2, device_mesh, sharding)
+        dt_3 = DTensor.from_local(tensor_3, device_mesh, sharding)
+        new_dt = torch.cat([dt_1, dt_2, dt_3])
+        cat_dt = DTensor.from_local(
+            torch.cat([tensor_1, tensor_2, tensor_3]), device_mesh, sharding
+        )
+        self.assertEqual(new_dt.to_local(), cat_dt.to_local())
+        self.assertEqual(new_dt.size(), cat_dt.size())
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/test_tp_sharding_ops.py b/test/distributed/_tensor/test_tp_sharding_ops.py
@@ -69,24 +69,6 @@ def test_replicated_permute(self):
         self.assertEqual(new_dt.to_local(), tensor.permute(1, 0, 2))
         self.assertEqual(new_dt.stride(), tensor.permute(1, 0, 2).stride())
 
-    @with_comms
-    def test_sharded_cat(self):
-        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        torch.manual_seed(self.rank)
-        tensor_1 = torch.rand(3, 5, 6)
-        tensor_2 = torch.rand(3, 5, 6)
-        tensor_3 = torch.rand(3, 5, 6)
-        sharding = [Shard(0)]
-        dt_1 = DTensor.from_local(tensor_1, device_mesh, sharding)
-        dt_2 = DTensor.from_local(tensor_2, device_mesh, sharding)
-        dt_3 = DTensor.from_local(tensor_3, device_mesh, sharding)
-        new_dt = torch.cat([dt_1, dt_2, dt_3])
-        cat_dt = DTensor.from_local(
-            torch.cat([tensor_1, tensor_2, tensor_3]), device_mesh, sharding
-        )
-        self.assertEqual(new_dt.to_local(), cat_dt.to_local())
-        self.assertEqual(new_dt.size(), cat_dt.size())
-
     @with_comms
     def test_sharded_split(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -11,7 +11,7 @@
     Shard,
 )
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
-from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops.common_rules import einop_rule, pointwise_rule
 from torch.distributed._tensor.ops.utils import register_prop_rule
 
 
@@ -472,3 +472,93 @@ def place(vp: Placement, ip: Placement) -> Placement:
             ],
         )
         return result
+
+
+@register_prop_rule("aten.cat.default")
+def cat_rule(op_schema: OpSchema) -> OutputSharding:
+    dim = 0  # default dim = 0
+    tensor_list_specs = cast(List[DTensorSpec], op_schema.args_schema[0])
+    if (len(op_schema.args_schema) > 1):
+        dim = cast(int, op_schema.args_schema[1])
+    # normalize arguments
+    if dim < 0:
+        dim += tensor_list_specs[0].ndim
+
+    # check concat dim
+    needs_reshard_on_cat_dim = False
+    for spec in tensor_list_specs:
+        if dim < len(spec.placements) and spec.placements[dim].is_shard():
+            needs_reshard_on_cat_dim = True
+            spec.placements = unshard_tensor_dim(spec.placements, dim=dim)
+    if needs_reshard_on_cat_dim:
+        args_schema = (tensor_list_specs,) + op_schema.args_schema[1:]
+        suggested_schema = OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=args_schema,
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+        return OutputSharding(
+            None,
+            schema_suggestions=[suggested_schema],
+            failed_reason="All tensors in concat must have no sharding on cat dim, need to reshard!",
+        )
+    alphabet = "abcdefghijklmnopqrstuvwxyz"
+    einop_equation = ""
+    for spec in tensor_list_specs:
+        einop_equation += alphabet[:spec.ndim]
+        einop_equation += ','
+    einop_equation = einop_equation[:-1] + "->" + alphabet[:tensor_list_specs[0].ndim]
+    output_sharding = einop_rule(
+        einop_equation,
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=tuple(tensor_list_specs),
+            kwargs_schema={},
+        ),
+        linearity=False
+    )
+
+    if output_sharding.output_spec is None:
+        if output_sharding.schema_suggestions is not None:
+            return _update_schema_suggestion_for_cat(
+                output_sharding,
+                op_schema,
+                dim,
+            )
+        else:
+            return OutputSharding(None)
+    # change output shape
+    new_size = 0
+    for spec in tensor_list_specs:
+        new_size += spec.shape[dim]
+    assert isinstance(output_sharding.output_spec, DTensorSpec)
+    output_sharding.output_spec.shape = torch.Size(
+        tuple(output_sharding.output_spec.shape[:dim])
+        + (new_size,)
+        + tuple(output_sharding.output_spec.shape[dim + 1 :])
+    )
+    return output_sharding
+
+
+def _update_schema_suggestion_for_cat(
+    output_sharding: OutputSharding,
+    op_schema: OpSchema,
+    dim: int,
+) -> OutputSharding:
+    assert output_sharding.schema_suggestions is not None
+    suggestion_specs = output_sharding.schema_suggestions[0].args_spec
+
+    # check concat dim
+    for spec in suggestion_specs:
+        if dim < len(spec.placements) and spec.placements[dim].is_shard():
+            spec.placements = unshard_tensor_dim(spec.placements, dim=dim)
+    args_schema = (suggestion_specs,) + op_schema.args_schema[1:]
+
+    output_sharding.schema_suggestions = [
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=args_schema,
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+    ]
+    return output_sharding
diff --git a/torch/distributed/_tensor/ops/tp_sharding_ops.py b/torch/distributed/_tensor/ops/tp_sharding_ops.py
@@ -2,7 +2,6 @@
 # implement matrix related ops for distributed tensor
 from typing import List
 
-import torch
 import torch.utils._pytree as pytree
 from torch.distributed._tensor.api import DTensor
 from torch.distributed._tensor.ops.utils import register_impl, unwrap_single_placement
@@ -16,18 +15,6 @@
 """
 
 
-@register_impl("aten.cat.default")
-def dist_cat(tensor_list: List[DTensor], dim: int = 0) -> DTensor:
-    local_inputs = pytree.tree_map(unwrap_local_tensor, tensor_list)
-    local_tensor = torch.ops.aten.concat(local_inputs, dim=dim)
-    return DTensor.from_local(
-        local_tensor,
-        tensor_list[0].device_mesh,
-        tensor_list[0].placements,
-        run_check=False,
-    )
-
-
 @register_impl("aten.split.Tensor")
 # pyre-fixme[2]: Parameter must be annotated.
 def dist_split(self: DTensor, split_size_or_sections, dim=0) -> List[DTensor]: