[DTensor] implement dist_split as a sharding prop rule

XilunWu · XilunWu · commit 98e3432b5029 · 2023-01-30T21:36:02.000Z
ghstack-source-id: 17951d9 Pull Request resolved: #93306
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
@@ -453,7 +453,7 @@ def wrapped(fn):
     xfail("special.spherical_bessel_j0"),
     xfail("special.xlog1py"),
     xfail("special.zeta"),
-    xfail("split"),
+    #xfail("split"),
     xfail("split", "list_args"),
     xfail("split_with_sizes"),
     xfail("squeeze", "multiple"),
@@ -553,6 +553,7 @@ def wrapped(fn):
     "torch.eq",
     "torch.isfinite",
     "torch.isnan",
+    #"torch.functional.split",
 ]
 
 
diff --git a/test/distributed/_tensor/test_tp_sharding_ops.py b/test/distributed/_tensor/test_tp_sharding_ops.py
@@ -70,7 +70,7 @@ def test_replicated_permute(self):
         self.assertEqual(new_dt.stride(), tensor.permute(1, 0, 2).stride())
 
     @with_comms
-    def test_sharded_split(self):
+    def test_sharded_split_1(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         torch.manual_seed(self.rank)
         tensor = torch.rand(3, 5, 6, device=self.device_type)
@@ -82,6 +82,21 @@ def test_sharded_split(self):
             self.assertTrue(dt.placements[0].is_shard(dim=2))
             self.assertEqual(dt.to_local(), local_tensors[idx])
 
+    @with_comms
+    def test_sharded_split_2(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(0)
+        tensor = torch.rand(4, 4, 4, device=self.device_type, requires_grad=True)
+        sharding = [Replicate()]
+        dist_tensor = distribute_tensor(tensor, device_mesh, sharding)
+        dt_list = dist_tensor.split(dist_tensor.size(0) // 2, dim=0)
+        print(dt_list)
+        local_tensors = tensor.split(2, dim=0)
+        for idx, dt in enumerate(dt_list):
+            #self.assertTrue(dt.placements[0].is_shard(dim=0))
+            self.assertEqual(dt.to_local(), local_tensors[idx])
+        dt_list[0].to_local().sum().backward()
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -600,3 +600,75 @@ def _update_schema_suggestion_for_cat(
         )
     ]
     return output_sharding
+
+@register_prop_rule("aten.split.Tensor")
+def split_rule(op_schema: OpSchema) -> OutputSharding:
+    """
+    The OutputSpecType of tensor split should be Sequence[DTensorSpec]
+    """
+    print(op_schema)
+    output_spec_list: List[DTensorSpec] = []
+    input_spec = cast(DTensorSpec, op_schema.args_schema[0])
+    ndim = input_spec.ndim
+    split_size_or_sections = op_schema.args_schema[1]
+    dim = 0
+    if len(op_schema.args_schema) > 2:
+        dim = cast(int, op_schema.args_schema[2])
+    dim = normalize_dim(dim, ndim)
+
+    # TODO: just like slice op, split replicates before splitting
+    # on a sharded dimension
+    # TODO: shall we consider partial???
+    # TODO: consider splitting an empty tensor
+    need_reshard = False
+    if is_tensor_dim_sharded(input_spec, dim=dim):
+        need_reshard = True
+        input_spec = DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=unshard_tensor_dim(input_spec.placements, dim=dim),
+            shape=input_spec.shape,
+            ndim=input_spec.ndim,
+        )
+
+    if need_reshard:
+        return OutputSharding(
+            None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(input_spec,) + op_schema.args_schema[1:],
+                    kwargs_schema={},
+                ),
+            ]
+        )
+
+    def size_split(N, i):
+        # Last chunk will be smaller if the tensor size N
+        # along the given dimension dim is not divisible by i.
+        assert i > 0
+        return [i] * (N // i) + ([N % i] if N % i != 0 else [])
+
+    output_size_list = (
+        size_split(input_spec.shape[dim], split_size_or_sections)
+        if isinstance(split_size_or_sections, int)
+        else split_size_or_sections
+    )
+    output_shape_list = [
+        torch.Size(
+            tuple(input_spec.shape[:dim])
+            + (size,)
+            + tuple(input_spec.shape[dim + 1 :])
+        )
+        for size in output_size_list
+    ]
+    output_spec_list = [
+        DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=input_spec.placements,
+            shape=shape,
+            ndim=input_spec.ndim,
+        ) 
+        for shape in output_shape_list
+    ]
+    print(output_spec_list)
+    return OutputSharding(output_spec_list)
diff --git a/torch/distributed/_tensor/ops/tp_sharding_ops.py b/torch/distributed/_tensor/ops/tp_sharding_ops.py
@@ -15,7 +15,6 @@
 """
 
 
-@register_impl("aten.split.Tensor")
 # pyre-fixme[2]: Parameter must be annotated.
 def dist_split(self: DTensor, split_size_or_sections, dim=0) -> List[DTensor]:
     local_mat = pytree.tree_map(unwrap_local_tensor, self)