[DeviceMesh] Simplifying internal bookkeeping with CuTe layout

fduwjj · fduwjj · commit da123be6648c · 2025-09-18T14:09:57.000-07:00
ghstack-source-id: eb6752b Pull Request resolved: #163213
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -440,6 +440,7 @@ def test_device_mesh_parent_child_hash(self):
         ep_mesh = ep_mesh_1 if self.rank < self.world_size // 2 else ep_mesh_2
         # ep_mesh is considered different from mesh_2d["TP"]
         self.assertEqual(mesh_2d["TP"]._flatten_mesh_list, ep_mesh._flatten_mesh_list)
+        self.assertEqual(mesh_2d["TP"]._layout, ep_mesh._layout)
         self.assertEqual(mesh_2d["TP"].mesh.shape, ep_mesh.mesh.shape)
         self.assertEqual(mesh_2d["TP"].device_type, ep_mesh.device_type)
         self.assertNotEqual(mesh_2d["TP"].mesh_dim_names, ep_mesh.mesh_dim_names)
@@ -454,6 +455,7 @@ def test_device_mesh_parent_child_hash(self):
         )
         # another_mesh is considered the same as ep_mesh
         self.assertEqual(ep_mesh._flatten_mesh_list, another_mesh._flatten_mesh_list)
+        self.assertEqual(ep_mesh._layout, another_mesh._layout)
         self.assertEqual(ep_mesh.mesh.shape, another_mesh.mesh.shape)
         self.assertEqual(ep_mesh.device_type, another_mesh.device_type)
         self.assertEqual(ep_mesh.mesh_dim_names, another_mesh.mesh_dim_names)
@@ -539,7 +541,6 @@ def test_from_group_with_mesh_shape_2d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
-        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
         for mesh_dim_group, ref_mesh_dim_group in zip(
             dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
         ):
@@ -800,6 +801,10 @@ def test_get_item_3d(self):
         # Test slicing out 1D mesh from a sub-2D mesh.
         shard_mesh = hsdp_mesh_2["Shard"]
         self.assertEqual(shard_mesh.mesh.tolist(), shard_group[shard_group_idx])
+        replicate_mesh = hsdp_mesh_2["Replicate"]
+        self.assertEqual(
+            replicate_mesh.mesh.tolist(), replicate_group[replicate_group_idx]
+        )
 
     @with_comms
     def test_cache_and_reuse_submesh_slice_result(self):
@@ -838,11 +843,14 @@ def test_get_item_3d_noncontiguous_slicing(self):
         # Check on the current dp_local_rank, whether the cp mesh tensor is the same.
         self.assertEqual(dp_cp_mesh.mesh[dp_local_rank], cp_mesh.mesh)
 
-        with self.assertRaisesRegex(
-            KeyError,
-            "Invalid mesh_dim_names",
-        ):
-            mesh_3d["cp", "dp"]
+        # Support transpose slicing.
+        cp_dp_mesh = mesh_3d["cp", "dp"]
+        expected_mesh_tensor = (
+            torch.tensor([[0, 4], [1, 5]], dtype=torch.int)
+            if self.rank in (0, 1, 4, 5)
+            else torch.tensor([[2, 6], [3, 7]], dtype=torch.int)
+        )
+        self.assertEqual(cp_dp_mesh.mesh, expected_mesh_tensor)
 
     @with_comms
     def test_flatten_mesh_1d(self):
@@ -875,10 +883,14 @@ def test_flatten_mesh_3d(self):
         self.assertEqual(flattened_dp_cp_mesh.mesh_dim_names[0], "dp_cp")
         root_mesh = _mesh_resources.get_root_mesh(dp_cp_mesh)
         self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
+        flatten_mesh_layout = _mesh_resources.flatten_name_to_root_layout[root_mesh][
             "dp_cp"
         ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 1))
+        self.assertEqual(flatten_mesh_layout, flattened_dp_cp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_cp_mesh._layout.global_ranks(8),
+            [[0, 2, 4, 6], [1, 3, 5, 7]],
+        )
 
         ref_pg_count = _world.group_count
         # Calling flatten again should not create a new pg.
@@ -893,10 +905,14 @@ def test_flatten_mesh_3d(self):
         self.assertEqual(flattened_dp_tp_mesh.mesh_dim_names[0], "dp_tp")
         root_mesh = _mesh_resources.get_root_mesh(dp_tp_mesh)
         self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
-            "dp_tp"
-        ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 2))
+        flatten_mesh_root_layout = _mesh_resources.flatten_name_to_root_layout[
+            root_mesh
+        ]["dp_tp"]
+        self.assertEqual(flatten_mesh_root_layout, flattened_dp_tp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_tp_mesh._layout.global_ranks(8),
+            [[0, 1, 4, 5], [2, 3, 6, 7]],
+        )
 
         # Test flatten with a flattened mesh_dim_name
         cp_tp_mesh = mesh_3d["cp", "tp"]
@@ -1498,6 +1514,91 @@ def test_composition(self):
             right_l = _Layout((2,), (3,))
             orig_l.composition(right_l)
 
+    def test_check_overlap(self):
+        """Test the check_overlap method for various layout configurations."""
+        # Test 1: Valid layout - no overlap
+        # sizes=(2,3), strides=(6,1) - stride 6 > span 3, so no overlap
+        layout1 = _Layout((2, 3), (6, 1))
+        self.assertTrue(layout1.check_overlap())
+
+        # Test 2: Invalid layout - overlap due to stride < previous span
+        # sizes=(2,3), strides=(2,1) - stride 2 < span 3, causes overlap
+        layout2 = _Layout((2, 3), (2, 1))
+        self.assertFalse(layout2.check_overlap())
+
+        # Test 3: Invalid layout - duplicate strides
+        # sizes=(2,3), strides=(1,1) - same stride, causes overlap
+        layout3 = _Layout((2, 3), (1, 1))
+        self.assertFalse(layout3.check_overlap())
+
+        # Test 4: Valid layout - single dimension
+        layout4 = _Layout((4,), (1,))
+        self.assertTrue(layout4.check_overlap())
+
+        # Test 5: Valid layout - exact boundary case
+        # sizes=(2,3), strides=(3,1) - stride 3 == span 3, valid
+        layout5 = _Layout((2, 3), (3, 1))
+        self.assertTrue(layout5.check_overlap())
+
+        # Test 6: Valid layout - multi-dimensional with proper spacing
+        layout6 = _Layout((2, 2, 2), (8, 4, 1))
+        self.assertTrue(layout6.check_overlap())
+
+        # Test 7: Invalid layout - middle dimension overlaps
+        layout7 = _Layout((2, 2, 2), (4, 1, 2))
+        self.assertTrue(layout7.check_overlap())
+
+    def test_to_remapping_tensor(self):
+        """Test the to_remapping_tensor method for various scenarios."""
+        # Test 1: Consecutive ranks, full world - should return logical groups directly
+        original_mesh = torch.tensor([[0, 1], [2, 3]], dtype=torch.int)
+        layout1 = _Layout((2, 2), (2, 1))  # row-major 2x2
+        result1 = layout1.to_remapping_tensor(original_mesh, world_size=4)
+        expected1 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
+        self.assertEqual(result1, expected1)
+
+        # Test 2: Non-consecutive ranks - should map to actual ranks
+        original_mesh = torch.tensor([[10, 20], [30, 40]], dtype=torch.int)
+        layout2 = _Layout((2, 2), (2, 1))
+        result2 = layout2.to_remapping_tensor(original_mesh, world_size=4)
+        expected2 = torch.tensor([[[10, 20], [30, 40]]], dtype=torch.int)
+        self.assertEqual(result2, expected2)
+
+        # Test 3: Partial world (mesh smaller than world_size) - requires stride scaling
+        original_mesh = torch.tensor([1, 2], dtype=torch.int)
+        layout3 = _Layout((2,), (4,))  # stride=4 for world_size=8
+        result3 = layout3.to_remapping_tensor(original_mesh, world_size=8)
+        expected3 = torch.tensor([[1, 2]], dtype=torch.int)
+        self.assertEqual(result3, expected3)
+
+        # Test 4: 1D layout with consecutive ranks
+        original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
+        layout4 = _Layout((4,), (1,))
+        result4 = layout4.to_remapping_tensor(original_mesh, world_size=4)
+        expected4 = torch.tensor([[0, 1, 2, 3]], dtype=torch.int)
+        self.assertEqual(result4, expected4)
+
+        # Test 5: Complex strided layout with non-consecutive ranks
+        original_mesh = torch.tensor([5, 10, 15, 20], dtype=torch.int)
+        layout5 = _Layout((2, 2), (2, 1))
+        result5 = layout5.to_remapping_tensor(original_mesh, world_size=4)
+        expected5 = torch.tensor([[[5, 10], [15, 20]]], dtype=torch.int)
+        self.assertEqual(result5, expected5)
+
+        # Test 6: Tensor Cute representation of a 2D mesh
+        original_mesh = torch.tensor([[0, 2], [1, 3]], dtype=torch.int)
+        layout6 = _Layout((2, 2), (1, 2))  # column-major style
+        result6 = layout6.to_remapping_tensor(original_mesh, world_size=4)
+        expected6 = torch.tensor([[[0, 2], [1, 3]]], dtype=torch.int)
+        self.assertEqual(result6, expected6)
+
+        # Test 7: Layout with different stride pattern
+        original_mesh = torch.tensor([0, 2, 1, 4], dtype=torch.int)
+        layout7 = _Layout((2, 2), (1, 2))  # column-major style
+        result7 = layout7.to_remapping_tensor(original_mesh, world_size=4)
+        expected7 = torch.tensor([[[0, 1], [2, 4]]], dtype=torch.int)
+        self.assertEqual(result7, expected7)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 from itertools import product
 
+import torch
 from torch.distributed._pycute import (
     coalesce,
     complement,
@@ -74,6 +75,16 @@ def __getitem__(self, i: int) -> "_MeshLayout":
         layout = super().__getitem__(i)
         return _MeshLayout(layout.shape, layout.stride)
 
+    def __getstate__(self) -> dict[str, IntTuple]:
+        return {
+            "shape": self.shape,
+            "stride": self.stride,
+        }
+
+    def __setstate__(self, state: dict[str, IntTuple]) -> None:
+        object.__setattr__(self, "shape", state["shape"])
+        object.__setattr__(self, "stride", state["stride"])
+
     def coalesce(self) -> "_MeshLayout":
         """
         A layout is represented by (sizes):(strides), e.g. (3,2):(4,2).
@@ -210,3 +221,144 @@ def global_ranks(self, world_size: int) -> list[list[int]]:
             [group_offset + group_rank for group_rank in self.member_ranks()]
             for group_offset in self.complement(world_size).member_ranks()
         ]
+
+    def check_non_overlap(self) -> bool:
+        """
+        Check if the layout has any overlap between the ranks it generates. If there is overlap,
+        we return False, otherwise True.
+
+        Aside from indice 0, indices from each dim of the layout must be non-overlapping.
+
+        Here is how it works:
+        1. Sort dimensions by stride (smallest stride first)
+        2. For each dimension, check if:
+           - It has the same stride as previous dimension (duplicate mapping)
+           - Its stride overlaps with the previous dimension's span
+
+        A dimension's "span" is size * stride, representing the address space it covers.
+
+        Example 1 - Valid (no overlap):
+        Layout: sizes=(2,3), strides=(6,1)
+        - Dim 1: stride=1, span=3*1=3, covers addresses [0,1,2]
+        - Dim 0: stride=6, span=2*6=12, covers addresses [0,6]
+        → No overlap since 6 > 3
+
+        Example 2 - Invalid (overlap):
+        Layout: sizes=(2,3), strides=(2,1)
+        - Dim 1: stride=1, span=3*1=3, covers addresses [0,1,2]
+        - Dim 0: stride=2, span=2*2=4, covers addresses [0,2]
+        → Overlap! stride=2 < span=3, so addresses [0,2] are duplicated
+
+        Returns:
+            bool: True if no overlap exists (valid layout), False if overlap detected
+        """
+        previous_span = -1
+        previous_stride = -1
+        for size, stride in sorted(self.sizes_and_strides, key=lambda x: x[1]):
+            if size == 1:
+                continue
+            if previous_stride == stride or stride < previous_span:
+                return False
+            previous_stride = stride
+            previous_span = size * stride
+        return True
+
+    def to_remapping_tensor(
+        self,
+        original_mesh_tensor: torch.Tensor,
+        world_size: int,
+    ) -> torch.Tensor:
+        """
+        Convert this layout into a tensor representation that maps the logical mesh
+        structure to actual device ranks, handling cases where the mesh doesn't use
+        consecutive ranks or doesn't span the full world size (Neither is CuTe representible).
+
+        With this method, the cute layout serves as the backend of indices bookkeeping for the
+        mesh tensor when it comes to flatten, unflatten and slicing operations. The actual mesh
+        tensor still represents the actual device assignment and ranks. We need this function
+        to specify device allocation and create backend for a mesh.
+
+        Overview:
+        1. Generate logical process groups using this layout's structure
+        2. Check if the original mesh uses consecutive ranks (0,1,2,...)
+        3. If consecutive: return the logical groups directly
+        4. If non-consecutive or partial world: map logical indices to actual ranks
+
+        Examples:
+
+        Case 1 - Consecutive ranks, full world:
+        original_mesh_tensor = [[0,1],[2,3]]  # 2x2 mesh, ranks 0-3
+        world_size = 4
+        layout = Layout(2:2)
+        → Returns logical groups directly: [[0,2],[1,3]]
+
+        Case 2 - Non-consecutive ranks:
+        original_mesh_tensor = [[10,20],[30,40]]  # custom rank assignment
+        world_size = 4
+        layout = Layout(2:2)
+        → Maps logical indices to actual ranks: [[[10,30],[20,40]]]
+
+        Case 3 - Partial world (stride scaling needed):
+        original_mesh_tensor = [[0,1]]  # 1x2 mesh in world_size=8
+        world_size = 8
+        layout = Layout((2,), (4,))  # every 4th rank
+        → Scale down stride: (4,) → (1,) to fit mesh size
+        → Map scaled indices to actual ranks: [[0,1]]
+
+        Args:
+            original_mesh_tensor: The concrete mesh tensor with actual device ranks
+            world_size: Total number of ranks in the distributed system
+
+        Returns:
+            torch.Tensor: A tensor representing the actual device rank from original_mesh_tensor
+        """
+
+        def scale_stride(scale: int, strides: IntTuple) -> IntTuple:
+            """
+            Recursively scale down strides by a factor to fit within smaller mesh.
+
+            When layout expects world_size=8 but mesh only has 4 elements,
+            we need to scale strides down by factor of 2 to generate valid indices.
+
+            Example: stride=4 with scale=2 → stride=2 (or keep as-is if stride < scale)
+            """
+            if is_int(strides):
+                return strides if strides < scale else strides // scale
+            else:
+                return tuple(scale_stride(scale, stride) for stride in strides)
+
+        # Create tensor representation of the mesh
+        pg_ranks_by_dim = self.global_ranks(original_mesh_tensor.numel())
+        sizes = flatten(self.sizes)
+        tensor = torch.tensor(pg_ranks_by_dim, device="cpu", dtype=torch.int).view(
+            -1,
+            *sizes,  # type: ignore[arg-type]
+        )
+
+        # When the mesh tensor value can be represented as a cute layout, we can use the global ranks
+        # generated by the layout directly for the mesh tensor. Otherwise, the ranks generated by the layout
+        # will be used as indices to get the actual ranks from the original mesh tensor.
+        if torch.equal(
+            original_mesh_tensor.flatten().sort().values,
+            torch.arange(
+                original_mesh_tensor.numel(),
+                device=original_mesh_tensor.device,
+                dtype=original_mesh_tensor.dtype,
+            ),
+        ):
+            return tensor
+
+        # This is important because the indices generated by the layout will be larger than the original mesh tensor
+        # when the original mesh tensor does not contain all ranks in the world. So we need to scale the layout's stride
+        # by world_size // mesh_tensor.numel() so that the indices generated by the layout will be within the range of
+        # the original mesh tensor.
+        if original_mesh_tensor.numel() != world_size:
+            scale_factor = world_size // original_mesh_tensor.numel()
+            scaled_strides = scale_stride(scale_factor, self.strides)
+            scaled_layout = _MeshLayout(self.sizes, scaled_strides)
+            pg_ranks_by_dim = scaled_layout.global_ranks(original_mesh_tensor.numel())
+            tensor = torch.tensor(pg_ranks_by_dim, device="cpu", dtype=torch.int).view(
+                -1,
+                *sizes,  # type: ignore[arg-type]
+            )
+        return original_mesh_tensor.flatten()[tensor]
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py