[distributed][sharded_tensor] Move local_shards check from ShardedTensorBase to ShardedTensor (#100197)

IvanKobzarev · pytorchmergebot · commit 4582ceb2c4a7 · 2023-05-02T12:42:24.000Z
Differential Revision: [D45369211](https://our.internmc.facebook.com/intern/diff/D45369211) Pull Request resolved: #100197 Approved by: https://github.com/fduwjj
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -23,6 +23,8 @@
     pre_load_state_dict_hook,
     state_dict_hook,
     ShardedTensor,
+    ShardedTensorBase,
+    ShardedTensorMetadata,
     Shard
 )
 from torch.distributed._shard.sharding_spec import (
@@ -2776,5 +2778,64 @@ def test_create_shard_with_no_placement(self):
         shard = Shard(torch.zeros(10), md)
         self.assertIsNone(shard.metadata.placement)
 
+class TestCreateTensorNoProcessGroupMode(TestCase):
+    def test_init_from_local_shards_and_global_metadata(self):
+        st_metadata: ShardedTensorMetadata = ShardedTensorMetadata(
+            shards_metadata=[
+                ShardMetadata(
+                    shard_offsets=[0, 0], shard_sizes=[2, 2], placement="rank:0/cpu"
+                ),
+                ShardMetadata(
+                    shard_offsets=[2, 0], shard_sizes=[2, 2], placement="rank:1/cpu"
+                ),
+            ],
+            size=torch.Size([4, 2]),
+        )
+        st_local_shards: List[Shard] = []
+        for shard_metadata in st_metadata.shards_metadata:
+            st_local_shards.append(
+                Shard(
+                    tensor=torch.zeros(
+                        shard_metadata.shard_sizes,
+                        device=shard_metadata.placement.device(),
+                    ),
+                    metadata=shard_metadata,
+                )
+            )
+
+        ShardedTensorBase._init_from_local_shards_and_global_metadata(
+            local_shards=st_local_shards,
+            sharded_tensor_metadata=st_metadata,
+        )
+
+    def test_non_contiguous_local_shards(self):
+        st_metadata: ShardedTensorMetadata = ShardedTensorMetadata(
+            shards_metadata=[
+                ShardMetadata(
+                    shard_offsets=[0, 0], shard_sizes=[2, 2], placement="rank:0/cpu"
+                ),
+                ShardMetadata(
+                    shard_offsets=[2, 0], shard_sizes=[2, 2], placement="rank:1/cpu"
+                ),
+            ],
+            size=torch.Size([4, 2]),
+        )
+        st_local_shards: List[Shard] = []
+        src = torch.randn(4, 2)
+        for shard_metadata in st_metadata.shards_metadata:
+            offsets = shard_metadata.shard_offsets
+            sizes = shard_metadata.shard_sizes
+            st_local_shards.append(
+                Shard(
+                    tensor=src[offsets[0]:offsets[0] + sizes[0], offsets[1]:offsets[1] + sizes[1]],
+                    metadata=shard_metadata,
+                )
+            )
+
+        ShardedTensorBase._init_from_local_shards_and_global_metadata(
+            local_shards=st_local_shards,
+            sharded_tensor_metadata=st_metadata,
+        )
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
@@ -136,7 +136,7 @@ def _init_from_local_shards_and_global_metadata(
         local_shards: List[Shard],
         sharded_tensor_metadata: ShardedTensorMetadata,
         sharding_spec=None,
-    ) -> "ShardedTensor":
+    ) -> "ShardedTensorBase":
         """
         Initialize a ShardedTensorBase with local shards and a global
         ShardedTensorMetadata built on each rank.
@@ -158,7 +158,7 @@ def _init_from_local_shards_and_global_metadata(
         else:
             spec = sharding_spec
 
-        sharded_tensor_base = ShardedTensor.__new__(
+        sharded_tensor_base = ShardedTensorBase.__new__(
             ShardedTensor,
             spec,
             sharded_tensor_metadata.size,
@@ -168,67 +168,6 @@ def _init_from_local_shards_and_global_metadata(
             requires_grad=tensor_properties.requires_grad,
         )
 
-        def _raise_if_mismatch(expected, actual, prop_name, rank, is_property=False):
-            tensor_property_or_metadata = (
-                "tensor property" if is_property else "local ShardMetadata"
-            )
-            if expected != actual:
-                raise ValueError(
-                    f"Local shards' tensor {prop_name} property is incompatible with "
-                    f"{tensor_property_or_metadata} on rank {rank}: "
-                    f"{tensor_property_or_metadata} {prop_name}={expected}, "
-                    f"local shard tensor {prop_name}={actual}."
-                )
-
-        for shard in local_shards:
-            shard_meta = shard.metadata
-            local_shard_tensor = shard.tensor
-            placement = shard_meta.placement
-            assert placement is not None, "Must specify placement for `Shard`!"
-            rank = placement.rank()
-            local_device = placement.device()
-
-            _raise_if_mismatch(
-                tensor_properties.layout,
-                local_shard_tensor.layout,
-                "layout",
-                rank,
-                True,
-            )
-            if not local_shard_tensor.is_contiguous():
-                raise ValueError(
-                    "Only torch.contiguous_format memory_format is currently supported"
-                )
-
-            _raise_if_mismatch(
-                shard_meta.shard_sizes,
-                list(local_shard_tensor.size()),
-                "size",
-                rank,
-            )
-            _raise_if_mismatch(
-                tensor_properties.pin_memory,
-                local_shard_tensor.is_pinned(),
-                "pin_memory",
-                rank,
-                True,
-            )
-            _raise_if_mismatch(local_device, local_shard_tensor.device, "device", rank)
-            _raise_if_mismatch(
-                tensor_properties.dtype,
-                local_shard_tensor.dtype,
-                "dtype",
-                rank,
-                True,
-            )
-            _raise_if_mismatch(
-                tensor_properties.requires_grad,
-                local_shard_tensor.requires_grad,
-                "requires_grad",
-                rank,
-                True,
-            )
-
         # check if shards_metadata have overlap shards
         validate_non_overlapping_shards_metadata(shards_metadata)
 
@@ -925,11 +864,100 @@ def _init_from_local_shards_and_global_metadata(  # type: ignore[override]
                 f'shards metadata in sharded_tensor_metadata ({len(local_shard_metadatas)}) '
                 f'on rank ({current_rank}) '
             )
-        sharded_tensor = super(
-            ShardedTensor, cls
-        )._init_from_local_shards_and_global_metadata(
-            local_shards, sharded_tensor_metadata, sharding_spec=sharding_spec
+
+        shards_metadata = sharded_tensor_metadata.shards_metadata
+        tensor_properties = sharded_tensor_metadata.tensor_properties
+
+        if len(shards_metadata) == 0:
+            raise ValueError("shards_metadata must not be empty!")
+
+        if tensor_properties.layout != torch.strided:
+            raise ValueError("Only torch.strided layout is currently supported")
+
+        if sharding_spec is None:
+            spec = shard_spec._infer_sharding_spec_from_shards_metadata(shards_metadata)
+        else:
+            spec = sharding_spec
+
+        sharded_tensor = ShardedTensor.__new__(
+            ShardedTensor,
+            spec,
+            sharded_tensor_metadata.size,
+            dtype=tensor_properties.dtype,
+            layout=tensor_properties.layout,
+            pin_memory=tensor_properties.pin_memory,
+            requires_grad=tensor_properties.requires_grad,
         )
+
+        def _raise_if_mismatch(expected, actual, prop_name, rank, is_property=False):
+            tensor_property_or_metadata = (
+                "tensor property" if is_property else "local ShardMetadata"
+            )
+            if expected != actual:
+                raise ValueError(
+                    f"Local shards' tensor {prop_name} property is incompatible with "
+                    f"{tensor_property_or_metadata} on rank {rank}: "
+                    f"{tensor_property_or_metadata} {prop_name}={expected}, "
+                    f"local shard tensor {prop_name}={actual}."
+                )
+
+        for shard in local_shards:
+            shard_meta = shard.metadata
+            local_shard_tensor = shard.tensor
+            placement = shard_meta.placement
+            assert placement is not None, "Must specify placement for `Shard`!"
+            rank = placement.rank()
+            local_device = placement.device()
+
+            _raise_if_mismatch(
+                tensor_properties.layout,
+                local_shard_tensor.layout,
+                "layout",
+                rank,
+                True,
+            )
+            if not local_shard_tensor.is_contiguous():
+                raise ValueError(
+                    "Only torch.contiguous_format memory_format is currently supported"
+                )
+
+            _raise_if_mismatch(
+                shard_meta.shard_sizes,
+                list(local_shard_tensor.size()),
+                "size",
+                rank,
+            )
+            _raise_if_mismatch(
+                tensor_properties.pin_memory,
+                local_shard_tensor.is_pinned(),
+                "pin_memory",
+                rank,
+                True,
+            )
+            _raise_if_mismatch(local_device, local_shard_tensor.device, "device", rank)
+            _raise_if_mismatch(
+                tensor_properties.dtype,
+                local_shard_tensor.dtype,
+                "dtype",
+                rank,
+                True,
+            )
+            _raise_if_mismatch(
+                tensor_properties.requires_grad,
+                local_shard_tensor.requires_grad,
+                "requires_grad",
+                rank,
+                True,
+            )
+
+        # check if shards_metadata have overlap shards
+        validate_non_overlapping_shards_metadata(shards_metadata)
+
+        # check if the shards_metadata is compatible with overall size of the sharded tensor.
+        check_tensor(shards_metadata, list(sharded_tensor_metadata.size))
+
+        # done validation, add local_shards
+        sharded_tensor._local_shards = local_shards
         sharded_tensor._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
 
         # run post initialization, i.e. map registration, rpc initialization