Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions test/distributed/_tensor/test_device_mesh.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
# Owner(s): ["oncall: distributed"]
import os
import sys

import torch
from torch.distributed._tensor.device_mesh import DeviceMesh
Expand All @@ -8,6 +10,7 @@
from torch.distributed.distributed_c10d import (
get_global_rank,
get_world_size,
is_initialized,
new_group,
ProcessGroup,
)
Expand All @@ -16,13 +19,30 @@
DTensorTestBase,
with_comms,
)
from torch.testing._internal.common_distributed import TEST_SKIPS


class DeviceMeshTest(DTensorTestBase):
@property
def world_size(self):
return 8

def test_init_process_group(self):
device_type = "cuda" if torch.cuda.is_available() else "cpu"
backend = "nccl" if device_type == "cuda" else "gloo"
# skip the test if not enough GPUs
if backend == "nccl" and torch.cuda.device_count() < self.world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
mesh_tensor = torch.arange(4).reshape(2, 2)
self.assertTrue(not is_initialized())
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "25364"
os.environ["WORLD_SIZE"] = f"{self.world_size}"
os.environ["RANK"] = f"{self.rank}"
mesh = DeviceMesh(device_type, mesh_tensor)
self.assertTrue(is_initialized())
self.destroy_pg()

@with_comms
def test_device_mesh_2d(self):
mesh_tensor = torch.arange(4).reshape(2, 2)
Expand Down
10 changes: 9 additions & 1 deletion torch/distributed/_tensor/device_mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
get_rank,
get_world_size,
GroupMember,
init_process_group,
is_initialized,
new_group,
ProcessGroup,
reduce_scatter,
Expand Down Expand Up @@ -109,7 +111,7 @@ def __init__(
if isinstance(mesh, torch.Tensor)
else torch.tensor(mesh, dtype=torch.int)
)
default_pg = _get_default_group()
default_pg = self._get_or_create_default_group()
self._backend = default_pg._get_backend_name()
# TODO: if user want to pass pg_options, offer a way to do it
# check default pg backend, should support device_type
Expand Down Expand Up @@ -215,6 +217,12 @@ def __init__(
)
self._dim_groups.append(new_subgroup)

def _get_or_create_default_group(self):
if not is_initialized():
_backend = "gloo" if self.device_type == "cpu" else "nccl"
init_process_group(backend=_backend)
return _get_default_group()

def __enter__(self) -> "DeviceMesh":
# set global device_mesh to this instance
set_global_device_mesh(self)
Expand Down