|
20 | 20 | import torch.nn.functional as F |
21 | 21 | import torch.distributed as c10d |
22 | 22 | import torch.distributed as dist |
| 23 | +import torch.multiprocessing as mp |
23 | 24 | from torch.nn.parallel import DistributedDataParallel |
24 | 25 |
|
25 | | -from common_utils import TestCase, load_tests, run_tests |
| 26 | +from common_utils import TestCase, load_tests, run_tests, PY3 |
26 | 27 | from common_utils import retry_on_address_already_in_use_error |
27 | 28 |
|
28 | 29 | # load_tests from common_utils is used to automatically filter tests for |
@@ -1606,6 +1607,54 @@ def allreduce(tensors): |
1606 | 1607 | tensors_list[i - 2][j]) |
1607 | 1608 |
|
1608 | 1609 |
|
| 1610 | +class ProcessGroupShareTensorTest(TestCase): |
| 1611 | + |
| 1612 | + @property |
| 1613 | + def world_size(self): |
| 1614 | + return 2 |
| 1615 | + |
| 1616 | + def opts(threads=2): |
| 1617 | + opts = c10d.ProcessGroupGloo.Options() |
| 1618 | + opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] |
| 1619 | + opts.timeout = 5.0 |
| 1620 | + opts.threads = threads |
| 1621 | + return opts |
| 1622 | + |
| 1623 | + def _test_allreduce_gloo_process(rank, filename, shared_tensors, world_size): |
| 1624 | + store = c10d.FileStore(filename, world_size) |
| 1625 | + pg = c10d.ProcessGroupGloo( |
| 1626 | + store, rank, world_size, ProcessGroupShareTensorTest.opts()) |
| 1627 | + xs = [shared_tensors[rank]] |
| 1628 | + pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait() |
| 1629 | + xs[0].to('cpu').allclose(torch.ones(2, 2)) |
| 1630 | + |
| 1631 | + @unittest.skipIf(not PY3, "Python 3 needed") |
| 1632 | + @skip_if_not_multigpu |
| 1633 | + def test_allreduce_gloo(self): |
| 1634 | + file = tempfile.NamedTemporaryFile(delete=False) |
| 1635 | + shared_tensors = [torch.ones(2, 2).to(i).share_memory_() for i in range(2)] |
| 1636 | + mp.spawn(ProcessGroupShareTensorTest._test_allreduce_gloo_process, |
| 1637 | + args=(file.name, shared_tensors, self.world_size), |
| 1638 | + nprocs=self.world_size, |
| 1639 | + join=True) |
| 1640 | + |
| 1641 | + def _test_allreduce_nccl_process(rank, filename, shared_tensors, world_size): |
| 1642 | + store = c10d.FileStore(filename, world_size) |
| 1643 | + pg = c10d.ProcessGroupNCCL(store, rank, world_size) |
| 1644 | + xs = [shared_tensors[rank]] |
| 1645 | + pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait() |
| 1646 | + xs[0].to('cpu').allclose(torch.ones(2, 2)) |
| 1647 | + |
| 1648 | + @unittest.skipIf(not PY3, "Python 3 needed") |
| 1649 | + @skip_if_not_multigpu |
| 1650 | + def test_allreduce_nccl(self): |
| 1651 | + file = tempfile.NamedTemporaryFile(delete=False) |
| 1652 | + shared_tensors = [torch.ones(2, 2).to(i).share_memory_() for i in range(2)] |
| 1653 | + mp.spawn(ProcessGroupShareTensorTest._test_allreduce_gloo_process, |
| 1654 | + args=(file.name, shared_tensors, self.world_size), |
| 1655 | + nprocs=self.world_size, |
| 1656 | + join=True) |
| 1657 | + |
1609 | 1658 | class Net(nn.Module): |
1610 | 1659 | def __init__(self): |
1611 | 1660 | super(Net, self).__init__() |
|
0 commit comments