-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Fix Process Group for tensors shared across processes #21449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
d23f4c1
Fix Process Group for tensors shared across processes
mrshenli 707128a
1. fix lint; 2. add more tests
mrshenli bfde4d4
fix test skip
mrshenli 38b390d
fix rocm build
mrshenli 5f37eda
move spawn tests from test_c10d.py to test_c10d_spawn.py
mrshenli 62febed
use deliberate test error to check whether test suite can retrieve me…
mrshenli 40b8ab9
Try deliberate failure, 2nd attempt
mrshenli 8409560
enable test_c10d_spawn.py in multi-gpu tests
mrshenli 1a451e7
enable test_c10d_spawn.py, 2nd attempt
mrshenli 22e5724
verified that CI can show correct error messages on deliberately fail…
mrshenli 65a3200
Merge remote-tracking branch 'upstream/master' into record
mrshenli b6f05ba
revert third_party
mrshenli abe828e
fix ROCM blacklist
mrshenli File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| 'autograd', | ||
| 'cpp_extensions', | ||
| 'c10d', | ||
| 'c10d_spawn', | ||
| 'cuda', | ||
| 'cuda_primary_ctx', | ||
| 'dataloader', | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,192 @@ | ||
| import sys | ||
| import tempfile | ||
| import unittest | ||
|
|
||
| import torch | ||
| import torch.distributed as c10d | ||
| import torch.multiprocessing as mp | ||
|
|
||
| from common_cuda import TEST_MULTIGPU | ||
| from common_utils import TestCase, load_tests, run_tests | ||
| from common_utils import NO_MULTIPROCESSING_SPAWN | ||
|
|
||
| # load_tests from common_utils is used to automatically filter tests for | ||
| # sharding on sandcastle. This line silences flake warnings | ||
| load_tests = load_tests | ||
|
|
||
| if not c10d.is_available(): | ||
| print('c10d not available, skipping tests') | ||
| sys.exit(0) | ||
|
|
||
|
|
||
| if NO_MULTIPROCESSING_SPAWN: | ||
| print('spawn not available, skipping tests') | ||
| sys.exit(0) | ||
|
|
||
|
|
||
| NO_NCCL = not hasattr(c10d, "ProcessGroupNCCL") | ||
|
|
||
|
|
||
| class ProcessGroupShareTensorTest(TestCase): | ||
|
|
||
| world_size = 2 | ||
|
|
||
| @classmethod | ||
| def opts(cls, threads=2): | ||
| opts = c10d.ProcessGroupGloo.Options() | ||
| opts.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] | ||
| opts.timeout = 5.0 | ||
| opts.threads = threads | ||
| return opts | ||
|
|
||
| @classmethod | ||
| def _init_pg_gloo(cls, rank, filename, world_size): | ||
| store = c10d.FileStore(filename, world_size) | ||
| return c10d.ProcessGroupGloo( | ||
| store, rank, world_size, ProcessGroupShareTensorTest.opts()) | ||
|
|
||
| @classmethod | ||
| def _init_pg_nccl(cls, rank, filename, world_size): | ||
| store = c10d.FileStore(filename, world_size) | ||
| return c10d.ProcessGroupNCCL(store, rank, world_size) | ||
|
|
||
| def _test_multiprocess(self, f, shared_tensors, init_pg, n_output): | ||
| ws = self.world_size | ||
| # file store will delete the test file on destruction | ||
| file = tempfile.NamedTemporaryFile(delete=False) | ||
| ctx = mp.get_context('spawn') | ||
| c2p = ctx.Queue(2) | ||
| p2c = ctx.Queue(2) | ||
| ps = [] | ||
| for i in range(ws): | ||
| p = ctx.Process( | ||
| target=f, | ||
| args=(i, file.name, shared_tensors, ws, init_pg, c2p, p2c)) | ||
|
|
||
| p.start() | ||
| ps.append(p) | ||
|
|
||
| for _ in range(ws * n_output): | ||
| pid, expected, result = c2p.get() | ||
| self.assertEqual( | ||
| expected, | ||
| result, | ||
| ( | ||
| "Expect rank {} to broadcast result {} but got {}." | ||
| ).format(pid, expected, result) | ||
| ) | ||
|
|
||
| for _ in range(ws): | ||
| p2c.put(0) | ||
|
|
||
| for p in ps: | ||
| p.join(2) | ||
|
|
||
| # Why classmethod? multiprocessing cannot pickle TestCase subclass when in | ||
| # spawn mode. See https://bugs.python.org/issue33884. | ||
| @classmethod | ||
| def _test_broadcast_process( | ||
| cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): | ||
| pg = init_pg(rank, filename, world_size) | ||
| xs = [shared_tensors[rank]] | ||
| pg.broadcast(xs).wait() | ||
| c2p.put((rank, torch.zeros(2, 2), xs[0].to("cpu"))) | ||
| p2c.get() | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| def test_shared_broadcast_gloo(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_broadcast_process, | ||
| [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_gloo, | ||
| 1) | ||
|
|
||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| @unittest.skipIf(NO_NCCL, "NCCL needed") | ||
| def test_shared_broadcast_nccl(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_broadcast_process, | ||
| [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_nccl, | ||
| 1) | ||
|
|
||
| @classmethod | ||
| def _test_allreduce_process( | ||
| cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): | ||
| pg = init_pg(rank, filename, world_size) | ||
| xs = [shared_tensors[rank]] | ||
| pg.allreduce(xs, op=c10d.ReduceOp.SUM).wait() | ||
| c2p.put((rank, torch.ones(2, 2) * 2, xs[0].to("cpu"))) | ||
| p2c.get() | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| def test_shared_allreduce_gloo(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_allreduce_process, | ||
| [torch.ones(2, 2).to(i) for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_gloo, | ||
| 1) | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| @unittest.skipIf(NO_NCCL, "NCCL needed") | ||
| def test_shared_allreduce_nccl(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_allreduce_process, | ||
| [torch.ones(2, 2).to(i) for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_nccl, | ||
| 1) | ||
|
|
||
| @classmethod | ||
| def _test_reduce_process( | ||
| cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): | ||
| pg = init_pg(rank, filename, world_size) | ||
| x = shared_tensors[rank] | ||
| pg.reduce(x, root=0, op=c10d.ReduceOp.SUM).wait() | ||
| if rank == 0: | ||
| c2p.put((rank, torch.ones(2, 2) * 2, x.to("cpu"))) | ||
| else: | ||
| c2p.put((rank, torch.ones(2, 2), x.to("cpu"))) | ||
| p2c.get() | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| @unittest.skipIf(NO_NCCL, "NCCL needed") | ||
| def test_shared_reduce_nccl(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_reduce_process, | ||
| [torch.ones(2, 2).to(i) for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_nccl, | ||
| 1) | ||
|
|
||
| @classmethod | ||
| def _test_allgather_process( | ||
| cls, rank, filename, shared_tensors, world_size, init_pg, c2p, p2c): | ||
| pg = init_pg(rank, filename, world_size) | ||
| xs = [shared_tensors[rank]] | ||
| ys = [[torch.zeros_like(xs[0]) for i in range(world_size)]] | ||
| pg.allgather(ys, xs).wait() | ||
| for i in range(world_size): | ||
| c2p.put((rank, torch.ones(2, 2) * i, ys[0][i].to("cpu"))) | ||
|
|
||
| p2c.get() | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| def test_shared_allgather_gloo(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_allgather_process, | ||
| [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_gloo, | ||
| self.world_size) | ||
|
|
||
| @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") | ||
| @unittest.skipIf(NO_NCCL, "NCCL needed") | ||
| def test_shared_allgather_nccl(self): | ||
| self._test_multiprocess( | ||
| ProcessGroupShareTensorTest._test_allgather_process, | ||
| [torch.ones(2, 2).to(i) * i for i in range(self.world_size)], | ||
| ProcessGroupShareTensorTest._init_pg_nccl, | ||
| self.world_size) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| run_tests() | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW,
test_multiprocessing.pydoes this by just having the test runner methods as honest to goodness top-level functions. This is just an informational comment, since a class method is just as good.