Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 92 additions & 1 deletion test/test_c10d.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from datetime import timedelta

from itertools import groupby
from functools import wraps
from functools import partial, reduce, wraps
from collections import namedtuple

import torch
Expand Down Expand Up @@ -157,6 +157,49 @@ def simple_multi_input_reduce_tests(rank, world_size):
]


def simple_sparse_reduce_tests(rank, world_size, num_inputs=1):
"""
Generate a number of basic test cases for sparse reduction.
These cover tensors with a varying number of sparse dimensions and a varying
number of dense dimensions. The only reduction operation we support is sum.
"""
def generate(rank, world_size, sparse_dims=1, dense_dims=0):
# First sparse dimension is [0..rank].
# Subsequent dimensions are always 0, so we know there is
# a non-empty intersection between any two sparse tensors.
indices = [range(rank + 1)]
shape = [world_size] + [2 for _ in range(dense_dims)]
for _ in range(sparse_dims - 1):
indices.append([0] * (rank + 1))
shape.append(world_size)
values = torch.ones([rank + 1] + [2 for _ in range(dense_dims)])
return torch.sparse_coo_tensor(indices, values, shape)

def compute_sum(fn, world_size):
return reduce(lambda a, b: a + b, [fn(rank, world_size) for rank in range(world_size)])

return [
(
[
fn(num_inputs * rank + i, num_inputs * world_size)
for i in range(num_inputs)
],
[
compute_sum(fn, num_inputs * world_size)
for i in range(num_inputs)
],
)
for fn in [
partial(generate, sparse_dims=1),
partial(generate, sparse_dims=2),
partial(generate, sparse_dims=3),
partial(generate, dense_dims=1),
partial(generate, dense_dims=2),
partial(generate, dense_dims=3),
]
]


class StoreTestBase(object):
def _create_store(self, i):
raise RuntimeError("not implemented")
Expand Down Expand Up @@ -788,6 +831,54 @@ def test_allreduce_stress_cuda(self):
inputs = [torch.Tensor([i + self.rank]).cuda() for i in range(1000)]
self._test_allreduce_stress(inputs)

def test_sparse_allreduce_checks(self):
store = c10d.FileStore(self.file.name, self.world_size)
pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())

t1 = torch.zeros([1])
t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,))
t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,))

with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"):
opts = c10d.AllreduceOptions()
pg.allreduce([], opts)

with self.assertRaisesRegex(ValueError, "invalid tensor layout"):
opts = c10d.AllreduceOptions()
pg.allreduce([t1, t2], opts)

with self.assertRaisesRegex(ValueError, "invalid tensor size"):
opts = c10d.AllreduceOptions()
pg.allreduce([t2, t3], opts)

# Sparse allreduce only works with c10d.ReduceOp.SUM.
for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]:
with self.assertRaisesRegex(ValueError, "unsupported reduction operation"):
opts = c10d.AllreduceOptions()
opts.reduceOp = op
pg.allreduce([t3], opts)

def _test_sparse_allreduce_basics(self, fn):
store = c10d.FileStore(self.file.name, self.world_size)
pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())

for num_inputs_per_rank in [1, 2]:
tests = simple_sparse_reduce_tests(
self.rank,
self.world_size,
num_inputs=num_inputs_per_rank)
for (inputs, outputs) in tests:
work = pg.allreduce([fn(input) for input in inputs])
work.wait()
self.assertEqual(work.result(), outputs)

def test_sparse_allreduce_basics(self):
self._test_sparse_allreduce_basics(lambda t: t)

@skip_if_not_multigpu
def test_sparse_allreduce_basics_cuda(self):
self._test_sparse_allreduce_basics(lambda t: t.clone().cuda())

def test_scatter_checks(self):
store = c10d.FileStore(self.file.name, self.world_size)
pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
Expand Down
9 changes: 9 additions & 0 deletions torch/csrc/distributed/c10d/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,15 @@ They are used in specifying strategies for reduction collectives, e.g.,
.def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
.def("exception", &::c10d::ProcessGroup::Work::exception)
.def("source_rank", &::c10d::ProcessGroup::Work::sourceRank)
.def(
"result",
[](::c10d::ProcessGroup::Work& work) -> std::vector<at::Tensor> {
auto tensors = work.result();
for (auto& tensor : tensors) {
tensor = autograd::make_variable(tensor);
}
return tensors;
})
.def("synchronize", &::c10d::ProcessGroup::Work::synchronize)
.def(
"wait",
Expand Down
4 changes: 4 additions & 0 deletions torch/lib/c10d/ProcessGroup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ int ProcessGroup::Work::sourceRank() const {
"that correspond to a recv or recv-from-any call.");
}

std::vector<at::Tensor> ProcessGroup::Work::result() const {
throw std::runtime_error("result() not implemented.");
}

void ProcessGroup::Work::synchronize() {}

void ProcessGroup::Work::wait() {
Expand Down
3 changes: 3 additions & 0 deletions torch/lib/c10d/ProcessGroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class ProcessGroup {
// Returns source rank if this objects represents a recv-from-any.
virtual int sourceRank() const;

// Returns result tensors, if applicable.
virtual std::vector<at::Tensor> result() const;

// Ensures that operations on the output tensors that are invoked
// after this function returns are correctly sequenced after the
// asynchronous completion of this work.
Expand Down
Loading