Skip to content

Commit 216c384

Browse files
committed
Delay reduction of unused parameters until first autograd hook is called
Reduction of gradients for unused parameters should happen as soon as possible, because they potentially block reduction of gradients for used parameters. This used to happen instantly when `prepare_for_backward` was called and it found parameters that didn't contribute. This meant that if you have a model with unused parameters, and you want to discard the model output (i.e. not call backward on some loss), reduction of the gradients of those unused parameters would have been kicked off, and you'd see an error the next time you called `forward`. In this commit, this original approach is slightly changed to delay reduction of the gradients of those unused parameters until the first autograd hook is called. This means that you can now discard the model output regardless of the model having unused parameters or not. This is a prerequisite for making the `find_unused_parameters` argument to DDP default to `True`.
1 parent 1d705b4 commit 216c384

File tree

3 files changed

+115
-61
lines changed

3 files changed

+115
-61
lines changed

test/test_c10d.py

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2592,15 +2592,13 @@ def step_model(model, input, target):
25922592
torch.manual_seed(1337 + iteration)
25932593
input = input[torch.randperm(global_batch_size)]
25942594

2595-
@skip_if_not_nccl
2596-
@skip_if_not_multigpu
25972595
def test_ignored_output(self):
25982596
"""
2599-
Note: this test can be sped up by only running it on a CPU module
2600-
once DistributedDataParallel supports them.
2597+
Test that the output of a model can be ignored and that there is no
2598+
implicit requirement that `backward` gets called.
26012599
"""
26022600
store = c10d.FileStore(self.file.name, self.world_size)
2603-
process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
2601+
process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
26042602

26052603
class IgnoredOutput(nn.Module):
26062604
def __init__(self):
@@ -2614,17 +2612,59 @@ def forward(self, x):
26142612
x = self.relu(self.fc2(x))
26152613
return F.softmax(x, dim=1)
26162614

2617-
device_id = gpus_for_rank(self.world_size)[self.rank][0]
26182615
model = DistributedDataParallel(
2619-
IgnoredOutput().float().to(device_id),
2620-
device_ids=[device_id],
2616+
IgnoredOutput().float(),
26212617
process_group=process_group,
26222618
)
26232619

26242620
batch_size = 4
26252621
criterion = nn.CrossEntropyLoss()
26262622
input = torch.rand([batch_size, 2], dtype=torch.float)
2627-
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
2623+
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)])
2624+
2625+
# Run a few iterations where we ignore the output.
2626+
for _ in range(4):
2627+
output = model(input)
2628+
del output
2629+
2630+
# Run a few iterations where we use the output.
2631+
for _ in range(4):
2632+
output = model(input)
2633+
loss = criterion(output, target)
2634+
loss.backward()
2635+
2636+
def test_ignored_output_with_unused_parameters(self):
2637+
"""
2638+
Test that the output of a model can be ignored and that there is no
2639+
implicit requirement that `backward` gets called, if not all model
2640+
parameters participated in computing the model output.
2641+
"""
2642+
store = c10d.FileStore(self.file.name, self.world_size)
2643+
process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
2644+
2645+
class IgnoredOutputWithUnusedParameters(nn.Module):
2646+
def __init__(self):
2647+
super(IgnoredOutputWithUnusedParameters, self).__init__()
2648+
self.fc1 = nn.Linear(2, 10, bias=False)
2649+
self.fc2 = nn.Linear(10, 4, bias=False)
2650+
self.fc3 = nn.Linear(4, 4, bias=False)
2651+
self.relu = nn.ReLU()
2652+
2653+
def forward(self, x):
2654+
x = self.relu(self.fc1(x))
2655+
x = self.relu(self.fc2(x))
2656+
return F.softmax(x, dim=1)
2657+
2658+
model = DistributedDataParallel(
2659+
IgnoredOutputWithUnusedParameters().float(),
2660+
process_group=process_group,
2661+
find_unused_parameters=True,
2662+
)
2663+
2664+
batch_size = 4
2665+
criterion = nn.CrossEntropyLoss()
2666+
input = torch.rand([batch_size, 2], dtype=torch.float)
2667+
target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)])
26282668

26292669
# Run a few iterations where we ignore the output.
26302670
for _ in range(4):

torch/csrc/distributed/c10d/reducer.cpp

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ Reducer::Reducer(
4848
expect_sparse_gradients_(std::move(expect_sparse_gradients)),
4949
expect_autograd_hooks_(false),
5050
require_finalize_(false),
51-
has_marked_unused_parameters_(false),
5251
next_bucket_(0),
52+
has_marked_unused_parameters_(false),
5353
backward_stats_base_(0) {
5454
AT_ASSERTM(replicas_.size() >= 1, "Expected at least one model replica.");
5555
AT_ASSERTM(replicas_[0].size() >= 1, "Expected at least one parameter.");
@@ -118,6 +118,10 @@ Reducer::Reducer(
118118
for (size_t variable_index = 0; variable_index < variable_count;
119119
variable_index++) {
120120
auto& variable = replicas_[replica_index][variable_index];
121+
const auto index = VariableIndex{
122+
replica_index : replica_index,
123+
variable_index : variable_index,
124+
};
121125

122126
// The gradient accumulator function is lazily initialized once.
123127
// Therefore we can use its presence in the autograd graph as
@@ -126,21 +130,14 @@ Reducer::Reducer(
126130

127131
// Hook to execute after the gradient accumulator has executed.
128132
hooks_.emplace_back(
129-
grad_accumulator->add_post_hook(
130-
torch::make_unique<LambdaPostHook>([=] {
131-
std::lock_guard<std::mutex> lock(this->mutex_);
132-
this->mark_variable_ready(
133-
replica_index,
134-
variable_index,
135-
/* called_from_autograd= */ true);
136-
})),
133+
grad_accumulator->add_post_hook(torch::make_unique<LambdaPostHook>(
134+
[=] { this->autograd_hook(index); })),
137135
grad_accumulator);
138136

139137
// Map raw function pointer to replica index and parameter index.
140138
// This is used later on when the autograd graph is traversed
141139
// to check for parameters for which no gradient is computed.
142-
func_[grad_accumulator.get()] =
143-
std::make_tuple(replica_index, variable_index);
140+
func_[grad_accumulator.get()] = index;
144141

145142
// The gradient accumulator is stored as weak_ptr in the autograd
146143
// metadata of the variable, so we have to keep it alive here for
@@ -177,9 +174,9 @@ Reducer::~Reducer() noexcept(false) {
177174
}
178175
}
179176

180-
void Reducer::mark_variable_ready_dense(
181-
size_t replica_index,
182-
size_t variable_index) {
177+
void Reducer::mark_variable_ready_dense(VariableIndex index) {
178+
const auto replica_index = index.replica_index;
179+
const auto variable_index = index.variable_index;
183180
const auto& bucket_index = variable_locators_[variable_index];
184181
auto& bucket = buckets_[bucket_index.bucket_index];
185182
auto& replica = bucket.replicas[replica_index];
@@ -214,9 +211,9 @@ void Reducer::mark_variable_ready_dense(
214211
}
215212
}
216213

217-
void Reducer::mark_variable_ready_sparse(
218-
size_t replica_index,
219-
size_t variable_index) {
214+
void Reducer::mark_variable_ready_sparse(VariableIndex index) {
215+
const auto replica_index = index.replica_index;
216+
const auto variable_index = index.variable_index;
220217
const auto& bucket_index = variable_locators_[variable_index];
221218
auto& bucket = buckets_[bucket_index.bucket_index];
222219
auto& replica = bucket.replicas[replica_index];
@@ -235,22 +232,37 @@ void Reducer::mark_variable_ready_sparse(
235232
replica.contents = grad;
236233
}
237234

238-
// Called when the gradient for the specified variable is ready.
239-
// It can be called from two places:
240-
// - By an autograd thread after executing a gradient accumulator function.
241-
// - By the `Reducer::prepare_for_backward` function if the variable doesn't
242-
// show up in the autograd graph (and it wouldn't be called by autograd).
243-
void Reducer::mark_variable_ready(
244-
size_t replica_index,
245-
size_t variable_index,
246-
bool called_from_autograd) {
235+
// The function `autograd_hook` is called after the gradient for a
236+
// model parameter has been accumulated into its gradient tensor.
237+
// This function is only to be called from the autograd thread.
238+
void Reducer::autograd_hook(VariableIndex index) {
239+
std::lock_guard<std::mutex> lock(this->mutex_);
240+
247241
// Ignore if we don't expect to be called.
248242
// This may be the case if the user wants to accumulate gradients
249243
// for number of iterations before reducing them.
250244
if (!expect_autograd_hooks_) {
251245
return;
252246
}
253247

248+
// If there are model parameters that went unused when computing the model
249+
// output, they won't be part of the autograd graph, and won't receive
250+
// gradients. These parameters are discovered in the `prepare_for_backward`
251+
// function and their indexes stored in the `unused_parameters_` vector.
252+
if (!has_marked_unused_parameters_ && !unused_parameters_.empty()) {
253+
has_marked_unused_parameters_ = true;
254+
for (const auto& unused_index : unused_parameters_) {
255+
mark_variable_ready(unused_index);
256+
}
257+
}
258+
259+
// Finally mark variable for which this function was originally called.
260+
mark_variable_ready(index);
261+
}
262+
263+
void Reducer::mark_variable_ready(VariableIndex index) {
264+
const auto replica_index = index.replica_index;
265+
const auto variable_index = index.variable_index;
254266
AT_ASSERTM(replica_index < replicas_.size(), "Out of range replica index.");
255267
AT_ASSERTM(
256268
variable_index < variable_locators_.size(),
@@ -293,9 +305,9 @@ void Reducer::mark_variable_ready(
293305
}
294306

295307
if (bucket.expect_sparse_gradient) {
296-
mark_variable_ready_sparse(replica_index, variable_index);
308+
mark_variable_ready_sparse(index);
297309
} else {
298-
mark_variable_ready_dense(replica_index, variable_index);
310+
mark_variable_ready_dense(index);
299311
}
300312

301313
// TODO(@pietern): Make this work for both CPU/CUDA tensors.
@@ -316,14 +328,10 @@ void Reducer::mark_variable_ready(
316328

317329
// Run finalizer function once the final bucket was marked ready.
318330
if (next_bucket_ == buckets_.size()) {
319-
if (called_from_autograd) {
320-
torch::autograd::Engine::get_default_engine().queue_callback([=] {
321-
std::lock_guard<std::mutex> lock(this->mutex_);
322-
this->finalize_backward();
323-
});
324-
} else {
325-
finalize_backward();
326-
}
331+
torch::autograd::Engine::get_default_engine().queue_callback([=] {
332+
std::lock_guard<std::mutex> lock(this->mutex_);
333+
this->finalize_backward();
334+
});
327335
}
328336
}
329337

@@ -489,8 +497,8 @@ void Reducer::prepare_for_backward(
489497
std::vector<torch::autograd::Function*> queue;
490498

491499
// Check that any prior reduction has finished.
492-
// The variable `expect_autograd_hooks` is true until gradients for all
493-
// parameters have been received and all buckets are ready.
500+
// The variable `require_finalize_` is true until all gradients
501+
// have been computed and reduction of all buckets has been kicked off.
494502
if (require_finalize_) {
495503
AT_ERROR(
496504
"Expected to have finished reduction in the prior iteration before ",
@@ -513,7 +521,6 @@ void Reducer::prepare_for_backward(
513521
}
514522

515523
// Reset accounting.
516-
has_marked_unused_parameters_ = true;
517524
expect_autograd_hooks_ = true;
518525
next_bucket_ = 0;
519526
backward_stats_base_ = current_time_in_nanos();
@@ -524,11 +531,14 @@ void Reducer::prepare_for_backward(
524531
bucket.pending = bucket.replicas.size();
525532
}
526533

534+
// Reset unused parameter accounting.
535+
has_marked_unused_parameters_ = false;
536+
unused_parameters_.clear();
537+
527538
// If no outputs are specified, we assume that autograd hooks for ALL
528539
// variables will be called, and we don't have to search the autograd graph
529540
// for presence of these hooks.
530541
if (outputs.empty()) {
531-
has_marked_unused_parameters_ = false;
532542
return;
533543
}
534544

@@ -562,10 +572,7 @@ void Reducer::prepare_for_backward(
562572
continue;
563573
}
564574

565-
size_t replica_index;
566-
size_t variable_index;
567-
std::tie(replica_index, variable_index) = it.second;
568-
mark_variable_ready(replica_index, variable_index);
575+
unused_parameters_.push_back(it.second);
569576
}
570577
}
571578

torch/csrc/distributed/c10d/reducer.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,30 +51,37 @@ class Reducer {
5151
// Forward declaration.
5252
struct Bucket;
5353

54+
// Locates a specific variable by replica index and variable index.
55+
struct VariableIndex {
56+
size_t replica_index;
57+
size_t variable_index;
58+
};
59+
5460
std::mutex mutex_;
5561
std::vector<std::vector<torch::autograd::Variable>> replicas_;
5662
std::shared_ptr<c10d::ProcessGroup> process_group_;
5763
std::vector<std::vector<bool>> expect_sparse_gradients_;
5864

5965
std::vector<std::vector<std::shared_ptr<torch::autograd::Function>>>
6066
grad_accumulators_;
61-
std::unordered_map<torch::autograd::Function*, std::tuple<int, int>> func_;
67+
std::unordered_map<torch::autograd::Function*, VariableIndex> func_;
6268
std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Function>>>
6369
hooks_;
6470

6571
bool expect_autograd_hooks_;
6672
bool require_finalize_;
67-
bool has_marked_unused_parameters_;
6873
size_t next_bucket_;
6974

70-
void mark_variable_ready_dense(size_t replica_index, size_t variable_index);
75+
bool has_marked_unused_parameters_;
76+
std::vector<VariableIndex> unused_parameters_;
77+
78+
void mark_variable_ready_dense(VariableIndex index);
79+
80+
void mark_variable_ready_sparse(VariableIndex index);
7181

72-
void mark_variable_ready_sparse(size_t replica_index, size_t variable_index);
82+
void mark_variable_ready(VariableIndex index);
7383

74-
void mark_variable_ready(
75-
size_t replica_index,
76-
size_t variable_index,
77-
bool called_from_autograd = false);
84+
void autograd_hook(VariableIndex index);
7885

7986
void mark_bucket_ready(size_t bucket_index);
8087

0 commit comments

Comments
 (0)