-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Closed
Labels
module: multi-gpuProblem is related to running on multiple GPUsProblem is related to running on multiple GPUsmodule: nnRelated to torch.nnRelated to torch.nntriagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module
Description
🐛 Bug
Sync Batchnorm running_varvalue update Issue
Issue
We @unlimblue @OscarZhangF find when input size is different between gpus, the running_var value doesn't sync between gpus.
e.g:
gpu0: input size ? x 3 x 300 x 300
gpu1: input size ? x 3 x 500 x 500
in _functions.py line 35:
mean, invstd = torch.batch_norm_gather_stats(
input,
mean_all,
invstd_all,
running_mean,
running_var,
momentum,
eps,
int(input.numel() / input.size(1))
)the argument count=int(input.numel() / input.size(1)) is different between gpu0 and gpu1.
in Normalization.cuh line 410:
// first the reductions each thread does separately
for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) {
accscalar_t avg = 0;
accscalar_t var_n = 0;
index_t n = 0;
for (int j = 0; j < world_size; j++) {
accscalar_t m = vec_mean[j][i];
accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]);
v = (v * v - epsilon) * count;
accscalar_t factor = 1.0 / (n + count);
var_n += v + (avg - m) * (avg - m) * n * count * factor;
avg = n * factor * avg + count * factor * m;
n += count;
}
mean[i] = avg;
invstd[i] = static_cast<accscalar_t>(1) / device_sqrt(var_n / n + epsilon);
if (running_mean.data() != NULL) {
running_mean[i] = static_cast<scalar_t>((1 - momentum) * running_mean[i] + momentum * avg);
}
accscalar_t unbiasedVar = var_n / (n - 1);
if (running_var.data() != NULL) {
running_var[i] = static_cast<scalar_t>((1 - momentum) * running_var[i] + momentum * unbiasedVar);
}the running_var's updating depend on count, and then it's not sync between gpus.
Test
"""run.py:"""
#!/usr/bin/env python
import os
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.multiprocessing import Process
from torch.nn.parallel import DistributedDataParallel as DDP
def run(rank, config):
m = nn.SyncBatchNorm(2, momentum=0.99).to(config["device_ids"][rank])
ddp_model = DDP(m)
for i in range(100):
I = torch.ones(2, 2, config["sizes"][rank]) * config["values"][rank]
O = ddp_model(I)
print(rank, ddp_model.module.running_mean.sum(), ddp_model.module.running_var.sum())
def init_processes(rank, size, fn, config, backend='gloo'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
dist.init_process_group(backend, rank=rank, world_size=size)
torch.manual_seed(1234)
fn(rank, config)
if __name__ == "__main__":
config = {
"device_ids": [0, 0, 0],
"sizes": [10, 10, 10],
# "sizes": [10, 50, 100],
"values": [1, 0.1, 0.01],
}
processes = []
world_size = 3
for rank in range(world_size):
p = Process(target=init_processes, args=(rank, world_size, run, config))
p.start()
processes.append(p)
for p in processes:
p.join()result is
0 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')
1 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')
2 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')but if change config["sizes"] = [10, 50, 100], result is
2 tensor(0.7400, device='cuda:0') tensor(0.4003, device='cuda:0')
1 tensor(0.7400, device='cuda:0') tensor(0.4010, device='cuda:0')
0 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')Metadata
Metadata
Assignees
Labels
module: multi-gpuProblem is related to running on multiple GPUsProblem is related to running on multiple GPUsmodule: nnRelated to torch.nnRelated to torch.nntriagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate moduleThis issue has been looked at a team member, and triaged and prioritized into an appropriate module