Skip to content

Sync Batchnorm running var update issue #22192

@unlimblue

Description

@unlimblue

🐛 Bug

Sync Batchnorm running_varvalue update Issue

Issue

We @unlimblue @OscarZhangF find when input size is different between gpus, the running_var value doesn't sync between gpus.
e.g:
gpu0: input size ? x 3 x 300 x 300
gpu1: input size ? x 3 x 500 x 500

in _functions.py line 35:

        mean, invstd = torch.batch_norm_gather_stats(
            input,
            mean_all,
            invstd_all,
            running_mean,
            running_var,
            momentum,
            eps,
            int(input.numel() / input.size(1))
        )

the argument count=int(input.numel() / input.size(1)) is different between gpu0 and gpu1.

in Normalization.cuh line 410:

// first the reductions each thread does separately
  for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) {
    accscalar_t avg = 0;
    accscalar_t var_n = 0;
    index_t n = 0;
    for (int j = 0; j < world_size; j++) {
      accscalar_t m = vec_mean[j][i];
      accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]);
      v = (v * v - epsilon) * count;
      accscalar_t factor = 1.0 / (n + count);
      var_n += v + (avg - m) * (avg - m) * n * count * factor;
      avg = n * factor * avg + count * factor * m;
      n += count;
    }
    mean[i] = avg;
    invstd[i] = static_cast<accscalar_t>(1) / device_sqrt(var_n / n + epsilon);
    if (running_mean.data() != NULL) {
      running_mean[i] = static_cast<scalar_t>((1 - momentum) * running_mean[i] + momentum * avg);
    }
    accscalar_t unbiasedVar = var_n / (n - 1);
    if (running_var.data() != NULL) {
      running_var[i] = static_cast<scalar_t>((1 - momentum) * running_var[i] + momentum * unbiasedVar);
    }

the running_var's updating depend on count, and then it's not sync between gpus.

Test

"""run.py:"""
#!/usr/bin/env python
import os
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.multiprocessing import Process
from torch.nn.parallel import DistributedDataParallel as DDP


def run(rank, config):
    m = nn.SyncBatchNorm(2, momentum=0.99).to(config["device_ids"][rank])
    ddp_model = DDP(m)
    for i in range(100):
        I = torch.ones(2, 2, config["sizes"][rank]) * config["values"][rank]
        O = ddp_model(I)
    print(rank, ddp_model.module.running_mean.sum(), ddp_model.module.running_var.sum())


def init_processes(rank, size, fn, config, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    torch.manual_seed(1234)
    fn(rank, config)


if __name__ == "__main__":
    
    config = {
        "device_ids": [0, 0, 0],
        "sizes": [10, 10, 10],
        # "sizes": [10, 50, 100],
        "values": [1, 0.1, 0.01],
    }
    processes = []
    world_size = 3
    for rank in range(world_size):
        p = Process(target=init_processes, args=(rank, world_size, run, config))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

result is

0 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')
1 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')
2 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')

but if change config["sizes"] = [10, 50, 100], result is

2 tensor(0.7400, device='cuda:0') tensor(0.4003, device='cuda:0')
1 tensor(0.7400, device='cuda:0') tensor(0.4010, device='cuda:0')
0 tensor(0.7400, device='cuda:0') tensor(0.4064, device='cuda:0')

Metadata

Metadata

Assignees

No one assigned

    Labels

    module: multi-gpuProblem is related to running on multiple GPUsmodule: nnRelated to torch.nntriagedThis issue has been looked at a team member, and triaged and prioritized into an appropriate module

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions