-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Closed
Description
- Call
backwardon something in the local process - Launch N subprocess that call
backward - hang
Here's a repro:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.multiprocessing as mp
def train():
x = torch.randn(1000, 1)
y = torch.randn(1000, 1)
model = nn.Linear(1, 1)
mse = nn.MSELoss()
model.zero_grad()
pred = model(Variable(x))
loss = mse(pred, Variable(y))
loss.backward() # hangs here
return model
def worker(rank):
print("rank %d start" % rank)
model = train()
print("rank %d done" % rank)
def run_distributed(N):
ps, models = [], []
for rank in range(10):
p = mp.Process(target=worker,
args=(rank,))
p.start()
ps.append(p)
for p in ps:
p.join()
return models
train() # run_distributed hangs unless this line is commented
print("Done main train")
run_distributed(5)
print("Done distributed train")
garytho and yrevar
Metadata
Metadata
Assignees
Labels
No labels