pytorch
diff --git a/‎.lintrunner.toml‎
Lines changed: 24 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎benchmarks/distributed/pipeline/benchmark_dataset.py‎
Lines changed: 58 additions & 0 deletions b/‎benchmarks/distributed/pipeline/benchmark_dataset.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎benchmarks/distributed/pipeline/pipe.py‎
Lines changed: 296 additions & 0 deletions b/‎benchmarks/distributed/pipeline/pipe.py‎
Lines changed: 296 additions & 0 deletions
@@ -1535,6 +1535,28 @@ exclude_patterns = [
     'torch/distributed/optim/post_localSGD_optimizer.py',
     'torch/distributed/optim/utils.py',
     'torch/distributed/optim/zero_redundancy_optimizer.py',
+    'torch/distributed/pipeline/__init__.py',
+    'torch/distributed/pipeline/sync/__init__.py',
+    'torch/distributed/pipeline/sync/_balance/__init__.py',
+    'torch/distributed/pipeline/sync/_balance/blockpartition.py',
+    'torch/distributed/pipeline/sync/_balance/profile.py',
+    'torch/distributed/pipeline/sync/batchnorm.py',
+    'torch/distributed/pipeline/sync/checkpoint.py',
+    'torch/distributed/pipeline/sync/copy.py',
+    'torch/distributed/pipeline/sync/dependency.py',
+    'torch/distributed/pipeline/sync/microbatch.py',
+    'torch/distributed/pipeline/sync/phony.py',
+    'torch/distributed/pipeline/sync/pipe.py',
+    'torch/distributed/pipeline/sync/pipeline.py',
+    'torch/distributed/pipeline/sync/skip/__init__.py',
+    'torch/distributed/pipeline/sync/skip/layout.py',
+    'torch/distributed/pipeline/sync/skip/namespace.py',
+    'torch/distributed/pipeline/sync/skip/portal.py',
+    'torch/distributed/pipeline/sync/skip/skippable.py',
+    'torch/distributed/pipeline/sync/skip/tracker.py',
+    'torch/distributed/pipeline/sync/stream.py',
+    'torch/distributed/pipeline/sync/utils.py',
+    'torch/distributed/pipeline/sync/worker.py',
     'torch/distributed/remote_device.py',
     'torch/distributed/rendezvous.py',
     'torch/distributed/rpc/__init__.py',
@@ -1828,6 +1850,8 @@ exclude_patterns = [
     'torch/testing/_internal/distributed/nn/__init__.py',
     'torch/testing/_internal/distributed/nn/api/__init__.py',
     'torch/testing/_internal/distributed/nn/api/remote_module_test.py',
+    'torch/testing/_internal/distributed/pipe_with_ddp_test.py',
+    'torch/testing/_internal/distributed/pipeline/__init__.py',
     'torch/testing/_internal/distributed/rpc/__init__.py',
     'torch/testing/_internal/distributed/rpc/dist_autograd_test.py',
     'torch/testing/_internal/distributed/rpc/dist_optimizer_test.py',
 
@@ -0,0 +1,58 @@
+import torch
+from torch.utils.data import Dataset
+
+
+def collate_sentences_lm(samples):
+    if len(samples) == 0:
+        return {}
+
+    id = torch.LongTensor([s["id"] for s in samples])
+    src_tokens = torch.stack([s["source"] for s in samples], 0)
+    tgt_tokens = torch.stack([s["target"] for s in samples], 0)
+    ntokens = len(samples) * len(samples[0]["target"])
+    src_lengths = torch.LongTensor([len(samples[0]["source"])] * len(samples))
+
+    batch = {
+        "id": id,
+        "nsentences": len(samples),
+        "ntokens": ntokens,
+        "input": src_tokens,
+        "target": tgt_tokens,
+    }
+    return batch
+
+
+class BenchmarkLMDataset(Dataset):
+    """
+    Dataset to benchmark a translation like seq2seq task.
+    Args:
+        vocab_size (int, optional): size of the vocabulary (default 10000).
+        max_source_positions (int, optional): max number of tokens in the
+            source sentence (default: 1024).
+        total_samples (int, optional): the total number of rows in the
+            dataset (default: 10000).
+    """
+
+    def __init__(
+        self,
+        vocab_size=10000,
+        max_source_positions=1024,
+        total_samples=10000,
+    ):
+        self.vocab_size = vocab_size
+        self.max_source_positions = max_source_positions
+        self.total_samples = total_samples
+        self.sizes = [self.max_source_positions] * self.total_samples
+
+    def __getitem__(self, index):
+        length = self.sizes[index]
+        source = torch.randint(1, self.vocab_size, (length,))
+        target = source.clone()
+        return {
+            "id": index,
+            "source": source,
+            "target": target,
+        }
+
+    def __len__(self):
+        return self.total_samples
@@ -0,0 +1,296 @@
+import argparse
+import math
+import os
+import time
+
+from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
+
+import torch
+import torch.nn as nn
+from torch.distributed import rpc
+
+from torch.distributed.pipeline.sync import Pipe
+from torch.distributed.pipeline.sync.utils import partition_model
+from torch.optim import Adam
+from torch.utils.data import DataLoader
+
+
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.2f}{unit}B"
+        num /= 1024.0
+
+
+def init_random_seed(seed: int):
+    import numpy
+
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    numpy.random.seed(seed)
+
+
+iteration_count = 0
+
+
+class EmbeddingLayer(nn.Embedding):
+    def __init__(self, ntoken, ninp, initrange):
+        super().__init__(ntoken, ninp)
+        self.ninp = ninp
+        nn.init.uniform_(self.weight, -initrange, initrange)
+
+    def forward(self, src):
+        return super().forward(src) * math.sqrt(self.ninp)
+
+
+class PositionalEncodingLayer(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        x = x + self.pe[: x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerDecoderLayer(nn.TransformerEncoderLayer):
+    """Though this class inherits from torch.nn.TransformerEncoderLayer,
+    it functions as a decoder in this model"""
+
+    def __init__(self, ninp, nhead, nhid, droupout):
+        super().__init__(ninp, nhead, nhid, droupout)
+        self.src_mask = None
+
+    def forward(self, src):
+        global iteration_count
+        iteration_count += 1
+
+        if self.src_mask is None or self.src_mask.size(0) != len(src):
+            device = src.device
+            mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
+            self.src_mask = mask
+
+        return super().forward(src, self.src_mask)
+
+
+class LinearLayer(nn.Linear):
+    def __init__(self, ninp, ntoken, initrange):
+        super().__init__(ninp, ntoken)
+        nn.init.zeros_(self.bias)
+        nn.init.uniform_(self.weight, -initrange, initrange)
+
+
+class TransformerLMSequential(nn.Sequential):
+    """A small language model based on the design of GPT-2 using nn.Sequential
+    for compatibility with Pipe"""
+
+    def __init__(self, ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder):
+        layers = [
+            EmbeddingLayer(ntokens, ninp, initrange),
+            PositionalEncodingLayer(ninp, dropout),
+        ]
+        for _ in range(ndecoder):
+            layers.append(TransformerDecoderLayer(ninp, nhead, nhid, dropout))
+
+        layers.append(LinearLayer(ninp, ntokens, initrange))
+        super().__init__(*layers)
+
+
+def make_model(args, device, ntokens):
+    ninp = 2048  # embedding dimension
+    nhid = (
+        2048  # the dimension of the feedforward network model in nn.TransformerEncoder
+    )
+    nhead = 32  # the number of heads in the multiheadattention models
+    dropout = 0
+    initrange = 0.1
+    ndecoder = args.num_decoder_layers
+
+    model = TransformerLMSequential(
+        ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder
+    ).to(device)
+
+    criterion = nn.CrossEntropyLoss()
+    lr = 0.01  # learning rate
+
+    def make_adam(model):
+        return Adam(model.parameters(), lr=lr)
+
+    optimizer = make_adam
+
+    return model, criterion, optimizer
+
+
+def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
+    model.train()
+
+    vocab_size = 10000
+    total_loss = 0.0
+    start_time = time.time()
+    word_counter = 0
+
+    optimizer = optimizer(model)
+
+    def get_first_device(model):
+        if model.devices:
+            return model.devices[0]
+        else:
+            return torch.cuda.current_device()
+
+    def get_last_device(model):
+        if model.devices:
+            return model.devices[-1]
+        else:
+            return torch.cuda.current_device()
+
+    print(
+        f"Number of parameters for model: {sum(p.numel() for p in model.parameters())}"
+    )
+    for i, batch in enumerate(lm_dataloader):
+        bi = batch["input"]
+        if args.max_batch and i > args.max_batch:
+            break
+        optimizer.zero_grad()
+        try:
+            tmp = batch["input"].to(get_first_device(model))
+            output = model(tmp).local_value()
+        except Exception as e:
+            raise RuntimeError(
+                f"training failed on {torch.distributed.get_rank()}"
+            ) from e
+
+        target = batch["target"].to(get_last_device(model))
+        output = output.to(target.device)
+
+        loss = criterion(output.view(-1, vocab_size), target.view(-1))
+        loss.backward()
+        del target
+        del output
+
+        torch.nn.utils.clip_grad_value_(model.parameters(), 0.05)
+        optimizer.step()
+
+        total_loss += loss.item()
+        log_interval = 1
+        word_counter += batch["ntokens"]
+        if i % log_interval == 0 and i > 0:
+            cur_loss = total_loss / log_interval
+            elapsed = time.time() - start_time
+            print(
+                f"| batch {i:5d} | wps {word_counter / elapsed:5.2f} | loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}"
+            )
+            word_counter = 0
+            total_loss = 0
+            start_time = time.time()
+
+    print("Peak memory usage for GPUs: ", end="")
+    for i in range(len(model.devices)):
+        print(
+            f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ",
+            end="",
+        )
+    print()
+
+
+def generate_balance(num_devices, num_layers):
+    balance = []
+    layers_assigned = 0
+    for i in range(num_devices):
+        x = (num_layers - layers_assigned) / (num_devices - i)
+        if x.is_integer():
+            balance.append(int(x))
+            layers_assigned += x
+        else:
+            balance.append(math.ceil(x))
+            layers_assigned += math.ceil(x)
+    return balance
+
+
+def make_model_and_data(args, device):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    vocab_size = 10000
+    model, criterion, optimizer = make_model(args, device, vocab_size)
+    lm_dataset = BenchmarkLMDataset()
+    lm_dataloader = DataLoader(
+        lm_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=0,
+        collate_fn=collate_sentences_lm,
+    )
+    return {
+        "model": model,
+        "criterion": criterion,
+        "optimizer": optimizer,
+        "data": lm_dataloader,
+        "vocab_size": vocab_size,
+    }
+
+
+def bench_single_process(args):
+    os.environ.update({"MASTER_ADDR": args.host})
+    os.environ.update({"MASTER_PORT": "10638"})
+
+    rpc.init_rpc(
+        "worker",
+        rank=0,
+        world_size=1,
+    )
+
+    num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
+    num_devices = min(args.num_devices, num_devices)
+    assert num_devices > 0
+    init_random_seed(0)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+    blob = make_model_and_data(args, None)
+    model = blob["model"]
+
+    balance = generate_balance(num_devices, len(model))
+    model = partition_model(model, balance)
+    p = Pipe(model, chunks=args.chunks, checkpoint=args.checkpoint)
+    del model
+    del blob["model"]
+
+    train(
+        blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args
+    )
+
+
+parser = argparse.ArgumentParser(description="benchmark")
+parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
+parser.add_argument(
+    "--chunks", type=int, default=4, help="number of microbatches per batch"
+)
+parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
+parser.add_argument("--max-batch", type=int, default=10, help="Max number of batches")
+parser.add_argument(
+    "--num-decoder-layers",
+    type=int,
+    default=10,
+    help="Number of decoder layers in the model",
+)
+parser.add_argument(
+    "--checkpoint",
+    default="except_last",
+    choices=["always", "except_last", "never"],
+    help="Checkpointing strategy for pipe",
+)
+parser.add_argument(
+    "--num-devices", type=int, default=4, help="Number of GPU devices to use"
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(f"Running benchmark with args: {args}")
+    bench_single_process(args)