Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions benchmarks/functional_autograd_benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Benchmarking tool for the autograd API

This folder contain a set of self-contained scripts that allow to benchmark the autograd with different common models.
It is designed to run the benchmark before and after your change and will generate a table to share on the PR.

To do so, you can use `functional_autograd_benchmark.py` to run the benchmarks before your change (using as output `before.txt`) and after your change (using as output `after.txt`).
You can then use `compare.py` to get a markdown table comparing the two runs.

The default arguments of `functional_autograd_benchmark.py` should be used in general. You can change them though to force a given device or force running even the (very) slow settings.

### Sample usage

```bash
# Make sure you compile pytorch in release mode and with the same flags before/after
export DEBUG=0
# When running on CPU, it might be required to limit the number of cores to avoid oversubscription
export OMP_NUM_THREADS=10

# Compile pytorch with the base revision
git checkout master
python setup.py develop

# Run the benchmark for the base
# This will use the GPU if available.
pushd benchmarks/functional_autograd_benchmark
python functional_autograd_benchmark.py --output before.txt

# Compile pytorch with your change
popd
git checkout your_feature_branch
python setup.py develop

# Run the benchmark for the new version
pushd benchmarks/functional_autograd_benchmark
python functional_autograd_benchmark.py --output after.txt

# Get the markdown table that you can paste in your github PR
python compare.py

popd

```

### Files in this folder:
- `functional_autograd_benchmark.py` is the main entry point to run the benchmark.
- `compare.py` is the entry point to run the comparison script that generates a markdown table.
- `torchaudio_models.py` and `torchvision_models.py` contains code extracted from torchaudio and torchvision to be able to run the models without having a specific version of these libraries installed.
- `ppl_models.py`, `vision_models.py` and `audio_text_models.py` contain all the getter functions used for the benchmark.
122 changes: 122 additions & 0 deletions benchmarks/functional_autograd_benchmark/audio_text_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import torch
from torch import nn, Tensor

import torchaudio_models as models

from utils import extract_weights, load_weights, GetterReturnType

def get_wav2letter(device: torch.device) -> GetterReturnType:
N = 10
input_frames = 700
vocab_size = 28
model = models.Wav2Letter(num_classes=vocab_size)
criterion = torch.nn.NLLLoss()
model.to(device)
params, names = extract_weights(model)

inputs = torch.rand([N, 1, input_frames], device=device)
labels = torch.rand(N, 3, device=device).mul(vocab_size).long()

def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
out = model(inputs)

loss = criterion(out, labels)
return loss

return forward, params

def get_deepspeech(device: torch.device) -> GetterReturnType:
sample_rate = 16000
window_size = 0.02
window = "hamming"
audio_conf = dict(sample_rate=sample_rate,
window_size=window_size,
window=window,
noise_dir=None)

N = 10
num_classes = 10
spectrogram_size = 161
# Commented are the original sizes in the code
seq_length = 500 # 1343
target_length = 10 # 50
labels = torch.rand(num_classes, device=device)
inputs = torch.rand(N, 1, spectrogram_size, seq_length, device=device)
# Sequence length for each input
inputs_sizes = torch.rand(N, device=device).mul(seq_length * 0.1).add(seq_length * 0.8)
targets = torch.rand(N, target_length, device=device)
targets_sizes = torch.full((N,), target_length, dtype=torch.int, device=device)

model = models.DeepSpeech(rnn_type=nn.LSTM, labels=labels, rnn_hidden_size=1024, nb_layers=5,
audio_conf=audio_conf, bidirectional=True)
model = model.to(device)
criterion = nn.CTCLoss()
params, names = extract_weights(model)

def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
out, out_sizes = model(inputs, inputs_sizes)
out = out.transpose(0, 1) # For ctc loss

loss = criterion(out, targets, out_sizes, targets_sizes)
return loss

return forward, params

def get_transformer(device: torch.device) -> GetterReturnType:
# For most SOTA research, you would like to have embed to 720, nhead to 12, bsz to 64, tgt_len/src_len to 128.
N = 64
seq_length = 128
ntoken = 50
model = models.TransformerModel(ntoken=ntoken, ninp=720, nhead=12, nhid=2048, nlayers=2)
model.to(device)
criterion = nn.NLLLoss()
params, names = extract_weights(model)

data = torch.rand(N, seq_length + 1, device=device).mul(ntoken).long()
inputs = data.narrow(1, 0, seq_length)
targets = data.narrow(1, 1, seq_length)

def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
out = model(inputs)

loss = criterion(out.reshape(N * seq_length, ntoken), targets.reshape(N * seq_length))
return loss

return forward, params

def get_multiheadattn(device: torch.device) -> GetterReturnType:
# From https://github.com/pytorch/text/blob/master/test/data/test_modules.py#L10
embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
# Build torchtext MultiheadAttention module
in_proj = models.InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False),
torch.nn.Linear(embed_dim, embed_dim, bias=False))

model = models.MultiheadAttentionContainer(nhead, in_proj,
models.ScaledDotProduct(),
torch.nn.Linear(embed_dim, embed_dim, bias=False))
model.to(device)
params, names = extract_weights(model)

query = torch.rand((tgt_len, bsz, embed_dim), device=device)
key = value = torch.rand((src_len, bsz, embed_dim), device=device)
attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len), device=device).to(torch.bool)
bias_k = bias_v = torch.rand((1, 1, embed_dim), device=device)

attn_mask = torch.stack([attn_mask_2D] * (bsz * nhead))
bias_k = bias_k.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)
bias_v = bias_v.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1)

def forward(*new_params: Tensor) -> Tensor:
load_weights(model, names, new_params)
mha_output, attn_weights = model(query, key, value, attn_mask=attn_mask, bias_k=bias_k, bias_v=bias_v)

# Don't test any specific loss, just backprop ones for both outputs
loss = mha_output.sum() + attn_weights.sum()

return loss

return forward, params
45 changes: 45 additions & 0 deletions benchmarks/functional_autograd_benchmark/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import argparse
from collections import defaultdict

from utils import to_markdown_table, from_markdown_table

def main():
parser = argparse.ArgumentParser("Main script to compare results from the benchmarks")
parser.add_argument("--before", type=str, default="before.txt", help="Text file containing the times to use as base")
parser.add_argument("--after", type=str, default="after.txt", help="Text file containing the times to use as new version")
parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
args = parser.parse_args()

with open(args.before, "r") as f:
content = f.read()
res_before = from_markdown_table(content)

with open(args.after, "r") as f:
content = f.read()
res_after = from_markdown_table(content)

diff = defaultdict(defaultdict)
for model in res_before:
for task in res_before[model]:
mean_before, var_before = res_before[model][task]
if task not in res_after[model]:
diff[model][task] = (None, mean_before, var_before, None, None)
else:
mean_after, var_after = res_after[model][task]
diff[model][task] = (mean_before / mean_after, mean_before, var_before, mean_after, var_after)
for model in res_after:
for task in res_after[model]:
if task not in res_before[model]:
mean_after, var_after = res_after[model][task]
diff[model][task] = (None, None, None, mean_after, var_after)

header = ("model", "task", "speedup", "mean (before)", "var (before)", "mean (after)", "var (after)")
out = to_markdown_table(diff, header=header)

print(out)
if args.output:
with open(args.output, "w") as f:
f.write(out)

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import torch
from torch.autograd import functional

import time
from argparse import ArgumentParser
from collections import defaultdict
from typing import NamedTuple, Callable, List, Any

import ppl_models
import vision_models
import audio_text_models

from utils import to_markdown_table, TimingResultType, InputsType, GetterType, VType

# Listing of the different tasks
FAST_TASKS_NO_DOUBLE_BACK = [
"vjp",
]

FAST_TASKS = FAST_TASKS_NO_DOUBLE_BACK + [
"vhp",
"jvp",
]

ALL_TASKS = FAST_TASKS + [
"hvp",
"jacobian",
"hessian"
]

DOUBLE_BACKWARD_TASKS = ["jvp", "hvp", "vhp", "hessian"]

# Model definition which contains:
# - name: a string with the model name.
# - getter: a function to get the model. It takes as input the device on which the model
# will run. It should return the forward function and the parameters (Tensors) used as
# input for the forward function. Note that the forward must *not* have any side effect.
# - tasks: the list of recommended tasks that can run in a reasonable amount of time with this model.
# - unsupported: the list of tasks that this model cannot run.
class ModelDef(NamedTuple):
name: str
getter: GetterType
tasks: List[str]
unsupported: List[str]

MODELS = [
ModelDef("resnet18", vision_models.get_resnet18, FAST_TASKS, []),
ModelDef("fcn_resnet", vision_models.get_fcn_resnet, FAST_TASKS, []),
ModelDef("detr", vision_models.get_detr, FAST_TASKS, []),
ModelDef("ppl_simple_reg", ppl_models.get_simple_regression, ALL_TASKS, []),
ModelDef("ppl_robust_reg", ppl_models.get_robust_regression, ALL_TASKS, []),
ModelDef("wav2letter", audio_text_models.get_wav2letter, FAST_TASKS, []),
ModelDef("deepspeech", audio_text_models.get_deepspeech, FAST_TASKS_NO_DOUBLE_BACK, DOUBLE_BACKWARD_TASKS),
ModelDef("transformer", audio_text_models.get_transformer, FAST_TASKS, []),
ModelDef("multiheadattn", audio_text_models.get_multiheadattn, FAST_TASKS, []),
]

def get_v_for(model: Callable, inp: InputsType, task: str) -> VType:
v: VType

if task in ["vjp"]:
out = model(*inp)
v = torch.rand_like(out)
elif task in ["jvp", "hvp", "vhp"]:
if isinstance(inp, tuple):
v = tuple(torch.rand_like(i) for i in inp)
else:
v = torch.rand_like(inp)
else:
v = None

return v

def run_once(model: Callable, inp: InputsType, task: str, v: VType) -> None:
func = getattr(functional, task)

if v is not None:
res = func(model, inp, v=v, strict=True)
else:
res = func(model, inp, strict=True)

def run_model(model_getter: GetterType, args: Any, task: str) -> List[float]:
if args.gpu == -1:
device = torch.device("cpu")

def noop():
pass
do_sync = noop
else:
device = torch.device("cuda:{}".format(args.gpu))
do_sync = torch.cuda.synchronize

model, inp = model_getter(device)

v = get_v_for(model, inp, task)
# Warmup
run_once(model, inp, task, v)

elapsed = []
for it in range(args.num_iters):
do_sync()
start = time.time()
run_once(model, inp, task, v)
do_sync()
elapsed.append(time.time() - start)

return elapsed

def main():
parser = ArgumentParser("Main script to benchmark functional API of the autograd.")
parser.add_argument("--output", type=str, default="", help="Text file where to write the output")
parser.add_argument("--num-iters", type=int, default=10)
parser.add_argument("--gpu", type=int, default=-2, help="GPU to use, -1 for CPU and -2 for auto-detect")
parser.add_argument("--run-slow-tasks", action="store_true", help="Run even the slow tasks")
parser.add_argument("--model-filter", type=str, default="", help="Only run the models in this filter")
parser.add_argument("--task-filter", type=str, default="", help="Only run the tasks in this filter")
parser.add_argument("--num-threads", type=int, default=10,
help="Number of concurrent threads to use when running on cpu")
parser.add_argument("--seed", type=int, default=0, help="The random seed to use.")
args = parser.parse_args()

results: TimingResultType = defaultdict(defaultdict)
torch.set_num_threads(args.num_threads)
torch.set_num_interop_threads(args.num_threads)

# This automatically seed cuda if it is available
torch.manual_seed(args.seed)

if args.gpu == -2:
args.gpu = 0 if torch.cuda.is_available() else -1

for name, model_getter, recommended_tasks, unsupported_tasks in MODELS:
if args.model_filter and name not in args.model_filter:
continue
tasks = ALL_TASKS if args.run_slow_tasks else recommended_tasks
for task in tasks:
if task in unsupported_tasks:
continue
if args.task_filter and task not in args.task_filter:
continue
runtimes = run_model(model_getter, args, task)

runtimes = torch.tensor(runtimes)
mean, var = runtimes.mean(), runtimes.var()
results[name][task] = (mean.item(), var.item())
print("Results for model {} on task {}: {}s (var: {})".format(name, task, mean, var))

if args.output:
with open(args.output, "w") as f:
f.write(to_markdown_table(results))

if __name__ == "__main__":
main()
Loading