LambdaLabsML · justinpinkney · Jan 19, 2023 · Dec 20, 2022 · Jan 12, 2023 · Jan 12, 2023
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ import torch
 from diffusers import StableDiffusionPipeline
 from torch import autocast
 
-pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)  
+pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
 prompt = "Yoda"
@@ -113,29 +113,21 @@ for idx, im in enumerate(images):
 
 ## Benchmarking inference
 
-Detailed benchmark documentation can be found [here](./docs/benchmark.md).
-
-### Setup
-
-Before running the benchmark, make sure you have completed the repository [installation steps](#installation).
-
-You will then need to set the huggingface access token:
-1. Create a user account on HuggingFace and generate an access token.
-2. Set your huggingface access token as the `ACCESS_TOKEN` environment variable:
-```
-export ACCESS_TOKEN=<hf_...>
-```
+We have updated the original benchmark using xformers and a newer version of Diffusers, see the [new results here](./docs/benchmark-update.md) (original results can still be found [here](./docs/benchmark.md)).
 
 ### Usage
 
-Launch the benchmark script to append benchmark results to the existing [benchmark.csv](./benchmark.csv) results file:
-```
-python ./scripts/benchmark.py
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
+
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
 ```
 
 ### Results
 
-<img src="./docs/pictures/pretty_benchmark_sd_txt2img_latency.png" alt="Stable Diffusion Text2Image Latency (seconds)" width="850"/>
+![](./docs/pictures/sd_throughput.png)
 
 ## Links
 

diff --git a/benchmark.csv b/benchmark.csv
@@ -1,58 +1,81 @@
-Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,pytorch,1,458.97,0.0
-Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,onnx,1,286.13,0.0
-NVIDIA GeForce RTX 3090,single,pytorch,1,7.96,7.72
-NVIDIA GeForce RTX 3090,half,pytorch,1,4.83,4.54
-NVIDIA GeForce RTX 3090,single,pytorch,2,14.49,11
-NVIDIA GeForce RTX 3090,half,pytorch,2,8.42,8.75
-NVIDIA GeForce RTX 3090,single,pytorch,4,27.94,17.69
-NVIDIA GeForce RTX 3090,half,pytorch,4,15.87,15.36
-NVIDIA GeForce RTX 3090,single,pytorch,8,-1.0,-1.0
-NVIDIA GeForce RTX 3090,half,pytorch,8,-1.0,-1.0
-NVIDIA RTX A5500,single,pytorch,1,8.55,7.69
-NVIDIA RTX A5500,half,pytorch,1,5.05,4.58
-NVIDIA RTX A5500,single,pytorch,2,15.71,11
-NVIDIA RTX A5500,half,pytorch,2,9.37,8.8
-NVIDIA RTX A5500,single,pytorch,4,30.51,17.69
-NVIDIA RTX A5500,half,pytorch,4,16.97,15.33
-NVIDIA RTX A5500,single,pytorch,8,-1.0,-1.0
-NVIDIA RTX A5500,half,pytorch,8,-1.0,-1.0
-AMD EPYC 7352 24-Core Processor,single,pytorch,1,529.93,0.0
-AMD EPYC 7352 24-Core Processor,single,onnx,1,223.19,0.0
-NVIDIA GeForce RTX 3080,single,pytorch,4,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,4,-1.0,-1.0
-NVIDIA GeForce RTX 3080,single,pytorch,1,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,1,5.59,4.52
-NVIDIA GeForce RTX 3080,single,pytorch,2,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,2,-1.0,-1.0
-NVIDIA A100 80GB PCIe,single,pytorch,1,6.39,7.75
-NVIDIA A100 80GB PCIe,half,pytorch,1,3.74,4.55
-NVIDIA A100 80GB PCIe,single,pytorch,2,11.12,11.05
-NVIDIA A100 80GB PCIe,half,pytorch,2,5.72,8.77
-NVIDIA A100 80GB PCIe,single,pytorch,4,20.18,17.63
-NVIDIA A100 80GB PCIe,half,pytorch,4,10.04,15.34
-NVIDIA A100 80GB PCIe,single,pytorch,8,38.88,30.88
-NVIDIA A100 80GB PCIe,half,pytorch,8,18.68,28.47
-NVIDIA A100 80GB PCIe,single,pytorch,16,76.92,57.46
-NVIDIA A100 80GB PCIe,half,pytorch,16,36.67,54.73
-NVIDIA A100 80GB PCIe,half,pytorch,28,63.88,78.78
-NVIDIA RTX A6000,single,pytorch,1,8.09,7.75
-NVIDIA RTX A6000,half,pytorch,1,5.03,4.53
-NVIDIA RTX A6000,single,pytorch,2,14.86,10.98
-NVIDIA RTX A6000,half,pytorch,2,9.03,8.79
-NVIDIA RTX A6000,single,pytorch,4,27.92,17.62
-NVIDIA RTX A6000,half,pytorch,4,17.0,15.34
-NVIDIA RTX A6000,single,pytorch,8,53.95,30.88
-NVIDIA RTX A6000,half,pytorch,8,32.57,28.51
-NVIDIA RTX A6000,half,pytorch,16,63.16,46.11
-Quadro RTX 8000,single,pytorch,1,12.3,7.71
-Quadro RTX 8000,half,pytorch,1,5.93,4.52
-Quadro RTX 8000,single,pytorch,2,24.42,9.16
-Quadro RTX 8000,half,pytorch,2,10.92,7.02
-Quadro RTX 8000,single,pytorch,4,42.56,15.58
-Quadro RTX 8000,half,pytorch,4,21.24,12.39
-Quadro RTX 8000,single,pytorch,8,76.96,23.11
-Quadro RTX 8000,half,pytorch,8,40.52,20.98
-Quadro RTX 8000,single,pytorch,16,152.55,42.47
-Quadro RTX 8000,half,pytorch,16,80.31,38.18
-Quadro RTX 8000,single,pytorch,32,-1.0,-1.0
-Quadro RTX 8000,half,pytorch,32,-1.0,-1.0
+device,precision,autocast,xformers,runtime,n_samples,latency,memory
+NVIDIA A10,half,FALSE,TRUE,pytorch,1,2.01,3.13
+NVIDIA A10,single,FALSE,TRUE,pytorch,1,4.69,6.29
+NVIDIA A10,half,FALSE,TRUE,pytorch,2,3.65,4.3
+NVIDIA A10,single,FALSE,TRUE,pytorch,2,7.75,8.57
+NVIDIA A10,half,FALSE,TRUE,pytorch,4,6.68,6.63
+NVIDIA A10,single,FALSE,TRUE,pytorch,4,14.35,11.24
+NVIDIA A10,half,FALSE,TRUE,pytorch,8,12.93,11.05
+NVIDIA A10,single,FALSE,TRUE,pytorch,8,28.28,17.91
+NVIDIA A10,half,FALSE,TRUE,pytorch,16,24.65,19.86
+NVIDIA A10,single,FALSE,TRUE,pytorch,16,57.5,21.21
+NVIDIA A10,half,FALSE,TRUE,pytorch,32,48.79,7.37
+NVIDIA A10,single,FALSE,TRUE,pytorch,32,108.78,15.88
+NVIDIA A10,half,FALSE,TRUE,pytorch,64,108.26,17.54
+NVIDIA A10,single,FALSE,TRUE,pytorch,64,-1,-1
+NVIDIA A10,half,FALSE,TRUE,pytorch,128,212.94,22.18
+NVIDIA A10,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,1,1.78,6.1
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,1,1.17,3.19
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,2,3.68,8.03
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,2,1.73,4.33
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,4,5.56,11.53
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,4,3.73,6.62
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,8,10.95,18.12
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,8,5.25,11.12
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,16,21.05,33.04
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,16,9.93,19.81
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,32,41.02,14.41
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,32,18.75,7.34
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,64,80.45,26.17
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,64,36.89,12.46
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,128,161.52,48.01
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,128,73.72,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,1,1.79,6.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,1,1.18,3.18
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,2,2.97,8.03
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,2,1.66,4.32
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,4,5.35,11.54
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,4,2.68,6.61
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,8,10.16,18.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,8,4.85,11.12
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,16,9.13,19.8
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,16,19.71,33.25
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,32,17.72,7.33
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,32,39.03,14.39
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,64,34.92,13.79
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,64,77.05,26.34
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,128,69.31,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,1,3.61,6.35
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,1,1.93,3.15
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,2,5.57,7.73
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,2,2.84,4.37
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,4,9.67,10.7
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,4,4.56,6.64
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,8,18.96,16.87
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,8,8.39,11.19
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,16,37.89,28.82
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,16,15.62,20.01
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,32,71.57,14.26
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,32,31.19,7.65
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,64,143.26,26.42
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,64,65.72,23.84
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,128,287.96,47.92
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,128,130.38,34.36
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,1,4.42,5.7
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,1,1.84,3.24
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,2,8.33,8.6
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,2,3.08,4.17
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,4,16.56,11.86
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,4,5.62,6.42
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,8,28.71,15.88
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,8,10.64,10.45
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,16,20.96,10.87
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,16,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,32,40.13,7.73
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,32,110.17,15.72
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,64,79.82,13.51
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,64,-1,-1
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,128,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,128,-1,-1
diff --git a/docs/benchmark-update.md b/docs/benchmark-update.md
@@ -0,0 +1,23 @@
+# Benchmark update
+
+We are currently running benchmarks to update our Stable Diffusion numbers using a more recent version of Diffusers and to take advantage of xformers. THe interim results on a limited set of GPUs are presented here.
+
+## Running the benchmark
+
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
+
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
+```
+
+Results will be written to `results.csv`, the benchmark will take different amounts of time depending on the GPU present but expect it to take at least several minutes.
+
+## Results
+
+The current results for the benchmark are available in [`benchmark.csv`](../benchmark.csv). These results were run with Diffusers 0.11.0 and xformers using Ubuntu 20.04, Python 3.8, PyTorch 1.13, CUDA 11.8 ([NGC PyTorch container 22.11](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html)).
+
+xformers provides a significant boost in performance and memory consumption allowing large batch sizes to maximise utilisation of GPUs. Our best performance comes using NVIDIA A100-SXM4-40GB on [Lambda GPU cloud](https://cloud.lambdalabs.com), at the maximum batch size tested (128) at half precision we observe a throughput of 1.85 images/second when using DDIM 30 steps for sampling.
+
+![](./pictures/sd_throughput.png)
diff --git a/docs/benchmark.md b/docs/benchmark.md
@@ -1,5 +1,7 @@
 # Benchmarking Diffuser Models
 
+__We are currently in the process of updating our Stable Diffusion benchmark using more recent version of Diffusers and taking advantage of xformers. See the summary of interim result [here](./benchmark-update.md)__
+
 We present a benchmark of [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) model inference.  This text2image model uses a text prompt as input and outputs an image of resolution `512x512`.
 
 Our experiments analyze inference performance in terms of speed, memory consumption, throughput, and quality of the output images. We look at how different choices in hardware (GPU model, GPU vs CPU) and software (single vs half precision, pytorch vs onnxruntime) affect inference performance.
@@ -27,10 +29,10 @@ We run these same inference jobs CPU devices to put in perspective the inference
 
 
 We note that:
-* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions. 
+* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions.
 * `onnxruntime` can reduce the latency for CPU by about `40%` to `50%`, depending on the type of CPUs.
 
-ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers.  
+ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers.
 We will investigate `onnxruntime-gpu` in future benchmarks.
 
 
@@ -62,10 +64,10 @@ We run a series of throughput experiment in pytorch with half-precision and usin
 
 We note:
 * Once again, A100 80GB is the top performer and has the highest throughput.
-* The gap between A100 80GB and other cards in terms of throughput can be explained by the larger maximum batch size that can be used on this card. 
+* The gap between A100 80GB and other cards in terms of throughput can be explained by the larger maximum batch size that can be used on this card.
 
 
-As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources. 
+As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources.
 
 <img src="./pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png" alt="Stable Diffusion Text2Image Batch size vs Throughput (images/minute)" width="380"/>
 
@@ -76,7 +78,7 @@ We are curious about whether half-precision introduces degradations to the quali
 
 ![Evolution of precision v degradation across 100 steps](./pictures/benchmark_sd_precision_history.gif)
 
-Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish. 
+Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish.
 
 Interestingly, such a difference may not imply artifacts in half-precision's outputs. For example, in step 70, the picture below shows half-precision didn't produce the artifact in the single-precision output (an extra front leg):
 
@@ -162,7 +164,7 @@ sudo docker run --rm --gpus all nvidia/cuda:11.2.1-base-ubuntu20.04 nvidia-smi
 3. Build the benchmark docker image
 
 ```
-docker build -t benchmark -f ./benchmarking/Dockerfile .   
+docker build -t benchmark -f ./benchmarking/Dockerfile .
 ```
 
 #### Running the benchmark

diff --git a/docs/pictures/sd_throughput.png b/docs/pictures/sd_throughput.png
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
@@ -0,0 +1,13 @@
+FROM nvcr.io/nvidia/pytorch:22.11-py3
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+RUN pip install --pre xformers
+RUN pip install diffusers==0.11.0 accelerate transformers
+
+WORKDIR /workspace
+
+COPY benchmark.py /workspace/benchmark.py
+RUN (printf '#!/bin/bash\npython benchmark.py \"$@\"\n' >> /entry.sh) && chmod a+x /entry.sh
+ENTRYPOINT ["/entry.sh"]
diff --git a/scripts/Makefile b/scripts/Makefile
@@ -0,0 +1,17 @@
+bench:
+	docker build -t sd-bench .
+	docker run \
+		--rm -it \
+		--gpus all \
+		--shm-size=128g \
+		--net=host \
+		-v $(PWD):/workspace/results \
+		sd-bench \
+		--steps 30 \
+		--samples 1,2,4,8,16,32,64,128 \
+		--autocast no \
+		--xformers yes \
+		--output_file /workspace/results/results.csv
+
+clean:
+	rm results.csv