cheinger
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/Dockerfile‎
Lines changed: 14 additions & 5 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/Dockerfile‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/NOTICE‎
Lines changed: 1 addition & 0 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/NOTICE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/README.md‎
Lines changed: 51 additions & 47 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/README.md‎
Lines changed: 51 additions & 47 deletions
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/data_module.py‎
Lines changed: 2 additions & 2 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/data_module.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/qm9.py‎
Lines changed: 2 additions & 2 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/data_loading/qm9.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/basis.py‎
Lines changed: 5 additions & 2 deletions b/‎DGLPyTorch/DrugDiscovery/SE3Transformer/se3_transformer/model/basis.py‎
Lines changed: 5 additions & 2 deletions
@@ -24,7 +24,7 @@
 # run docker daemon with --default-runtime=nvidia for GPU detection during build
 # multistage build for DGL with CUDA and FP16
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.08-py3
 
 FROM ${FROM_IMAGE_NAME} AS dgl_builder
 
@@ -33,11 +33,19 @@ RUN apt-get update \
     && apt-get install -y git build-essential python3-dev make cmake \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /dgl
-RUN git clone --branch v0.7.0 --recurse-submodules --depth 1 https://github.com/dmlc/dgl.git .
-RUN sed -i 's/"35 50 60 70"/"60 70 80"/g' cmake/modules/CUDA.cmake
+RUN git clone --branch 0.9.0 --recurse-submodules --depth 1 https://github.com/dmlc/dgl.git .
 WORKDIR build
-RUN cmake -DUSE_CUDA=ON -DUSE_FP16=ON ..
-RUN make -j8
+RUN export NCCL_ROOT=/usr \
+    && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release \
+        -DUSE_CUDA=ON -DCUDA_ARCH_BIN="60 70 80" -DCUDA_ARCH_PTX="80" \
+        -DCUDA_ARCH_NAME="Manual" \
+        -DUSE_FP16=ON \
+        -DBUILD_TORCH=ON \
+        -DUSE_NCCL=ON \
+        -DUSE_SYSTEM_NCCL=ON \
+        -DBUILD_WITH_SHARED_NCCL=ON \
+        -DUSE_AVX=ON \
+    && cmake --build .
 
 
 FROM ${FROM_IMAGE_NAME}
@@ -49,6 +57,7 @@ COPY --from=dgl_builder /dgl ./dgl
 RUN cd dgl/python && python setup.py install && cd ../.. && rm -rf dgl
 
 ADD requirements.txt .
+RUN pip install --no-cache-dir --upgrade --pre pip
 RUN pip install --no-cache-dir -r requirements.txt
 ADD . .
 
 
@@ -1,4 +1,4 @@
-Copyright 2021 NVIDIA CORPORATION & AFFILIATES
+Copyright 2021-2022 NVIDIA CORPORATION & AFFILIATES
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 
 
@@ -1,3 +1,4 @@
+
 SE(3)-Transformer PyTorch
 
 This repository includes software from https://github.com/FabianFuchsML/se3-transformer-public
 
@@ -161,11 +161,11 @@ Competitive training results and analysis are provided for the following hyperpa
 
 This model supports the following features:: 
 
-| Feature               | SE(3)-Transformer                
-|-----------------------|--------------------------
-|Automatic mixed precision (AMP)   |         Yes 
-|Distributed data parallel (DDP)   |         Yes 
-         
+| Feature                         | SE(3)-Transformer |
+|---------------------------------|-------------------|
+| Automatic mixed precision (AMP) | Yes               |
+| Distributed data parallel (DDP) | Yes               |
+
 #### Features
 
 
@@ -476,20 +476,20 @@ The following sections provide details on how we achieved our performance and ac
 
 Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
 
-| GPUs    | Batch size / GPU    | Absolute error - TF32  | Absolute error - mixed precision  |   Time to train - TF32  |  Time to train - mixed precision | Time to train speedup (mixed precision to TF32) |       
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|  1                 |    240                   |           0.03456                            |        0.03460                                |        1h23min      |    1h03min                |    1.32x              |
-|  8                 |    240                   |           0.03417                            |        0.03424                                |        15min          |    12min                |    1.25x              |
+| GPUs | Batch size / GPU | Absolute error - TF32 | Absolute error - mixed precision | Time to train - TF32 | Time to train - mixed precision | Time to train speedup (mixed precision to TF32) |       
+|:----:|:----------------:|:---------------------:|:--------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------------------------:|
+|  1   |       240        |        0.03038        |             0.02987              |       1h02min        |              50min              |                      1.24x                      |
+|  8   |       240        |        0.03466        |             0.03436              |        13min         |              10min              |                      1.27x                      |
 
 
 ##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
 
 Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
 
-| GPUs    | Batch size / GPU    | Absolute error - FP32  | Absolute error - mixed precision  |   Time to train - FP32  |  Time to train - mixed precision | Time to train speedup (mixed precision to FP32)  |      
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|  1                 |    240                   |           0.03432                            |        0.03439                                |         2h25min         |    1h33min                |    1.56x              |
-|  8                 |    240                   |           0.03380                            |        0.03495                                |        29min          |    20min                |    1.45x              |
+| GPUs | Batch size / GPU | Absolute error - FP32 | Absolute error - mixed precision | Time to train - FP32 | Time to train - mixed precision | Time to train speedup (mixed precision to FP32) |      
+|:----:|:----------------:|:---------------------:|:--------------------------------:|:--------------------:|:-------------------------------:|:-----------------------------------------------:|
+|  1   |       240        |        0.03044        |             0.03076              |       2h07min        |             1h22min             |                      1.55x                      |
+|  8   |       240        |        0.03435        |             0.03495              |        27min         |              19min              |                      1.42x                      |
 
 
 
@@ -499,12 +499,12 @@ Our results were obtained by running the `scripts/train.sh` training script in t
 
 Our results were obtained by running the `scripts/benchmark_train.sh` and `scripts/benchmark_train_multi_gpu.sh` benchmarking scripts in the PyTorch 21.07 NGC container on NVIDIA DGX A100 with 8x A100 80GB GPUs. Performance numbers (in molecules per millisecond) were averaged over five  entire training epochs after a warmup epoch.
 
-| GPUs             | Batch size / GPU     | Throughput - TF32 [mol/ms]                             | Throughput - mixed precision [mol/ms]      | Throughput speedup (mixed precision - TF32)   | Weak scaling - TF32    | Weak scaling - mixed precision |
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|   1              |     240             |   2.21                                       |   2.92                            |   1.32x                         |                      |                                              |
-|   1              |     120              |  1.81                                        |  2.04                             |  1.13x                          |                      |                                              |
-|   8              |     240             |   15.88                                      |     21.02                         |   1.32x                         |   7.18               |    7.20                                     |
-|   8              |     120              |  12.68                                       |    13.99                          |  1.10x                          |       7.00           |    6.86                                       |
+|       GPUs       |  Batch size / GPU   | Throughput - TF32 [mol/ms] | Throughput - mixed precision [mol/ms] | Throughput speedup (mixed precision - TF32) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|:----------------:|:-------------------:|:--------------------------:|:-------------------------------------:|:-------------------------------------------:|:-------------------:|:------------------------------:|
+|        1         |         240         |            2.61            |                 3.35                  |                    1.28x                    |                     |                                |
+|        1         |         120         |            1.94            |                 2.07                  |                    1.07x                    |                     |                                |
+|        8         |         240         |           18.80            |                 23.90                 |                    1.27x                    |        7.20         |              7.13              |
+|        8         |         120         |           14.10            |                 14.52                 |                    1.03x                    |        7.27         |              7.01              |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -514,12 +514,12 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_train.sh` and `scripts/benchmark_train_multi_gpu.sh` benchmarking scripts in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with 8x V100 16GB GPUs. Performance numbers (in molecules per millisecond) were averaged over five  entire training epochs after a warmup epoch.
 
-| GPUs             | Batch size / GPU     | Throughput - FP32 [mol/ms] | Throughput - mixed precision  [mol/ms]     | Throughput speedup (FP32 - mixed precision)   | Weak scaling - FP32    | Weak scaling - mixed precision |
-|:------------------:|:----------------------:|:--------------------:|:------------------------------------:|:---------------------------------:|:----------------------:|:----------------------------------------------:|
-|   1              |     240              |    1.25          |    1.88                           |  1.50x                          |                      |                                              |
-|   1              |     120              |    1.03           |   1.41                            |  1.37x                          |                      |                                              |
-|   8              |     240              |    8.68           |   12.75                           |  1.47x                          |      6.94            |      6.78                                    |
-|   8              |     120              |    6.64           |   8.58                           |   1.29x                         |        6.44          |        6.08                                  |
+|       GPUs       |   Batch size / GPU   | Throughput - FP32 [mol/ms] | Throughput - mixed precision  [mol/ms] | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|:----------------:|:--------------------:|:--------------------------:|:--------------------------------------:|:-------------------------------------------:|:-------------------:|:------------------------------:|
+|        1         |         240          |            1.33            |                  2.12                  |                    1.59x                    |                     |                                |
+|        1         |         120          |            1.11            |                  1.45                  |                    1.31x                    |                     |                                |
+|        8         |         240          |            9.32            |                 13.40                  |                    1.44x                    |        7.01         |              6.32              |
+|        8         |         120          |            6.90            |                  8.39                  |                    1.22x                    |        6.21         |              5.79              |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -532,21 +532,21 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_inference.sh` inferencing benchmarking script in the PyTorch 21.07 NGC container on NVIDIA DGX A100 with 1x A100 80GB GPU.
 
-FP16
+AMP
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 11.60 | 140.94 | 138.29 | 140.12 | 386.40 |
-| 800 | 10.74 | 75.69 | 75.74 | 76.50 | 79.77 |
-| 400 | 8.86 | 45.57 | 46.11 | 46.60 | 49.97 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          13.54          |      121.44      |      118.07      |      119.00      |      366.64      |
+|    800     |          12.63          |      64.11       |      63.78       |      64.37       |      68.19       |
+|    400     |          10.65          |      37.97       |      39.02       |      39.67       |      42.87       |
 
 TF32
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 8.58 | 189.20 | 186.39 | 187.71 | 420.28 |
-| 800 | 8.28 | 97.56 | 97.20 | 97.73 | 101.13 |
-| 400 | 7.55 | 53.38 | 53.72 | 54.48 | 56.62 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          8.97           |      180.85      |      178.31      |      178.92      |      375.33      |
+|    800     |          8.86           |      90.76       |      90.77       |      91.11       |      92.96       |
+|    400     |          8.49           |      47.42       |      47.65       |      48.15       |      50.74       |
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
 
@@ -556,21 +556,21 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 Our results were obtained by running the `scripts/benchmark_inference.sh` inferencing benchmarking script in the PyTorch 21.07 NGC container on NVIDIA DGX-1 with 1x V100 16GB GPU.
 
-FP16
+AMP
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 6.42 | 254.54 | 247.97 | 249.29 | 721.15 |
-| 800 | 6.13 | 132.07 | 131.90 | 132.70 | 140.15 |
-| 400 | 5.37 | 75.12 | 76.01 | 76.66 | 79.90 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          6.59           |      248.02      |      242.11      |      242.62      |      674.60      |
+|    800     |          6.38           |      126.49      |      125.96      |      126.31      |      127.72      |
+|    400     |          5.90           |      68.24       |      68.53       |      69.02       |      70.87       |
 
 FP32
 
-| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] |Latency 95% [ms] |Latency 99% [ms] |
-|:------------:|:------:|:-----:|:-----:|:-----:|:-----:|
-| 1600 | 3.39 | 475.86 | 473.82 | 475.64 | 891.18 |
-| 800 | 3.36 | 239.17 | 240.64 | 241.65 | 243.70 |
-| 400 | 3.17 | 126.67 | 128.19 | 128.82 | 130.54 |
+| Batch size | Throughput Avg [mol/ms] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+|:----------:|:-----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
+|    1600    |          3.33           |      482.20      |      483.50      |      485.28      |      754.84      |
+|    800     |          3.35           |      239.09      |      242.21      |      243.13      |      244.91      |
+|    400     |          3.27           |      122.68      |      123.60      |      124.18      |      125.85      |
 
 
 To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@@ -580,6 +580,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 ### Changelog
 
+August 2022:
+- Slight performance improvements
+- Upgraded base container
+
 November 2021:
 - Improved low memory mode to give further 6x memory savings
 - Disabled W&B logging by default
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 import torch.distributed as dist
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 from typing import Tuple
 
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -18,7 +18,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES
 # SPDX-License-Identifier: MIT
 
 
@@ -33,6 +33,9 @@
 
 from se3_transformer.runtime.utils import degree_to_dim
 
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+
 
 @lru_cache(maxsize=None)
 def get_clebsch_gordon(J: int, d_in: int, d_out: int, device) -> Tensor:
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright 2021 NVIDIA CORPORATION & AFFILIATES`
	`1`	`+Copyright 2021-2022 NVIDIA CORPORATION & AFFILIATES`
`2`	`2`
`3`	`3`	`Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+`
`1`	`2`	`SE(3)-Transformer PyTorch`
`2`	`3`
`3`	`4`	`This repository includes software from https://github.com/FabianFuchsML/se3-transformer-public`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`1`	`+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`2`	`2`	`#`
`3`	`3`	`# Permission is hereby granted, free of charge, to any person obtaining a`
`4`	`4`	`# copy of this software and associated documentation files (the "Software"),`
`@@ -18,7 +18,7 @@`
`18`	`18`	`# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER`
`19`	`19`	`# DEALINGS IN THE SOFTWARE.`
`20`	`20`	`#`
`21`		`-# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES`
	`21`	`+# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES`
`22`	`22`	`# SPDX-License-Identifier: MIT`
`23`	`23`
`24`	`24`	`import torch.distributed as dist`