Skip to content

Commit 2cfd880

Browse files
authored
Merge pull request NVIDIA#103 from lxp121/master
Adding FasterTransformer
2 parents f89dcca + 75502be commit 2cfd880

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+7239
-0
lines changed

FasterTransformer/CMakeLists.txt

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
15+
project(FasterTransformer LANGUAGES CXX CUDA)
16+
17+
find_package(CUDA 10.0 REQUIRED)
18+
19+
option(BUILD_TRT "Build in TensorRT mode" OFF)
20+
option(BUILD_TF "Build in TensorFlow mode" OFF)
21+
22+
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
23+
24+
set(TF_PATH "" CACHE STRING "TensorFlow path")
25+
#set(TF_PATH "/usr/local/lib/python3.5/dist-packages/tensorflow")
26+
27+
if(BUILD_TF AND NOT TF_PATH)
28+
message(FATAL_ERROR "TF_PATH must be set if BUILD_TF(=TensorFlow mode) is on.")
29+
endif()
30+
31+
set(TRT_PATH "" CACHE STRING "TensorRT path")
32+
#set(TRT_PATH "/myspace/TensorRT-5.1.5.0")
33+
34+
if(BUILD_TRT AND NOT TRT_PATH)
35+
message(FATAL_ERROR "TRT_PATH must be set if BUILD_TRT(=TensorRT mode) is on.")
36+
endif()
37+
38+
list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
39+
find_package(CUDA REQUIRED)
40+
41+
# setting compiler flags
42+
if (SM STREQUAL 70 OR
43+
SM STREQUAL 75 OR
44+
SM STREQUAL 61 OR
45+
SM STREQUAL 60)
46+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM},code=\\\"sm_${SM},compute_${SM}\\\" -rdc=true")
47+
if (SM STREQUAL 70 OR SM STREQUAL 75)
48+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWMMA")
49+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWMMA")
50+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
51+
endif()
52+
53+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
54+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
55+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall")
56+
message("-- Assign GPU architecture (sm=${SM})")
57+
else()
58+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=\\\"sm_60,compute_60\\\" -rdc=true")
59+
message("-- Unknown or unsupported GPU architecture (set sm=60)")
60+
endif()
61+
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0")
62+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0")
63+
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
64+
65+
66+
set(CMAKE_CXX_STANDARD 11)
67+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
68+
69+
if(CMAKE_CXX_STANDARD STREQUAL "11")
70+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
71+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
72+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11")
73+
endif()
74+
75+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
76+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -O3")
77+
78+
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
79+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
80+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
81+
82+
set(COMMON_HEADER_DIRS
83+
${PROJECT_SOURCE_DIR}
84+
${CUDA_PATH}/include
85+
)
86+
87+
set(COMMON_LIB_DIRS
88+
${CUDA_PATH}/lib64
89+
)
90+
91+
if(BUILD_TF)
92+
list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
93+
list(APPEND COMMON_LIB_DIRS ${TF_PATH})
94+
endif()
95+
96+
if(BUILD_TRT)
97+
list(APPEND COMMON_HEADER_DIRS ${TRT_PATH}/include)
98+
list(APPEND COMMON_LIB_DIRS ${TRT_PATH}/lib)
99+
endif()
100+
101+
include_directories(
102+
${COMMON_HEADER_DIRS}
103+
)
104+
105+
link_directories(
106+
${COMMON_LIB_DIRS}
107+
)
108+
109+
add_subdirectory(tools)
110+
add_subdirectory(fastertransformer)
111+
add_subdirectory(sample)
112+
113+
114+
if(BUILD_TF)
115+
add_custom_target(copy ALL COMMENT "Copying tensorflow test scripts")
116+
add_custom_command(TARGET copy
117+
POST_BUILD
118+
COMMAND cp ${PROJECT_SOURCE_DIR}/sample/tensorflow/*.py ${PROJECT_SOURCE_DIR}/build/
119+
)
120+
endif()
121+

FasterTransformer/README.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
Faster Transformer
2+
===================
3+
## What is it?
4+
The Faster Transformer implements an equivalent but highly optimized BERT transformer layer for inference. On Volta and Turing GPUs, FP16 precision is used automatically to access the computing power of tensor cores.
5+
6+
Faster Transformer is built on top of the CUDA and cuBLAS. It supports three kinds of sequence lengths, 32, 64 and 128. Two key parameters of the transformer layer, the number of heads and the size of each head, are passed in runtime. Thus, not only the BERT Base (12 heads * 64 per head) , but also customized models like 4 heads * 32 per head and 8 heads * 96 per heads, are well supported. Our implementation shows good speedups on both small and large batch size cases.
7+
8+
C++ API, TensorRT plugin, and TensorFlow OP wrapper are available. You can easily integrate this optimized transformer layer into your TensorFlow or other inference service codes that built in native C++ or TensorRT. In addition to codes that illustrate the API invocations, we also provide a simple end-to-end BERT TensorFlow inference sample.
9+
10+
## Environment requirements
11+
* CMake >= 3.8
12+
* CUDA 10.0
13+
* Python 2.7
14+
* Tensorflow 1.13
15+
* TensorRT 5.1.5
16+
* The project is tested in nvidia/cuda 10.0-cudnn7-devel-ubuntu16.04 docker image. If you encountered compiling errors, try to compile with this docker image.
17+
18+
## Performance ##
19+
* CPU: Intel(R) Xeon(R) Gold 6132 CPU @ 2.60GHz
20+
* T4 (with mclk 5000MHz, pclk 1590MHz)
21+
* P4 (with mclk 2999MHz, pclk 1531MHz)
22+
* V100 (with mclk 877MHz, pclk 1380MHz)
23+
24+
When batch size equals to 1, the Tensorflow execution time really depends on the CPU you are using.
25+
26+
We only report the faster transformer performance here.
27+
28+
The performance of the faster transformer mainly depends on GPU. The execution time is stable.
29+
30+
31+
| <batch_size, layers, seq_len, head_num, size_per_head> | P4 FP32 (in ms) | T4 FP32 (in ms)| T4 FP16 (in ms)|
32+
|:-------------:|:-------------:|:---------:|:-----------:|
33+
| (1, 12, 32, 12, 64) | 3.43 | 2.74 | 1.56 |
34+
| (1, 12, 64, 12, 64) | 4.04 | 3.64 | 1.77 |
35+
| (1, 12, 128, 12, 64) | 6.22 | 5.93 | 2.23 |
36+
37+
38+
For large batch size case, we report both Tensorflow XLA and faster transformer's performance.
39+
40+
| <batch_size, layers, seq_len, head_num, size_per_head> | Tensorflow XLA on V100 FP16 (in ms)| Faster Transformer V100 FP16 (in ms) | Speedup |
41+
|:-------------:|:-------------:|:---------:|:-----------:|
42+
| (100, 12, 32, 12, 64) | 13.96 | 9.57 | 1.459 |
43+
| (200, 12, 32, 12, 64) | 26.47 | 18.37 | 1.44 |
44+
| (300, 12, 32, 12, 64) | 38.4 | 27.41 | 1.401 |
45+
| (400, 12, 32, 12, 64) | 49.65 | 35.63 | 1.393 |
46+
| (500, 12, 32, 12, 64) | 62.2 | 44.57 | 1.396 |
47+
48+
| <batch_size, layers, seq_len, head_num, size_per_head> | Tensorflow XLA on V100 FP16 (in ms)| Faster Transformer V100 FP16 (in ms) | Speedup |
49+
|:-------------:|:-------------:|:---------:|:-----------:|
50+
| (100, 12, 32, 4, 32) | 3.49 | 1.73 | 2.017 |
51+
| (200, 12, 32, 4, 32) | 4.9 | 2.55 | 1.922 |
52+
| (300, 12, 32, 4, 32) | 6.35 | 3.356 | 1.892 |
53+
| (400, 12, 32, 4, 32) | 8 | 4.31 | 1.856 |
54+
| (500, 12, 32, 4, 32) | 9.93 | 5.13 | 1.936 |
55+
56+
## Directory Structure
57+
```
58+
/fastertransformer: source code of transformer
59+
|--/cuda: some CUDA kernels and multi-head attention implementation, both are compiled with cuda/cuBLAS.
60+
|--/tf_op: custom Tensorflow OP implementation
61+
|--/trt_plugin: TensorRT plugin implementation
62+
/sample: c++ and tensorflow transformer interface samples
63+
|--/cpp: both FP16 and FP32 c++ interface samples
64+
|--/fastertransformer_bert: samples that show of how to integrate our Tensorflow OP into the open source BERT model for sentence (and sentence-pair) classification tasks (GLUE), the samples support both FP16 and FP32, see readme file within this folder more details
65+
|--/tensorflow: both FP16 and FP32 tensorflow OP samples
66+
|--/tensorRT: both FP16 and FP32 tensorRT plugin samples
67+
/tools/gemm_test: loop over all GEMM algorithms to pick the best one
68+
```
69+
70+
## How to build?
71+
### Init Git ###
72+
```shell
73+
$ git submodule init
74+
$ git submodule update
75+
```
76+
77+
### Build with Release ###
78+
```shell
79+
$ mkdir -p build
80+
$ cd build
81+
$ cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release .. # C++ only
82+
$ cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=/myspace/TensorRT-5.1.5.0 .. # TensorRT mode
83+
$ cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python2.7/dist-packages/tensorflow .. # Tensorflow mode
84+
$ cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TRT=ON -DTRT_PATH=/myspace/TensorRT-5.1.5.0 -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python2.7/dist-packages/tensorflow .. # C++, TensorRT and Tensorflow
85+
$ make
86+
```
87+
88+
Note: xx is the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4).
89+
### Execute demos ###
90+
```shell
91+
$ Step1 Generate the gemm_config.in file under the path build to pick GEMM algorithms for the best performance.
92+
$ ./build/bin/gemm_fp16(32) <batch_size> <seq_len> <head_num> <size_per_head>
93+
$ Step2 Execute demos
94+
$ 1. Tensorflow demos: python build/transformer_fp16(32).py <batch_size> <num_layers> <seq_len> <head_num> <size_per_head>
95+
$ 2. c++ demos: ./build/bin/transformer_fp16(32) <batch_size> <num_layerse> <seq_len> <head_num> <size_per_head>
96+
$ 3. TensorRT demos: ./build/bin/transformer_trt <batch_size> <num_layerse> <seq_len> <head_num> <size_per_head> fp16(fp32)
97+
```
98+
99+
### Useful sample code ###
100+
```shell
101+
$ 1. sample/tensorflow/transformer_fp32.py: transformer_layer Tensorflow FP32 OP call, time measurement, timeline generation
102+
$ 2. sample/tensorflow/transformer_fp16.py: transformer_layer Tensorflow FP16 OP call, time measurement, timeline generation
103+
$ 3. sample/tensorflow/error_check.py: how to catch custom OP runtime errors
104+
$ 4. sample/cpp/transformer_fp32.cc: transformer layer C++ FP32 sample
105+
$ 5. sample/cpp/transformer_fp16.cc: transformer layer C++ FP16 sample
106+
$ 6. sample/tensorRT/transformer_trt.cc: transformer layer tensorRT FP32/FP16 sample
107+
$ 7. tools/gemm_test/gemm_fp16.cu: loop over all cublas FP16 GEMM algorithms and pick the best one
108+
$ 8. tools/gemm_test/gemm_fp32.cu: loop over all cublas FP32 GEMM algorithms and pick the best one
109+
```
110+

0 commit comments

Comments
 (0)