Skip to content

Commit b04f609

Browse files
committed
[aten] Call fbgemm functions for embedding prepack/unpack
Pull Request resolved: #44845 fbgemm functions are vectorized and faster ``` Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/6473924484856786 Summary (total time 15.08s): PASS: 7 FAIL: 0 SKIP: 0 FATAL: 0 TIMEOUT: 0 OMIT: 0 ``` Performance Before: ``` # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 68.727 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 131.500 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 248.190 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 172.742 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 333.008 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 652.423 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 167.282 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 398.901 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 785.254 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 122.653 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 230.617 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 408.807 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 176.087 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 337.514 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 659.716 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 342.529 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 665.197 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 1307.923 ``` Performance After: ``` # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 10.782 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 17.443 # Benchmarking PyTorch: qembeddingbag_byte_prepack # Mode: Eager # Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 25.898 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 13.903 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 18.575 # Benchmarking PyTorch: qembeddingbag_4bit_prepack # Mode: Eager # Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 30.650 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 14.158 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 19.818 # Benchmarking PyTorch: qembeddingbag_2bit_prepack # Mode: Eager # Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 30.852 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 47.596 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 91.025 # Benchmarking PyTorch: qembeddingbag_byte_unpack # Mode: Eager # Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 131.425 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 12.637 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 20.856 # Benchmarking PyTorch: qembeddingbag_4bit_unpack # Mode: Eager # Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 33.944 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128 # Input: num_embeddings: 80, embedding_dim: 128 Forward Execution Time (us) : 21.181 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256 # Input: num_embeddings: 80, embedding_dim: 256 Forward Execution Time (us) : 34.213 # Benchmarking PyTorch: qembeddingbag_2bit_unpack # Mode: Eager # Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512 # Input: num_embeddings: 80, embedding_dim: 512 Forward Execution Time (us) : 59.622 ``` ghstack-source-id: 112812505 Differential Revision: [D23675777](https://our.internmc.facebook.com/intern/diff/D23675777/)
1 parent dc67b47 commit b04f609

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
104104
embedding_rows,
105105
embedding_cols +
106106
8}; // extra 8 bytes to store FP scale and zero_point per row.
107-
size_t output_columns = output_shape[1];
108-
constexpr float kEpsilon = 1e-8f;
109107

110108
// Allocate output packed weights
111109
auto output = at::empty(
@@ -114,6 +112,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
114112
weight_contig.suggest_memory_format());
115113
auto* output_data = output.data_ptr<uint8_t>();
116114

115+
#ifdef USE_FBGEMM
116+
fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat(
117+
weight_data, embedding_rows, embedding_cols, output_data);
118+
#else
119+
size_t output_columns = output_shape[1];
120+
constexpr float kEpsilon = 1e-8f;
117121
for (std::size_t row = 0; row < embedding_rows; ++row) {
118122
const float* input_row = weight_data + row * embedding_cols;
119123
std::uint8_t* output_row = output_data + row * output_columns;
@@ -134,6 +138,8 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
134138
lrintf((input_row[col] - minimum_element) * inverse_scale);
135139
} // embedding_cols
136140
} // embedding_rows
141+
#endif // USE_FBGEMM
142+
137143
return output;
138144
}
139145

@@ -175,6 +181,11 @@ Tensor _qembeddingbag_nbit_prepack_helper(
175181
weight_contig.options().dtype(at::kByte),
176182
weight_contig.suggest_memory_format());
177183
auto* output_data = output.data_ptr<uint8_t>();
184+
185+
#ifdef USE_FBGEMM
186+
fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf(
187+
BIT_RATE, weight_data, embedding_rows, embedding_cols, output_data);
188+
#else
178189
const auto output_columns = output.size(output.dim() - 1);
179190

180191
for (int row = 0; row < embedding_rows; ++row) {
@@ -226,6 +237,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
226237
}
227238
} // embedding_cols
228239
} // embedding_rows
240+
#endif // USE_FBGEMM
241+
229242
return output;
230243
}
231244

aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
7373
packed_weight.suggest_memory_format());
7474
float* output_data = output.data_ptr<float>();
7575

76+
#ifdef USE_FBGEMM
77+
fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat(
78+
input, input_rows, input_columns, output_data);
79+
#else
7680
for (std::size_t row = 0; row < input_rows; ++row) {
7781
const std::uint8_t* input_row = input + row * input_columns;
7882
const float* input_row_scale_zp =
@@ -84,14 +88,17 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
8488
input_row[col] * input_row_scale_zp[0] + input_row_scale_zp[1];
8589
} // output_columns
8690
} // input_rows
91+
#endif // USE_FBGEMM
8792
return output;
8893
}
8994

90-
Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) {
95+
Tensor _qembeddingbag_nbit_unpack_helper(
96+
const Tensor& packed_weight,
97+
int BIT_RATE) {
9198
const auto input_rows = packed_weight.size(0);
9299
const auto input_columns = packed_weight.size(1);
93100
const auto* input_data = packed_weight.data_ptr<uint8_t>();
94-
int NUM_ELEM_PER_BYTE = 8/BIT_RATE;
101+
int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
95102

96103
// The last 4 bytes per row are two fp16 scale and zero_point.
97104
// The rest of input_columns is the number of values in the original row.
@@ -105,6 +112,10 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
105112
packed_weight.options().dtype(kFloat),
106113
packed_weight.suggest_memory_format());
107114
float* output_data = output.data_ptr<float>();
115+
#ifdef USE_FBGEMM
116+
fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat(
117+
BIT_RATE, input_data, input_rows, input_columns, output_data);
118+
#else
108119
auto output_columns = output_dimensions[1];
109120
for (size_t row = 0; row < input_rows; ++row) {
110121
float* output_row = output_data + row * output_columns;
@@ -122,6 +133,8 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
122133
output_row[col] = scale * quantized + zero_point;
123134
} // output_columns
124135
} // input_rows
136+
#endif // USE_FBGEMM
137+
125138
return output;
126139
}
127140

0 commit comments

Comments
 (0)