2 Bit Embedding Conversion Operator support. (#43077)

Radhakrishnan Venkataramani · facebook-github-bot · commit fa6b34b54c73 · 2020-08-18T23:20:30.000-07:00
Summary: Pull Request resolved: #43077 2 Bit Embedding weight conversion operation is quite similar to 4 bit embedding weight conversion. The diff contains both the 1. 2bit packing op `embedding_bag_2bit_prepack`. 2. 2bit unpacking op `embedding_bag_2bit_unpack`. Comments about the op are inline with the op definition. Test Plan: buck test caffe2/test:quantization -- test_embedding_bag_2bit_unpack Reviewed By: supriyar Differential Revision: D23143262 fbshipit-source-id: fd8877f049ac1f7eb4bc580e588dc95f8b1edef0
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -137,19 +137,24 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   return output;
 }
 
-Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
+Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
   int64_t embedding_rows = weight.size(0);
   int64_t embedding_cols = weight.size(1);
 
   Tensor weight_contig = weight.contiguous(weight.suggest_memory_format());
 
   const auto weight_data = weight.data_ptr<float>();
-  constexpr int BIT_RATE = 4;
-  constexpr int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
+  TORCH_CHECK(
+    BIT_RATE == 4 || BIT_RATE == 2,
+    "BIT_RATE must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'."
+    "For 8bit, consider using 'embedding_bag_byte_prepack'.");
+
+  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
   TORCH_CHECK(
       weight_contig.size(weight.dim() - 1) % NUM_ELEM_PER_BYTE == 0,
-      "FloatToFused4BitRowwiseQuantizedOp only works for the number of "
-      "columns a multiple of 2");
+      "qembeddingbag_" + c10::to_string(BIT_RATE) +
+      "bit_prepack only works for the number of columns a multiple of "
+      + c10::to_string(NUM_ELEM_PER_BYTE));
 
   // The "fused" representation stores the scale and bias with the
   // row-wise quantized data in one tensor.
@@ -219,6 +224,29 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
   return output;
 }
 
+// Applies 4-bit row-wise quantization by determining the range
+// (maximum - minimum) and bias (minimum value) of each row in the input
+// matrix, and then scaling each element to an 2-bit number between 0 and
+// 15.
+// To later de-quantize values, the scale (range / 15) and zero_point
+// are stored alongside the data. More precisely, each row first has quantized
+// values, and then 2-byte fp16 scale and 2-byte zero_offset.
+Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
+  return _qembeddingbag_nbit_prepack_helper(weight, 4 /*BIT_RATE*/);
+}
+
+// Applies 2-bit row-wise quantization by determining the range
+// (maximum - minimum) and bias (minimum value) of each row in the input
+// matrix, and then scaling each element to an 2-bit number between 0 and
+// 3.
+// To later de-quantize values, the scale (range / 3) and zero_point
+// are stored alongside the data. More precisely, each row first has quantized
+// values, and then 2-byte fp16 scale and 2-byte zero_offset.
+// TODO() - Add 2Bit Embedding Lookup operator.
+Tensor qembeddingbag_2bit_prepack(const Tensor& weight) {
+  return _qembeddingbag_nbit_prepack_helper(weight, 2 /*BIT_RATE*/);
+}
+
 class QEmbeddingPackWeights final {
  public:
   static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(at::Tensor weight) {
@@ -229,7 +257,9 @@ class QEmbeddingPackWeights final {
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl("embedding_bag_byte_prepack", qembeddingbag_byte_prepack);
   m.impl("embedding_bag_4bit_prepack", qembeddingbag_4bit_prepack);
+  m.impl("embedding_bag_2bit_prepack", qembeddingbag_2bit_prepack);
 }
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl("embedding_bag_prepack", TORCH_FN(QEmbeddingPackWeights::run));
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -87,12 +87,11 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
   return output;
 }
 
-Tensor qembeddingbag_4bit_unpack(const Tensor& packed_weight) {
+Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) {
   const auto input_rows = packed_weight.size(0);
   const auto input_columns = packed_weight.size(1);
   const auto* input_data = packed_weight.data_ptr<uint8_t>();
-  constexpr int NUM_ELEM_PER_BYTE = 2;
-  constexpr int BIT_RATE = 4;
+  int NUM_ELEM_PER_BYTE = 8/BIT_RATE;
 
   // The last 4 bytes per row are two fp16 scale and zero_point.
   // The rest of input_columns is the number of values in the original row.
@@ -126,6 +125,30 @@ Tensor qembeddingbag_4bit_unpack(const Tensor& packed_weight) {
   return output;
 }
 
+// De-quantizes the result of the qembeddingbag_4bit_prepack operator.
+// The input is expected to first have quantized values,
+// then 2-byte fp16 scale and 2-byte zero_offset.
+// The output is a matrix containing only the values, but de-quantized.
+// De-quantization is performed by multiplying each value by its
+// row's scale and zero_point parameters. The de-quantized values
+// will thus not be exactly equal to the original, un-quantized
+// floating point values.
+Tensor qembeddingbag_4bit_unpack(const Tensor& packed_weight) {
+  return _qembeddingbag_nbit_unpack_helper(packed_weight, 4 /*BIT_RATE*/);
+}
+
+// De-quantizes the result of the qembeddingbag_2bit_prepack operator.
+// The input is expected to first have quantized values,
+// then 2-byte fp16 scale and 2-byte zero_offset.
+// The output is a matrix containing only the values, but de-quantized.
+// De-quantization is performed by multiplying each value by its
+// row's scale and zero_point parameters. The de-quantized values
+// will thus not be exactly equal to the original, un-quantized
+// floating point values.
+Tensor qembeddingbag_2bit_unpack(const Tensor& packed_weight) {
+  return _qembeddingbag_nbit_unpack_helper(packed_weight, 2 /*BIT_RATE*/);
+}
+
 class QEmbeddingUnpackWeights final {
  public:
   static at::Tensor run(
@@ -137,6 +160,7 @@ class QEmbeddingUnpackWeights final {
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl("embedding_bag_byte_unpack", qembeddingbag_byte_unpack);
   m.impl("embedding_bag_4bit_unpack", qembeddingbag_4bit_unpack);
+  m.impl("embedding_bag_2bit_unpack", qembeddingbag_2bit_unpack);
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
@@ -95,6 +95,8 @@ TORCH_LIBRARY(quantized, m) {
   m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor");
   m.def("embedding_bag_4bit_prepack(Tensor weight) -> Tensor");
   m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor");
+  m.def("embedding_bag_2bit_prepack(Tensor weight) -> Tensor");
+  m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor");
   m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor");
   m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
   m.def("embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
@@ -2743,8 +2743,13 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe
         # compare against C2 to ensure numerical equivalency.
         from caffe2.python import core, workspace
         conversion_op = "FloatToFused8BitRowwiseQuantized"
+        reverse_conversion_op = None
         if bit_rate == 4:
             conversion_op = "FloatToFused4BitRowwiseQuantized"
+            reverse_conversion_op = "Fused4BitRowwiseQuantizedToFloat"
+        elif bit_rate == 2:
+            conversion_op = "FloatToFused2BitRowwiseQuantized"
+            reverse_conversion_op = "Fused2BitRowwiseQuantizedToFloat"
 
         def get_c2_weights(weights):
             workspace.ResetWorkspace()
@@ -2756,10 +2761,10 @@ def get_c2_weights(weights):
                 )
             )
             emb_q = workspace.FetchBlob("quantized_weights")
-            if bit_rate == 4:
+            if bit_rate == 4 or bit_rate == 2:
                 workspace.RunOperatorOnce(
                     core.CreateOperator(
-                        "Fused4BitRowwiseQuantizedToFloat", ["quantized_weights"], ["dequantized_weights"]
+                        reverse_conversion_op, ["quantized_weights"], ["dequantized_weights"]
                     )
                 )
                 dequantized_data = torch.from_numpy(workspace.FetchBlob("dequantized_weights"))
@@ -2794,6 +2799,15 @@ def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim):
 
         self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=4)
 
+    """ Tests the correctness of the embedding_bag_2bit pack/unpack op against C2 """
+    @given(num_embeddings=st.integers(10, 100),
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),)
+    def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim):
+        pack_fn = torch.ops.quantized.embedding_bag_2bit_prepack
+        unpack_fn = torch.ops.quantized.embedding_bag_2bit_unpack
+
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=2)
+
     def embedding_bag_rowwise_offsets_run(
             self, bit_rate, num_embeddings,
             embedding_dim, num_offsets, enable_per_sample_weights,