Fold quantize op into module (#25625)

jerryzh168 · facebook-github-bot · commit 43335cddb729 · 2019-09-13T12:27:16.000-07:00
Summary: Pull Request resolved: #25625 We want to fold the quantize op for weights/bias into module to avoid quantizing weights on the fly. Test Plan: python test/test_jit.py Imported from OSS Differential Revision: D17208889 fbshipit-source-id: 1854b8953b065855d210bc1166533c08ca264354
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -1331,6 +1331,23 @@ def test_fuse_linear(self):
             torch._C._jit_pass_fuse_linear(graph)
             FileCheck().run(input_str, graph)
 
+    @_tmp_donotuse_dont_inline_everything
+    def test_fold_quantize(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.weight = torch.nn.Parameter(torch.tensor([2], dtype=torch.float))
+
+            def forward(self, x):
+                return torch.quantize_linear(self.weight, 2.0, 0, torch.quint8)
+
+        m = torch.jit.script(M())
+        torch._C._jit_pass_fold_quantize(m._c, 'forward')
+        self.assertTrue(m._c._has_attribute('_quantized_weight'))
+        FileCheck().check_not('GetAttr[name="weight"]') \
+                   .check('GetAttr[name="_quantized_weight"]') \
+                   .run(m._c._get_method('forward').graph)
+
     def test_pattern_based_rewrite(self):
         # mul(mul(mul(mul(x,y),z),x),y) --> mul(mul(mulmul(x,y,z), x), y) -->
         # --> mulmul(mulmul(x,y,z), x, y)
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
@@ -168,6 +168,10 @@ void initJITBindings(PyObject* module) {
           [](std::shared_ptr<Graph>& g) { return QuantFusion(g); })
       .def("_jit_pass_fold_convbn", &FoldConvBatchNorm2d)
       .def("_jit_pass_fuse_linear", &FuseLinear)
+      .def("_jit_pass_fold_quantize",
+           [](script::Module& module, const std::string& method_name) {
+             FoldQuantizeCallIntoBuffer(module, method_name);
+           })
       .def(
           "_jit_pass_quantlint",
           [](std::shared_ptr<Graph>& g) { return QuantLinting(g); })
diff --git a/torch/csrc/jit/passes/quantization.cpp b/torch/csrc/jit/passes/quantization.cpp
@@ -825,5 +825,43 @@ graph(%self, %x):
     }
   }
 }
+
+void FoldQuantizeCallIntoBuffer(
+    script::Module& module,
+    const std::string& method_name) {
+  // TODO: extra filter on scale/zero_point/dtype to make sure they are Constant
+  const std::string pattern = R"(
+graph(%self, %scale, %zero_point, %dtype):
+   %weight = prim::GetAttr[name="weight"](%self)
+   %weight_quant = aten::quantize_linear(%weight, %scale, %zero_point, %dtype)
+   return (%weight_quant))";
+  Graph pattern_graph;
+  std::unordered_map<std::string, Value*> vmap;
+  script::parseIR(pattern, &pattern_graph, vmap);
+  auto method = module.get_method(method_name);
+  auto graph = method.graph();
+  auto matches = findPatternMatches(pattern_graph, *graph);
+  for (const auto& match : matches) {
+    auto match_vmap = match.values_map;
+    auto* weight = match_vmap.at(vmap.at("weight"));
+    auto float_weight = module.get_parameter("weight").variable_data();
+    auto scale = toIValue(match_vmap.at(vmap.at("scale"))).value().toDouble();
+    auto zero_point =
+        toIValue(match_vmap.at(vmap.at("zero_point"))).value().toInt();
+    auto dtype =
+        toIValue(match_vmap.at(vmap.at("dtype"))).value().toScalarType();
+    module.register_buffer(
+        "_quantized_weight",
+        at::quantize_linear(float_weight, scale, zero_point, dtype));
+  }
+
+  std::string replacement = R"(
+graph(%self, %scale, %zero_point, %dtype):
+    %weight_quant = prim::GetAttr[name="_quantized_weight"](%self)
+    return (%weight_quant))";
+  SubgraphRewriter rewriter;
+  rewriter.RegisterRewritePattern(pattern, replacement);
+  rewriter.runOnGraph(graph);
+}
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/quantization.h b/torch/csrc/jit/passes/quantization.h
@@ -105,5 +105,16 @@ TORCH_API void QuantFusion(std::shared_ptr<Graph>& graph);
  */
 TORCH_API void FoldConvBatchNorm2d(const script::Module& module);
 
+/** \brief Fold quantize function call into module
+ *
+ *  For the graph in the specified method of module, if we find a quantize_linear
+ *  call on an attribute("weight") of the module, we'll quantize the attribute directly
+ *  and register a new buffer "_quantized_weight" on the module and remove the
+ *  quantize_linear call and replace the use of the quantized weight with
+ *  "_quantized_weight".
+ */
+TORCH_API void FoldQuantizeCallIntoBuffer(script::Module& module, const std::string& method_name);
+
+
 } // namespace jit
 } // namespace torch