redo of add quantized layer norm implementation

vkuzo · vkuzo · commit d4564015c6c3 · 2020-04-15T08:39:46.000-07:00
Summary: This is a redo of #35329 with a better test. Adds a quantized implementation of LayerNorm for server. A future PR will add the Python wrapper. Test Plan: numerics match the floating point implementation benchmarks by input size: v1 (mean+var non-vectorized): https://gist.github.com/vkuzo/f6d72c04742608112f4c2e612c74bd13 v2 (mean+var vectorized in float): https://gist.github.com/vkuzo/4dd95657c5b5f3654e0965db00eff8d2 v3 (mean+var vectorized in int, current): https://gist.github.com/vkuzo/57a75f75629da9f23b64b38ca0e3d34b ghstack-source-id: 9bb87ea Pull Request resolved: #36593
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
@@ -12,6 +12,7 @@
 #include <ATen/Config.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <ATen/core/op_registration/op_registration.h>
 
 namespace at {
 namespace native {
@@ -60,13 +61,12 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
   return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
-Tensor layer_norm(
+std::tuple<Tensor, Tensor, Tensor, int64_t, int64_t> _prepare_layer_norm_inputs(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const Tensor& weight /* optional */,
-    const Tensor& bias /* optional */,
-    double eps,
-    bool /* cudnn_enable, deprecated */) {
+    const Tensor& bias /* optional */) {
+
   const int normalized_ndim = normalized_shape.size();
   TORCH_CHECK(
       normalized_ndim >= 1,
@@ -119,11 +119,90 @@ Tensor layer_norm(
   const auto& X = input.is_contiguous() ? input : input.contiguous();
   const auto& gamma = weight.is_contiguous() ? weight : weight.contiguous();
   const auto& beta = bias.is_contiguous() ? bias : bias.contiguous();
+
+  return std::make_tuple(X, gamma, beta, M, N);
+}
+
+Tensor layer_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    bool /* cudnn_enable, deprecated */) {
+
+  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto X = std::get<0>(inputs);
+  auto gamma = std::get<1>(inputs);
+  auto beta = std::get<2>(inputs);
+  auto M = std::get<3>(inputs);
+  auto N = std::get<4>(inputs);
+
   return std::get<0>(at::native_layer_norm(X, gamma, beta, M, N, eps));
 }
 
+Tensor quantized_layer_norm_impl(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point) {
+
+  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto X = std::get<0>(inputs);
+  auto gamma = std::get<1>(inputs);
+  auto beta = std::get<2>(inputs);
+  auto M = std::get<3>(inputs);
+  auto N = std::get<4>(inputs);
+
+  Tensor Y = at::_empty_affine_quantized(
+    X.sizes(),
+    X.scalar_type(),
+    output_scale,
+    output_zero_point,
+    X.suggest_memory_format());
+
+  if (M > 0) {
+    quantized_layer_norm_stub(kCPU, X, gamma, beta, M, N, eps, &Y);
+  }
+  return Y;
+}
+
+// Keep the registry in the anonymous namespace.
+namespace {
+class QLayerNorm2d final : public torch::OperatorKernel {
+ public:
+  Tensor operator()(
+      Tensor input,
+      std::vector<int64_t> normalized_shape,
+      Tensor weight /* optional */,
+      Tensor bias /* optional */,
+      double eps,
+      double output_scale,
+      int64_t output_zero_point) {
+    return quantized_layer_norm_impl(
+        input, normalized_shape, weight, bias, eps, output_scale, output_zero_point);
+  }
+};
+
+static auto registry = torch::RegisterOperators().op(
+    "quantized::layer_norm(Tensor input, "
+    "int[] normalized_shape, "
+    "Tensor weight, "
+    "Tensor bias, "
+    "float eps, "
+    "float output_scale, "
+    "int output_zero_point) -> Tensor",
+    torch::RegisterOperators::options().kernel<QLayerNorm2d>(
+        DispatchKey::QuantizedCPU));
+
+} // namespace
+
 DEFINE_DISPATCH(LayerNormKernel);
 DEFINE_DISPATCH(LayerNormBackwardKernel);
+DEFINE_DISPATCH(quantized_layer_norm_stub);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
@@ -29,8 +29,18 @@ using backward_fn = void (*)(
     Tensor* /* dgamma */,
     Tensor* /* dbeta */);
 
+using forward_quantized_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
 DECLARE_DISPATCH(forward_fn, LayerNormKernel);
 DECLARE_DISPATCH(backward_fn, LayerNormBackwardKernel);
+DECLARE_DISPATCH(forward_quantized_fn, quantized_layer_norm_stub);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1645,6 +1645,11 @@
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
 
+- func: quantized_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor
+  requires_tensor: True
+  dispatch:
+    QuantizedCPU: quantized_layer_norm_impl
+
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
 
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -1889,6 +1889,120 @@ void fake_quant_grad_per_channel_cpu(TensorIterator &iter, int64_t quant_min, in
     });
 }
 
+template <typename T>
+void quantized_layer_norm_kernel_impl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    float eps,
+    Tensor* Y) {
+
+}
+
+void quantized_layer_norm_kernel(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    Tensor* Y) {
+  AT_DISPATCH_QINT_TYPES(X.scalar_type(), "quantized_layer_norm_kernel_impl_cpu", [&]() {
+    using qVec = vec256::Vec256<scalar_t>;
+    using fVec = vec256::Vec256<float>;
+
+    TORCH_INTERNAL_ASSERT(X.numel() == M * N, "Unexpected num elements in X");
+    TORCH_INTERNAL_ASSERT(!gamma.defined() || gamma.numel() == N,
+        "Unexpected size of gamma");
+    TORCH_INTERNAL_ASSERT(!beta.defined() || beta.numel() == N,
+        "Unexpected size of beta");
+    scalar_t* X_data = X.data_ptr<scalar_t>();
+    const float* gamma_data = gamma.defined() ? gamma.data_ptr<float>() : nullptr;
+    const float* beta_data = beta.defined() ? beta.data_ptr<float>() : nullptr;
+    scalar_t* Y_data = Y->data_ptr<scalar_t>();
+    const bool gamma_null = gamma_data == nullptr;
+    const bool beta_null = beta_data == nullptr;
+    int64_t x_zp = X.q_zero_point();
+    float x_scale = X.q_scale();
+    fVec x_zp_vec((float)x_zp);
+    fVec one_vec(1.0f);
+    fVec zero_vec(0.0f);
+    float x_fake_scale = 1.0f;
+    fVec x_fake_scale_vec(x_fake_scale);
+    fVec x_fake_scale_zp_neg_premul_vec = x_fake_scale_vec * x_zp_vec.neg();
+    int64_t y_zp = Y->q_zero_point();
+    float y_scale = Y->q_scale();
+    float y_inv_scale = 1.0f / y_scale;
+
+    constexpr int kFloatVLen = 8;
+    int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
+    int64_t kNumIntVecInLayer = N / kIntVLen;
+    int64_t kNonVecRemInLayer = N % kIntVLen;
+
+    at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
+      for (int64_t i = start; i < end; ++i) {
+
+        scalar_t* X_ptr = X_data + i * N;
+        scalar_t* Y_ptr = Y_data + i * N;
+
+        // First pass: calculate mean and variance.
+
+        scalar_t::underlying* X_ptr_underlying = reinterpret_cast<scalar_t::underlying*>(X_ptr);
+        auto l_sum_shifted = hsum(X_ptr_underlying, N);
+        auto l_sum_sq_shifted = hsum_sq(X_ptr_underlying, N);
+        float l_mean_shifted_div_scale_x = static_cast<float>(l_sum_shifted) / N;
+        // mean(dqX) / scale_x
+        float layer_mean_div_scale_x = l_mean_shifted_div_scale_x - x_zp;
+        // var(dqX) / scale_x^2
+        float layer_var_div_scale_x_sq =
+          std::max(static_cast<float>(l_sum_sq_shifted) / N -
+              l_mean_shifted_div_scale_x * l_mean_shifted_div_scale_x, 0.0f);
+        // scale_x / sqrt(var(dqX) + eps)
+        float scale_x_div_layer_std = x_scale /
+          std::sqrt(layer_var_div_scale_x_sq * x_scale * x_scale + eps);
+        fVec layer_mean_div_scale_xVec(layer_mean_div_scale_x);
+        fVec scale_x_div_layer_stdVec(scale_x_div_layer_std);
+
+        // Second pass: normalize
+
+        // TODO replace with TensorIterator implementation once #33166 is fixed.
+        for (int64_t vecIdx = 0; vecIdx < kNumIntVecInLayer; vecIdx++) {
+          int64_t vecStartIdx = vecIdx * kIntVLen;
+          auto qXVec = qVec::loadu(X_ptr + vecStartIdx);
+          auto dqXVec = qXVec.dequantize(x_fake_scale_vec, x_zp_vec,
+              x_fake_scale_zp_neg_premul_vec);
+          for (int dqXVecIdx = 0; dqXVecIdx < dqXVec.size(); dqXVecIdx++) {
+            int64_t vecVecStartIdx = vecStartIdx + dqXVecIdx * kFloatVLen;
+            auto gammaVec = gamma_null
+              ? one_vec
+              : fVec::loadu(gamma_data + vecVecStartIdx);
+            auto betaVec = beta_null
+              ? zero_vec
+              : fVec::loadu(beta_data + vecVecStartIdx);
+            dqXVec[dqXVecIdx] =
+              (dqXVec[dqXVecIdx] - layer_mean_div_scale_xVec) *
+                scale_x_div_layer_stdVec * gammaVec + betaVec;
+            qVec::quantize(dqXVec, y_scale, y_zp, y_inv_scale)
+              .store(Y_ptr + vecStartIdx);
+          }
+        }
+        for (int64_t remIdx = N - kNonVecRemInLayer; remIdx < N; remIdx++) {
+          const float gamma_v = gamma_null ? 1.0f : gamma_data[remIdx];
+          const float beta_v = beta_null ? 0.0f : beta_data[remIdx];
+          auto qXVal = X_ptr[remIdx];
+          float dqXVal = at::dequantize_val(x_fake_scale, x_zp, qXVal);
+          float dqY =
+            ((dqXVal - layer_mean_div_scale_x) * scale_x_div_layer_std) * gamma_v + beta_v;
+          Y_ptr[remIdx] = at::quantize_val<scalar_t>(y_scale, y_zp, dqY);
+        }
+      }
+    }); // parallel_for
+
+  });
+}
+
 } // namespace
 
 REGISTER_DISPATCH(qrelu_stub, &qrelu_kernel);
@@ -1924,6 +2038,7 @@ REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu);
 REGISTER_DISPATCH(fake_quant_grad_per_channel_stub, &fake_quant_grad_per_channel_cpu);
+REGISTER_DISPATCH(quantized_layer_norm_stub, &quantized_layer_norm_kernel);
 
 } // namespace native
 } // namespace at
diff --git a/benchmarks/operator_benchmark/benchmark_all_other_test.py b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@@ -8,7 +8,7 @@
     add_test, as_strided_test, batchnorm_test, binary_test, cat_test,  # noqa
     chunk_test, conv_test, diag_test, embeddingbag_test, fill_test,  # noqa
     gather_test, linear_test, matmul_test, pool_test,  # noqa
-    softmax_test, hardsigmoid_test, hardswish_test  # noqa
+    softmax_test, hardsigmoid_test, hardswish_test, layernorm_test  # noqa
 )
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/benchmark_all_quantized_test.py b/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
@@ -12,6 +12,7 @@
     qcomparators_test,
     qconv_test,
     qinterpolate_test,
+    qlayernorm_test,
     qlinear_test,
     qobserver_test,
     qpool_test,
diff --git a/benchmarks/operator_benchmark/pt/layernorm_test.py b/benchmarks/operator_benchmark/pt/layernorm_test.py
@@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import operator_benchmark as op_bench
+import torch
+import torch.nn.functional as F
+
+
+"""Microbenchmarks for layernorm operator."""
+
+layernorm_configs_short = op_bench.cross_product_configs(
+    dims=(
+        (1, 8, 16),
+        (8, 8, 16),
+        (32, 8, 16),
+        (64, 128, 56, 56),
+    ),
+    tags=["short"],
+)
+
+
+class LayerNormBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, dims):
+        self.X = (torch.rand(*dims) - 0.5) * 256
+        self.weight = torch.rand(*self.X.size()[1:], dtype=torch.float)
+        self.bias = torch.rand(*self.X.size()[1:], dtype=torch.float)
+        self.eps = 1e-5
+
+    def forward(self):
+        return F.layer_norm(
+            self.X, self.X.size()[1:], weight=self.weight, bias=self.bias, eps=self.eps)
+
+
+op_bench.generate_pt_test(layernorm_configs_short, LayerNormBenchmark)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/qlayernorm_test.py b/benchmarks/operator_benchmark/pt/qlayernorm_test.py
@@ -0,0 +1,50 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+
+import operator_benchmark as op_bench
+import torch
+
+
+"""Microbenchmarks for quantized layernorm operator."""
+
+layernorm_configs_short = op_bench.cross_product_configs(
+    dims=(
+        (1, 8, 16),
+        (8, 8, 16),
+        (32, 8, 16),
+        (64, 128, 56, 56),
+    ),
+    dtype=(torch.qint8,),
+    tags=["short"],
+)
+
+
+class QLayerNormBenchmark(op_bench.TorchBenchmarkBase):
+
+    def init(self, dims, dtype):
+        X = (torch.rand(*dims) - 0.5) * 256
+        scale = 1.0
+        zero_point = 0
+        self.qX = torch.quantize_per_tensor(
+            X, scale=scale, zero_point=zero_point, dtype=dtype)
+        self.weight = torch.rand(*self.qX.size()[1:], dtype=torch.float)
+        self.bias = torch.rand(*self.qX.size()[1:], dtype=torch.float)
+        self.eps = 1e-5
+        self.Y_scale = 0.1
+        self.Y_zero_point = 0
+
+    def forward(self):
+        return torch.ops.quantized.layer_norm(
+            self.qX, self.qX.size()[1:], weight=self.weight, bias=self.bias,
+            eps=self.eps, output_scale=self.Y_scale,
+            output_zero_point=self.Y_zero_point)
+
+
+op_bench.generate_pt_test(layernorm_configs_short, QLayerNormBenchmark)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/test/quantization/test_quantized.py b/test/quantization/test_quantized.py
diff --git a/torch/_overrides.py b/torch/_overrides.py

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`add_test, as_strided_test, batchnorm_test, binary_test, cat_test, # noqa`
`9`	`9`	`chunk_test, conv_test, diag_test, embeddingbag_test, fill_test, # noqa`
`10`	`10`	`gather_test, linear_test, matmul_test, pool_test, # noqa`
`11`		`- softmax_test, hardsigmoid_test, hardswish_test # noqa`
	`11`	`+ softmax_test, hardsigmoid_test, hardswish_test, layernorm_test # noqa`
`12`	`12`	`)`
`13`	`13`
`14`	`14`	`if __name__ == "__main__":`