add quantized layer norm implementation

vkuzo · vkuzo · commit 15d19f73b9ec · 2020-03-24T15:51:08.000-07:00
Summary: Adds a quantized implementation of LayerNorm for server. Relevant PRs: * #20345 (floating point LN) * #33080 (quantized BN) A future PR will add the Python wrapper. Test Plan: numerics match the floating point implementation TODO: benchmarks Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned]
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
@@ -12,6 +12,7 @@
 #include <ATen/Config.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <ATen/core/op_registration/op_registration.h>
 
 namespace at {
 namespace native {
@@ -60,13 +61,12 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
   return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
-Tensor layer_norm(
+std::tuple<Tensor, Tensor, Tensor, int64_t, int64_t> _prepare_layer_norm_inputs(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const Tensor& weight /* optional */,
-    const Tensor& bias /* optional */,
-    double eps,
-    bool /* cudnn_enable, deprecated */) {
+    const Tensor& bias /* optional */) {
+
   const int normalized_ndim = normalized_shape.size();
   TORCH_CHECK(
       normalized_ndim >= 1,
@@ -119,11 +119,90 @@ Tensor layer_norm(
   const auto& X = input.is_contiguous() ? input : input.contiguous();
   const auto& gamma = weight.is_contiguous() ? weight : weight.contiguous();
   const auto& beta = bias.is_contiguous() ? bias : bias.contiguous();
+
+  return std::make_tuple(X, gamma, beta, M, N);
+}
+
+Tensor layer_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    bool /* cudnn_enable, deprecated */) {
+
+  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto X = std::get<0>(inputs);
+  auto gamma = std::get<1>(inputs);
+  auto beta = std::get<2>(inputs);
+  auto M = std::get<3>(inputs);
+  auto N = std::get<4>(inputs);
+
   return std::get<0>(at::native_layer_norm(X, gamma, beta, M, N, eps));
 }
 
+Tensor quantized_layer_norm_impl(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point) {
+
+  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto X = std::get<0>(inputs);
+  auto gamma = std::get<1>(inputs);
+  auto beta = std::get<2>(inputs);
+  auto M = std::get<3>(inputs);
+  auto N = std::get<4>(inputs);
+
+  Tensor Y = at::_empty_affine_quantized(
+    X.sizes(),
+    X.scalar_type(),
+    output_scale,
+    output_zero_point,
+    X.suggest_memory_format());
+
+  if (M > 0) {
+    LayerNormKernelQuantized(kCPU, X, gamma, beta, M, N, eps, &Y);
+  }
+  return Y;
+}
+
+// Keep the registry in the anonymous namespace.
+namespace {
+class QLayerNorm2d final : public torch::OperatorKernel {
+ public:
+  Tensor operator()(
+      Tensor input,
+      std::vector<int64_t> normalized_shape,
+      Tensor weight /* optional */,
+      Tensor bias /* optional */,
+      double eps,
+      double output_scale,
+      int64_t output_zero_point) {
+    return quantized_layer_norm_impl(
+        input, normalized_shape, weight, bias, eps, output_scale, output_zero_point);
+  }
+};
+
+static auto registry = torch::RegisterOperators().op(
+    "quantized::layer_norm(Tensor input, "
+    "int[] normalized_shape, "
+    "Tensor weight, "
+    "Tensor bias, "
+    "float eps, "
+    "float output_scale, "
+    "int output_zero_point) -> Tensor",
+    torch::RegisterOperators::options().kernel<QLayerNorm2d>(
+        DispatchKey::QuantizedCPUTensorId));
+
+} // namespace
+
 DEFINE_DISPATCH(LayerNormKernel);
 DEFINE_DISPATCH(LayerNormBackwardKernel);
+DEFINE_DISPATCH(LayerNormKernelQuantized);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
@@ -29,8 +29,18 @@ using backward_fn = void (*)(
     Tensor* /* dgamma */,
     Tensor* /* dbeta */);
 
+using forward_quantized_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
 DECLARE_DISPATCH(forward_fn, LayerNormKernel);
 DECLARE_DISPATCH(backward_fn, LayerNormBackwardKernel);
+DECLARE_DISPATCH(forward_quantized_fn, LayerNormKernelQuantized);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1598,6 +1598,11 @@
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
 
+- func: quantized_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor
+  requires_tensor: True
+  dispatch:
+    QuantizedCPU: quantized_layer_norm_impl
+
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
 
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -6,6 +6,7 @@
 #include <ATen/native/quantized/cpu/quantized_ops.h>
 #include <ATen/quantized/Quantizer.h>
 #include <ATen/native/SortingUtils.h>
+#include <ATen/cpu/vec256/functional.h>
 
 #include <cmath>
 #ifdef USE_FBGEMM
@@ -1497,6 +1498,156 @@ void fake_quant_grad_per_channel_cpu(TensorIterator &iter, int64_t quant_min, in
     });
 }
 
+template <typename T>
+void LayerNormKernelQuantizedImplInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    float eps,
+    Tensor* Y) {
+
+  using qVec = vec256::Vec256<T>;
+  using fVec = vec256::Vec256<float>;
+
+  DCHECK_EQ(X.numel(), M * N);
+  DCHECK(!gamma.defined() || gamma.numel() == N);
+  DCHECK(!beta.defined() || beta.numel() == N);
+  T* X_data = X.data_ptr<T>();
+  const float* gamma_data = gamma.defined() ? gamma.data_ptr<float>() : nullptr;
+  const float* beta_data = beta.defined() ? beta.data_ptr<float>() : nullptr;
+  T* Y_data = Y->data_ptr<T>();
+  const float c = 1.0f / static_cast<float>(N);
+  const bool gamma_null = gamma_data == nullptr;
+  const bool beta_null = beta_data == nullptr;
+
+  int64_t x_zp = X.q_zero_point();
+  float x_scale = X.q_scale();
+
+  fVec x_zp_vec = fVec((float)x_zp);
+  fVec one_vec = fVec(1.0f);
+  fVec zero_vec = fVec(0.0f);
+
+  float x_fake_scale = 1.0f;
+  fVec x_fake_scale_vec = fVec(x_fake_scale);
+  fVec x_fake_scale_zp_neg_premul_vec = x_fake_scale_vec * x_zp_vec.neg();
+
+  int64_t y_zp = Y->q_zero_point();
+  float y_scale = Y->q_scale();
+  float y_inv_scale = 1.0f / y_scale;
+
+  // 8 floats in a 256 bit Vec256
+  constexpr int kFloatVLen = 8;
+  // N ints in a qVec
+  int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
+  // portion of layer that can be vectorized
+  int64_t kNumIntVecInLayer = N / kIntVLen;
+  // remainder of layer that cannot be vectorized
+  int64_t kNonVecRemInLayer = N % kIntVLen;
+
+  at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; ++i) {
+
+      T* X_ptr = X_data + i * N;
+      T* Y_ptr = Y_data + i * N;
+
+      // First pass: calculate mean and variance.
+      // Note: Fake dequant using scale=1.0f because scale_x cancels out
+      //   during normalization, with the exception of epsilon
+
+      // TODO replace with TensorIterator implementation once #33166 is fixed.
+      float layerSum = 0.0f;
+      float layerSumSquares = 0.0f;
+      for (int64_t vecIdx = 0; vecIdx < kNumIntVecInLayer; vecIdx++) {
+        auto qXVec = qVec::loadu(X_ptr + vecIdx * kIntVLen);
+        auto dqXVec = qXVec.dequantize(x_fake_scale_vec, x_zp_vec,
+            x_fake_scale_zp_neg_premul_vec);
+          // sum of vals
+        float thisLayerSum = vec256::reduce_all<float>(
+          [](fVec& x, fVec& y) { return x + y; },
+          (float*)dqXVec.data(),
+          kFloatVLen * dqXVec.size()
+        );
+        layerSum += thisLayerSum;
+        // sum of squares
+        float thisLayerSumSquares = vec256::map_reduce_all<float>(
+          [](fVec x) { return x * x; },
+          [](fVec x, fVec y) { return x + y; },
+          (float*)dqXVec.data(),
+          kFloatVLen * dqXVec.size()
+        );
+        layerSumSquares += thisLayerSumSquares;
+      }
+      for (int64_t remIdx = N - kNonVecRemInLayer; remIdx < N; remIdx++) {
+        auto qXVal = X_ptr[remIdx];
+        float dqXVal = at::dequantize_val(x_fake_scale, x_zp, qXVal);
+        layerSum += dqXVal;
+        layerSumSquares += dqXVal * dqXVal;
+      }
+
+      // mean(dqX) / scale_x
+      float layerMeanDivScaleX = layerSum / N;
+      // var(dqX) / scale_x^2
+      float layerVarDivScaleXSq =
+        std::max(layerSumSquares / N - layerMeanDivScaleX * layerMeanDivScaleX, 0.0f);
+      // scale_x / std(dqX), scale epsilon properly
+      float scaleXDivLayerStd = 1.0f /
+        std::sqrt(layerVarDivScaleXSq + (eps * x_scale * x_scale));
+      fVec layerMeanDivScaleXVec(layerMeanDivScaleX);
+      fVec scaleXDivLayerStdVec(scaleXDivLayerStd);
+
+      // Second pass: normalize
+
+      // TODO replace with TensorIterator implementation once #33166 is fixed.
+      for (int64_t vecIdx = 0; vecIdx < kNumIntVecInLayer; vecIdx++) {
+        int64_t vecStartIdx = vecIdx * kIntVLen;
+        auto qXVec = qVec::loadu(X_ptr + vecStartIdx);
+        auto dqXVec = qXVec.dequantize(x_fake_scale_vec, x_zp_vec,
+            x_fake_scale_zp_neg_premul_vec);
+        for (int dqXVecIdx = 0; dqXVecIdx < dqXVec.size(); dqXVecIdx++) {
+          int64_t vecVecStartIdx = vecStartIdx + dqXVecIdx * kFloatVLen;
+          auto gammaVec = gamma_null
+            ? one_vec
+            : fVec::loadu(gamma_data + vecVecStartIdx);
+          auto betaVec = beta_null
+            ? zero_vec
+            : fVec::loadu(beta_data + vecVecStartIdx);
+          dqXVec[dqXVecIdx] =
+            (dqXVec[dqXVecIdx] - layerMeanDivScaleXVec) *
+              scaleXDivLayerStdVec * gammaVec + betaVec;
+          qVec::quantize(dqXVec, y_scale, y_zp, y_inv_scale)
+            .store(Y_ptr + vecStartIdx);
+        }
+      }
+      for (int64_t remIdx = N - kNonVecRemInLayer; remIdx < N; remIdx++) {
+        const float gamma_v = gamma_null ? 1.0f : gamma_data[remIdx];
+        const float beta_v = beta_null ? 0.0f : beta_data[remIdx];
+        auto qXVal = X_ptr[remIdx];
+        float dqXVal = at::dequantize_val(x_fake_scale, x_zp, qXVal);
+        float dqY =
+          ((dqXVal - layerMeanDivScaleX) * scaleXDivLayerStd) * gamma_v + beta_v;
+        Y_ptr[remIdx] = at::quantize_val<T>(y_scale, y_zp, dqY);
+      }
+
+    }
+  }); // parallel_for
+}
+
+void LayerNormKernelQuantizedImpl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    Tensor* Y) {
+  AT_DISPATCH_QINT_TYPES(X.scalar_type(), "LayerNormKernelImpl", [&]() {
+    LayerNormKernelQuantizedImplInternal<scalar_t>(
+        X, gamma, beta, M, N, static_cast<float>(eps), Y);
+  });
+}
+
 } // namespace
 
 REGISTER_DISPATCH(qrelu_stub, &qrelu_kernel);
@@ -1531,6 +1682,7 @@ REGISTER_DISPATCH(fake_quant_tensor_stub, &fake_quantize_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_grad_tensor_stub, &fake_quantize_grad_tensor_kernel);
 REGISTER_DISPATCH(fake_quant_per_channel_stub, &fake_quant_per_channel_cpu);
 REGISTER_DISPATCH(fake_quant_grad_per_channel_stub, &fake_quant_grad_per_channel_cpu);
+REGISTER_DISPATCH(LayerNormKernelQuantized, &LayerNormKernelQuantizedImpl);
 
 } // namespace native
 } // namespace at
diff --git a/test/test_quantized.py b/test/test_quantized.py
@@ -271,6 +271,44 @@ def test_qhardsigmoid(self, X):
                          message="Hardsigmoid failed: {} vs. {}".format(qY, qY_hat))
 
 
+    """Tests the correctness of the quantized::qlayer_norm op."""
+    @given(X=hu.tensor(shapes=hu.array_shapes(3, 5, 1, 32),
+                       elements=hu.floats(-1e3, 1e3, allow_nan=False, allow_infinity=False),
+                       qparams=hu.qparams()),
+           Y_scale=st.floats(0.2, 2.6),
+           Y_zero_point=st.integers(0, 5),
+           qengine=st.sampled_from(("qnnpack", "fbgemm")))
+    def test_qlayer_norm(self, X, Y_scale, Y_zero_point, qengine):
+        if qengine not in torch.backends.quantized.supported_engines:
+            return
+
+        with override_quantized_engine(qengine):
+            X, (scale, zero_point, torch_type) = X
+            X = torch.from_numpy(X)
+            qX = torch.quantize_per_tensor(X, scale=scale,
+                                           zero_point=zero_point,
+                                           dtype=torch_type)
+            dqX = qX.dequantize()
+
+            weight = torch.rand(*qX.size()[1:], dtype=torch.float)
+            bias = torch.rand(*qX.size()[1:], dtype=torch.float)
+            epsilon = 1e-5
+
+            qY = torch.ops.quantized.layer_norm(
+                qX, qX.size()[1:], weight=weight, bias=bias, eps=epsilon,
+                output_scale=Y_scale, output_zero_point=Y_zero_point)
+
+            Y_hat = F.layer_norm(
+                dqX, dqX.size()[1:], weight=weight, bias=bias, eps=epsilon)
+            qY_hat = torch.quantize_per_tensor(
+                Y_hat, scale=Y_scale, zero_point=Y_zero_point, dtype=torch_type)
+
+            self.assertEqual(
+                qY,
+                qY_hat,
+                message="LayerNorm failed:\n {} input vs\n {} actual vs \n{} expected".format(X, qY, qY_hat))
+
+
     """Tests the correctness of the quantized::qnnpack_tanh op."""
     @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
                        qparams=hu.qparams()))
diff --git a/third_party/fbgemm b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 58c002d1593f32aa420ab56b5c344e60d3fb6d05
+Subproject commit fbf509dcb5b6b816f4651e2e6f53accc963743e4
diff --git a/third_party/protobuf b/third_party/protobuf
@@ -1 +1 @@
-Subproject commit d0bfd5221182da1a7cc280f3337b5e41a89539cf
+Subproject commit 48cb18e5c419ddd23d9badcfe4e9df7bde1979b2