Optimize pytorch layer_norm forward (#20345)

xiaomengy · facebook-github-bot · commit c9da01194a6e · 2019-05-21T15:59:49.000-07:00
Summary: Pull Request resolved: #20345 Seperate from D15194600 Optimize pytorch layer_norm op part 1: optimize layer_norm_forward_cpu import Eigen Maps for the performance of reduction Reviewed By: zheng-xq Differential Revision: D15290608 fbshipit-source-id: cf2c208dfd6fbcbc4c69db3ed60278d9bee156b5
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -462,64 +462,6 @@ Tensor instance_norm(
   return out.view(input.sizes());
 }
 
-Tensor layer_norm(const Tensor& input, IntArrayRef normalized_shape,
-    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
-    double eps, bool cudnn_enabled) {
-
-    int64_t normalized_ndim = normalized_shape.size();
-
-    TORCH_CHECK(normalized_ndim >= 1,
-             "Expected normalized_shape to be at least 1-dimensional, i.e., ",
-             "containing at least one element, but got normalized_shape=",
-             normalized_shape);
-
-    TORCH_CHECK(!weight.defined() || weight.sizes().equals(normalized_shape),
-             "Expected weight to be of same shape as normalized_shape, but got ",
-             "weight of shape ", weight.sizes(), " and normalized_shape=",
-             normalized_shape);
-    TORCH_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape),
-             "Expected bias to be of same shape as normalized_shape, but got ",
-             "bias of shape ", bias.sizes(), " and normalized_shape=",
-             normalized_shape);
-
-    auto input_shape = input.sizes();
-    auto input_ndim = input.dim();
-
-    if (input_ndim < normalized_ndim ||
-        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Given normalized_shape=" << normalized_shape
-         << ", expected input with shape [*";
-      for (auto size : normalized_shape) {
-        ss << ", " << size;
-      }
-      ss << "], but got input of size" << input_shape;
-      AT_ERROR(ss.str());
-    }
-
-    int64_t n = 1;
-    for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) {
-      n *= input_shape[i];
-    }
-
-    // Apply layer norm
-    auto input_reshaped = input.contiguous().view({1, n, -1});
-
-    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
-                              cudnn_enabled);
-    out = out.view(input_shape);
-
-    if (weight.defined() && bias.defined()) {
-      return bias.addcmul(out, weight, 1);
-    } else if (weight.defined()) {
-      return out.mul(weight);
-    } else if (bias.defined()) {
-      return out.add(bias);
-    } else {
-      return out;
-    }
-}
-
 Tensor group_norm(const Tensor& input, int64_t num_groups,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
     double eps, bool cudnn_enabled) {
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -0,0 +1,79 @@
+#include <ATen/native/cpu/layer_norm_kernel.h>
+
+#include <ATen/ATen.h>
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+template <typename T>
+void LayerNormKernelImplInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    T eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  DCHECK_EQ(X.numel(), M * N);
+  DCHECK(!gamma.defined() || gamma.numel() == N);
+  DCHECK(!beta.defined() || beta.numel() == N);
+  const T* X_data = X.data<T>();
+  const T* gamma_data = gamma.defined() ? gamma.data<T>() : nullptr;
+  const T* beta_data = beta.defined() ? beta.data<T>() : nullptr;
+  T* Y_data = Y->data<T>();
+  T* mean_data = mean->data<T>();
+  T* rstd_data = rstd->data<T>();
+  const T c = T(1) / static_cast<T>(N);
+  const bool gamma_null = gamma_data == nullptr;
+  const bool beta_null = beta_data == nullptr;
+  for (int64_t i = 0; i < M; ++i) {
+    const T* X_ptr = X_data + i * N;
+    T* Y_ptr = Y_data + i * N;
+    T mean_val = T(0);
+    T rstd_val = T(0);
+    for (int64_t j = 0; j < N; ++j) {
+      mean_val += X_ptr[j];
+      rstd_val += X_ptr[j] * X_ptr[j];
+    }
+    mean_val *= c;
+    rstd_val = T(1) / std::sqrt(rstd_val * c - mean_val * mean_val + eps);
+    const T scale = rstd_val;
+    const T bias = -rstd_val * mean_val;
+    for (int64_t j = 0; j < N; ++j) {
+      const T gamma_v = gamma_null ? T(1) : gamma_data[j];
+      const T beta_v = beta_null ? T(0) : beta_data[j];
+      Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v;
+    }
+    mean_data[i] = mean_val;
+    rstd_data[i] = rstd_val;
+  }
+}
+
+void LayerNormKernelImpl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "LayerNormKernelImpl", [&]() {
+    LayerNormKernelImplInternal<scalar_t>(
+        X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+  });
+}
+
+} // namespace
+
+REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.h b/aten/src/ATen/native/cpu/layer_norm_kernel.h
@@ -0,0 +1,26 @@
+#ifndef ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
+#define ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+using forward_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */,
+    Tensor* /* mean */,
+    Tensor* /* rstd */);
+
+DECLARE_DISPATCH(forward_fn, LayerNormKernel);
+
+} // namespace native
+} // namespace at
+
+#endif // ATEN_SRC_NATIVE_CPU_LAYER_NORM_KERNEL_H_
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
@@ -0,0 +1,122 @@
+#include <ATen/NativeFunctions.h>
+
+#include <functional>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Config.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/layer_norm_kernel.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+std::tuple<Tensor, Tensor, Tensor> layer_norm_forward_cpu(
+    const Tensor& X,
+    const Tensor& gamma /* optional */,
+    const Tensor& beta /* optional */,
+    int64_t M,
+    int64_t N,
+    double eps) {
+  Tensor Y = at::native::empty_like(X);
+  Tensor mean = at::empty({M}, X.options());
+  Tensor rstd = at::empty({M}, X.options());
+  LayerNormKernel(kCPU, X, gamma, beta, M, N, eps, &Y, &mean, &rstd);
+  return std::make_tuple(Y, mean, rstd);
+}
+
+} // namespace
+
+Tensor layer_norm(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */,
+    double eps,
+    bool cudnn_enabled) {
+  const int normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(
+      normalized_ndim >= 1,
+      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+      "containing at least one element, but got normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !bias.defined() || bias.sizes().equals(normalized_shape),
+      "Expected bias to be of same shape as normalized_shape, but got ",
+      "bias of shape ",
+      bias.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  const auto input_shape = input.sizes();
+  const auto input_ndim = input.dim();
+
+  if (input_ndim < normalized_ndim ||
+      !input_shape.slice(input_ndim - normalized_ndim)
+           .equals(normalized_shape)) {
+    std::stringstream ss;
+    ss << "Given normalized_shape=" << normalized_shape
+       << ", expected input with shape [*";
+    for (auto size : normalized_shape) {
+      ss << ", " << size;
+    }
+    ss << "], but got input of size" << input_shape;
+    AT_ERROR(ss.str());
+  }
+
+  const int axis = input_ndim - normalized_ndim;
+  const int64_t M = std::accumulate(
+      input_shape.cbegin(),
+      input_shape.cbegin() + axis,
+      1LL,
+      std::multiplies<int64_t>());
+  const int64_t N = std::accumulate(
+      input_shape.cbegin() + axis,
+      input_shape.cend(),
+      1LL,
+      std::multiplies<int64_t>());
+
+  // TODO(yangxm): Remove this check after backward pass landed.
+  const auto is_forward = [](const Tensor& tensor) {
+    return tensor.is_variable() && !tensor.requires_grad();
+  };
+  if (input.device().is_cpu() && is_forward(input) && is_forward(weight) &&
+      is_forward(bias)) {
+    return std::get<0>(layer_norm_forward_cpu(
+        input.contiguous(), weight.contiguous(), bias.contiguous(), M, N, eps));
+  }
+
+  // Apply layer norm
+  auto input_reshaped = input.contiguous().view({1, M, -1});
+  auto out = at::batch_norm(
+      input_reshaped, {}, {}, {}, {}, true, 0, eps, cudnn_enabled);
+  out = out.view(input_shape);
+
+  if (weight.defined() && bias.defined()) {
+    return bias.addcmul(out, weight, 1);
+  } else if (weight.defined()) {
+    return out.mul(weight);
+  } else if (bias.defined()) {
+    return out.add(bias);
+  } else {
+    return out;
+  }
+}
+
+DEFINE_DISPATCH(LayerNormKernel);
+
+} // namespace native
+} // namespace at