Add gelu activation in pytorch (#20665)

xiaomengy · facebook-github-bot · commit baeb0b8e4190 · 2019-06-02T00:06:50.000-07:00
Summary: Pull Request resolved: #20665 Add gelu activation forward on CPU in pytorch Compare to current python implemented version of gelu in BERT model like def gelu(self, x): x * 0.5 * (1.0 + torch.erf(x / self.sqrt_two)) The torch.nn.functional.gelu function can reduce the forward time from 333ms to 109ms (with MKL) / 112ms (without MKL) for input size = [64, 128, 56, 56] on a devvm. Reviewed By: zheng-xq Differential Revision: D15400974 fbshipit-source-id: 78399123aef803376a2459d487d44557126070ac
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
@@ -371,4 +371,20 @@ Tensor hardshrink_backward_cpu(const Tensor & grad, const Tensor & self, Scalar
   return out_tensor;
 }
 
+
+Tensor gelu_cpu(const Tensor& self) {
+  const auto X = self.contiguous();
+  Tensor Y = at::native::empty_like(X);
+  GeluKernel(kCPU, X, &Y);
+  return Y;
+}
+
+Tensor gelu_cuda(const Tensor& self) {
+  Tensor Y = at::native::empty_like(self);
+  GeluKernel(kCUDA, self, &Y);
+  return Y;
+}
+
+DEFINE_DISPATCH(GeluKernel);
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h
@@ -1,15 +1,21 @@
 #pragma once
 
-#include <c10/core/Scalar.h>
+#include <ATen/ATen.h>
 #include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
 
-namespace at { struct TensorIterator; }
+struct TensorIterator;
 
-namespace at { namespace native {
+namespace native {
 
-using threshold_fn = void(*)(TensorIterator&, Scalar, Scalar);
+using threshold_fn = void (*)(TensorIterator&, Scalar, Scalar);
+using activation_fn = void (*)(const Tensor& /* X */, Tensor* /* Y */);
 
 DECLARE_DISPATCH(threshold_fn, threshold_stub);
+DECLARE_DISPATCH(activation_fn, GeluKernel);
 
+} // namespace native
 
-}} // namespace at::native
+} // namespace at
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
@@ -1,31 +1,104 @@
+#define _USE_MATH_DEFINES
+
 #include <ATen/native/Activation.h>
 
+#include <math.h>
+
 #include <ATen/ATen.h>
+#include <ATen/Config.h>
 #include <ATen/cpu/vec256/vec256.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 
-namespace at { namespace native {
+#if AT_MKL_ENABLED()
+#include <mkl.h>
+#endif // AT_MKL_ENABLED()
+
+namespace at {
+namespace native {
+
 namespace {
 
-static void threshold_kernel(TensorIterator& iter, Scalar threshold_scalar, Scalar value_scalar) {
+static void threshold_kernel(
+    TensorIterator& iter,
+    Scalar threshold_scalar,
+    Scalar value_scalar) {
   AT_DISPATCH_ALL_TYPES(iter.dtype(), "threshold_cpu", [&] {
     using Vec = Vec256<scalar_t>;
     scalar_t threshold = threshold_scalar.to<scalar_t>();
     scalar_t value = value_scalar.to<scalar_t>();
     binary_kernel_vec(
-      iter,
-      [&](scalar_t x, scalar_t other) -> scalar_t {
-        return x <= threshold ? value : other;
-      },
-      [&](Vec x, Vec other) -> Vec {
-        return Vec::blendv(other, Vec(value), x <= Vec(threshold));
-      });
+        iter,
+        [&](scalar_t x, scalar_t other) -> scalar_t {
+          return x <= threshold ? value : other;
+        },
+        [&](Vec x, Vec other) -> Vec {
+          return Vec::blendv(other, Vec(value), x <= Vec(threshold));
+        });
   });
 }
 
-} // anonymous namespace
+#if AT_MKL_ENABLED()
+
+// TODO(yangxm): Consider to use TensorIterator here.
+template <typename T>
+void GeluKernelMKLImpl(const Tensor& X, Tensor* Y);
+
+#define DELEGATE_GELU_KERNEL_MKL_IMPL(T, CdfNormFunc, MulFunc) \
+  template <>                                                  \
+  void GeluKernelMKLImpl<T>(const Tensor& X, Tensor* Y) {      \
+    const int64_t N = X.numel();                               \
+    const T* X_data = X.data<T>();                             \
+    T* Y_data = Y->data<T>();                                  \
+    CdfNormFunc(N, X_data, Y_data);                            \
+    MulFunc(N, X_data, Y_data, Y_data);                        \
+  }
+DELEGATE_GELU_KERNEL_MKL_IMPL(float, vsCdfNorm, vsMul)
+DELEGATE_GELU_KERNEL_MKL_IMPL(double, vdCdfNorm, vdMul)
+#undef DELEGATE_GELU_KERNEL_MKL_IMPL
+
+#else // AT_MKL_ENABLED()
+
+template <typename T>
+void GeluKernelMKLImpl(const Tensor& X, Tensor* Y) {
+  AT_ASSERTM(false, "ATen not compiled with MKL");
+}
+
+#endif // AT_MKL_ENABLED()
+
+template <typename T>
+void GeluKernelImplInternal(const Tensor& X, Tensor* Y) {
+  const int64_t N = X.numel();
+  const T* X_data = X.data<T>();
+  T* Y_data = Y->data<T>();
+  for (int64_t i = 0; i < N; ++i) {
+    Y_data[i] = X_data[i] * M_SQRT1_2;
+  }
+  Y->erf_();
+  for (int64_t i = 0; i < N; ++i) {
+    Y_data[i] = (Y_data[i] + T(1)) * X_data[i] * T(0.5);
+  }
+}
+
+// TODO(yangxm): Add another fast kernel using formula
+// y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
+// and the fast tanh impl from Eigen.
+void GeluKernelImpl(const Tensor& X, Tensor* Y) {
+  if (at::hasMKL()) {
+    AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GeluKernelImpl", [&]() {
+      GeluKernelMKLImpl<scalar_t>(X, Y);
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GeluKernelImpl", [&]() {
+      GeluKernelImplInternal<scalar_t>(X, Y);
+    });
+  }
+}
+
+} // namespace
 
 REGISTER_DISPATCH(threshold_stub, &threshold_kernel);
+REGISTER_DISPATCH(GeluKernel, &GeluKernelImpl);
 
-}} // namespace at::native
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
@@ -1,10 +1,12 @@
+#include <ATen/native/Activation.h>
+
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/native/Activation.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <c10/cuda/CUDAMathCompat.h>
 
 
 namespace at { namespace native {
@@ -291,6 +293,24 @@ static void threshold_kernel(TensorIterator& iter, Scalar threshold, Scalar valu
   });
 }
 
+namespace {
+
+template <typename T>
+void GeluCUDAKernelImplInternal(const Tensor& X, Tensor* Y) {
+  at::cuda::CUDA_tensor_apply2<T, T>(X, *Y, [] __device__(const T& x, T& y) {
+    y = x * c10::cuda::compat::normcdf(x);
+  });
+}
+
+void GeluCUDAKernelImpl(const Tensor& X, Tensor* Y) {
+  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "GeluCUDAKernelImpl", [&]() {
+    GeluCUDAKernelImplInternal<scalar_t>(X, Y);
+  });
+}
+
+} // namespace
+
 REGISTER_DISPATCH(threshold_stub, &threshold_kernel);
+REGISTER_DISPATCH(GeluKernel, &GeluCUDAKernelImpl);
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1587,6 +1587,12 @@
     CPU: prelu_backward_cpu
     CUDA: prelu_backward_cuda
 
+- func: gelu(Tensor self) -> Tensor
+  python_module: nn
+  dispatch:
+    CPU: gelu_cpu
+    CUDA: gelu_cuda
+
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   variants: function, method
   dispatch:
diff --git a/c10/cuda/CUDAMathCompat.h b/c10/cuda/CUDAMathCompat.h
@@ -84,6 +84,13 @@ __MATH_FUNCTIONS_DECL__ double tan(double x) {
   return ::tan(x);
 }
 
+__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
+  return ::normcdff(x);
+}
+__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
+  return ::normcdf(x);
+}
+
 } // namespace compat
 } // namespace cuda
 } // namespace c10
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
@@ -1046,6 +1046,11 @@ Non-linear activation functions
 
 .. autofunction:: glu
 
+:hidden:`gelu`
+~~~~~~~~~~~~~~~
+
+.. autofunction:: gelu
+
 :hidden:`logsigmoid`
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -6059,6 +6059,33 @@ def test_PReLU_backward_requires_grad_false(self):
         y.mean().backward()
         self.assertEqual(x.grad, None)
 
+    @unittest.skipIf(
+        not TEST_NUMPY or not TEST_SCIPY, "Numpy or Scipy not found")
+    def test_gelu(self):
+        def _test_gelu(n, m, dtype, contiguous):
+            def _gelu_ref(X):
+                return X * stats.norm.cdf(X)
+
+            if contiguous:
+                X = torch.rand(n, m, dtype=dtype)
+            else:
+                X = torch.rand(n, m, dtype=dtype)[:, ::2]
+            res = F.gelu(X)
+            ref = _gelu_ref(X.numpy())
+            self.assertEqual(res, ref)
+
+            if TEST_CUDA:
+                res_cuda = F.gelu(X.cuda())
+                self.assertEqual(res_cuda.cpu(), ref)
+
+        for n in range(1, 10):
+            for m in range(1, 10):
+                _test_gelu(n, m, torch.float32, True)
+                _test_gelu(n, m, torch.float32, False)
+                _test_gelu(n, m, torch.float64, True)
+                _test_gelu(n, m, torch.float64, False)
+
+
     def test_bce_loss_always_nonnegative(self):
         target = torch.ones(5)
         input = torch.ones(5)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1034,6 +1034,9 @@
 - name: glu(Tensor self, int64_t dim)
   self: glu_backward(grad, self, dim)
 
+- name: gelu(Tensor self)
+  self: not_implemented("gelu")
+
 - name: hardshrink(Tensor self, Scalar lambd)
   self: hardshrink_backward(grad, self, lambd)
 
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -1148,6 +1148,19 @@ def rrelu(input, lower=1. / 8, upper=1. / 3, training=False, inplace=False):
 See :class:`~torch.nn.LogSigmoid` for more details.
 """)
 
+@weak_script
+def gelu(input):
+    r"""gelu(input) -> Tensor
+
+    Applies element-wise the function
+    :math:`\text{GeLU}(x) = x * \Phi(x)`
+
+    where `\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    See :`Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`.
+    """
+    return torch._C._nn.gelu(input)
+
 
 @weak_script
 def hardshrink(input, lambd=0.5):