Added SiLU activation function (#41034)

heitorschueroff · facebook-github-bot · commit 75a4862f639d · 2020-07-10T07:37:30.000-07:00
Summary: Implemented the SiLU activation function as discussed in #3169. Pull Request resolved: #41034 Reviewed By: glaringlee Differential Revision: D22465203 Pulled By: heitorschueroff fbshipit-source-id: b27d064529fc99600c586ad49b594b52b718b0d2
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
@@ -607,6 +607,7 @@ _(aten, selu) \
 _(aten, set) \
 _(aten, sigmoid) \
 _(aten, sign) \
+_(aten, silu) \
 _(aten, sin) \
 _(aten, sinh) \
 _(aten, size) \
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
@@ -190,6 +190,22 @@ Tensor & celu_(Tensor & self, Scalar alpha) {
   return at::elu_(self, alpha, Scalar(1.0), Scalar(inv_alpha));
 }
 
+Tensor silu(const Tensor& self) {
+  return self * at::sigmoid(self);
+}
+
+Tensor& silu_(Tensor& self) {
+  return self.mul_(at::sigmoid(self));
+}
+
+Tensor& silu_out(Tensor& result, const Tensor& self) {
+  return at::mul_out(result, self, at::sigmoid(self));
+}
+
+Tensor silu_backward(const Tensor& grad, const Tensor& self) {
+  auto self_sigmoid = at::sigmoid(self);
+  return grad * (self_sigmoid * (1 + self * (1 - self_sigmoid)));
+}
 
 template <typename scalar_t>
 inline void _rrelu_with_noise_train(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -2428,6 +2428,20 @@
 
 - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
 
+- func: silu(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+  
+- func: silu_(Tensor(a!) self) -> Tensor(a!)
+  python_module: nn
+
+- func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: nn
+
+- func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+
 - func: sigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
@@ -278,6 +278,11 @@ Non-linear activation functions
 
 .. autofunction:: hardsigmoid
 
+:hidden:`silu`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: silu
+
 
 Normalization functions
 -----------------------
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
@@ -125,6 +125,7 @@ Non-linear Activations (weighted sum, nonlinearity)
     nn.CELU
     nn.GELU
     nn.Sigmoid
+    nn.SiLU
     nn.Softplus
     nn.Softshrink
     nn.Softsign
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -16577,6 +16577,23 @@ def test_hardsigmoid(self, device, dtype):
                          torch.tensor(expectedOutput, dtype=dtype, device=device),
                          atol=precision_4dps, rtol=0)
 
+    @dtypes(torch.float, torch.double)
+    def test_silu(self, device, dtype):
+        inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000]
+        expectedOutput = [0.0000, -0.2689, 0, 0.3112, 0.7312, 1.7616, 1000]
+        precision_4dps = 0.0002
+
+        input_tensor = torch.tensor(inputValues, dtype=dtype, device=device)
+        expected_output_tensor = torch.tensor(expectedOutput, dtype=dtype, device=device)
+
+        self.assertEqual(torch.nn.functional.silu(input_tensor), 
+                         expected_output_tensor,
+                         atol=precision_4dps, rtol=0)
+
+        self.assertEqual(torch.nn.functional.silu(input_tensor, inplace=True), 
+                         expected_output_tensor,
+                         atol=precision_4dps, rtol=0)
+
     @onlyCPU
     @dtypes(torch.float)
     def test_diag_embed(self, device, dtype):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1181,6 +1181,12 @@
 - name: relu_(Tensor(a!) self) -> Tensor(a!)
   self: threshold_backward(grad, result, 0)
 
+- name: silu(Tensor self) -> Tensor
+  self: silu_backward(grad, self)
+
+- name: silu_(Tensor(a!) self) -> Tensor(a!)
+  self: not_implemented("silu_ cannot compute gradient of inplace version, use silu instead")
+
 - name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   self: elu_backward(grad, alpha, scale, input_scale, result)
 
diff --git a/torch/_overrides.py b/torch/_overrides.py
@@ -528,6 +528,7 @@ def get_testing_overrides():
         torch.nn.functional.relu6: lambda input, inplace=False: -1,
         torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
         torch.nn.functional.selu: lambda input, inplace=False: -1,
+        torch.nn.functional.silu: lambda input, inplace=False: -1,
         torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
         torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
         torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -1700,6 +1700,29 @@ def bilinear(input1, input2, weight, bias=None):
     """
     return torch.bilinear(input1, input2, weight, bias)
 
+def silu(input, inplace=False):
+    # type: (Tensor, bool) -> Tensor
+    r"""Applies the silu function, element-wise.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_ 
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see 
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation 
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish: 
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_ 
+        where the SiLU was experimented with later.
+
+    See :class:`~torch.nn.SiLU` for more details.
+    """
+    if not torch.jit.is_scripting():
+        if type(input) is not Tensor and has_torch_function((input,)):
+            return handle_torch_function(silu, (input,), input, inplace=inplace)
+    if inplace:
+        return torch._C._nn.silu_(input)
+    return torch._C._nn.silu(input)
 
 def hardswish(input, inplace=False):
     # type: (Tensor, bool) -> Tensor
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
@@ -5,7 +5,7 @@
 from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
     Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, GELU, Hardshrink, LeakyReLU, LogSigmoid, \
     Softplus, Softshrink, MultiheadAttention, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU, \
-    Hardsigmoid, Hardswish
+    Hardsigmoid, Hardswish, SiLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
     CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
     MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
@@ -54,5 +54,5 @@
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
     'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder',
     'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer',
-    'Flatten', 'Hardsigmoid', 'Hardswish',
+    'Flatten', 'Hardsigmoid', 'Hardswish', 'SiLU',
 ]
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
@@ -349,6 +349,44 @@ class Tanh(Module):
     def forward(self, input: Tensor) -> Tensor:
         return torch.tanh(input)
 
+class SiLU(Module):
+    r"""Applies the silu function, element-wise.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_ 
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see 
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation 
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish: 
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_ 
+        where the SiLU was experimented with later.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    Examples::
+
+        >>> m = nn.SiLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+    __constants__ = ['inplace']
+    inplace: bool
+
+    def __init__(self, inplace: bool = False):
+        super(SiLU, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
 
 class Hardswish(Module):
     r"""Applies the hardswish function, element-wise, as described in the paper: