Update torch.nn.init and torch.nn.utils.clip_grad (#6173)

tonybeltramelli · ezyang · commit 7fcaf3b49e05 · 2018-04-17T11:32:32.000-04:00
Introducing two updates.

1. Add param to He initialization scheme in torch.nn.init
Problem solved:
The function calculate_gain can take an argument to specify the type of non-linearity used. However, it wasn't possible to pass this argument directly to the He / Kaiming weight initialization function.

2. Add util to clip gradient value in torch.nn.utils.clip_grad
Problem solved:
DL libraries typically provide users with easy access to functions for clipping the gradients both using the norm and a fixed value. However, the utils clip_grad.py only had a function to clip the gradient norm.

* add param to He initialization scheme in torch.nn.init

* add util to clip gradient value in torch/nn/utils/clip_grad.py

* update doc in torch.nn.utils.clip_grad

* update and add test for torch.nn.utils.clip_grad

* update function signature in torch.nn.utils.clip_grad to match suffix_ convention

* ensure backward compatibility in torch.nn.utils.clip_grad

* remove DeprecationWarning in torch.nn.utils.clip_grad

* extend test and implementation of torch.nn.utils.clip_grad

* update test and implementation torch.nn.utils.clip_grad
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
@@ -700,10 +700,15 @@ DataParallel layers (multi-GPU, distributed)
 Utilities
 ---------
 
-:hidden:`clip_grad_norm`
+:hidden:`clip_grad_norm_`
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: torch.nn.utils.clip_grad_norm
+.. autofunction:: torch.nn.utils.clip_grad_norm_
+
+:hidden:`clip_grad_value_`
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: torch.nn.utils.clip_grad_value_
 
 :hidden:`weight_norm`
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -20,7 +20,7 @@
 import torch.nn.init as init
 import torch.nn.utils.rnn as rnn_utils
 import torch.legacy.nn as legacy
-from torch.nn.utils import clip_grad_norm
+from torch.nn.utils import clip_grad_norm_, clip_grad_value_
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.autograd import Variable, gradcheck
 from torch.autograd.gradcheck import gradgradcheck
@@ -1238,7 +1238,7 @@ def compare_scaling(grads):
             for p, g in zip(l.parameters(), grads):
                 p._grad = Variable(g.clone().view_as(p.data))
             norm_before = compute_norm(norm_type)
-            norm = clip_grad_norm(l.parameters(), max_norm, norm_type=norm_type)
+            norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
             norm_after = compute_norm(norm_type)
             self.assertEqual(norm, norm_before)
             self.assertEqual(norm_after, max_norm)
@@ -1251,14 +1251,28 @@ def compare_scaling(grads):
             for p, g in zip(l.parameters(), grads):
                 p.grad.data.copy_(g)
             norm_before = compute_norm(norm_type)
-            norm = clip_grad_norm(l.parameters(), max_norm, norm_type=norm_type)
+            norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
             norm_after = compute_norm(norm_type)
             self.assertEqual(norm, norm_before)
             self.assertEqual(norm_before, norm_after)
             self.assertLessEqual(norm_after, max_norm)
             scale = compare_scaling(grads)
             self.assertEqual(scale, 1)
 
+    def test_clip_grad_value(self):
+        l = nn.Linear(10, 10)
+        clip_value = 2.5
+
+        grad_w, grad_b = torch.arange(-50, 50).view(10, 10).div(5), torch.ones(10).mul(2)
+        for grad_list in [[grad_w, grad_b], [grad_w, None]]:
+            for p, g in zip(l.parameters(), grad_list):
+                p._grad = Variable(g.clone().view_as(p.data)) if g is not None else g
+
+        clip_grad_value_(l.parameters(), clip_value)
+        for p in filter(lambda p: p.grad is not None, l.parameters()):
+            self.assertLessEqual(p.grad.data.max(), clip_value)
+            self.assertGreaterEqual(p.grad.data.min(), -clip_value)
+
     def test_parameters_to_vector(self):
         conv1 = nn.Conv2d(3, 10, 5)
         fc1 = nn.Linear(10, 20)
diff --git a/torch/nn/init.py b/torch/nn/init.py
@@ -230,7 +230,7 @@ def _calculate_correct_fan(tensor, mode):
     return fan_in if mode == 'fan_in' else fan_out
 
 
-def kaiming_uniform_(tensor, a=0, mode='fan_in'):
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
     r"""Fills the input `Tensor` with values according to the method
     described in "Delving deep into rectifiers: Surpassing human-level
     performance on ImageNet classification" - He, K. et al. (2015), using a
@@ -250,20 +250,22 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in'):
             preserves the magnitude of the variance of the weights in the
             forward pass. Choosing `fan_out` preserves the magnitudes in the
             backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with 'relu' or 'leaky_relu' (default).
 
     Examples:
         >>> w = torch.Tensor(3, 5)
-        >>> nn.init.kaiming_uniform_(w, mode='fan_in')
+        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
     """
     fan = _calculate_correct_fan(tensor, mode)
-    gain = calculate_gain('leaky_relu', a)
+    gain = calculate_gain(nonlinearity, a)
     std = gain / math.sqrt(fan)
     bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
     with torch.no_grad():
         return tensor.uniform_(-bound, bound)
 
 
-def kaiming_normal_(tensor, a=0, mode='fan_in'):
+def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
     r"""Fills the input `Tensor` with values according to the method
     described in "Delving deep into rectifiers: Surpassing human-level
     performance on ImageNet classification" - He, K. et al. (2015), using a
@@ -283,13 +285,15 @@ def kaiming_normal_(tensor, a=0, mode='fan_in'):
             preserves the magnitude of the variance of the weights in the
             forward pass. Choosing `fan_out` preserves the magnitudes in the
             backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with 'relu' or 'leaky_relu' (default).
 
     Examples:
         >>> w = torch.Tensor(3, 5)
-        >>> nn.init.kaiming_normal_(w, mode='fan_out')
+        >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
     """
     fan = _calculate_correct_fan(tensor, mode)
-    gain = calculate_gain('leaky_relu', a)
+    gain = calculate_gain(nonlinearity, a)
     std = gain / math.sqrt(fan)
     with torch.no_grad():
         return tensor.normal_(0, std)
diff --git a/torch/nn/utils/__init__.py b/torch/nn/utils/__init__.py
@@ -1,4 +1,4 @@
 from . import rnn
-from .clip_grad import clip_grad_norm
+from .clip_grad import clip_grad_norm, clip_grad_norm_, clip_grad_value_
 from .weight_norm import weight_norm, remove_weight_norm
 from .convert_parameters import parameters_to_vector, vector_to_parameters
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
@@ -1,12 +1,14 @@
+import warnings
 
-def clip_grad_norm(parameters, max_norm, norm_type=2):
+
+def clip_grad_norm_(parameters, max_norm, norm_type=2):
     r"""Clips gradient norm of an iterable of parameters.
 
     The norm is computed over all gradients together, as if they were
     concatenated into a single vector. Gradients are modified in-place.
 
     Arguments:
-        parameters (Iterable[Variable]): an iterable of Variables that will have
+        parameters (Iterable[Tensor]): an iterable of Tensors that will have
             gradients normalized
         max_norm (float or int): max norm of the gradients
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
@@ -31,3 +33,31 @@ def clip_grad_norm(parameters, max_norm, norm_type=2):
         for p in parameters:
             p.grad.data.mul_(clip_coef)
     return total_norm
+
+
+def clip_grad_norm(parameters, max_norm, norm_type=2):
+    r"""Clips gradient norm of an iterable of parameters.
+
+    .. warning::
+        This method is now deprecated in favor of
+        :func:`torch.nn.utils.clip_grad_norm_`.
+    """
+    warnings.warn("torch.nn.utils.clip_grad_norm is now deprecated in favor "
+                  "of torch.nn.utils.clip_grad_norm_.", stacklevel=2)
+    return clip_grad_norm_(parameters, max_norm, norm_type)
+
+
+def clip_grad_value_(parameters, clip_value):
+    r"""Clips gradient of an iterable of parameters at specified value.
+
+    Gradients are modified in-place.
+
+    Arguments:
+        parameters (Iterable[Tensor]): an iterable of Tensors that will have
+            gradients normalized
+        clip_value (float or int): maximum allowed value of the gradients
+            The gradients are clipped in the range [-clip_value, clip_value]
+    """
+    clip_value = float(clip_value)
+    for p in filter(lambda p: p.grad is not None, parameters):
+        p.grad.data.clamp_(min=-clip_value, max=clip_value)