Skip to content
Merged
9 changes: 7 additions & 2 deletions docs/source/nn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -676,10 +676,15 @@ DataParallel layers (multi-GPU, distributed)
Utilities
---------

:hidden:`clip_grad_norm`
:hidden:`clip_grad_norm_`
~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: torch.nn.utils.clip_grad_norm
.. autofunction:: torch.nn.utils.clip_grad_norm_

:hidden:`clip_grad_value_`
~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: torch.nn.utils.clip_grad_value_

:hidden:`weight_norm`
~~~~~~~~~~~~~~~~~~~~~
Expand Down
20 changes: 17 additions & 3 deletions test/test_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import torch.nn.init as init
import torch.nn.utils.rnn as rnn_utils
import torch.legacy.nn as legacy
from torch.nn.utils import clip_grad_norm
from torch.nn.utils import clip_grad_norm_, clip_grad_value_
from torch.nn.utils import parameters_to_vector, vector_to_parameters
from torch.autograd import Variable, gradcheck
from torch.autograd.gradcheck import gradgradcheck
Expand Down Expand Up @@ -1167,7 +1167,7 @@ def compare_scaling(grads):
for p, g in zip(l.parameters(), grads):
p._grad = Variable(g.clone().view_as(p.data))
norm_before = compute_norm(norm_type)
norm = clip_grad_norm(l.parameters(), max_norm, norm_type=norm_type)
norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
norm_after = compute_norm(norm_type)
self.assertEqual(norm, norm_before)
self.assertEqual(norm_after, max_norm)
Expand All @@ -1180,14 +1180,28 @@ def compare_scaling(grads):
for p, g in zip(l.parameters(), grads):
p.grad.data.copy_(g)
norm_before = compute_norm(norm_type)
norm = clip_grad_norm(l.parameters(), max_norm, norm_type=norm_type)
norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type)
norm_after = compute_norm(norm_type)
self.assertEqual(norm, norm_before)
self.assertEqual(norm_before, norm_after)
self.assertLessEqual(norm_after, max_norm)
scale = compare_scaling(grads)
self.assertEqual(scale, 1)

def test_clip_grad_value(self):
l = nn.Linear(10, 10)
clip_value = 2.5

grad_w, grad_b = torch.arange(-50, 50).view(10, 10).div(5), torch.ones(10).mul(2)
for grad_list in [[grad_w, grad_b], [grad_w, None]]:
for p, g in zip(l.parameters(), grad_list):
p._grad = Variable(g.clone().view_as(p.data)) if g is not None else g

clip_grad_value_(l.parameters(), clip_value)
for p in filter(lambda p: p.grad is not None, l.parameters()):
self.assertLessEqual(p.grad.data.max(), clip_value)
self.assertGreaterEqual(p.grad.data.min(), -clip_value)

This comment was marked as off-topic.

This comment was marked as off-topic.


def test_parameters_to_vector(self):
conv1 = nn.Conv2d(3, 10, 5)
fc1 = nn.Linear(10, 20)
Expand Down
16 changes: 10 additions & 6 deletions torch/nn/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _calculate_correct_fan(tensor, mode):
return fan_in if mode == 'fan_in' else fan_out


def kaiming_uniform_(tensor, a=0, mode='fan_in'):
def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
r"""Fills the input `Tensor` with values according to the method
described in "Delving deep into rectifiers: Surpassing human-level
performance on ImageNet classification" - He, K. et al. (2015), using a
Expand All @@ -250,20 +250,22 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in'):
preserves the magnitude of the variance of the weights in the
forward pass. Choosing `fan_out` preserves the magnitudes in the
backwards pass.
nonlinearity: the non-linear function (`nn.functional` name),
recommended to use only with 'relu' or 'leaky_relu' (default).

Examples:
>>> w = torch.Tensor(3, 5)
>>> nn.init.kaiming_uniform_(w, mode='fan_in')
>>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
"""
fan = _calculate_correct_fan(tensor, mode)
gain = calculate_gain('leaky_relu', a)
gain = calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation
with torch.no_grad():
return tensor.uniform_(-bound, bound)


def kaiming_normal_(tensor, a=0, mode='fan_in'):
def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
r"""Fills the input `Tensor` with values according to the method
described in "Delving deep into rectifiers: Surpassing human-level
performance on ImageNet classification" - He, K. et al. (2015), using a
Expand All @@ -283,13 +285,15 @@ def kaiming_normal_(tensor, a=0, mode='fan_in'):
preserves the magnitude of the variance of the weights in the
forward pass. Choosing `fan_out` preserves the magnitudes in the
backwards pass.
nonlinearity: the non-linear function (`nn.functional` name),
recommended to use only with 'relu' or 'leaky_relu' (default).

Examples:
>>> w = torch.Tensor(3, 5)
>>> nn.init.kaiming_normal_(w, mode='fan_out')
>>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
"""
fan = _calculate_correct_fan(tensor, mode)
gain = calculate_gain('leaky_relu', a)
gain = calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
with torch.no_grad():
return tensor.normal_(0, std)
Expand Down
2 changes: 1 addition & 1 deletion torch/nn/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import rnn
from .clip_grad import clip_grad_norm
from .clip_grad import clip_grad_norm, clip_grad_norm_, clip_grad_value_
from .weight_norm import weight_norm, remove_weight_norm
from .convert_parameters import parameters_to_vector, vector_to_parameters
34 changes: 32 additions & 2 deletions torch/nn/utils/clip_grad.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import warnings

def clip_grad_norm(parameters, max_norm, norm_type=2):

def clip_grad_norm_(parameters, max_norm, norm_type=2):
r"""Clips gradient norm of an iterable of parameters.

The norm is computed over all gradients together, as if they were
concatenated into a single vector. Gradients are modified in-place.

Arguments:
parameters (Iterable[Variable]): an iterable of Variables that will have
parameters (Iterable[Tensor]): an iterable of Tensors that will have
gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
Expand All @@ -31,3 +33,31 @@ def clip_grad_norm(parameters, max_norm, norm_type=2):
for p in parameters:
p.grad.data.mul_(clip_coef)
return total_norm


def clip_grad_norm(parameters, max_norm, norm_type=2):
r"""Clips gradient norm of an iterable of parameters.

.. warning::
This method is now deprecated in favor of
:func:`torch.nn.utils.clip_grad_norm_`.
"""
warnings.warn("torch.nn.utils.clip_grad_norm is now deprecated in favor "
"of torch.nn.utils.clip_grad_norm_.", stacklevel=2)
return clip_grad_norm_(parameters, max_norm, norm_type)


def clip_grad_value_(parameters, clip_value):
r"""Clips gradient of an iterable of parameters at specified value.

Gradients are modified in-place.

Arguments:
parameters (Iterable[Tensor]): an iterable of Tensors that will have
gradients normalized
clip_value (float or int): maximum allowed value of the gradients
The gradients are clipped in the range [-clip_value, clip_value]
"""
clip_value = float(clip_value)
for p in filter(lambda p: p.grad is not None, parameters):
p.grad.data.clamp_(min=-clip_value, max=clip_value)