pytorch · ezyang · Mar 24, 2018 · Mar 21, 2018 · Mar 23, 2018
diff --git a/aten/src/ATen/cudnn/README.md b/aten/src/ATen/cudnn/README.md
@@ -1,4 +1,4 @@
-All files living in this directory are written with the assumption that cuDNN is available, 
-which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever 
-you need to use definitions from here, please guard the `#include<ATen/cudnn/*.h>` and 
-definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [BatchNorm.cpp](native/cudnn/BatchNorm.cpp).
+All files living in this directory are written with the assumption that cuDNN is available,
+which means that these code are not guarded by `#if AT_CUDNN_ENABLED()`. Therefore, whenever
+you need to use definitions from here, please guard the `#include<ATen/cudnn/*.h>` and
+definition usages with `#if AT_CUDNN_ENABLED()` macro, e.g. [native/cudnn/BatchNorm.cpp](native/cudnn/BatchNorm.cpp).
diff --git a/aten/src/ATen/native/BatchNorm.cpp → aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/BatchNorm.cpp → aten/src/ATen/native/Normalization.cpp
@@ -6,6 +6,7 @@
 #include "THC/THC.h"
 #include "ATen/cudnn/cudnn-wrapper.h"
 #endif
+#include <vector>
 
 namespace at { namespace native {
 
@@ -69,4 +70,58 @@ Tensor batch_norm(
             running_mean, running_var, training, momentum, eps);
 }
 
+Tensor group_norm(const Tensor& input, int64_t num_groups,
+    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    double eps) {
+
+    auto input_shape = input.sizes();
+    int64_t b = input.size(0);
+    int64_t c = input.size(1);
+
+    if (c % num_groups != 0) {
+      std::stringstream ss;
+      ss << "Expected number of channels in input to be divisible by "
+         << "num_groups, but got " << input.sizes() << " input and num_groups="
+         << num_groups;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) {
+      std::stringstream ss;
+      ss << "Expected weight to be a vector of size equal to the number of "
+         << "channels in input, but got " << weight.sizes() << " weight and "
+         <<  input.sizes() << " input";
+      throw std::runtime_error(ss.str());
+    }
+
+    if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) {
+      std::stringstream ss;
+      ss << "Expected bias to be a vector of size equal to the number of "
+         << "channels in input, but got " << bias.sizes() << " bias and "
+         <<  input.sizes() << " input";
+      throw std::runtime_error(ss.str());
+    }
+
+    // Apply group norm
+    auto input_reshaped = input.contiguous().view({1, b * num_groups, -1});
+
+    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps, true);
+    out = out.view(input_shape);
+
+    if (!weight.defined() && !bias.defined()) {
+      return out;
+    }
+
+    std::vector<int64_t> affine_param_shape(input.dim(), 1);
+    affine_param_shape[1] = c;
+
+    if (weight.defined() && bias.defined()) {
+      return bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1);
+    } else if (weight.defined()) {
+      return out.mul(weight.view(affine_param_shape));
+    } else {
+      return out.add(bias.view(affine_param_shape));
+    }
+}
+
 }} // at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -291,6 +291,9 @@
 - func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor
   variants: function
 
+- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5) -> Tensor
+  variants: function
+
 - func: index(Tensor self, TensorList indices) -> Tensor
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
 

diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1732,7 +1732,7 @@ def test_InstanceNorm3d_general_cuda(self):
     def _test_LayerNorm_general(self, type):
         for i in range(2, 6):
             shape = torch.LongTensor(i).random_(3, 6).tolist()
-            x = Variable(type(*shape).uniform_(0, 10))
+            x = type(*shape).uniform_(0, 10)
             normalized_ndim = random.randint(1, i - 1)  # inclusive
             normalized_shape = shape[-normalized_ndim:]
             unnormalized_shape = shape[:-normalized_ndim]
@@ -1779,8 +1779,7 @@ def _test_LayerNorm_general(self, type):
             self.assertEqual(old_running_var, ln.running_var)
 
     def _test_LayerNorm_cuda_half(self):
-        # just THNN, LayerNorm has no cuDNN path
-        input = Variable(torch.rand(2, 3, 3, 2).cuda().half().random_(1, 10), requires_grad=True)
+        input = torch.zeros(2, 3, 3, 2, requires_grad=True).cuda().half().random_(1, 10)
         m = nn.LayerNorm([3, 2]).cuda().half()
         output = m(input)
         output.sum().backward()
@@ -1794,6 +1793,69 @@ def test_LayerNorm_general_cuda(self):
         self._test_LayerNorm_general(torch.cuda.FloatTensor)
         self._test_LayerNorm_cuda_half()
 
+    def _test_GroupNorm_general(self, type):
+        good_shape_g = {
+            (1, 2, 3, 4): 2,
+            (2, 3, 10): 3,
+            (3, 1, 1, 1, 2): 1,
+            (2, 6, 4, 2, 2): 3,
+        }
+        for shape, g in good_shape_g.items():
+            x = type(*shape).uniform_(0, 10)
+            b = shape[0]
+            c = shape[1]
+
+            # test that GN normalizes to mean 0 and stddev 1
+            gn = nn.GroupNorm(g, c, eps=0).type(type)
+            gn.weight.data.fill_(1)
+            gn.bias.data.fill_(0)
+            output = gn(x)
+            out_reshaped = output.view(b, g, -1)
+            mean = out_reshaped.mean(-1)
+            var = out_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean).mean(), 0, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var).mean(), 1, delta=1e-5)
+
+            # test that GN applies weight and bias correctly
+            scale = type(c).uniform_(0.2, 2)
+            bias = type(c).uniform_(0.2, 2)
+            gn.weight.data.copy_(scale)
+            gn.bias.data.copy_(bias)
+            output = gn(x)
+            out_reshaped = output.view(b, c, -1)
+            out_normed = (out_reshaped - bias.view(c, 1)) / scale.view(c, 1)
+            out_normed_reshaped = out_normed.view(b, g, -1)
+            mean = out_normed_reshaped.mean(-1)
+            var = out_normed_reshaped.var(-1, unbiased=False)
+            self.assertAlmostEqual(torch.abs(mean).mean(), 0, delta=1e-5)
+            self.assertAlmostEqual(torch.abs(var).mean(), 1, delta=1e-5)
+
+        bad_shape_g = {
+            (1, 2, 3, 4): 3,
+            (2, 3, 10): 2,
+            (3, 1, 1, 1, 2): 10,
+            (2, 6, 4, 2, 2): 4,
+        }
+        for shape, g in bad_shape_g.items():
+            gn = nn.GroupNorm(g, shape[1])
+            input = type(*shape).uniform_(0, 10)
+            self.assertRaises(RuntimeError, lambda: gn(input))
+
+    def _test_GroupNorm_cuda_half(self):
+        input = torch.zeros(2, 4, 3, 2, requires_grad=True).cuda().half().random_(1, 10)
+        m = nn.GroupNorm(2, 4).cuda().half()
+        output = m(input)
+        output.sum().backward()
+        self.assertEqual(output.type(), input.type())
+
+    def test_GroupNorm_general(self):
+        self._test_GroupNorm_general(torch.FloatTensor)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_GroupNorm_general_cuda(self):
+        self._test_GroupNorm_general(torch.cuda.FloatTensor)
+        self._test_GroupNorm_cuda_half()
+
     def test_pad(self):
         inputs = Variable(torch.randn(1, 3, 4, 4), requires_grad=True)
         _assertGradAndGradgradChecks(self, lambda x: F.pad(x, (1, 1, 1, 1)), (inputs,))
@@ -5880,6 +5942,54 @@ def multimarginloss_weights_no_reduce_test():
         check_eval=True,
         desc='3d_elementwise_affine_tracking_stats',
     ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 6, 1e-3),
+        input_size=(4, 6, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_affine',
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(5, 5, 1e-3, False),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(1, 5, 1e-3, False),
+        input_size=(4, 5, 5),
+        cudnn=True,
+        check_eval=True,
+        desc='1d_no_affine_LN',  # this setting is equivalent with LayerNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 6, 1e-3),
+        input_size=(4, 6, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_affine',
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(3, 3, 1e-3, False),
+        input_size=(4, 3, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_no_affine_IN',  # this setting is equivalent with InstanceNorm
+    ),
+    dict(
+        module_name='GroupNorm',
+        constructor_args=(1, 3, 1e-3, False),
+        input_size=(4, 3, 2, 3),
+        cudnn=True,
+        check_eval=True,
+        desc='2d_no_affine_LN',  # this setting is equivalent with LayerNorm
+    ),
     dict(
         module_name='Conv1d',
         constructor_args=(4, 5, 3),

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -1207,8 +1207,8 @@ def batch_norm(input, running_mean, running_var, weight=None, bias=None,
     )
 
 
-def instance_norm(input, running_mean, running_var, weight=None, bias=None,
-                  use_input_stats=True, momentum=0.1, eps=1e-5):
+def instance_norm(input, running_mean=None, running_var=None, weight=None,
+                  bias=None, use_input_stats=True, momentum=0.1, eps=1e-5):
     r"""Applies Instance Normalization for each channel in each data sample in a
     batch.
 
@@ -1244,7 +1244,7 @@ def _instance_norm(input, running_mean=None, running_var=None, weight=None,
             input_reshaped, running_mean, running_var, weight=weight, bias=bias,
             training=use_input_stats, momentum=momentum, eps=eps)
 
-        # Reshape back
+        # Reshape and copy back
         if running_mean is not None:
             running_mean_orig.copy_(running_mean.view(b, c).mean(0, keepdim=False))
         if running_var is not None:
@@ -1257,7 +1257,7 @@ def _instance_norm(input, running_mean=None, running_var=None, weight=None,
                           eps=eps)
 
 
-def layer_norm(input, normalized_shape, running_mean, running_var,
+def layer_norm(input, normalized_shape, running_mean=None, running_var=None,
                weight=None, bias=None, use_input_stats=True,
                momentum=0.1, eps=1e-5):
     r"""Applies Layer Normalization for last certain number of dimensions.
@@ -1267,6 +1267,16 @@ def layer_norm(input, normalized_shape, running_mean, running_var,
     if not use_input_stats and (running_mean is None or running_var is None):
         raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
 
+    if weight is not None and weight.size() != normalized_shape:
+        raise ValueError('Expected weight to be of same shape as '
+                         'normalized_shape, but got {} weight and '
+                         'normalized_shape={}'.format(weight.size(), normalized_shape))
+
+    if bias is not None and bias.size() != normalized_shape:
+        raise ValueError('Expected bias to be of same shape as '
+                         'normalized_shape, but got {} bias and '
+                         'normalized_shape={}'.format(bias.size(), normalized_shape))
+
     normalized_ndim = len(normalized_shape)
     input_shape = input.size()
 
@@ -1309,6 +1319,14 @@ def layer_norm(input, normalized_shape, running_mean, running_var,
         return out
 
 
+def group_norm(input, num_groups, weight=None, bias=None, eps=1e-5):
+    r"""Applies Group Normalization for last certain number of dimensions.
+
+    See :class:`~torch.nn.GroupNorm` for details.
+    """
+    return torch.group_norm(input, num_groups, weight, bias, eps)
+
+
 def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
     r"""Applies local response normalization over an input signal composed of
     several input planes, where channels occupy the second dimension.

diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
@@ -15,7 +15,7 @@
     AdaptiveMaxPool2d, AdaptiveMaxPool3d, AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d
 from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d
 from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d
-from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm
+from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
 from .dropout import Dropout, Dropout2d, Dropout3d, AlphaDropout
 from .padding import ReflectionPad1d, ReflectionPad2d, ReplicationPad1d, ReplicationPad2d, \
     ReplicationPad3d, ZeroPad2d, ConstantPad1d, ConstantPad2d, ConstantPad3d
@@ -39,7 +39,7 @@
     'ParameterList', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
     'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d',
     'LPPool1d', 'LPPool2d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d',
-    'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout',
+    'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout',
     'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
     'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
     'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'PairwiseDistance',

diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
@@ -105,7 +105,8 @@ class BatchNorm1d(_BatchNorm):
         - Input: :math:`(N, C)` or :math:`(N, C, L)`
         - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # With Learnable Parameters
         >>> m = nn.BatchNorm1d(100)
         >>> # Without Learnable Parameters
@@ -174,7 +175,8 @@ class BatchNorm2d(_BatchNorm):
         - Input: :math:`(N, C, H, W)`
         - Output: :math:`(N, C, H, W)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # With Learnable Parameters
         >>> m = nn.BatchNorm2d(100)
         >>> # Without Learnable Parameters
@@ -244,7 +246,8 @@ class BatchNorm3d(_BatchNorm):
         - Input: :math:`(N, C, D, H, W)`
         - Output: :math:`(N, C, D, H, W)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # With Learnable Parameters
         >>> m = nn.BatchNorm3d(100)
         >>> # Without Learnable Parameters

diff --git a/torch/nn/modules/instancenorm.py b/torch/nn/modules/instancenorm.py
@@ -64,7 +64,8 @@ class InstanceNorm1d(_InstanceNorm):
         - Input: :math:`(N, C, L)`
         - Output: :math:`(N, C, L)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # Without Learnable Parameters
         >>> m = nn.InstanceNorm1d(100)
         >>> # With Learnable Parameters
@@ -127,7 +128,8 @@ class InstanceNorm2d(_InstanceNorm):
         - Input: :math:`(N, C, H, W)`
         - Output: :math:`(N, C, H, W)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # Without Learnable Parameters
         >>> m = nn.InstanceNorm2d(100)
         >>> # With Learnable Parameters
@@ -190,7 +192,8 @@ class InstanceNorm3d(_InstanceNorm):
         - Input: :math:`(N, C, D, H, W)`
         - Output: :math:`(N, C, D, H, W)` (same shape as input)
 
-    Examples:
+    Examples::
+
         >>> # Without Learnable Parameters
         >>> m = nn.InstanceNorm3d(100)
         >>> # With Learnable Parameters