Move LayerNorm to ATen; remove tracking_running_stats functionality (#5983)

ssnl · soumith · commit 48ad4546d215 · 2018-03-30T09:44:11.000-07:00
* move LN to aten; remove tracking_stats functionaility

* Address comments about error message and respect cudnn flag for LayerNorm and GroupNorm
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -70,9 +70,77 @@ Tensor batch_norm(
             running_mean, running_var, training, momentum, eps);
 }
 
+Tensor layer_norm(const Tensor& input, IntList normalized_shape,
+    const Tensor& weight /* optional */, const Tensor& bias /* optional */,
+    double eps, bool cudnn_enabled) {
+
+    int64_t normalized_ndim = normalized_shape.size();
+
+    if (normalized_ndim < 1) {
+      std::stringstream ss;
+      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
+         << "containing at least one element, but got normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (weight.defined() && !weight.sizes().equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Expected weight to be of same shape as normalized_shape, but got "
+         << "weight of shape " << weight.sizes() << " and normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    if (bias.defined() && !bias.sizes().equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Expected bias to be of same shape as normalized_shape, but got "
+         << "bias of shape " << bias.sizes() << " and normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    auto input_shape = input.sizes();
+    auto input_ndim = input.dim();
+
+    if (input_ndim < normalized_ndim ||
+        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Given normalized_shape=" << normalized_shape
+         << ", expected input with shape [*";
+      for (auto size : normalized_shape) {
+        ss << ", " << size;
+      }
+      ss << "], but got input of size" << input_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    int64_t n = 1;
+    for (int64_t i = 0; i < input_ndim - normalized_ndim; i++) {
+      n *= input_shape[i];
+    }
+
+    // Apply layer norm
+    auto input_reshaped = input.contiguous().view({1, n, -1});
+
+    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
+                              cudnn_enabled);
+    out = out.view(input_shape);
+
+    if (weight.defined() && bias.defined()) {
+      return bias.addcmul(out, weight, 1);
+    } else if (weight.defined()) {
+      return out.mul(weight);
+    } else if (bias.defined()) {
+      return out.add(bias);
+    } else {
+      return out;
+    }
+}
+
 Tensor group_norm(const Tensor& input, int64_t num_groups,
     const Tensor& weight /* optional */, const Tensor& bias /* optional */,
-    double eps) {
+    double eps, bool cudnn_enabled) {
 
     auto input_shape = input.sizes();
     int64_t b = input.size(0);
@@ -81,31 +149,32 @@ Tensor group_norm(const Tensor& input, int64_t num_groups,
     if (c % num_groups != 0) {
       std::stringstream ss;
       ss << "Expected number of channels in input to be divisible by "
-         << "num_groups, but got " << input.sizes() << " input and num_groups="
-         << num_groups;
+         << "num_groups, but got input of shape " << input.sizes() << " and "
+         << "num_groups=" << num_groups;
       throw std::runtime_error(ss.str());
     }
 
     if (weight.defined() && (weight.dim() != 1 || weight.numel() != c)) {
       std::stringstream ss;
       ss << "Expected weight to be a vector of size equal to the number of "
-         << "channels in input, but got " << weight.sizes() << " weight and "
-         <<  input.sizes() << " input";
+         << "channels in input, but got weight of shape " << weight.sizes()
+         << " and input of shape " <<  input.sizes();
       throw std::runtime_error(ss.str());
     }
 
     if (bias.defined() && (bias.dim() != 1 || bias.numel() != c)) {
       std::stringstream ss;
       ss << "Expected bias to be a vector of size equal to the number of "
-         << "channels in input, but got " << bias.sizes() << " bias and "
-         <<  input.sizes() << " input";
+         << "channels in input, but got bias of shape " << weight.sizes()
+         << " and input of shape " <<  input.sizes();
       throw std::runtime_error(ss.str());
     }
 
     // Apply group norm
     auto input_reshaped = input.contiguous().view({1, b * num_groups, -1});
 
-    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps, true);
+    auto out = at::batch_norm(input_reshaped, {}, {}, {}, {}, true, 0, eps,
+                              cudnn_enabled);
     out = out.view(input_shape);
 
     if (!weight.defined() && !bias.defined()) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -353,7 +353,7 @@
 - func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor
   variants: function
 
-- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5) -> Tensor
+- func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor
   variants: function
 
 # FFT
@@ -393,6 +393,9 @@
 
 - func: is_sparse(Tensor self) -> bool
 
+- func: layer_norm(Tensor input, IntList normalized_shape, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enable=True) -> Tensor
+  variants: function
+
 - func: linspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
   variants: function
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1759,24 +1759,17 @@ def _test_LayerNorm_general(self, type):
             self.assertAlmostEqual(torch.abs(mean.data).mean(), bias, delta=1e-5)
             self.assertAlmostEqual(torch.abs(var.data).mean(), scale ** 2, delta=1e-5)
 
-            # test that LN with track_running_stats=True
-            ln = nn.LayerNorm(normalized_shape, momentum=1, eps=0,
-                              elementwise_affine=False, track_running_stats=True).type(type)
-            output_ref = ln(x).data.clone()
-            input_reshaped = x.view(*(unnormalized_shape + [-1]))
-            # make sure that running mean and var update correctly when training
-            mean = input_reshaped.mean(-1).mean()
-            var = input_reshaped.var(-1, unbiased=True).mean()
-            self.assertAlmostEqual(torch.abs(mean.data - ln.running_mean).mean(), 0, delta=1e-5)
-            self.assertAlmostEqual(torch.abs(var.data - ln.running_var).mean(), 0, delta=1e-5)
-            ln.eval()
-            old_running_mean = ln.running_mean.clone()
-            old_running_var = ln.running_var.clone()
-            output_new = ln(x + ln.running_var.sqrt()[0] * scale).data
-            self.assertAlmostEqual((output_new - output_ref).mean(), scale, delta=1e-5)
-            # make sure that running mean and var don't change in eval
-            self.assertEqual(old_running_mean, ln.running_mean)
-            self.assertEqual(old_running_var, ln.running_var)
+        bad_norm_shape_input_shape = {
+            (): (),
+            (2, 3): (3,),
+            (2,): (1, 2, 3),
+            (10,): (2, 3),
+            10: (2, 3),
+        }
+        for norm_shape, input_shape in bad_norm_shape_input_shape.items():
+            ln = nn.LayerNorm(norm_shape)
+            input = type(*input_shape).uniform_(0, 10)
+            self.assertRaises(RuntimeError, lambda: ln(input))
 
     def _test_LayerNorm_cuda_half(self):
         input = torch.zeros(2, 3, 3, 2, requires_grad=True).cuda().half().random_(1, 10)
@@ -5963,52 +5956,36 @@ def multimarginloss_weights_no_reduce_test():
     ),
     dict(
         module_name='LayerNorm',
-        constructor_args=([5], 1e-3, 0.3),
+        constructor_args=([5], 1e-3),
         input_size=(4, 5, 5),
         cudnn=True,
         check_eval=True,
         desc='1d_elementwise_affine',
     ),
     dict(
         module_name='LayerNorm',
-        constructor_args=([5], 1e-3, 0.3, False),
+        constructor_args=([5], 1e-3, False),
         input_size=(4, 5, 5),
         cudnn=True,
         check_eval=True,
         desc='1d_no_elementwise_affine',
     ),
     dict(
         module_name='LayerNorm',
-        constructor_args=([5], 1e-3, 0.3, True, True),
-        input_size=(4, 5, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='1d_elementwise_affine_tracking_stats',
-    ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([2, 2, 5], 1e-3, 0.3),
+        constructor_args=([2, 2, 5], 1e-3),
         input_size=(4, 2, 2, 5),
         cudnn=True,
         check_eval=True,
         desc='3d_elementwise_affine',
     ),
     dict(
         module_name='LayerNorm',
-        constructor_args=([2, 2, 5], 1e-3, 0.3, False),
+        constructor_args=([2, 2, 5], 1e-3, False),
         input_size=(4, 2, 2, 5),
         cudnn=True,
         check_eval=True,
         desc='3d_no_elementwise_affine',
     ),
-    dict(
-        module_name='LayerNorm',
-        constructor_args=([2, 2, 5], 1e-3, 0.3, True, True),
-        input_size=(4, 2, 2, 5),
-        cudnn=True,
-        check_eval=True,
-        desc='3d_elementwise_affine_tracking_stats',
-    ),
     dict(
         module_name='GroupNorm',
         constructor_args=(3, 6, 1e-3),
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -1257,74 +1257,22 @@ def _instance_norm(input, running_mean=None, running_var=None, weight=None,
                           eps=eps)
 
 
-def layer_norm(input, normalized_shape, running_mean=None, running_var=None,
-               weight=None, bias=None, use_input_stats=True,
-               momentum=0.1, eps=1e-5):
+def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-5):
     r"""Applies Layer Normalization for last certain number of dimensions.
 
     See :class:`~torch.nn.LayerNorm` for details.
     """
-    if not use_input_stats and (running_mean is None or running_var is None):
-        raise ValueError('Expected running_mean and running_var to be not None when use_input_stats=False')
-
-    if weight is not None and weight.size() != normalized_shape:
-        raise ValueError('Expected weight to be of same shape as '
-                         'normalized_shape, but got {} weight and '
-                         'normalized_shape={}'.format(weight.size(), normalized_shape))
-
-    if bias is not None and bias.size() != normalized_shape:
-        raise ValueError('Expected bias to be of same shape as '
-                         'normalized_shape, but got {} bias and '
-                         'normalized_shape={}'.format(bias.size(), normalized_shape))
-
-    normalized_ndim = len(normalized_shape)
-    input_shape = input.size()
-
-    if input_shape[-normalized_ndim:] != torch.Size(normalized_shape):
-        raise ValueError('Expected input with shape [*, {}], but got {} input'
-                         .format(', '.join(normalized_shape), list(input_shape)))
-
-    n = reduce(mul, input_shape[:-normalized_ndim], 1)
-
-    # Repeat stored stats if necessary
-    if running_mean is not None:
-        running_mean_orig = running_mean
-        running_mean = running_mean_orig.repeat(n)
-    if running_var is not None:
-        running_var_orig = running_var
-        running_var = running_var_orig.repeat(n)
-
-    # Apply layer norm
-    input_reshaped = input.contiguous().view(1, n, -1)
-
-    out = batch_norm(
-        input_reshaped, running_mean, running_var, None, None,
-        use_input_stats, momentum, eps)
-
-    # Copy back
-    if running_mean is not None:
-        running_mean_orig.fill_(running_mean.mean())
-    if running_var is not None:
-        running_var_orig.fill_(running_var.mean())
-
-    out = out.view(*input_shape)
-
-    if weight is not None and bias is not None:
-        return torch.addcmul(bias, 1, out, weight)
-    elif weight is not None:
-        return torch.mul(out, weight)
-    elif bias is not None:
-        return torch.add(out, bias)
-    else:
-        return out
+    return torch.layer_norm(input, normalized_shape, weight, bias, eps,
+                            torch.backends.cudnn.enabled)
 
 
 def group_norm(input, num_groups, weight=None, bias=None, eps=1e-5):
     r"""Applies Group Normalization for last certain number of dimensions.
 
     See :class:`~torch.nn.GroupNorm` for details.
     """
-    return torch.group_norm(input, num_groups, weight, bias, eps)
+    return torch.group_norm(input, num_groups, weight, bias, eps,
+                            torch.backends.cudnn.enabled)
 
 
 def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py