propagate nan in some activations (#8033)

ssnl · web-flow · commit bf29abd908f2 · 2018-06-01T14:08:01.000-04:00
* propagate nan in some activations

* fix py2 not having math.nan

* flake8
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
@@ -2803,13 +2803,13 @@ void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) {
 void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value) {
   THTensor_(resizeAs)(r, t);
   TH_TENSOR_APPLY2(real, r, real, t,
-                   *r_data = *t_data > value ? *t_data : value;);
+                   *r_data = *t_data < value ? value : *t_data;);  // this order propagates NaN
 }
 
 void THTensor_(cminValue)(THTensor *r, THTensor *t, real value) {
   THTensor_(resizeAs)(r, t);
   TH_TENSOR_APPLY2(real, r, real, t,
-                   *r_data = *t_data < value ? *t_data : value;);
+                   *r_data = *t_data > value ? value : *t_data;);  // this order propagates NaN
 }
 
 void THTensor_(zeros)(THTensor *r_, THLongStorage *size)
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -655,11 +655,11 @@ struct TensorMaxValueOp {
   TensorMaxValueOp(T v) : val(v) {}
 
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::gt(*out, val) ? *out : val;
+    *out = THCNumerics<T>::lt(*out, val) ? val : *out;  // this order propagates NaN
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::gt(*in, val) ? *in : val;
+    *out = THCNumerics<T>::lt(*in, val) ? val : *in;  // this order propagates NaN
   }
 
   T val;
@@ -670,11 +670,11 @@ struct TensorMinValueOp {
   TensorMinValueOp(T v) : val(v) {}
 
   __device__ __forceinline__ void operator()(T* out) {
-    *out = THCNumerics<T>::lt(*out, val) ? *out : val;
+    *out = THCNumerics<T>::gt(*out, val) ? val : *out;  // this order propagates NaN
   }
 
   __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = THCNumerics<T>::lt(*in, val) ? *in : val;
+    *out = THCNumerics<T>::gt(*in, val) ? val : *in;  // this order propagates NaN
   }
 
   T val;
diff --git a/aten/src/THCUNN/HardTanh.cu b/aten/src/THCUNN/HardTanh.cu
@@ -18,10 +18,10 @@ struct hardtanhupdateOutput_functor
   {
     if (*input < min_val_)
       *output = min_val_;
-    else if (*input <= max_val_)
-      *output = *input;
-    else
+    else if (*input > max_val_)
       *output = max_val_;
+    else
+      *output = *input;
   }
 
   __device__ void operator()(T *input) const
diff --git a/aten/src/THNN/generic/HardShrink.c b/aten/src/THNN/generic/HardShrink.c
@@ -14,10 +14,10 @@ void THNN_(HardShrink_updateOutput)(
   TH_TENSOR_APPLY2(real, output, real, input,
     if (*input_data > lambda)
       *output_data = *input_data;
-    else if (*input_data < -lambda)
-      *output_data = *input_data;
-    else
+    else if (*input_data >= -lambda)
       *output_data = 0;
+    else
+      *output_data = *input_data;  // let NaN case pass through here
   );
 }
 
@@ -32,10 +32,10 @@ void THNN_(HardShrink_updateGradInput)(
   THNN_CHECK_NELEMENT(input, gradOutput);
   THTensor_(resizeAs)(gradInput, input);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
-    if (*input_data > lambda || *input_data < -lambda)
-      *gradInput_data = *gradOutput_data;
-    else
+    if (*input_data >= -lambda && *input_data <= lambda)
       *gradInput_data = 0;
+    else
+      *gradInput_data = *gradOutput_data;  // let NaN case pass through here
   );
 }
 
diff --git a/aten/src/THNN/generic/HardTanh.c b/aten/src/THNN/generic/HardTanh.c
@@ -33,10 +33,10 @@ void THNN_(HardTanh_updateOutput)(
       TH_TENSOR_APPLY2(real, output, real, input,
         if (*input_data < min_val)
           *output_data = min_val;
-        else if (*input_data <= max_val)
-          *output_data = *input_data;
-        else
+        else if (*input_data > max_val)
           *output_data = max_val;
+        else
+          *output_data = *input_data;
       );
     }
   }
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1338,6 +1338,46 @@ def test_vector_to_parameters(self):
         sample = next(model.parameters())[0, 0, 0]
         self.assertTrue(torch.equal(sample.data, vec.data[:5]))
 
+    # We don't want to make propagating NaN a hard requirement on ops, but for
+    # these easy ones, we should make them do so.
+    def _test_nonlinearity_propagate_nan(self, device):
+        nan = float('nan')
+
+        def test(nonlinearity, *args, **kwargs):
+            x = torch.tensor([nan], device=device)
+            fn = getattr(F, nonlinearity)
+            try:
+                self.assertTrue(math.isnan(fn(x, *args, **kwargs).item()))
+            except Exception as e:
+                if 'not implemented' not in str(e):
+                    raise
+
+        test('relu')
+        test('relu', inplace=True)
+        test('relu6')
+        test('elu')
+        test('selu')
+        test('rrelu')
+        test('rrelu', inplace=True)
+        test('hardtanh')
+        test('tanh')
+        test('sigmoid')
+        test('logsigmoid')
+        test('hardshrink')
+        test('tanhshrink')
+        test('softsign')
+        test('softmin', 0)
+        test('softmax', 0)
+        test('log_softmax', 0)
+        test('leaky_relu', 0.2)
+
+    def test_nonlinearity_propagate_nan(self):
+        self._test_nonlinearity_propagate_nan('cpu')
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_nonlinearity_propagate_nan_cuda(self):
+        self._test_nonlinearity_propagate_nan('cuda')
+
     def test_weight_norm(self):
         input = torch.randn(3, 5)
         m = nn.Linear(5, 7)

Original file line number	Diff line number	Diff line change
`@@ -2803,13 +2803,13 @@ void THTensor_(cmin)(THTensor r, THTensor t, THTensor *src) {`
`2803`	`2803`	`void THTensor_(cmaxValue)(THTensor r, THTensor t, real value) {`
`2804`	`2804`	`THTensor_(resizeAs)(r, t);`
`2805`	`2805`	`TH_TENSOR_APPLY2(real, r, real, t,`
`2806`		`- r_data = t_data > value ? *t_data : value;);`
	`2806`	`+ r_data = t_data < value ? value : *t_data;); // this order propagates NaN`
`2807`	`2807`	`}`
`2808`	`2808`
`2809`	`2809`	`void THTensor_(cminValue)(THTensor r, THTensor t, real value) {`
`2810`	`2810`	`THTensor_(resizeAs)(r, t);`
`2811`	`2811`	`TH_TENSOR_APPLY2(real, r, real, t,`
`2812`		`- r_data = t_data < value ? *t_data : value;);`
	`2812`	`+ r_data = t_data > value ? value : *t_data;); // this order propagates NaN`
`2813`	`2813`	`}`
`2814`	`2814`
`2815`	`2815`	`void THTensor_(zeros)(THTensor r_, THLongStorage size)`
Original file line number	Diff line number	Diff line change
`@@ -655,11 +655,11 @@ struct TensorMaxValueOp {`
`655`	`655`	`TensorMaxValueOp(T v) : val(v) {}`
`656`	`656`
`657`	`657`	`__device__ __forceinline__ void operator()(T* out) {`
`658`		`- out = THCNumerics<T>::gt(out, val) ? *out : val;`
	`658`	`+ out = THCNumerics<T>::lt(out, val) ? val : *out; // this order propagates NaN`
`659`	`659`	`}`
`660`	`660`
`661`	`661`	`__device__ __forceinline__ void operator()(T* out, T* in) {`
`662`		`- out = THCNumerics<T>::gt(in, val) ? *in : val;`
	`662`	`+ out = THCNumerics<T>::lt(in, val) ? val : *in; // this order propagates NaN`
`663`	`663`	`}`
`664`	`664`
`665`	`665`	`T val;`
`@@ -670,11 +670,11 @@ struct TensorMinValueOp {`
`670`	`670`	`TensorMinValueOp(T v) : val(v) {}`
`671`	`671`
`672`	`672`	`__device__ __forceinline__ void operator()(T* out) {`
`673`		`- out = THCNumerics<T>::lt(out, val) ? *out : val;`
	`673`	`+ out = THCNumerics<T>::gt(out, val) ? val : *out; // this order propagates NaN`
`674`	`674`	`}`
`675`	`675`
`676`	`676`	`__device__ __forceinline__ void operator()(T* out, T* in) {`
`677`		`- out = THCNumerics<T>::lt(in, val) ? *in : val;`
	`677`	`+ out = THCNumerics<T>::gt(in, val) ? val : *in; // this order propagates NaN`
`678`	`678`	`}`
`679`	`679`
`680`	`680`	`T val;`
Original file line number	Diff line number	Diff line change
`@@ -18,10 +18,10 @@ struct hardtanhupdateOutput_functor`
`18`	`18`	`{`
`19`	`19`	`if (*input < min_val_)`
`20`	`20`	`*output = min_val_;`
`21`		`- else if (*input <= max_val_)`
`22`		`- output = input;`
`23`		`- else`
	`21`	`+ else if (*input > max_val_)`
`24`	`22`	`*output = max_val_;`
	`23`	`+ else`
	`24`	`+ output = input;`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`__device__ void operator()(T *input) const`
Original file line number	Diff line number	Diff line change
`@@ -33,10 +33,10 @@ void THNN_(HardTanh_updateOutput)(`
`33`	`33`	`TH_TENSOR_APPLY2(real, output, real, input,`
`34`	`34`	`if (*input_data < min_val)`
`35`	`35`	`*output_data = min_val;`
`36`		`- else if (*input_data <= max_val)`
`37`		`- output_data = input_data;`
`38`		`- else`
	`36`	`+ else if (*input_data > max_val)`
`39`	`37`	`*output_data = max_val;`
	`38`	`+ else`
	`39`	`+ output_data = input_data;`
`40`	`40`	`);`
`41`	`41`	`}`
`42`	`42`	`}`