rsokl · rsokl · Jul 24, 2021 · Jul 23, 2021 · Jul 23, 2021 · Jul 24, 2021
diff --git a/src/mygrad/linalg/funcs.py b/src/mygrad/linalg/funcs.py
@@ -22,6 +22,7 @@ def norm(
     axis: Optional[Union[int, Tuple[int]]] = None,
     keepdims: bool = False,
     *,
+    nan_to_num: bool = True,
     constant: Optional[bool] = None,
 ) -> Tensor:
     r"""Vector norm.
@@ -53,6 +54,10 @@ def norm(
         result as dimensions with size one.  With this option the result will
         broadcast correctly against the original `x`.
 
+    nan_to_num : bool, optional (default=True)
+        If `True` then gradients that would store nans due to the presence of
+        zeros in `x` will instead store zeros in those places.
+
     constant : Optional[bool]
         If ``True``, this tensor is treated as a constant, and thus does not
         facilitate back propagation (i.e. ``constant.grad`` will always return
@@ -113,14 +118,26 @@ def norm(
     >>> l2_norms
     Tensor([3.74165739, 1.        ])
 
-    The presence of the elementwise absolute values in the norm means that zero-valued
-    entries in a vectors have an undefined derivative.
+    The presence of the elementwise absolute values in the norm operation means that zero-valued entries in any of 
+    input vectors have an undefined derivative. When `nan_to_num=False` is specified these derivatives will be reported
+    as `nan`, otherwise they will be made to be 0.0.
 
+    >>> l2_norms = mg.linalg.norm(x, axis=1, ord=2, nan_to_num=True)
     >>> l2_norms.backward()
     >>> x.grad
     array([[0.26726124, 0.53452248, 0.80178373],
            [1.        ,        nan,        nan]])
 
+    This is rigorously true, but is often not the desired behavior in autodiff applications.
+    Rather, it can be preferable to use `0.0` to fill these undefined derivatives.
+    This is the default behavior, when `nan_to_num` is not specified.
+
+    >>> l2_norms = mg.linalg.norm(x, axis=1, ord=2, nan_to_num=False)  # default setting: `nan_to_num=False`
+    >>> l2_norms.backward()
+    >>> x.grad
+    array([[0.26726124, 0.53452248, 0.80178373],
+          [1.        ,          0.,         0.]])
+
     L1 norms along each of the three columns:
 
     >>> mg.linalg.norm(x, axis=0, ord=1)
@@ -143,7 +160,12 @@ def norm(
     return Tensor._op(
         Norm,
         x,
-        op_kwargs={"axis": axis, "keepdims": keepdims, "ord": ord},
+        op_kwargs={
+            "axis": axis,
+            "keepdims": keepdims,
+            "ord": ord,
+            "nan_to_num": nan_to_num,
+        },
         constant=constant,
     )
 

diff --git a/src/mygrad/linalg/ops.py b/src/mygrad/linalg/ops.py
@@ -257,8 +257,17 @@ def _expand_dims(x, axis, original_ndmin):
 
 
 class Norm(Operation):
-    def __call__(self, tensor, ord=None, axis=None, keepdims=False):
+    def __call__(
+        self,
+        tensor,
+        ord=None,
+        axis=None,
+        keepdims: bool = False,
+        *,
+        nan_to_num: bool = True
+    ):
         self.variables = (tensor,)
+        self._nan_to_num = nan_to_num
         out = np.linalg.norm(tensor.data, ord=ord, axis=axis, keepdims=keepdims)
 
         if isinstance(ord, Real) and np.isinf(ord):  # pragma: no cover
@@ -304,7 +313,8 @@ def backward_var(self, grad: np.ndarray, index: int, **kwargs) -> np.ndarray:
             # is broadcast-compatible with `tensor`
             grad = _expand_dims(grad, axis=self.axis, original_ndmin=tensor.ndim)
 
-        invalid_derivative = np.where(x == 0)
+        if not self._nan_to_num:
+            invalid_derivative = np.where(x == 0)
 
         if self.ord == 1:
             out = np.sign(x)
@@ -327,5 +337,7 @@ def backward_var(self, grad: np.ndarray, index: int, **kwargs) -> np.ndarray:
             out *= np.sign(x)
             out *= _norm
             out *= grad
-        out[invalid_derivative] = np.nan
+
+        if not self._nan_to_num:
+            out[invalid_derivative] = np.nan
         return out
diff --git a/src/mygrad/math/misc/funcs.py b/src/mygrad/math/misc/funcs.py
@@ -32,6 +32,7 @@ def absolute(
     where: Mask = True,
     dtype: DTypeLikeReals = None,
     constant: Optional[bool] = None,
+    nan_to_num: bool = True,
 ) -> Tensor:  # pragma: no cover
     """The absolute value, computed elementwise.
 
@@ -57,6 +58,10 @@ def absolute(
 
         Integer-type tensors must be constant.
 
+    nan_to_num : bool, optional (default=True)
+        If `True` then gradients that would store nans due to the presence of
+        zeros in `x` will instead store zeros in those places.
+
     where : Mask
         This condition is broadcast over the input. At locations where the
         condition is True, the ``out`` tensor will be set to the ufunc result.
@@ -85,6 +90,22 @@ def absolute(
     >>> mg.absolute([-1.2, 1.2])
     Tensor([ 1.2,  1.2])
 
+    The absolute-value function is not differentiable at `x=0.0`.
+    By default the derivative at this point is treated as 0.
+
+    >>> x = mg.tensor([-2.0, 0.0, 2.0])
+    >>> mg.absolute(x).backward()
+    >>> x.grad
+    np.array([-1., 0., 1.])
+
+    However a more rigorous behavior can be enabled such that the
+    undefined derivative will be returned as `nan`.
+
+    >>> x = mg.tensor([-2.0, 0.0, 2.0])
+    >>> mg.absolute(x, nan_to_num=False).backward()
+    >>> x.grad
+    np.array([-1., nan, 1.])
+
     Plot the function and its derivate over ``[-10, 10]``:
 
     .. plot::

diff --git a/src/mygrad/math/misc/ops.py b/src/mygrad/math/misc/ops.py
@@ -11,10 +11,17 @@
 class Abs(UnaryUfunc):
     numpy_ufunc = np.absolute
 
+    def __call__(self, *args, nan_to_num: bool = True, **kwargs):
+        self._nan_to_num = nan_to_num
+        return super().__call__(*args, **kwargs)
+
     def backward_var(self, grad, index, **kwargs):
         (a,) = self.variables
+
         return grad * np.piecewise(
-            a.data, [a.data < 0, a.data == 0, a.data > 0], [-1, np.nan, 1]
+            a.data,
+            [a.data < 0, a.data == 0, a.data > 0],
+            [-1, (0 if self._nan_to_num else np.nan), 1],
         )
 
 
@@ -117,4 +124,4 @@ def backward_var(self, grad, index, **kwargs):
                 dfdx = a[:, np.newaxis] * np.expand_dims(grad, -2)
             return dfdx
         else:  # pragma: no cover
-            raise ValueError()
+            raise ValueError()
diff --git a/src/mygrad/tensor_creation/funcs.py b/src/mygrad/tensor_creation/funcs.py
@@ -675,14 +675,14 @@ def full_like(
 
 
 def arange(
-    start: Real,
-    stop: Real = None,
-    step: int = None,
-    dtype: Optional[DTypeLikeReals] = None,
-    *,
+    *args,
     constant: Optional[bool] = None,
+    **kwargs,
 ) -> Tensor:
-    """Return a Tensor with evenly-spaced values within a given interval.
+    """
+    arange([start,] stop[, step,], dtype=None, *, constant=None)
+
+    Return a Tensor with evenly-spaced values within a given interval.
 
     Values are generated within [start, stop). Note that for non-integer steps, results may be
     inconsistent; you are better off using `linspace` instead.
@@ -726,19 +726,14 @@ def arange(
     >>> import mygrad as mg
     >>> mg.arange(3)
     Tensor([0, 1, 2])
-    >>> mg.arange(3.0, constant=True)
-    Tensor([ 0.,  1.,  2.])  # resulting tensor will not back-propagate a gradient
+    >>> mg.arange(3.0, constant=True)  # resulting tensor will not back-propagate a gradient
+    Tensor([ 0.,  1.,  2.])
     >>> mg.arange(3,7)
     Tensor([3, 4, 5, 6])
     >>> mg.arange(3,7,2)
     Tensor([3, 5])
     """
-    if stop is None:
-        arr = np.arange(start, step=step, dtype=dtype)
-    else:
-        arr = np.arange(start, stop, step=step, dtype=dtype)
-
-    return Tensor(arr, constant=constant, copy=False)
+    return Tensor(np.arange(*args, **kwargs), constant=constant, copy=False)
 
 
 def linspace(

diff --git a/src/mygrad/ufuncs/_ufunc_creators.py b/src/mygrad/ufuncs/_ufunc_creators.py
@@ -183,6 +183,7 @@ def __call__(
         where: Mask = True,
         dtype: DTypeLikeReals = None,
         constant: Optional[bool] = None,
+        **kwargs,
     ) -> Tensor:
         # it is fastest to check if out is None, which is likely the
         # most common scenario, and this is a very "hot path" in the
@@ -191,15 +192,15 @@ def __call__(
             out._in_place_op(
                 cls._wrapped_op,
                 x,
-                op_kwargs={"where": where, "dtype": dtype},
+                op_kwargs={"where": where, "dtype": dtype, **kwargs},
                 constant=constant,
             )
             return out
         else:
             return Tensor._op(
                 cls._wrapped_op,
                 x,
-                op_kwargs={"where": where, "dtype": dtype},
+                op_kwargs={"where": where, "dtype": dtype, **kwargs},
                 constant=constant,
                 out=out,
             )

diff --git a/tests/linalg/test_norm.py b/tests/linalg/test_norm.py
@@ -161,3 +161,17 @@ def test_norm_backward_1d(x, data, ord):
 
     assert_allclose(o1, o2)
     assert_allclose(t1.grad, t2.grad, atol=1e-7, rtol=1e-7)
+
+
+def test_nan_to_num_behavior():
+    x = mg.tensor([[1.0, 2.0, 3.0], [1.0, 0.0, 0.0]])
+    y = x.copy()
+    z = x.copy()
+
+    mg.linalg.norm(x, axis=1, nan_to_num=False).backward()
+    mg.linalg.norm(y, axis=1, nan_to_num=True).backward()
+    mg.linalg.norm(z, axis=1).backward()  # default behavior should be `nan_to_num=True`
+
+    assert np.isnan(x.grad).sum() == 2
+    assert_allclose(np.nan_to_num(x.grad), y.grad)
+    assert_allclose(z.grad, y.grad)
diff --git a/tests/ufuncs/test_fwd_prop_and_backprop.py b/tests/ufuncs/test_fwd_prop_and_backprop.py
@@ -331,3 +331,17 @@ def test_arctan2_bkwd_pos_x():
 )
 def test_arctan2_bkwd_neg_x():
     pass
+
+
+def test_abs_nan_to_num():
+    x = mg.arange(-2.0, 3.0)
+    y = x.copy()
+    z = x.copy()
+
+    mg.abs(x, nan_to_num=False).backward()
+    mg.abs(y, nan_to_num=True).backward()
+    mg.abs(z).backward()
+
+    assert np.all(np.isnan(x.grad) == np.array([False, False, True, False, False]))
+    assert_allclose(np.nan_to_num(x.grad), y.grad)
+    assert_allclose(y.grad, z.grad)