pytorch
diff --git a/‎docs/source/autograd.rst‎
Lines changed: 7 additions & 0 deletions b/‎docs/source/autograd.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 0 deletions b/‎setup.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎test/test_autograd.py‎
Lines changed: 36 additions & 1 deletion b/‎test/test_autograd.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎tools/cpp_build/libtorch/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎tools/cpp_build/libtorch/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/autograd/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎torch/autograd/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/autograd/anomaly_mode.py‎
Lines changed: 99 additions & 0 deletions b/‎torch/autograd/anomaly_mode.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/anomaly_mode.cpp‎
Lines changed: 7 additions & 0 deletions b/‎torch/csrc/autograd/anomaly_mode.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/anomaly_mode.h‎
Lines changed: 23 additions & 0 deletions b/‎torch/csrc/autograd/anomaly_mode.h‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/engine.cpp‎
Lines changed: 18 additions & 0 deletions b/‎torch/csrc/autograd/engine.cpp‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎torch/csrc/autograd/engine.h‎
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/autograd/engine.h‎
Lines changed: 4 additions & 0 deletions
@@ -98,3 +98,10 @@ and nvprof based (registers both CPU and GPU activity) using
     :members:
 
 .. autofunction:: torch.autograd.profiler.load_nvprof
+
+Anomaly detection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: detect_anomaly
+
+.. autoclass:: set_detect_anomaly
@@ -737,6 +737,8 @@ def run(self):
     "torch/csrc/autograd/init.cpp",
     "torch/csrc/autograd/aten_variable_hooks.cpp",
     "torch/csrc/autograd/grad_mode.cpp",
+    "torch/csrc/autograd/anomaly_mode.cpp",
+    "torch/csrc/autograd/python_anomaly_mode.cpp",
     "torch/csrc/autograd/engine.cpp",
     "torch/csrc/autograd/function.cpp",
     "torch/csrc/autograd/variable.cpp",
 
@@ -15,7 +15,7 @@
 from torch.autograd.profiler import profile
 from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
     suppress_warnings
-from torch.autograd import Variable, Function
+from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
 
@@ -2306,6 +2306,41 @@ def test_rnn_backward_to_input_but_not_parameters_cuda(self):
         out.sum().backward()
         self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)
 
+    def test_anomaly_detect_nan(self):
+        size = 10
+
+        class MyFunc(Function):
+            @staticmethod
+            def forward(ctx, inp1, inp2, fail_0th):
+                ctx.fail_0th = fail_0th
+                return inp1.sum(0, keepdim=True)
+
+            @staticmethod
+            def backward(ctx, gO):
+                gI = gO.clone().expand(size)
+                gI[0] = 0
+                gI[0] /= 0  # Generate a nan
+                if ctx.fail_0th:
+                    return gI, None, None
+                else:
+                    return None, gI, None
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        out.backward()  # Should not fail
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, True)
+        with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
+            with detect_anomaly():
+                out.backward()
+
+        inp = torch.rand(size, requires_grad=True)
+        out = MyFunc.apply(inp, inp, False)
+        with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
+            with detect_anomaly():
+                out.backward()
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
 
@@ -201,6 +201,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
+  ${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/function.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
   ${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
 
@@ -11,6 +11,7 @@
 from .function import Function, NestedIOFunction
 from .gradcheck import gradcheck, gradgradcheck
 from .grad_mode import no_grad, enable_grad, set_grad_enabled
+from .anomaly_mode import detect_anomaly, set_detect_anomaly
 from . import profiler
 
 __all__ = ['Variable', 'Function', 'backward', 'grad_mode']
 
@@ -0,0 +1,99 @@
+import torch
+
+
+class detect_anomaly(object):
+    r"""Context-manager that enable anomaly detection for the autograd engine.
+
+    This does two things:
+    - Running the forward pass with detection enabled will allow the backward
+    pass to print the traceback of the forward operation that created the failing
+    backward function.
+    - Any backward computation that generate "nan" value will raise an error.
+
+    Example:
+
+        >>> import torch
+        >>> from torch import autograd
+        >>> class MyFunc(autograd.Function):
+        ...     @staticmethod
+        ...     def forward(ctx, inp):
+        ...         return inp.clone()
+        ...     @staticmethod
+        ...     def backward(ctx, gO):
+        ...         # Error during the backward pass
+        ...         raise RuntimeError("Some error in backward")
+        ...         return gO.clone()
+        >>> def run_fn(a):
+        ...     out = MyFunc.apply(a)
+        ...     return out.sum()
+        >>> inp = torch.rand(10, 10, requires_grad=True)
+        >>> out = run_fn(inp)
+        >>> out.backward()
+            Traceback (most recent call last):
+              File "<stdin>", line 1, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+        >>> with autograd.detect_anomaly():
+        ...     inp = torch.rand(10, 10, requires_grad=True)
+        ...     out = run_fn(inp)
+        ...     out.backward()
+            Traceback of forward call that caused the error:
+              File "tmp.py", line 53, in <module>
+                out = run_fn(inp)
+              File "tmp.py", line 44, in run_fn
+                out = MyFunc.apply(a)
+            Traceback (most recent call last):
+              File "<stdin>", line 4, in <module>
+              File "/your/pytorch/install/torch/tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+
+    """
+
+    def __init__(self):
+        self.prev = torch.is_anomaly_enabled()
+
+    def __enter__(self):
+        torch.set_anomaly_enabled(True)
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
+
+
+class set_detect_anomaly(object):
+    r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
+
+    ``set_detect_anomaly`` will enable or disable the autograd anomaly detection
+    based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    See ``detect_anomaly`` above for details of the anomaly detection behaviour.
+
+    Arguments:
+        mode (bool): Flag whether to enable anomaly detection (``True``),
+                     or disable (``False``).
+
+    """
+
+    def __init__(self, mode):
+        self.prev = torch.is_anomaly_enabled()
+        torch.set_anomaly_enabled(mode)
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        torch.set_anomaly_enabled(self.prev)
+        return False
@@ -0,0 +1,7 @@
+#include "torch/csrc/autograd/anomaly_mode.h"
+
+namespace torch { namespace autograd {
+
+bool AnomalyMode::_enabled = 0;
+
+}}
@@ -0,0 +1,23 @@
+#pragma once
+
+namespace torch { namespace autograd {
+
+struct AnomalyMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static void set_enabled(bool enabled) {
+    _enabled = enabled;
+  }
+
+private:
+  static bool _enabled;
+};
+
+
+struct AnomalyMetadata {
+  virtual void store_stack() = 0;
+  virtual void print_stack() = 0;
+};
+
+}}
@@ -3,6 +3,7 @@
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/functions/basic_ops.h"
 #include "torch/csrc/autograd/grad_mode.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
 #include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/utils/auto_gpu.h"
 
@@ -269,6 +270,9 @@ auto Engine::thread_main(GraphTask *graph_task) -> void {
 auto Engine::thread_on_exception(FunctionTask& task, std::exception& e) -> void {
   std::lock_guard<std::mutex> lock(task.base->mutex);
   if (!task.base->has_error.load()) {
+    if (AnomalyMode::is_enabled()) {
+      task.fn->metadata()->print_stack();
+    }
     task.base->exception = std::current_exception();
     task.base->has_error = true;
   }
@@ -373,6 +377,20 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {
 
   int num_outputs = outputs.size();
   if (num_outputs == 0) return; // Don't even acquire the mutex
+
+  if (AnomalyMode::is_enabled()) {
+    AutoGradMode grad_mode(false);
+    for (int i = 0; i < num_outputs; ++i) {
+      auto& output = outputs[i];
+      AutoGPU guard(output);
+      if (output.defined() && output.ne(output).any().toCByte()) {
+        std::stringstream ss;
+        ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
+        throw std::runtime_error(ss.str());
+      }
+    }
+  }
+
   std::lock_guard<std::mutex> lock(task.base->mutex);
   for (int i = 0; i < num_outputs; ++i) {
     auto& output = outputs[i];
 
@@ -5,6 +5,7 @@
 
 #include "torch/csrc/autograd/function.h"
 #include "torch/csrc/autograd/input_buffer.h"
+#include "torch/csrc/autograd/anomaly_mode.h"
 
 #include <deque>
 #include <exception>
@@ -41,6 +42,9 @@ struct Engine {
       bool keep_graph,
       bool create_graph,
       const edge_list& outputs = {});
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
+    return nullptr;
+  }
 
   void queue_callback(std::function<void()> callback);