Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/autograd.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,10 @@ and nvprof based (registers both CPU and GPU activity) using
:members:

.. autofunction:: torch.autograd.profiler.load_nvprof

Anomaly detection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: detect_anomaly

.. autoclass:: set_detect_anomaly
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,8 @@ def run(self):
"torch/csrc/autograd/init.cpp",
"torch/csrc/autograd/aten_variable_hooks.cpp",
"torch/csrc/autograd/grad_mode.cpp",
"torch/csrc/autograd/anomaly_mode.cpp",
"torch/csrc/autograd/python_anomaly_mode.cpp",
"torch/csrc/autograd/engine.cpp",
"torch/csrc/autograd/function.cpp",
"torch/csrc/autograd/variable.cpp",
Expand Down
37 changes: 36 additions & 1 deletion test/test_autograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from torch.autograd.profiler import profile
from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
suppress_warnings
from torch.autograd import Variable, Function
from torch.autograd import Variable, Function, detect_anomaly
from torch.autograd.function import InplaceFunction
from torch.testing import make_non_contiguous, randn_like

Expand Down Expand Up @@ -2306,6 +2306,41 @@ def test_rnn_backward_to_input_but_not_parameters_cuda(self):
out.sum().backward()
self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0)

def test_anomaly_detect_nan(self):
size = 10

class MyFunc(Function):
@staticmethod
def forward(ctx, inp1, inp2, fail_0th):
ctx.fail_0th = fail_0th
return inp1.sum(0, keepdim=True)

@staticmethod
def backward(ctx, gO):
gI = gO.clone().expand(size)
gI[0] = 0
gI[0] /= 0 # Generate a nan
if ctx.fail_0th:
return gI, None, None
else:
return None, gI, None

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, True)
out.backward() # Should not fail

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, True)
with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 0th output."):
with detect_anomaly():
out.backward()

inp = torch.rand(size, requires_grad=True)
out = MyFunc.apply(inp, inp, False)
with self.assertRaisesRegexp(RuntimeError, "Function 'MyFuncBackward' returned nan values in its 1th output."):
with detect_anomaly():
out.backward()


def index_variable(shape, max_indices):
if not isinstance(shape, tuple):
Expand Down
1 change: 1 addition & 0 deletions tools/cpp_build/libtorch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ set(TORCH_SRCS
${TORCH_SRC_DIR}/csrc/autograd/profiler.cpp
${TORCH_SRC_DIR}/csrc/autograd/saved_variable.cpp
${TORCH_SRC_DIR}/csrc/autograd/grad_mode.cpp
${TORCH_SRC_DIR}/csrc/autograd/anomaly_mode.cpp
${TORCH_SRC_DIR}/csrc/autograd/function.cpp
${TORCH_SRC_DIR}/csrc/autograd/input_buffer.cpp
${TORCH_SRC_DIR}/csrc/autograd/functions/utils.cpp
Expand Down
1 change: 1 addition & 0 deletions torch/autograd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .function import Function, NestedIOFunction
from .gradcheck import gradcheck, gradgradcheck
from .grad_mode import no_grad, enable_grad, set_grad_enabled
from .anomaly_mode import detect_anomaly, set_detect_anomaly
from . import profiler

__all__ = ['Variable', 'Function', 'backward', 'grad_mode']
Expand Down
99 changes: 99 additions & 0 deletions torch/autograd/anomaly_mode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import torch


class detect_anomaly(object):
r"""Context-manager that enable anomaly detection for the autograd engine.

This does two things:
- Running the forward pass with detection enabled will allow the backward
pass to print the traceback of the forward operation that created the failing
backward function.
- Any backward computation that generate "nan" value will raise an error.

Example:

>>> import torch
>>> from torch import autograd
>>> class MyFunc(autograd.Function):
... @staticmethod
... def forward(ctx, inp):
... return inp.clone()
... @staticmethod
... def backward(ctx, gO):
... # Error during the backward pass
... raise RuntimeError("Some error in backward")
... return gO.clone()
>>> def run_fn(a):
... out = MyFunc.apply(a)
... return out.sum()
>>> inp = torch.rand(10, 10, requires_grad=True)
>>> out = run_fn(inp)
>>> out.backward()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/your/pytorch/install/torch/tensor.py", line 93, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
allow_unreachable=True) # allow_unreachable flag
File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
return self._forward_cls.backward(self, *args)
File "<stdin>", line 8, in backward
RuntimeError: Some error in backward
>>> with autograd.detect_anomaly():
... inp = torch.rand(10, 10, requires_grad=True)
... out = run_fn(inp)
... out.backward()
Traceback of forward call that caused the error:

This comment was marked as off-topic.

This comment was marked as off-topic.

File "tmp.py", line 53, in <module>
out = run_fn(inp)
File "tmp.py", line 44, in run_fn
out = MyFunc.apply(a)
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "/your/pytorch/install/torch/tensor.py", line 93, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
allow_unreachable=True) # allow_unreachable flag
File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
return self._forward_cls.backward(self, *args)
File "<stdin>", line 8, in backward
RuntimeError: Some error in backward

"""

def __init__(self):
self.prev = torch.is_anomaly_enabled()

def __enter__(self):
torch.set_anomaly_enabled(True)

def __exit__(self, *args):
torch.set_anomaly_enabled(self.prev)
return False


class set_detect_anomaly(object):
r"""Context-manager that sets the anomaly detection for the autograd engine on or off.

``set_detect_anomaly`` will enable or disable the autograd anomaly detection
based on its argument :attr:`mode`.
It can be used as a context-manager or as a function.

See ``detect_anomaly`` above for details of the anomaly detection behaviour.

Arguments:
mode (bool): Flag whether to enable anomaly detection (``True``),
or disable (``False``).

"""

def __init__(self, mode):
self.prev = torch.is_anomaly_enabled()
torch.set_anomaly_enabled(mode)

def __enter__(self):
pass

def __exit__(self, *args):
torch.set_anomaly_enabled(self.prev)
return False
7 changes: 7 additions & 0 deletions torch/csrc/autograd/anomaly_mode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include "torch/csrc/autograd/anomaly_mode.h"

namespace torch { namespace autograd {

bool AnomalyMode::_enabled = 0;

}}
23 changes: 23 additions & 0 deletions torch/csrc/autograd/anomaly_mode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

namespace torch { namespace autograd {

struct AnomalyMode {
static bool is_enabled() {
return _enabled;
}
static void set_enabled(bool enabled) {
_enabled = enabled;
}

private:
static bool _enabled;

This comment was marked as off-topic.

};


struct AnomalyMetadata {
virtual void store_stack() = 0;
virtual void print_stack() = 0;
};

}}
18 changes: 18 additions & 0 deletions torch/csrc/autograd/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "torch/csrc/autograd/function.h"
#include "torch/csrc/autograd/functions/basic_ops.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/csrc/autograd/anomaly_mode.h"
#include "torch/csrc/autograd/variable.h"
#include "torch/csrc/utils/auto_gpu.h"

Expand Down Expand Up @@ -269,6 +270,9 @@ auto Engine::thread_main(GraphTask *graph_task) -> void {
auto Engine::thread_on_exception(FunctionTask& task, std::exception& e) -> void {
std::lock_guard<std::mutex> lock(task.base->mutex);
if (!task.base->has_error.load()) {
if (AnomalyMode::is_enabled()) {
task.fn->metadata()->print_stack();
}
task.base->exception = std::current_exception();
task.base->has_error = true;
}
Expand Down Expand Up @@ -373,6 +377,20 @@ auto Engine::evaluate_function(FunctionTask& task) -> void {

int num_outputs = outputs.size();
if (num_outputs == 0) return; // Don't even acquire the mutex

if (AnomalyMode::is_enabled()) {
AutoGradMode grad_mode(false);
for (int i = 0; i < num_outputs; ++i) {
auto& output = outputs[i];
AutoGPU guard(output);
if (output.defined() && output.ne(output).any().toCByte()) {
std::stringstream ss;
ss << "Function '" << fn.name() << "' returned nan values in its " << i << "th output.";
throw std::runtime_error(ss.str());
}
}
}

std::lock_guard<std::mutex> lock(task.base->mutex);
for (int i = 0; i < num_outputs; ++i) {
auto& output = outputs[i];
Expand Down
4 changes: 4 additions & 0 deletions torch/csrc/autograd/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "torch/csrc/autograd/function.h"
#include "torch/csrc/autograd/input_buffer.h"
#include "torch/csrc/autograd/anomaly_mode.h"

#include <deque>
#include <exception>
Expand Down Expand Up @@ -41,6 +42,9 @@ struct Engine {
bool keep_graph,
bool create_graph,
const edge_list& outputs = {});
virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
return nullptr;
}

void queue_callback(std::function<void()> callback);

Expand Down
8 changes: 8 additions & 0 deletions torch/csrc/autograd/function.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "torch/csrc/autograd/function.h"

#include "torch/csrc/autograd/engine.h"
#include "torch/csrc/autograd/functions/special.h"
#include "torch/csrc/autograd/variable.h"
#include "torch/csrc/jit/ir.h"
Expand Down Expand Up @@ -99,6 +100,13 @@ void Function::set_up_context_edge(
backward_eval->forward_ctx_select = ctx_select;
}

AnomalyMetadata* Function::metadata() noexcept {
if (!anomaly_metadata_) {
anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
}
return anomaly_metadata_.get();
}

/*
* Fix for #5534: prevent stack overflow on deletion of deep computation graph
*
Expand Down
12 changes: 11 additions & 1 deletion torch/csrc/autograd/function.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "torch/csrc/assertions.h"
#include "torch/csrc/autograd/edge.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/csrc/autograd/anomaly_mode.h"
#include "torch/csrc/autograd/profiler.h"
#include "torch/csrc/autograd/saved_variable.h"
#include "torch/csrc/autograd/type_and_shape.h"
Expand Down Expand Up @@ -95,7 +96,11 @@ struct Function : std::enable_shared_from_this<Function> {
uint64_t sequence_nr,
edge_list&& next_edges = edge_list())
: sequence_nr_(sequence_nr),
next_edges_(std::move(next_edges)) {}
next_edges_(std::move(next_edges)) {
if (AnomalyMode::is_enabled()) {
metadata()->store_stack();
}
}

explicit Function(
edge_list&& next_edges = edge_list())
Expand Down Expand Up @@ -236,6 +241,10 @@ struct Function : std::enable_shared_from_this<Function> {
pyobj_ = pyobj;
}

/// Returns the anomaly metadata stored for this `Function`.
/// If none exist, creates a new empty one.
AnomalyMetadata* metadata() noexcept;

/// Create a context edge for the JIT.
static void set_up_context_edge(
jit::Node* this_node,
Expand Down Expand Up @@ -329,6 +338,7 @@ struct Function : std::enable_shared_from_this<Function> {

edge_list next_edges_;
PyObject* pyobj_ = nullptr; // weak reference
std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
auto_unique_ptr<jit::tracer::FunctionTracingState> tracing_state_;
Expand Down
22 changes: 22 additions & 0 deletions torch/csrc/autograd/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,32 @@ static PyObject * is_grad_enabled(PyObject* _unused, PyObject *arg) {
END_HANDLE_TH_ERRORS
}

static PyObject * set_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
HANDLE_TH_ERRORS
if (!PyBool_Check(arg)) {
throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
}
AnomalyMode::set_enabled(arg == Py_True);
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}

static PyObject * is_anomaly_mode_enabled(PyObject* _unused, PyObject *arg) {
HANDLE_TH_ERRORS
if (AnomalyMode::is_enabled()) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
END_HANDLE_TH_ERRORS
}

// autograd methods on torch._C
static PyMethodDef methods[] = {
{"set_grad_enabled", (PyCFunction)set_grad_enabled, METH_O, nullptr},
{"is_grad_enabled", (PyCFunction)is_grad_enabled, METH_NOARGS, nullptr},
{"set_anomaly_enabled", (PyCFunction)set_anomaly_mode_enabled, METH_O, nullptr},
{"is_anomaly_enabled", (PyCFunction)is_anomaly_mode_enabled, METH_NOARGS, nullptr},
{nullptr, nullptr, 0, nullptr}
};

Expand Down
Loading