Skip to content

Commit 8c3a94e

Browse files
apaszkefacebook-github-bot
authored andcommitted
Improve autograd profiler performance (#11773)
Summary: To illustrate the benefits of this commit, I'll use the time/iter I got from one of the JIT benchmarks on my machine. | Run | Time | |----------------------------------------------|-------------------------| | No profiler | 45ms | | With profiler | 56ms | | Use `clock_gettime` instead of `std::chrono` | 48ms | | Touch all pages on block allocation | 48ms (less jitter) | | Use `const char*` instead of `std::string` | 47ms (even less jitter) | Pull Request resolved: #11773 Differential Revision: D9886858 Pulled By: apaszke fbshipit-source-id: 58f926f09e95df0b11ec687763a72b06b66991d0
1 parent b3a2665 commit 8c3a94e

File tree

5 files changed

+85
-72
lines changed

5 files changed

+85
-72
lines changed

torch/autograd/profiler.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -428,22 +428,9 @@ def __repr__(self):
428428
################################################################################
429429
# Utilities
430430

431-
def demangle(name):
432-
"""Demangle a C++ identifier using c++filt"""
433-
try:
434-
with open(os.devnull, 'w') as devnull:
435-
is_win = sys.platform == 'win32'
436-
filt_cmd = ['undname', name] if is_win else ['c++filt', '-n', name]
437-
orig_name = subprocess.check_output(filt_cmd, stderr=devnull).rstrip().decode("ascii")
438-
orig_name = re.search('is :- \"(.*)"', orig_name).group(1) if is_win else orig_name
439-
return orig_name
440-
except (subprocess.CalledProcessError, AttributeError, FileNotFoundError, OSError):
441-
return name
442-
443-
444431
class StringTable(defaultdict):
445432
def __missing__(self, key):
446-
self[key] = demangle(key)
433+
self[key] = torch._C._demangle(key)
447434
return self[key]
448435

449436

@@ -526,7 +513,7 @@ def parse_nvprof_trace(path):
526513
# Parse strings table
527514
strings = {}
528515
for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
529-
strings[r["id"]] = demangle(r["value"])
516+
strings[r["id"]] = torch._C._demangle(r["value"])
530517

531518
# First, find all functions and create FunctionEvents for them
532519
marker_query = """

torch/csrc/Module.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,10 @@ static PyObject* initModule() {
605605
// setting up TH Errors so that they throw C++ exceptions
606606
at::init();
607607

608+
609+
py::reinterpret_borrow<py::module>(module)
610+
.def("_demangle", &at::demangle);
611+
608612
// Set ATen warnings to issue Python warnings
609613
at::Warning::set_warning_handler(&warning_handler);
610614

torch/csrc/autograd/init.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
4545
m.def("_enable_profiler", torch::autograd::profiler::enableProfiler);
4646
m.def("_disable_profiler", torch::autograd::profiler::disableProfiler);
4747

48-
m.def("_push_range", [](const char* name) {
49-
torch::autograd::profiler::pushRange(name);
48+
m.def("_push_range", [](std::string name) {
49+
torch::autograd::profiler::pushRange(std::move(name));
5050
});
5151
m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
5252

torch/csrc/autograd/profiler.cpp

Lines changed: 27 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
namespace torch { namespace autograd { namespace profiler {
77

88
ProfilerState state = ProfilerState::Disabled;
9-
uint32_t next_thread_id = 0;
9+
uint16_t next_thread_id = 0;
1010
std::mutex all_event_lists_mutex;
1111
std::list<std::shared_ptr<RangeEventList>> all_event_lists;
1212
thread_local std::shared_ptr<RangeEventList> event_list;
13-
thread_local int32_t thread_id;
13+
thread_local uint16_t thread_id;
1414

1515
RangeEventList& getEventList() {
1616
if (!event_list) {
@@ -23,6 +23,9 @@ RangeEventList& getEventList() {
2323
}
2424

2525
void mark(std::string name, bool include_cuda /* = true */) {
26+
if (state == ProfilerState::Disabled) {
27+
return;
28+
}
2629
if (state == ProfilerState::NVTX) {
2730
#ifdef USE_CUDA
2831
nvtxMarkA(name.c_str());
@@ -39,7 +42,12 @@ void mark(std::string name, bool include_cuda /* = true */) {
3942
}
4043
}
4144

42-
void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) {
45+
const char* c_str(const char *str) { return str; }
46+
// NB: non-const to disallow temporaries (lifetime issues)
47+
const char* c_str(std::string& str) { return str.c_str(); }
48+
49+
template<typename T>
50+
void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
4351
if (state == ProfilerState::Disabled) {
4452
return;
4553
}
@@ -49,9 +57,9 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
4957
std::stringstream s;
5058
s << name << msg << sequence_nr;
5159
nvtxRangePushA(s.str().c_str());
52-
}
53-
else
54-
nvtxRangePushA(name.c_str());
60+
} else {
61+
nvtxRangePushA(c_str(name));
62+
}
5563
#else
5664
throw std::logic_error(
5765
"pushRange called with NVTX tracing, but compiled without CUDA");
@@ -65,6 +73,10 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
6573
}
6674
}
6775

76+
void pushRange(std::string name) {
77+
pushRangeImpl(std::move(name));
78+
}
79+
6880
void popRange() {
6981
if (state == ProfilerState::Disabled) {
7082
return;
@@ -79,45 +91,30 @@ void popRange() {
7991
} else {
8092
getEventList().record(
8193
EventKind::PopRange,
82-
std::string(),
94+
"",
8395
thread_id,
8496
state == ProfilerState::CUDA);
8597
}
8698
}
8799

88100
RecordFunction::RecordFunction(Function* fn) {
89-
if (state == ProfilerState::Disabled)
90-
return;
91-
pushFunctionRange(fn);
101+
// NB: we don't use fn->name() here, because it will unnecessarily allocate
102+
// a string. We will run a demangler on all the names anyway, so it's ok to
103+
// avoid doing it now.
104+
pushRangeImpl(typeid(*fn).name(), ", stashed seq=", fn->sequence_nr());
92105
}
93106

94107
RecordFunction::RecordFunction(std::string name) {
95-
if (state == ProfilerState::Disabled)
96-
return;
97-
pushRange(std::move(name));
108+
pushRangeImpl(std::move(name));
98109
}
99110

100111
RecordFunction::RecordFunction(const char* name) {
101-
if (state == ProfilerState::Disabled)
102-
return;
103-
pushRange(name);
112+
pushRangeImpl<const char*>(name);
104113
}
105114

106-
RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
115+
RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
107116
{
108-
if (state == ProfilerState::Disabled)
109-
return;
110-
pushRange(name, ", seq=", current_sequence_nr);
111-
}
112-
113-
RecordFunction::~RecordFunction() {
114-
if (state == ProfilerState::Disabled)
115-
return;
116-
popRange();
117-
}
118-
119-
void RecordFunction::pushFunctionRange(Function* fn) {
120-
pushRange(fn->name(), ", stashed seq=", fn->sequence_nr());
117+
pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
121118
}
122119

123120
#ifdef USE_CUDA

torch/csrc/autograd/profiler.h

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
#include "ATen/cuda/CUDAContext.h"
2222
#include <cuda_runtime.h>
2323
#endif
24+
#ifndef _WIN32
25+
#include <time.h>
26+
#endif
2427

2528
namespace torch { namespace autograd {
2629

@@ -32,36 +35,48 @@ constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
3235
return ((a + b - 1) / b) * b;
3336
}
3437

35-
inline uint64_t getTime() {
38+
inline int64_t getTime() {
39+
#ifdef _WIN32
3640
using namespace std::chrono;
3741
using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
3842
return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
43+
#else
44+
// clock_gettime is *much* faster than std::chrono implementation on Linux
45+
struct timespec t;
46+
clock_gettime(CLOCK_MONOTONIC, &t);
47+
return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
48+
#endif
3949
}
4050

41-
enum class EventKind {
51+
enum class EventKind : uint16_t {
4252
Mark,
4353
PushRange,
4454
PopRange
4555
};
4656

47-
struct Event {
48-
Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
49-
: kind_(kind)
50-
, name_(std::move(name))
51-
, thread_id_(thread_id) {
57+
struct Event final {
58+
Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
59+
: owned_name_(new std::string(std::move(name)))
60+
, name_ptr_(owned_name_->c_str())
61+
, kind_(kind)
62+
, thread_id_(thread_id) { record(record_cuda); }
63+
Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
64+
: name_ptr_(name)
65+
, kind_(kind)
66+
, thread_id_(thread_id) { record(record_cuda); }
67+
68+
void record(bool record_cuda) {
5269
#ifdef USE_CUDA
53-
if(record_cuda) {
70+
if (record_cuda) {
5471
TORCH_CUDA_CHECK(cudaGetDevice(&device_));
5572
TORCH_CUDA_CHECK(cudaEventCreate(&event));
5673
auto stream = at::cuda::getCurrentCUDAStream();
5774
cpu_ns_ = getTime();
5875
TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
59-
} else {
60-
cpu_ns_ = getTime();
76+
return;
6177
}
62-
#else
63-
cpu_ns_ = getTime();
6478
#endif
79+
cpu_ns_ = getTime();
6580
}
6681
std::string kind() const {
6782
switch(kind_) {
@@ -71,10 +86,10 @@ struct Event {
7186
}
7287
throw std::runtime_error("unknown EventKind");
7388
}
74-
const std::string & name() const {
75-
return name_;
89+
const char* name() const {
90+
return name_ptr_;
7691
}
77-
uint32_t thread_id() const {
92+
uint16_t thread_id() const {
7893
return thread_id_;
7994
}
8095
double cpu_elapsed_us(const Event & e) {
@@ -108,14 +123,18 @@ struct Event {
108123
return device_;
109124
}
110125
private:
111-
EventKind kind_;
112-
std::string name_;
113-
uint32_t thread_id_;
114126
int64_t cpu_ns_; // signed to allow for negative intervals
127+
// std::string is a very large object (usually around 32B),
128+
// and this field is used only for user-created ranges, so
129+
// it's better to save on size of Events.
130+
std::unique_ptr<std::string> owned_name_;
131+
const char * name_ptr_;
132+
EventKind kind_;
133+
uint16_t thread_id_;
134+
int device_ = -1;
115135
#ifdef USE_CUDA
116136
cudaEvent_t event = nullptr;
117137
#endif
118-
int device_ = -1;
119138
};
120139

121140
// a linked-list of fixed sized vectors, to avoid
@@ -132,7 +151,14 @@ struct RangeEventList {
132151

133152
void allocBlock() {
134153
blocks.emplace_front();
135-
blocks.front().reserve(num_block_elements);
154+
auto & new_block = blocks.front();
155+
new_block.reserve(num_block_elements);
156+
// Materialize all pages in the new block to release jitter when recording events.
157+
const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
158+
for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
159+
ptr < end_ptr; ptr += 4 * 1024) {
160+
(*ptr);
161+
}
136162
}
137163

138164
template<typename... Args>
@@ -166,7 +192,7 @@ enum class ProfilerState {
166192

167193
TORCH_API RangeEventList& getEventList();
168194
TORCH_API void mark(std::string name, bool include_cuda = true);
169-
TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1);
195+
TORCH_API void pushRange(std::string name);
170196
TORCH_API void popRange();
171197

172198
struct TORCH_API RecordFunction {
@@ -178,10 +204,9 @@ struct TORCH_API RecordFunction {
178204

179205
explicit RecordFunction(const char* name, int64_t current_sequence_nr);
180206

181-
~RecordFunction();
182-
183-
// Needed only because we don't have Function defined yet.
184-
void pushFunctionRange(Function *fn);
207+
~RecordFunction() {
208+
popRange();
209+
}
185210
};
186211

187212
using thread_event_lists = std::vector<std::vector<Event>>;

0 commit comments

Comments
 (0)