Improve autograd profiler performance (#11773)

apaszke · facebook-github-bot · commit 8c3a94eaf2ee · 2018-09-19T09:25:43.000-07:00
Summary: To illustrate the benefits of this commit, I'll use the time/iter I got from one of the JIT benchmarks on my machine. | Run | Time | |----------------------------------------------|-------------------------| | No profiler | 45ms | | With profiler | 56ms | | Use `clock_gettime` instead of `std::chrono` | 48ms | | Touch all pages on block allocation | 48ms (less jitter) | | Use `const char*` instead of `std::string` | 47ms (even less jitter) | Pull Request resolved: #11773 Differential Revision: D9886858 Pulled By: apaszke fbshipit-source-id: 58f926f09e95df0b11ec687763a72b06b66991d0
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
@@ -428,22 +428,9 @@ def __repr__(self):
 ################################################################################
 # Utilities
 
-def demangle(name):
-    """Demangle a C++ identifier using c++filt"""
-    try:
-        with open(os.devnull, 'w') as devnull:
-            is_win = sys.platform == 'win32'
-            filt_cmd = ['undname', name] if is_win else ['c++filt', '-n', name]
-            orig_name = subprocess.check_output(filt_cmd, stderr=devnull).rstrip().decode("ascii")
-            orig_name = re.search('is :- \"(.*)"', orig_name).group(1) if is_win else orig_name
-            return orig_name
-    except (subprocess.CalledProcessError, AttributeError, FileNotFoundError, OSError):
-        return name
-
-
 class StringTable(defaultdict):
     def __missing__(self, key):
-        self[key] = demangle(key)
+        self[key] = torch._C._demangle(key)
         return self[key]
 
 
@@ -526,7 +513,7 @@ def parse_nvprof_trace(path):
     # Parse strings table
     strings = {}
     for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
-        strings[r["id"]] = demangle(r["value"])
+        strings[r["id"]] = torch._C._demangle(r["value"])
 
     # First, find all functions and create FunctionEvents for them
     marker_query = """
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -605,6 +605,10 @@ static PyObject* initModule() {
   // setting up TH Errors so that they throw C++ exceptions
   at::init();
 
+
+  py::reinterpret_borrow<py::module>(module)
+    .def("_demangle", &at::demangle);
+
   // Set ATen warnings to issue Python warnings
   at::Warning::set_warning_handler(&warning_handler);
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
@@ -45,8 +45,8 @@ PyObject * THPAutograd_initExtension(PyObject *_unused)
   m.def("_enable_profiler", torch::autograd::profiler::enableProfiler);
   m.def("_disable_profiler", torch::autograd::profiler::disableProfiler);
 
-  m.def("_push_range", [](const char* name) {
-    torch::autograd::profiler::pushRange(name);
+  m.def("_push_range", [](std::string name) {
+    torch::autograd::profiler::pushRange(std::move(name));
   });
   m.def("_pop_range", []() { torch::autograd::profiler::popRange(); });
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
@@ -6,11 +6,11 @@
 namespace torch { namespace autograd { namespace profiler {
 
 ProfilerState state = ProfilerState::Disabled;
-uint32_t next_thread_id = 0;
+uint16_t next_thread_id = 0;
 std::mutex all_event_lists_mutex;
 std::list<std::shared_ptr<RangeEventList>> all_event_lists;
 thread_local std::shared_ptr<RangeEventList> event_list;
-thread_local int32_t thread_id;
+thread_local uint16_t thread_id;
 
 RangeEventList& getEventList() {
   if (!event_list) {
@@ -23,6 +23,9 @@ RangeEventList& getEventList() {
 }
 
 void mark(std::string name, bool include_cuda /* = true */) {
+  if (state == ProfilerState::Disabled) {
+    return;
+  }
   if (state == ProfilerState::NVTX) {
 #ifdef USE_CUDA
     nvtxMarkA(name.c_str());
@@ -39,7 +42,12 @@ void mark(std::string name, bool include_cuda /* = true */) {
   }
 }
 
-void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*= -1*/) {
+const char* c_str(const char *str) { return str; }
+// NB: non-const to disallow temporaries (lifetime issues)
+const char* c_str(std::string& str) { return str.c_str(); }
+
+template<typename T>
+void pushRangeImpl(T name, const char* msg="", int64_t sequence_nr=-1) {
   if (state == ProfilerState::Disabled) {
     return;
   }
@@ -49,9 +57,9 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
       std::stringstream s;
       s << name << msg << sequence_nr;
       nvtxRangePushA(s.str().c_str());
-    } 
-    else
-      nvtxRangePushA(name.c_str());
+    } else {
+      nvtxRangePushA(c_str(name));
+    }
 #else
     throw std::logic_error(
         "pushRange called with NVTX tracing, but compiled without CUDA");
@@ -65,6 +73,10 @@ void pushRange(std::string name, const char* msg/*= ""*/, int64_t sequence_nr/*=
   }
 }
 
+void pushRange(std::string name) {
+  pushRangeImpl(std::move(name));
+}
+
 void popRange() {
   if (state == ProfilerState::Disabled) {
     return;
@@ -79,45 +91,30 @@ void popRange() {
   } else {
     getEventList().record(
         EventKind::PopRange,
-        std::string(),
+        "",
         thread_id,
         state == ProfilerState::CUDA);
   }
 }
 
 RecordFunction::RecordFunction(Function* fn) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushFunctionRange(fn);
+  // NB: we don't use fn->name() here, because it will unnecessarily allocate
+  // a string. We will run a demangler on all the names anyway, so it's ok to
+  // avoid doing it now.
+  pushRangeImpl(typeid(*fn).name(), ", stashed seq=", fn->sequence_nr());
 }
 
 RecordFunction::RecordFunction(std::string name) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(std::move(name));
+  pushRangeImpl(std::move(name));
 }
 
 RecordFunction::RecordFunction(const char* name) {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(name);
+  pushRangeImpl<const char*>(name);
 }
 
-RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr) 
+RecordFunction::RecordFunction(const char* name, int64_t current_sequence_nr)
 {
-  if (state == ProfilerState::Disabled)
-    return;
-  pushRange(name, ", seq=", current_sequence_nr);
-}
-
-RecordFunction::~RecordFunction() {
-  if (state == ProfilerState::Disabled)
-    return;
-  popRange();
-}
-
-void RecordFunction::pushFunctionRange(Function* fn) {
-  pushRange(fn->name(), ", stashed seq=", fn->sequence_nr());
+  pushRangeImpl<const char*>(name, ", seq=", current_sequence_nr);
 }
 
 #ifdef USE_CUDA
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
@@ -21,6 +21,9 @@
 #include "ATen/cuda/CUDAContext.h"
 #include <cuda_runtime.h>
 #endif
+#ifndef _WIN32
+#include <time.h>
+#endif
 
 namespace torch { namespace autograd {
 
@@ -32,36 +35,48 @@ constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
   return ((a + b - 1) / b) * b;
 }
 
-inline uint64_t getTime() {
+inline int64_t getTime() {
+#ifdef _WIN32
   using namespace std::chrono;
   using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
   return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+#else
+  // clock_gettime is *much* faster than std::chrono implementation on Linux
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
+#endif
 }
 
-enum class EventKind {
+enum class EventKind : uint16_t {
   Mark,
   PushRange,
   PopRange
 };
 
-struct Event {
-  Event(EventKind kind, std::string name, uint32_t thread_id, bool record_cuda)
-  : kind_(kind)
-  , name_(std::move(name))
-  , thread_id_(thread_id) {
+struct Event final {
+  Event(EventKind kind, std::string name, uint16_t thread_id, bool record_cuda)
+  : owned_name_(new std::string(std::move(name)))
+  , name_ptr_(owned_name_->c_str())
+  , kind_(kind)
+  , thread_id_(thread_id) { record(record_cuda); }
+  Event(EventKind kind, const char* name, uint16_t thread_id, bool record_cuda)
+  : name_ptr_(name)
+  , kind_(kind)
+  , thread_id_(thread_id) { record(record_cuda); }
+
+  void record(bool record_cuda) {
 #ifdef USE_CUDA
-    if(record_cuda) {
+    if (record_cuda) {
       TORCH_CUDA_CHECK(cudaGetDevice(&device_));
       TORCH_CUDA_CHECK(cudaEventCreate(&event));
       auto stream = at::cuda::getCurrentCUDAStream();
       cpu_ns_ = getTime();
       TORCH_CUDA_CHECK(cudaEventRecord(event, stream));
-    } else {
-      cpu_ns_ = getTime();
+      return;
     }
-#else
-    cpu_ns_ = getTime();
 #endif
+    cpu_ns_ = getTime();
   }
   std::string kind() const {
     switch(kind_) {
@@ -71,10 +86,10 @@ struct Event {
     }
     throw std::runtime_error("unknown EventKind");
   }
-  const std::string & name() const {
-    return name_;
+  const char* name() const {
+    return name_ptr_;
   }
-  uint32_t thread_id() const {
+  uint16_t thread_id() const {
     return thread_id_;
   }
   double cpu_elapsed_us(const Event & e) {
@@ -108,14 +123,18 @@ struct Event {
     return device_;
   }
 private:
-  EventKind kind_;
-  std::string name_;
-  uint32_t thread_id_;
   int64_t cpu_ns_; // signed to allow for negative intervals
+  // std::string is a very large object (usually around 32B),
+  // and this field is used only for user-created ranges, so
+  // it's better to save on size of Events.
+  std::unique_ptr<std::string> owned_name_;
+  const char * name_ptr_;
+  EventKind kind_;
+  uint16_t thread_id_;
+  int device_ = -1;
 #ifdef USE_CUDA
   cudaEvent_t event = nullptr;
 #endif
-  int device_ = -1;
 };
 
 // a linked-list of fixed sized vectors, to avoid
@@ -132,7 +151,14 @@ struct RangeEventList {
 
   void allocBlock() {
     blocks.emplace_front();
-    blocks.front().reserve(num_block_elements);
+    auto & new_block = blocks.front();
+    new_block.reserve(num_block_elements);
+    // Materialize all pages in the new block to release jitter when recording events.
+    const char * const end_ptr = reinterpret_cast<char*>(new_block.data() + num_block_elements);
+    for (volatile const char * ptr = reinterpret_cast<char*>(new_block.data());
+         ptr < end_ptr; ptr += 4 * 1024) {
+      (*ptr);
+    }
   }
 
   template<typename... Args>
@@ -166,7 +192,7 @@ enum class ProfilerState {
 
 TORCH_API RangeEventList& getEventList();
 TORCH_API void mark(std::string name, bool include_cuda = true);
-TORCH_API void pushRange(std::string name, const char* msg = "", int64_t sequence_nr = -1);
+TORCH_API void pushRange(std::string name);
 TORCH_API void popRange();
 
 struct TORCH_API RecordFunction {
@@ -178,10 +204,9 @@ struct TORCH_API RecordFunction {
 
   explicit RecordFunction(const char* name, int64_t current_sequence_nr);
 
-  ~RecordFunction();
-
-  // Needed only because we don't have Function defined yet.
-  void pushFunctionRange(Function *fn);
+  ~RecordFunction() {
+    popRange();
+  }
 };
 
 using thread_event_lists = std::vector<std::vector<Event>>;