Update on "Caching allocator tracing"

zdevito · zdevito · commit 9903d8552f36 · 2022-10-07T13:21:48.000-07:00
We currently can take snapshots of the state of the allocated cuda memory, but we do not have a way to correlate these snapshots with the actions the allocator that were taken between snapshots. This PR adds a simple fixed-sized buffer that records the major actions that the allocator takes (ALLOC, FREE, SEGMENT_ALLOC, SEGMENT_FREE, OOM, SNAPSHOT) and includes these with the snapshot information. Capturing period snapshots with a big enough trace buffer makes it possible to see how the allocator state changes over time. 

We plan to use this functionality to guide how settings in the allocator can be adjusted and eventually have a more robust overall algorithm.

As a component of this functionality, we also add the ability to get a callback when the allocator will throw an OOM, primarily so that snapshots can be taken immediately to see why the program ran out of memory (most programs have some C++ state that would free tensors before the OutOfMemory exception can be caught).

This PR also updates the _memory_viz.py script to pretty-print the trace information and provide a better textual summary of snapshots distinguishing between internal and external fragmentation.

[ghstack-poisoned]
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -646,7 +646,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
           history_entry[addr_s] = (int64_t)h.addr;
           history_entry[real_size_s] = h.real_size;
           if (h.context) {
-            auto sc = (StackContext*) h.context.get();
+            auto sc = (StackContext*)h.context.get();
             history_entry[frames_s] = get_frames(sc);
             if (!sc->cpp_frames.empty()) {
               history_entry[cpp_frames_s] = py::cast(sc->cpp_frames);
@@ -829,13 +829,21 @@ static void registerCudaDeviceProperties(PyObject* module) {
         return stream.str();
       });
 
-  m.def("_cuda_recordMemoryHistory", [](bool enabled, bool record_context, bool record_context_cpp, Py_ssize_t alloc_trace_max_entries, bool alloc_trace_record_context) {
-  c10::cuda::CUDACachingAllocator::recordHistory(
-      enabled,
-      record_context ? (record_context_cpp ? StackContext::gather_with_cpp : StackContext::gather) : nullptr,
-      alloc_trace_max_entries,
-      alloc_trace_record_context);
-  });
+  m.def(
+      "_cuda_recordMemoryHistory",
+      [](bool enabled,
+         bool record_context,
+         bool record_context_cpp,
+         Py_ssize_t alloc_trace_max_entries,
+         bool alloc_trace_record_context) {
+        c10::cuda::CUDACachingAllocator::recordHistory(
+            enabled,
+            record_context ? (record_context_cpp ? StackContext::gather_with_cpp
+                                                 : StackContext::gather)
+                           : nullptr,
+            alloc_trace_max_entries,
+            alloc_trace_record_context);
+      });
 }
 
 static void bindGetDeviceProperties(PyObject* module) {
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -615,7 +615,8 @@ def _record_memory_history(enabled: bool, record_context=True,
         stack trace collection; file an issue with us if you need it.
     """
     with torch.cuda.device(device):
-        _C._cuda_recordMemoryHistory(enabled, record_context, _enable_expensive_cpp, trace_alloc_max_entries, trace_alloc_record_context)
+        _C._cuda_recordMemoryHistory(enabled, record_context, _enable_expensive_cpp,
+                                     trace_alloc_max_entries, trace_alloc_record_context)
 
 def _snapshot(device: Union[Device, int] = None):
     with torch.cuda.device(device):