pytorch
diff --git a/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 182 additions & 29 deletions b/‎c10/cuda/CUDACachingAllocator.cpp‎
Lines changed: 182 additions & 29 deletions
diff --git a/‎c10/cuda/CUDACachingAllocator.h‎
Lines changed: 53 additions & 8 deletions b/‎c10/cuda/CUDACachingAllocator.h‎
Lines changed: 53 additions & 8 deletions
diff --git a/‎test/test_cuda.py‎
Lines changed: 24 additions & 3 deletions b/‎test/test_cuda.py‎
Lines changed: 24 additions & 3 deletions
diff --git a/‎torch/_C/__init__.pyi.in‎
Lines changed: 2 additions & 2 deletions b/‎torch/_C/__init__.pyi.in‎
Lines changed: 2 additions & 2 deletions
@@ -98,14 +98,12 @@ struct Context {
   virtual ~Context() {}
 };
 
-typedef std::unique_ptr<Context> (*CreateContextFn)(void);
+typedef std::shared_ptr<Context> (*CreateContextFn)(void);
 
 struct History {
   void* addr;
   size_t real_size; // unrounded, actually requested size
-  std::unique_ptr<Context> context; // per-watcher context
-  std::unique_ptr<History> next; // when blocks are merged we keep records of
-                                 // what used to be in the block
+  std::shared_ptr<Context> context; // per-watcher context
 };
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
@@ -115,8 +113,7 @@ struct BlockInfo {
   int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
-  History* history =
-      nullptr; // borrowed reference because it is owned by the allocator
+  std::vector<History> history;
 };
 
 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
@@ -131,6 +128,44 @@ struct SegmentInfo {
   std::vector<BlockInfo> blocks;
 };
 
+struct TraceEntry {
+  enum Action {
+    ALLOC, // API made to the caching allocator for new memory
+    FREE_REQUESTED, // API call made to the caching allocator to free memory
+    FREE_COMPLETED, // The allocator might have to delay a free because
+                    // it is still in use on another stream via record_stream
+                    // This event is generated when a free actually completes.
+    SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
+    SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
+                  // defragement or empty_caches)
+    SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
+              // events
+    OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
+        // bytes reported by cuda)
+  };
+  TraceEntry(
+      Action action,
+      int64_t addr,
+      size_t size,
+      cudaStream_t stream,
+      std::shared_ptr<Context> context = nullptr)
+      : action_(action),
+        addr_(addr),
+        context_(context),
+        stream_(stream),
+        size_(size) {}
+  Action action_;
+  int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  std::shared_ptr<Context> context_;
+  cudaStream_t stream_;
+  int64_t size_;
+};
+
+struct SnapshotInfo {
+  std::vector<SegmentInfo> segments;
+  std::vector<std::vector<TraceEntry>> device_traces;
+};
+
 C10_CUDA_API void* raw_alloc(size_t nbytes);
 C10_CUDA_API void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream);
 C10_CUDA_API void raw_delete(void* ptr);
@@ -149,7 +184,7 @@ C10_CUDA_API void recordStream(const DataPtr&, CUDAStream stream);
 C10_CUDA_API DeviceStats getDeviceStats(int device);
 C10_CUDA_API void resetAccumulatedStats(int device);
 C10_CUDA_API void resetPeakStats(int device);
-C10_CUDA_API std::vector<SegmentInfo> snapshot();
+C10_CUDA_API SnapshotInfo snapshot();
 
 // CUDAGraph interactions
 C10_CUDA_API void notifyCaptureBegin(
@@ -161,7 +196,17 @@ C10_CUDA_API void notifyCaptureDestroy(int device, MempoolId_t mempool_id);
 
 C10_CUDA_API std::mutex* getFreeMutex();
 
-C10_CUDA_API void setContextRecorder(CreateContextFn recorder);
+C10_CUDA_API void recordHistory(
+    bool enabled,
+    CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    bool alloc_trace_record_context);
+using OutOfMemoryObserver = std::function<void(
+    int64_t device,
+    int64_t allocated,
+    int64_t device_total,
+    int64_t device_free)>;
+C10_CUDA_API void attachOutOfMemoryObserver(OutOfMemoryObserver observer);
 
 C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle);
 } // namespace CUDACachingAllocator
 
@@ -4619,20 +4619,26 @@ def test_memory_snapshot(self):
 
             ss = torch.cuda.memory._snapshot()
             found_it = False
-            for seg in ss:
+            for seg in ss['segments']:
                 for b in seg['blocks']:
                     if 'history' in b:
                         for h in b['history']:
                             if h['real_size'] == 311 * 411 * 4:
                                 self.assertTrue('test_cuda' in h['frames'][0]['filename'])
                                 found_it = True
             self.assertTrue(found_it)
+
             if not IS_WINDOWS:
                 with tempfile.NamedTemporaryFile() as f:
                     torch.cuda.memory._save_segment_usage(f.name)
                     with open(f.name, 'r') as f2:
                         self.assertTrue('test_cuda.py' in f2.read())
 
+            del x
+            torch.cuda.empty_cache()
+            ss = torch.cuda.memory._snapshot()
+            self.assertTrue(ss['device_traces'][0][-1]['action'] == 'segment_free')
+
         finally:
             torch.cuda.memory._record_memory_history(False)
 
@@ -4643,7 +4649,7 @@ def test_memory_snapshot_with_cpp(self):
             torch.cuda.memory._record_memory_history(True, _enable_expensive_cpp=True)
             x = torch.rand(311, 411, device='cuda')
 
-            ss = torch.cuda.memory._snapshot()
+            ss = torch.cuda.memory._snapshot()['segments']
             found_it = False
             for seg in ss:
                 for b in seg['blocks']:
@@ -4734,16 +4740,31 @@ def test_cpp_memory_snapshot_pickle(self):
             t = torch.rand(311, 411, device='cuda')
             mem = pickle.loads(m.do_snapshot())
             found = False
-            for s in mem:
+            for s in mem['segments']:
                 for b in s['blocks']:
                     if b['state'] == 'active_allocated' and 'history' in b:
                         history = b['history']
                         if history and history[0]['real_size'] == 311 * 411 * 4:
                             found = True
+            last_action = mem['device_traces'][0][-1]
+            self.assertTrue(last_action['action'] == 'alloc')
+            self.assertTrue(last_action['size'] == 311 * 411 * 4)
             self.assertTrue(found)
         finally:
             m.record(False)
 
+    def test_notifies_oom(self):
+        x = False
+
+        def cb(device, alloc, device_alloc, device_free):
+            nonlocal x
+            x = True
+        torch._C._cuda_attach_out_of_memory_observer(cb)
+        with self.assertRaises(torch.cuda.OutOfMemoryError):
+            torch.empty(1024 * 1024 * 1024 * 1024, device='cuda')
+        self.assertTrue(x)
+
+
 instantiate_parametrized_tests(TestCuda)
 
 if __name__ == '__main__':
 
@@ -1178,8 +1178,8 @@ def _cuda_emptyCache() -> None: ...
 def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
 def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
 def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
-def _cuda_memorySnapshot() -> List[Dict[str, Any]]: ...
-def _cuda_recordMemoryHistory(enabled: _bool, cpp: _bool) -> None: ...
+def _cuda_memorySnapshot() -> Dict[str, Any]: ...
+def _cuda_recordMemoryHistory(enabled: _bool, record_context: _bool, record_context_cpp: _bool, alloc_trace_max_entries: _int, alloc_trace_record_context: _bool) -> None: ...
 def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...