adds list_gpu_processes function (#44616)

Natalia Gimelshein · facebook-github-bot · commit 95a69a7d09e4 · 2020-09-14T09:54:32.000-07:00
Summary: per title, to make it easier to track the creation of stray contexts: ``` python -c "import torch; a=torch.randn(1, device='cuda'); print(torch.cuda.memory.list_gpu_processes(0)); print(torch.cuda.memory.list_gpu_processes(1))" GPU:0 process 79749 uses 601.000 MB GPU memory GPU:1 no processes are running ``` Pull Request resolved: #44616 Reviewed By: mruberry Differential Revision: D23675739 Pulled By: ngimel fbshipit-source-id: ffa14cad9d7144e883de13b1c2c6817bd432f53a
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -467,3 +467,38 @@ def _format_count(cnt, pref_cnt):
     for k, v in stats.items():
         fmt_dict[k.replace(".", "-")] = v
     return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
+
+
+def list_gpu_processes(device: Union[Device, int] = None) -> str:
+    r"""Returns a human-readable printout of the running processes
+    and their GPU memory use for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Arguments:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+
+    try:
+        import pynvml  # type: ignore
+    except ModuleNotFoundError:
+        return("pynvml module not found, please install pynvml")
+    from pynvml import NVMLError_DriverNotLoaded
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded:
+        return ("cuda driver can't be loaded, is cuda enabled?")
+    device = _get_device_index(device, optional=True)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    lines = []
+    lines.append(f"GPU:{device}")
+    if len(procs) == 0:
+        lines.append("no processes are running")
+    for p in procs:
+        mem = p.usedGpuMemory / (1024 * 1024)
+        lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
+    return "\n".join(lines)