Make torch.cuda.* take device objects; Update distributed docs (#10833)

ssnl · facebook-github-bot · commit 8e33451e2e98 · 2018-08-27T15:24:42.000-07:00
Summary: Commits: 1. Make `torch.cuda.*` take device objects 2. Update `torch.distributed` docs to emphasize calling `torch.cuda.set_device` before `init_process_group` Pull Request resolved: #10833 Differential Revision: D9514241 Pulled By: SsnL fbshipit-source-id: 2497464305fb1e63d6c495291a5744aaa7e2696e
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
@@ -88,8 +88,8 @@ TCP initialization
 
 There are two ways to initialize using TCP, both requiring a network address
 reachable from all processes and a desired ``world_size``. The first way
-requires specifying an address that belongs to the rank 0 process. This first way of
-initialization requires that all processes have manually specified ranks.
+requires specifying an address that belongs to the rank 0 process. This
+initialization method requires that all processes have manually specified ranks.
 
 Alternatively, the address has to be a valid IP multicast address, in which case
 ranks can be assigned automatically. Multicast initialization also supports
@@ -101,10 +101,10 @@ jobs, as long as they use different group names.
     import torch.distributed as dist
 
     # Use address of one of the machines
-    dist.init_process_group(init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
+    dist.init_process_group(backend, init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4)
 
     # or a multicast address - rank will be assigned automatically if unspecified
-    dist.init_process_group(init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
+    dist.init_process_group(backend, init_method='tcp://[ff15:1e18:5d4c:4cf0:d02d:b659:53ba:b0a7]:23456',
                             world_size=4)
 
 Shared file-system initialization
@@ -126,8 +126,8 @@ multiple jobs, as long as they use different group names.
     import torch.distributed as dist
 
     # Rank will be assigned automatically if unspecified
-    dist.init_process_group(init_method='file:///mnt/nfs/sharedfile', world_size=4,
-                            group_name=args.group)
+    dist.init_process_group(backend, init_method='file:///mnt/nfs/sharedfile',
+                            world_size=4, group_name=args.group)
 
 Environment variable initialization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -31,9 +31,9 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
    nn
    optim
    torch.autograd <autograd>
+   torch.distributed <distributed>
    torch.distributions <distributions>
    torch.multiprocessing <multiprocessing>
-   torch.distributed <distributed>
    bottleneck
    checkpoint
    cpp_extension
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -773,7 +773,7 @@ def advance(gen, end):
         # interlace
         torch.cuda.empty_cache()
         gen0 = self._test_memory_stats_generator(self, device=0, N=35)
-        gen1 = self._test_memory_stats_generator(self, device=1, N=35)
+        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
         end0 = end1 = False
         while not (end0 and end1):
             end0 = advance(gen0, end0)
@@ -782,7 +782,7 @@ def advance(gen, end):
         # semi-random order
         torch.cuda.empty_cache()
         gen0 = self._test_memory_stats_generator(self, device=0, N=35)
-        gen1 = self._test_memory_stats_generator(self, device=1, N=35)
+        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
         end0 = end1 = False
 
         while not (end0 and end1):
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -18,6 +18,7 @@
 from torch._six import raise_from
 from subprocess import Popen, PIPE
 from multiprocessing.util import register_after_fork as _register_after_fork
+from ._utils import _get_device_index
 
 _initialized = False
 _queued_calls = []  # don't invoke these until initialization occurs
@@ -211,12 +212,12 @@ class device(object):
     r"""Context-manager that changes the selected device.
 
     Arguments:
-        idx (int): device index to select. It's a no-op if this argument
-            is negative.
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
     """
 
-    def __init__(self, idx):
-        self.idx = int(idx)
+    def __init__(self, device):
+        self.idx = _get_device_index(device, optional=True)
         self.prev_idx = -1
 
     def __enter__(self):
@@ -255,9 +256,10 @@ def set_device(device):
     cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable.
 
     Arguments:
-        device (int): selected device. This function is a no-op if this
-            argument is negative.
+        device (torch.device or int): selected device. This function is a no-op
+            if this argument is negative.
     """
+    device = _get_device_index(device)
     if device >= 0:
         torch._C._cuda_setDevice(device)
 
@@ -266,8 +268,10 @@ def get_device_name(device):
     r"""Gets the name of a device.
 
     Arguments:
-        device (int): device for which to return the name. This function is a
-            no-op if this argument is negative.
+        device (torch.device or int, optional): device for which to return the
+            name. This function is a no-op if this argument is a negative
+            integer. Uses the current device, given by :meth:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
     """
     return get_device_properties(device).name
 
@@ -276,8 +280,12 @@ def get_device_capability(device):
     r"""Gets the cuda capability of a device.
 
     Arguments:
-        device (int): device for which to return the name. This function is a
-            no-op if this argument is negative.
+        device (torch.device or int, optional): device for which to return the
+            device capability. This function is a no-op if this argument is
+            a negative integer. Uses the current device, given by
+            :meth:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+
     Returns:
         tuple(int, int): the major and minor cuda capability of the device
     """
@@ -288,6 +296,7 @@ def get_device_capability(device):
 def get_device_properties(device):
     if not _initialized:
         init()  # will define _get_device_properties and _CudaDeviceProperties
+    device = _get_device_index(device, optional=True)
     if device < 0 or device >= device_count():
         raise AssertionError("Invalid device id")
     return _get_device_properties(device)
@@ -370,19 +379,17 @@ def memory_allocated(device=None):
     device.
 
     Arguments:
-        device (int, optional): selected device. Returns statistic for the
-                                current device, given by
-                                :meth:`~torch.cuda.current_device`, if
-                                :attr:`device` is ``None`` (default).
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :meth:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
 
     .. note::
         This is likely less than the amount shown in `nvidia-smi` since some
         unused memory can be held by the caching allocator and some context
         needs to be created on GPU. See :ref:`cuda-memory-management` for more
         details about GPU memory management.
     """
-    if device is None:
-        device = current_device()
+    device = _get_device_index(device, optional=True)
     return torch._C._cuda_memoryAllocated(device)
 
 
@@ -391,17 +398,15 @@ def max_memory_allocated(device=None):
     device.
 
     Arguments:
-        device (int, optional): selected device. Returns statistic for the
-                                current device, given by
-                                :meth:`~torch.cuda.current_device`, if
-                                :attr:`device` is ``None`` (default).
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :meth:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
 
     .. note::
         See :ref:`cuda-memory-management` for more details about GPU memory
         management.
     """
-    if device is None:
-        device = current_device()
+    device = _get_device_index(device, optional=True)
     return torch._C._cuda_maxMemoryAllocated(device)
 
 
@@ -410,17 +415,15 @@ def memory_cached(device=None):
     for a given device.
 
     Arguments:
-        device (int, optional): selected device. Returns statistic for the
-                                current device, given by
-                                :meth:`~torch.cuda.current_device`, if
-                                :attr:`device` is ``None`` (default).
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :meth:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
 
     .. note::
         See :ref:`cuda-memory-management` for more details about GPU memory
         management.
     """
-    if device is None:
-        device = current_device()
+    device = _get_device_index(device, optional=True)
     return torch._C._cuda_memoryCached(device)
 
 
@@ -429,17 +432,15 @@ def max_memory_cached(device=None):
     for a given device.
 
     Arguments:
-        device (int, optional): selected device. Returns statistic for the
-                                current device, given by
-                                :meth:`~torch.cuda.current_device`, if
-                                :attr:`device` is ``None`` (default).
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :meth:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
 
     .. note::
         See :ref:`cuda-memory-management` for more details about GPU memory
         management.
     """
-    if device is None:
-        device = current_device()
+    device = _get_device_index(device, optional=True)
     return torch._C._cuda_maxMemoryCached(device)
 
 
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
@@ -0,0 +1,32 @@
+import torch
+
+
+def _get_device_index(device, optional=False):
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a CUDA device. Note that for CUDA device without sepecified index, i.e.,
+    ``torch.devie('cuda')``, this will return the current default CUDA device if
+    :attr:`optional` is ``True``.
+
+    If :attr:`device` is a Python interger, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default CUDA
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, torch.device):
+        dev_type = device.type
+        if device.type != 'cuda':
+            raise ValueError('Expected a cuda device, but got: {}'.format(device))
+        device_idx = device.index
+    else:
+        device_idx = device
+    if device_idx is None:
+        if optional:
+            # default cuda device index
+            return torch.cuda.current_device()
+        else:
+            raise ValueError('Expected a cuda device with sepecified index or '
+                             'an integer, but got: '.format(device))
+    return device_idx
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
@@ -1,27 +1,30 @@
 import ctypes
 import torch
 from . import cudart, check_error, cudaStatus
+from ._utils import _get_device_index
 
 
 class Stream(torch._C._CudaStreamBase):
-    """Wrapper around a CUDA stream.
+    r"""Wrapper around a CUDA stream.
 
     A CUDA stream is a linear sequence of execution that belongs to a specific
     device, independent from other streams.  See :ref:`cuda-semantics` for
     details.
 
     Arguments:
-        device(int, optional): a device on which to allocate the Stream.
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
         priority(int, optional): priority of the stream. Lower numbers
                                  represent higher priorities.
     """
 
-    def __new__(cls, device=-1, priority=0, **kwargs):
+    def __new__(cls, device=None, priority=0, **kwargs):
         with torch.cuda.device(device):
             return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
 
     def wait_event(self, event):
-        """Makes all future work submitted to the stream wait for an event.
+        r"""Makes all future work submitted to the stream wait for an event.
 
         Arguments:
             event (Event): an event to wait for.
@@ -38,7 +41,7 @@ def wait_event(self, event):
         check_error(cudart().cudaStreamWaitEvent(self, event, ctypes.c_int(0)))
 
     def wait_stream(self, stream):
-        """Synchronizes with another stream.
+        r"""Synchronizes with another stream.
 
         All future work submitted to this stream will wait until all kernels
         submitted to a given stream at the time of call complete.
@@ -52,7 +55,7 @@ def wait_stream(self, stream):
         self.wait_event(stream.record_event())
 
     def record_event(self, event=None):
-        """Records an event.
+        r"""Records an event.
 
         Arguments:
             event (Event, optional): event to record. If not given, a new one
@@ -67,7 +70,7 @@ def record_event(self, event=None):
         return event
 
     def query(self):
-        """Checks if all the work submitted has been completed.
+        r"""Checks if all the work submitted has been completed.
 
         Returns:
             A boolean indicating if all kernels in this stream are completed.
@@ -79,7 +82,7 @@ def query(self):
         return True
 
     def synchronize(self):
-        """Wait for all the kernels in this stream to complete.
+        r"""Wait for all the kernels in this stream to complete.
 
         .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
            `CUDA documentation`_ for more info.
@@ -126,7 +129,7 @@ class EventHandle(ctypes.Structure):
 
 
 class Event(object):
-    """Wrapper around CUDA event.
+    r"""Wrapper around CUDA event.
 
     Arguments:
         enable_timing (bool): indicates if the event should measure time
@@ -165,19 +168,19 @@ def __del__(self):
             del self._as_parameter_
 
     def record(self, stream=None):
-        """Records the event in a given stream."""
+        r"""Records the event in a given stream."""
         if stream is None:
             stream = torch.cuda.current_stream()
         stream.record_event(self)
 
     def wait(self, stream=None):
-        """Makes a given stream wait for the event."""
+        r"""Makes a given stream wait for the event."""
         if stream is None:
             stream = torch.cuda.current_stream()
         stream.wait_event(self)
 
     def query(self):
-        """Checks if the event has been recorded.
+        r"""Checks if the event has been recorded.
 
         Returns:
             A boolean indicating if the event has been recorded.
@@ -189,18 +192,18 @@ def query(self):
         return True
 
     def elapsed_time(self, end_event):
-        """Returns the time elapsed before the event was recorded."""
+        r"""Returns the time elapsed before the event was recorded."""
         time_ms = ctypes.c_float()
         check_error(cudart().cudaEventElapsedTime(
             ctypes.byref(time_ms), self, end_event))
         return time_ms.value
 
     def synchronize(self):
-        """Synchronizes with the event."""
+        r"""Synchronizes with the event."""
         check_error(cudart().cudaEventSynchronize(self))
 
     def ipc_handle(self):
-        """Returns an IPC handle of this event."""
+        r"""Returns an IPC handle of this event."""
         handle = EventHandle()
         check_error(cudart().cudaIpcGetEventHandle(ctypes.byref(handle), self))
         return handle
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
diff --git a/torch/distributed/launch.py b/torch/distributed/launch.py