Merge branch 'optimizeAliasAnalysis' of github.com:Chillee/pytorch into optimizeAliasAnalysis

Chillee · Chillee · commit 4549d42cdc39 · 2019-05-28T10:40:47.000-07:00
diff --git a/.gitmodules b/.gitmodules
@@ -21,7 +21,7 @@
 [submodule "third_party/protobuf"]
     ignore = dirty
     path = third_party/protobuf
-    url = https://github.com/google/protobuf.git
+    url = https://github.com/protocolbuffers/protobuf.git
 [submodule "third_party/ios-cmake"]
     ignore = dirty
     path = third_party/ios-cmake
@@ -57,7 +57,7 @@
 [submodule "third-party/cpuinfo"]
     ignore = dirty
     path = third_party/cpuinfo
-    url = https://github.com/Maratyszcza/cpuinfo.git
+    url = https://github.com/pytorch/cpuinfo.git
 [submodule "third_party/python-enum"]
     ignore = dirty
     path = third_party/python-enum
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
@@ -200,7 +200,7 @@ THTensor *THTensor_(newView)(THTensor *tensor, at::IntArrayRef size)
                                         inferred_size);
   THArgCheck(stride.has_value(), 2, "view size is "
     "not compatible with input tensor's size and stride (at least one dimension spans "
-    "across two contiguous subspaces). Call .contiguous() before .view().");
+    "across two contiguous subspaces). Use .reshape(...) instead.");
   auto stride_value = *stride;
   THTensor_setStorage(self, THTensor_getStoragePtr(tensor), tensor->storage_offset(), inferred_size, stride_value);
   return self;
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
@@ -206,7 +206,7 @@ THCTensor *THCTensor_(newView)(THCState *state, THCTensor *tensor, at::IntArrayR
                                         inferred_size);
   THArgCheck(stride.has_value(), 2, "view size is "
     "not compatible with input tensor's size and stride (at least one dimension spans "
-    "across two contiguous subspaces). Call .contiguous() before .view().");
+    "across two contiguous subspaces). Use .reshape(...) instead.");
   auto stride_value = *stride;
 
   // NOTE: This path of constructing the Tensor directly with the viewed Storage is necessary
diff --git a/docs/source/multiprocessing.rst b/docs/source/multiprocessing.rst
@@ -19,6 +19,9 @@ Strategy management
 .. autofunction:: get_sharing_strategy
 .. autofunction:: set_sharing_strategy
 
+
+.. _multiprocessing-cuda-sharing-details:
+
 Sharing CUDA tensors
 --------------------
 
@@ -28,8 +31,13 @@ Python 2 can only create subprocesses using ``fork``, and it's not supported
 by the CUDA runtime.
 
 Unlike CPU tensors, the sending process is required to keep the original tensor
-as long as the receiving process retains a copy of the tensor. It is implemented
-under the hood but requires users to follow the next best practices.
+as long as the receiving process retains a copy of the tensor. The refcounting is
+implemented under the hood but requires users to follow the next best practices.
+
+.. warning::
+    If the consumer process dies abnormally to a fatal signal, the shared tensor
+    could be forever kept in memory as long as the sending process is running.
+
 
 1. Release memory ASAP in the consumer.
 
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
@@ -277,8 +277,9 @@ memory. CPU tensors and storages expose a :meth:`~torch.Tensor.pin_memory`
 method, that returns a copy of the object, with data put in a pinned region.
 
 Also, once you pin a tensor or storage, you can use asynchronous GPU copies.
-Just pass an additional ``non_blocking=True`` argument to a :meth:`~torch.Tensor.cuda`
-call. This can be used to overlap data transfers with computation.
+Just pass an additional ``non_blocking=True`` argument to a
+:meth:`~torch.Tensor.to` or a :meth:`~torch.Tensor.cuda` call. This can be used
+to overlap data transfers with computation.
 
 You can make the :class:`~torch.utils.data.DataLoader` return batches placed in
 pinned memory by passing ``pin_memory=True`` to its constructor.
diff --git a/docs/source/notes/multiprocessing.rst b/docs/source/notes/multiprocessing.rst
@@ -20,22 +20,26 @@ memory and will only send a handle to another process.
 This allows to implement various training methods, like Hogwild, A3C, or any
 others that require asynchronous operation.
 
-Sharing CUDA tensors
---------------------
-
-Sharing CUDA tensors between processes is supported only in Python 3, using
-a ``spawn`` or ``forkserver`` start methods. :mod:`python:multiprocessing` in
-Python 2 can only create subprocesses using ``fork``, and it's not supported
-by the CUDA runtime.
+CUDA in multiprocessing
+-----------------------
 
-.. warning::
+The CUDA runtime does not support the ``fork`` start method. However,
+:mod:`python:multiprocessing` in Python 2 can only create subprocesses using
+``fork``. So Python 3 and either ``spawn`` or ``forkserver`` start method are
+required to use CUDA in subprocesses.
 
-    CUDA API requires that the allocation exported to other processes remains
-    valid as long as it's used by them. You should be careful and ensure that
-    CUDA tensors you shared don't go out of scope as long as it's necessary.
-    This shouldn't be a problem for sharing model parameters, but passing other
-    kinds of data should be done with care. Note that this restriction doesn't
-    apply to shared CPU memory.
+.. note::
+  The start method can be set via either creating a context with
+  ``multiprocessing.get_context(...)`` or directly using
+  ``multiprocessing.set_start_method(...)``.
+
+Unlike CPU tensors, the sending process is required to keep the original tensor
+as long as the receiving process retains a copy of the tensor. It is implemented
+under the hood but requires users to follow the best practices for the program
+to run correctly. For example, the sending process must stay alive as long as
+the consumer process has references to the tensor, and the refcounting can not
+save you if the consumer process exits abnormally via a fatal signal. See
+:ref:`this section <multiprocessing-cuda-sharing-details>`.
 
 See also: :ref:`cuda-nn-dataparallel-instead`
 
diff --git a/setup.py b/setup.py
@@ -288,6 +288,10 @@ def check_file(f):
     check_file(os.path.join(third_party_path, 'foxi', 'CMakeLists.txt'))
     check_file(os.path.join(third_party_path, 'QNNPACK', 'CMakeLists.txt'))
     check_file(os.path.join(third_party_path, 'fbgemm', 'CMakeLists.txt'))
+    check_file(os.path.join(third_party_path, 'fbgemm', 'third_party',
+                            'asmjit', 'CMakeLists.txt'))
+    check_file(os.path.join(third_party_path, 'onnx', 'third_party',
+                            'benchmark', 'CMakeLists.txt'))
 
     check_pydep('yaml', 'pyyaml')
     check_pydep('typing', 'typing')
diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
@@ -1 +1 @@
-Subproject commit 1450d42675be325cd3b7a684d4b231eedceb22fb
+Subproject commit f40ce73e8987d2990e4b9ef6c75f4b3423acce78
diff --git a/third_party/onnx b/third_party/onnx
@@ -1 +1 @@
-Subproject commit cc2333a3f929caca7223b98699237f19388dd585
+Subproject commit ead449a30d026a7a0a59e2ba0a42ca8e52ec2359
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
@@ -3583,13 +3583,13 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.ones,
            r"""
-ones(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+ones(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with the scalar value `1`, with the shape defined
 by the variable argument :attr:`sizes`.
 
 Args:
-    sizes (int...): a sequence of integers defining the shape of the output tensor.
+    size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
     {out}
     {dtype}
@@ -5632,13 +5632,13 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.zeros,
            r"""
-zeros(*sizes, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+zeros(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with the scalar value `0`, with the shape defined
 by the variable argument :attr:`sizes`.
 
 Args:
-    sizes (int...): a sequence of integers defining the shape of the output tensor.
+    size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
     {out}
     {dtype}
diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp
@@ -8,8 +8,8 @@
 namespace torch {
 namespace jit {
 namespace {
-std::unordered_set<const Element*, unsigned> comprMap;
-std::unordered_set<unsigned, const Element*> decomprMap;
+std::unordered_map<const Element*, unsigned> comprMap;
+std::unordered_map<unsigned, const Element*> decomprMap;
 } // namespace
 
 unsigned Element::toIndex(const Element* x) {
diff --git a/torch/hub.py b/torch/hub.py
@@ -23,10 +23,12 @@
     # fake tqdm if it's not installed
     class tqdm(object):
 
-        def __init__(self, total=None, disable=False):
+        def __init__(self, total=None, disable=False,
+                     unit=None, unit_scale=None, unit_divisor=None):
             self.total = total
             self.disable = disable
             self.n = 0
+            # ignore unit, unit_scale, unit_divisor; they're just for real tqdm
 
         def update(self, n):
             if self.disable:
@@ -380,7 +382,8 @@ def _download_url_to_file(url, dst, hash_prefix, progress):
     try:
         if hash_prefix is not None:
             sha256 = hashlib.sha256()
-        with tqdm(total=file_size, disable=not progress) as pbar:
+        with tqdm(total=file_size, disable=not progress,
+                  unit='B', unit_scale=True, unit_divisor=1024) as pbar:
             while True:
                 buffer = u.read(8192)
                 if len(buffer) == 0:
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -16,7 +16,7 @@
 
 
 conv1d = _add_docstr(torch.conv1d, r"""
-conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros') -> Tensor
+conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 
 Applies a 1D convolution over an input signal composed of several input
 planes.
@@ -37,7 +37,6 @@
       a one-element tuple `(dW,)`. Default: 1
     groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
-    padding_mode: the type of paddings applied to both sided can be: `zeros` or `circular`. Default: `zeros`
 
 Examples::
 
@@ -47,7 +46,7 @@
 """)
 
 conv2d = _add_docstr(torch.conv2d, r"""
-conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros') -> Tensor
+conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 
 Applies a 2D convolution over an input image composed of several input
 planes.
@@ -68,7 +67,6 @@
       a tuple `(dH, dW)`. Default: 1
     groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
       number of groups. Default: 1
-    padding_mode: the type of paddings applied to both sided can be: `zeros` or `circular`. Default: `zeros`
 
 Examples::
 
@@ -79,7 +77,7 @@
 """)  # noqa: E501
 
 conv3d = _add_docstr(torch.conv3d, r"""
-conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros') -> Tensor
+conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 
 Applies a 3D convolution over an input image composed of several input
 planes.
@@ -100,7 +98,6 @@
       a tuple `(dT, dH, dW)`. Default: 1
     groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
       the number of groups. Default: 1
-    padding_mode: the type of paddings applied to both sided can be: `zeros` or `circular`. Default: `zeros`
 
 Examples::
 
@@ -2425,8 +2422,8 @@ def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=
             ``'trilinear'``. Default: ``'nearest'``
         align_corners (bool, optional): Geometrically, we consider the pixels of the
             input and output as squares rather than points.
-            If set to ``True``, the input and output tensors are aligned by the
-            center points of their corner pixels. If set to ``False``, the input and
+            If set to ``False``, the input and output tensors are aligned by the
+            center points of their corner pixels. If set to ``True``, the input and
             output tensors are aligned by the corner points of their corner
             pixels, and the interpolation uses edge value padding for out-of-boundary values.
             This only has effect when :attr:`mode` is ``'linear'``,
@@ -2472,8 +2469,8 @@ def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corne
             ``'trilinear'`` | ``'area'``. Default: ``'nearest'``
         align_corners (bool, optional): Geometrically, we consider the pixels of the
             input and output as squares rather than points.
-            If set to ``True``, the input and output tensors are aligned by the
-            center points of their corner pixels. If set to ``False``, the input and
+            If set to ``False``, the input and output tensors are aligned by the
+            center points of their corner pixels. If set to ``True``, the input and
             output tensors are aligned by the corner points of their corner
             pixels, and the interpolation uses edge value padding for out-of-boundary values.
             This only has effect when :attr:`mode` is ``'linear'``,
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
@@ -329,7 +329,7 @@ class RNN(RNNBase):
           :math:`H_{out}=\text{hidden\_size}`
           Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
           If the RNN is bidirectional, num_directions should be 2, else it should be 1.
-        - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
         - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
           for each element in the batch
 
@@ -635,7 +635,7 @@ class GRU(RNNBase):
           :math:`H_{out}=\text{hidden\_size}`
           Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
           If the RNN is bidirectional, num_directions should be 2, else it should be 1.
-        - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
         - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
           for each element in the batch
 
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
@@ -25,9 +25,10 @@ def scatter_map(obj):
     # fn is recursive). To avoid this reference cycle, we set the function to
     # None, clearing the cell
     try:
-        return scatter_map(inputs)
+        res = scatter_map(inputs)
     finally:
         scatter_map = None
+    return res
 
 
 def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
@@ -64,6 +65,7 @@ def gather_map(outputs):
     # Recursive function calls like this create reference cycles.
     # Setting the function to None clears the refcycle.
     try:
-        return gather_map(outputs)
+        res = gather_map(outputs)
     finally:
         gather_map = None
+    return res