Add non_blocking to Tensor/Module.to (#7312)

ssnl · web-flow · commit c0a419e6baec · 2018-06-04T18:46:52.000-04:00
* Add non_blocking to Tensor/Module.to

* flake8

* Add argparse tests

* cpp parse

* Use C++ parser

* use a commong parse function with Tensor.to

* fix test_jit

* use THPObjectPtr

* increase refcount for None, True, and False

* address comments

* address comments
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -79,7 +79,7 @@ def get_lstm_inputs(device):
     input = torch.randn(3, 10, dtype=torch.float, device=device)
     hx = torch.randn(3, 20, dtype=torch.float, device=device)
     cx = torch.randn(3, 20, dtype=torch.float, device=device)
-    module = nn.LSTMCell(10, 20).to(torch.float, device)  # Just to allocate weights with correct sizes
+    module = nn.LSTMCell(10, 20).to(device, torch.float)  # Just to allocate weights with correct sizes
     return (input, hx, cx) + tuple(p.requires_grad_(False) for p in module.parameters())
 
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1175,6 +1175,28 @@ def test_add_module(self):
         self.assertEqual(net.l, l3)
         self.assertRaises(TypeError, lambda: net.add_module('x', 'non-module'))
 
+    def test_module_to_argparse(self):
+        net = nn.Sequential(nn.Linear(3, 3))
+        cpu = torch.device('cpu')
+        with self.assertRaises(TypeError):
+            net.to(cpu, True)
+        with self.assertRaises(TypeError):
+            net.to(torch.long)
+        with self.assertRaises(TypeError):
+            net.to(None, True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, torch.long, True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, dtype=torch.long, non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to([])
+        with self.assertRaises(TypeError):
+            net.to({}, non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to(torch.tensor(3, dtype=torch.long), non_blocking=True)
+        with self.assertRaises(TypeError):
+            net.to(cpu, torch.tensor(3, dtype=torch.long), non_blocking=True)
+
     def test_type(self):
         l = nn.Linear(10, 20)
         net = nn.Module()
@@ -1203,22 +1225,22 @@ def test_type(self):
             self.assertIsInstance(l.weight.data, torch.FloatTensor)
             self.assertIsInstance(l.bias.data, torch.FloatTensor)
             self.assertIsInstance(net.indices, torch.LongTensor)
-            net.to("cuda", torch.double)
+            net.to("cuda", torch.double, True)
             self.assertIsInstance(l.weight.data, torch.cuda.DoubleTensor)
             self.assertIsInstance(l.bias.data, torch.cuda.DoubleTensor)
             self.assertIsInstance(net.indices, torch.cuda.LongTensor)
-            net.to(device="cuda:0", dtype=torch.half)
+            net.to(torch.empty(1, device="cuda:0", dtype=torch.half))
             self.assertIsInstance(l.weight.data, torch.cuda.HalfTensor)
             self.assertIsInstance(l.bias.data, torch.cuda.HalfTensor)
             self.assertIsInstance(net.indices, torch.cuda.LongTensor)
-        net.to(torch.device("cpu"))
+        net.to(torch.device("cpu"), non_blocking=True)
         self.assertIsInstance(l.weight.data, torch.HalfTensor)
         self.assertIsInstance(l.bias.data, torch.HalfTensor)
         self.assertIsInstance(net.indices, torch.LongTensor)
         net.type(torch.FloatTensor)
         self.assertIsInstance(l.weight.data, torch.FloatTensor)
         self.assertIsInstance(l.bias.data, torch.FloatTensor)
-        net.type(torch.DoubleTensor)
+        net.to(torch.DoubleTensor(1))
         self.assertIsInstance(l.weight.data, torch.DoubleTensor)
         self.assertIsInstance(l.bias.data, torch.DoubleTensor)
         if TEST_CUDA:
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -1858,26 +1858,28 @@ def test_to(self):
         self.assertIs(torch.float32, a.to(dtype=torch.float32).dtype)
 
         if torch.cuda.is_available():
-            for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
-                b = torch.tensor(5., device=cuda)
-                self.assertEqual(b.device, b.to(cuda).device)
-                self.assertEqual(a.device, b.to('cpu').device)
-                self.assertEqual(b.device, a.to(cuda).device)
-                self.assertIs(torch.int32, b.to('cpu', dtype=torch.int32).dtype)
-                self.assertEqual(a.device, b.to('cpu', dtype=torch.int32).device)
-                self.assertIs(torch.int32, b.to(dtype=torch.int32).dtype)
-                self.assertEqual(b.device, b.to(dtype=torch.int32).device)
+            for non_blocking in [True, False]:
+                for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                    b = torch.tensor(5., device=cuda)
+                    self.assertEqual(b.device, b.to(cuda, non_blocking=non_blocking).device)
+                    self.assertEqual(a.device, b.to('cpu', non_blocking=non_blocking).device)
+                    self.assertEqual(b.device, a.to(cuda, non_blocking=non_blocking).device)
+                    self.assertIs(torch.int32, b.to('cpu', dtype=torch.int32, non_blocking=non_blocking).dtype)
+                    self.assertEqual(a.device, b.to('cpu', dtype=torch.int32, non_blocking=non_blocking).device)
+                    self.assertIs(torch.int32, b.to(dtype=torch.int32).dtype)
+                    self.assertEqual(b.device, b.to(dtype=torch.int32).device)
 
     def test_to_with_tensor(self):
         a = torch.tensor(5)
         self.assertEqual(a.device, a.to(a).device)
 
         if torch.cuda.is_available():
-            for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
-                b = torch.tensor(5., device=cuda)
-                self.assertEqual(b.device, b.to(b).device)
-                self.assertEqual(a.device, b.to(a).device)
-                self.assertEqual(b.device, a.to(b).device)
+            for non_blocking in [True, False]:
+                for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
+                    b = torch.tensor(5., device=cuda)
+                    self.assertEqual(b.device, b.to(b, non_blocking=non_blocking).device)
+                    self.assertEqual(a.device, b.to(a, non_blocking=non_blocking).device)
+                    self.assertEqual(b.device, a.to(b, non_blocking=non_blocking).device)
 
     @staticmethod
     def _test_empty_full(self, dtypes, layout, device):
diff --git a/tools/autograd/templates/python_nn_functions.cpp b/tools/autograd/templates/python_nn_functions.cpp
@@ -2,9 +2,12 @@
 
 // ${generated_comment}
 
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 
 #include "python_nn_functions_dispatch.h"
@@ -15,9 +18,36 @@ using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
 
+static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  auto parsed = parse_to_conversion(args, kwargs);
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto tuple = THPObjectPtr{PyTuple_New(3)};
+  if (!tuple) throw python_error();
+  if (device) {
+    PyTuple_SET_ITEM(tuple.get(), 0, THPDevice_New(*device));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
+  }
+  if (scalarType) {
+    PyTuple_SET_ITEM(tuple.get(), 1, torch::autograd::utils::wrap(torch::getDtype(*scalarType)));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
+  }
+  PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
 ${py_methods}
 
 static PyMethodDef nn_functions[] = {
+  {"_parse_to", (PyCFunction)THPVariable__parse_to, METH_VARARGS | METH_KEYWORDS, nullptr},
   ${py_method_defs}
   {NULL}
 };
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
@@ -8,6 +8,7 @@
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/python_error_messages.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
 #include "torch/csrc/jit/tracer.h"
 #ifdef WITH_CUDA
 #include "torch/csrc/cuda/Stream.h"
@@ -558,31 +559,22 @@ static PyObject * THPVariable_storage_type(PyObject* self, PyObject* arg)
 static PyObject * THPVariable_to(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
-  static PythonArgParser parser({
-    "to(Device device, ScalarType dtype=None)",
-    "to(ScalarType dtype)",
-    "to(Tensor other)",
-  });
-  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-  ParsedArgs<2> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-  if (r.idx == 0) {
-    auto device = r.device(0);
-    auto deviceAutoGPU = device.deviceInt64();
-    auto scalarType = r.scalartypeWithDefault(1, self_.type().scalarType());
-    auto& layout = *torch::getLayout(self_.type().backend());
-    auto& type = torch::getType(scalarType, layout, device.type);
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, deviceAutoGPU, false));
-  } else if (r.idx == 1) {
-    auto scalarType = r.scalartype(0);
-    auto& type = self_.type().toScalarType(scalarType);
+  auto parsed = parse_to_conversion(args, kwargs);
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  if (!device) {
+    // device not given
+    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+    auto& type = self_.type().toScalarType(scalarType.value_or(self_.type().scalarType()));
     return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type));
-  } else if (r.idx == 2) {
-    auto other = r.tensor(0);
-    auto& type = other.type();
-    auto deviceType = torch::getDeviceType(type);
-    auto deviceAutoGPU = (deviceType == DeviceType::CPU) ? -1 : other.get_device();
-    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, deviceAutoGPU, false));
+  } else {
+    // device and maybe dtype are given
+    auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+    auto deviceAutoGPU = device->deviceInt64();
+    auto& layout = *torch::getLayout(self_.type().backend());
+    auto& type = torch::getType(scalarType.value_or(self_.type().scalarType()), layout, device->type);
+    return THPVariable_Wrap(torch::utils::dispatch_type_conversion(self_, type, deviceAutoGPU, non_blocking));
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
@@ -2003,15 +2003,20 @@ def callable(a, b) -> number
 
     Returns a Tensor with the specified :attr:`dtype`
 
-.. function:: to(device, dtype=None) -> Tensor
+.. function:: to(device=None, dtype=None, non_blocking=False) -> Tensor
 
     Returns a Tensor with the specified :attr:`device` and (optional)
     :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+    When :attr:`non_blocking`, tries to convert asynchronously with respect to
+    the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+    CUDA Tensor.
 
-.. function:: to(other) -> Tensor
+.. function:: to(other, non_blocking=False) -> Tensor
 
-    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as the Tensor
-    :attr:`other`.
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+    the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+    asynchronously with respect to the host if possible, e.g., converting a CPU
+    Tensor with pinned memory to a CUDA Tensor.
 
 Example::
 
@@ -2030,7 +2035,7 @@ def callable(a, b) -> number
             [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
 
     >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
-    >>> tensor.to(other)
+    >>> tensor.to(other, non_blocking=True)
     tensor([[-0.5044,  0.0005],
             [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
@@ -366,32 +366,24 @@ PyObject *THPVariable_is_sparse(THPVariable *self)
   END_HANDLE_TH_ERRORS
 }
 
-PyObject *THPVariable_dtype(THPVariable *self)
+static PyObject *THPVariable_dtype(THPVariable *self)
 {
   HANDLE_TH_ERRORS
   auto& self_ = self->cdata;
   return torch::autograd::utils::wrap(torch::getDtype(self_.type().scalarType()));
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * THPVariable_layout(THPVariable* self, PyObject* args) {
+static PyObject * THPVariable_layout(THPVariable* self) {
   HANDLE_TH_ERRORS
   auto& self_ = self->cdata;
   return torch::autograd::utils::wrap(torch::getLayout(self_.type().backend()));
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * THPVariable_device(THPVariable* self, PyObject* args) {
+static PyObject * THPVariable_device(THPVariable* self) {
   HANDLE_TH_ERRORS
-  auto& self_ = self->cdata;
-  if (self_.type().is_cuda()) {
-    torch::Device device(torch::DeviceType::CUDA, self_.get_device(), false);
-    return THPDevice_New(device);
-  }
-  else {
-    torch::Device device(torch::DeviceType::CPU, -1, true);
-    return THPDevice_New(device);
-  }
+  return THPDevice_New(torch::tensor::getDevice(self->cdata));
   END_HANDLE_TH_ERRORS
 }
 
@@ -413,9 +405,9 @@ static struct PyGetSetDef THPVariable_properties[] = {
   {"shape", (getter)THPVariable_get_shape, nullptr, nullptr, nullptr},
   {"is_cuda", (getter)THPVariable_is_cuda, nullptr, nullptr, nullptr},
   {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr},
-  {"dtype", (getter)THPVariable_dtype, NULL, NULL, NULL},
-  {"layout", (getter)THPVariable_layout, NULL, NULL, NULL},
-  {"device", (getter)THPVariable_device, NULL, NULL, NULL},
+  {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr},
+  {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr},
+  {"device", (getter)THPVariable_device, nullptr, nullptr, nullptr},
   {nullptr}
 };
 
diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "torch/csrc/python_headers.h"
+#include <ATen/ATen.h>
+
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/device.h"
+
+namespace torch { namespace autograd { namespace utils {
+
+inline std::tuple<at::optional<torch::Device>, at::optional<at::ScalarType>, bool>
+parse_to_conversion(PyObject *args, PyObject *kwargs) {
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False)",
+    "to(ScalarType dtype, bool non_blocking=False)",
+    "to(Tensor tensor, bool non_blocking=False)",
+  });
+  ParsedArgs<3> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    return std::make_tuple(r.deviceOptional(0), r.scalartypeOptional(1), r.toBool(2));
+  } else if (r.idx == 1) {
+    return std::make_tuple(at::nullopt, r.scalartype(0), r.toBool(1));
+  } else {
+    auto tensor = r.tensor(0);
+    return std::make_tuple(
+      torch::tensor::getDevice(tensor),
+      tensor.type().scalarType(),
+      r.toBool(1)
+    );
+  }
+}
+
+}}} // namespace torch::autograd::utils
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
@@ -384,4 +384,12 @@ at::Type& get_default_tensor_type() {
   return *default_tensor_type;
 }
 
+Device getDevice(const at::Tensor& tensor) {
+  if (tensor.type().is_cuda()) {
+    return torch::Device(torch::DeviceType::CUDA, tensor.get_device(), false);
+  } else {
+    return torch::Device(torch::DeviceType::CPU, -1, true);
+  }
+}
+
 }} // namespace torch::tensor
diff --git a/torch/csrc/tensor/python_tensor.h b/torch/csrc/tensor/python_tensor.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "torch/csrc/python_headers.h"
+#include "torch/csrc/utils/device.h"
 #include <ATen/ATen.h>
 
 namespace torch { namespace tensor {
@@ -23,4 +24,7 @@ void py_set_default_dtype(PyObject* dtype_obj);
 // returned value will be a VariableType instance.
 at::Type& get_default_tensor_type();
 
+// Gets the torch::Device object of a given at::Tensor
+Device getDevice(const at::Tensor& tensor);
+
 }} // namespace torch::tensor
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
@@ -162,7 +162,7 @@ struct FunctionParameter {
 template<int N>
 inline PythonArgs PythonArgParser::parse(PyObject* args, PyObject* kwargs, ParsedArgs<N>& dst) {
   if (N < max_args) {
-    throw ValueError("dst does not have enough capacity, expected %d (got %d)",
+    throw ValueError("PythonArgParser: dst ParsedArgs buffer does not have enough capacity, expected %d (got %d)",
         (int)max_args, N);
   }
   return raw_parse(args, kwargs, dst.args);
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.cpp b/torch/csrc/utils/tensor_conversion_dispatch.cpp
@@ -11,7 +11,7 @@ namespace torch { namespace utils {
 at::Tensor dispatch_type_conversion(
     const at::Tensor & self,
     const at::Type & type,
-    at::optional<int> device,
+    at::optional<int64_t> device,
     bool non_blocking) {
   if (type.is_cuda()) {
     torch::utils::cuda_lazy_init();
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.h b/torch/csrc/utils/tensor_conversion_dispatch.h
@@ -23,7 +23,7 @@ namespace torch { namespace utils {
 // w.r.t the host.
 at::Tensor dispatch_type_conversion(const at::Tensor & self,
                                     const at::Type & type,
-                                    at::optional<int> device=at::nullopt,
+                                    at::optional<int64_t> device=at::nullopt,
                                     bool non_blocking=false);
 
 }} // namespace torch::utils
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py