Fix broadcast copying device[0] tensor when not using NCCL (#8222)

ssnl · web-flow · commit 2bf8b702a3b4 · 2018-06-19T16:34:29.000-04:00
* Fix broadcast copying device[0] tensor when not using NCCL; Avoids potential extra copy in flatten_dense_tensors

* use toType

* revert dense_flat changes

* address comments
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
@@ -56,7 +56,7 @@ struct DeviceGuard {
     }
   }
 
-  /// Sets the device to the given one if its index is not `nullopt`.
+  /// Sets the device to the given one.
   void set_index(int32_t index) {
     if (index == -1) {
       return;
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
@@ -35,11 +35,12 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
                              "first on devices list");
   std::vector<Tensor> tensors;
   tensors.reserve(devices.size());
+  at::DeviceGuard _device_guard;
 #ifdef USE_NCCL
   if (nccl::is_available({tensor})) {
     tensors.push_back(tensor);
     for (auto device : devices.slice(1)) {
-      at::DeviceGuard _device_guard(device);
+      _device_guard.set_index(device);
       tensors.push_back(type.tensor(tensor.sizes()));
     }
     nccl::broadcast(tensors);
@@ -48,8 +49,12 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
   {
 #endif
     auto & gpu_type = type.toBackend(type.is_sparse() ? at::kSparseCUDA : at::kCUDA);
-    for (auto device : devices) {
-      at::DeviceGuard _device_guard(device);
+    if (type.is_cuda()) {
+      tensors.push_back(tensor);
+    }
+    IntList loop_devices = type.is_cuda() ? devices.slice(1) : devices;
+    for (auto device : loop_devices) {
+      _device_guard.set_index(device);
       tensors.push_back(gpu_type.copy(tensor, true));
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ struct DeviceGuard {`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`
`59`		- /// Sets the device to the given one if its index is not `nullopt`.
	`59`	`+ /// Sets the device to the given one.`
`60`	`60`	`void set_index(int32_t index) {`
`61`	`61`	`if (index == -1) {`
`62`	`62`	`return;`