pytorch · ssnl · Jun 19, 2018 · Jun 6, 2018 · Jun 6, 2018 · Jun 7, 2018
diff --git a/aten/src/ATen/DeviceGuard.h b/aten/src/ATen/DeviceGuard.h
@@ -56,7 +56,7 @@ struct DeviceGuard {
     }
   }
 
-  /// Sets the device to the given one if its index is not `nullopt`.
+  /// Sets the device to the given one.
   void set_index(int32_t index) {
     if (index == -1) {
       return;

diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
@@ -35,11 +35,12 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
                              "first on devices list");
   std::vector<Tensor> tensors;
   tensors.reserve(devices.size());
+  at::DeviceGuard _device_guard;
 #ifdef USE_NCCL
   if (nccl::is_available({tensor})) {
     tensors.push_back(tensor);
     for (auto device : devices.slice(1)) {
-      at::DeviceGuard _device_guard(device);
+      _device_guard.set_index(device);
       tensors.push_back(type.tensor(tensor.sizes()));
     }
     nccl::broadcast(tensors);
@@ -48,8 +49,12 @@ std::vector<Tensor> broadcast(const Tensor& tensor, IntList devices) {
   {
 #endif
     auto & gpu_type = type.toBackend(type.is_sparse() ? at::kSparseCUDA : at::kCUDA);
-    for (auto device : devices) {
-      at::DeviceGuard _device_guard(device);
+    if (type.is_cuda()) {
+      tensors.push_back(tensor);
+    }
+    IntList loop_devices = type.is_cuda() ? devices.slice(1) : devices;
+    for (auto device : loop_devices) {
+      _device_guard.set_index(device);
       tensors.push_back(gpu_type.copy(tensor, true));
     }
   }