small fixes in fusion_compiler (#7776)

ngimel · soumith · commit 1feb1a9b88c1 · 2018-05-23T15:18:58.000-04:00
* small fixes in fusion_compiler

* address review comments
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -369,7 +369,7 @@ def broadcast(a, b):
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA_MULTI_GPU, "needs non-zero device")
     def test_fuse_last_device(self):
-        device = 'cuda:' + str(torch.cuda.device_count() - 1)
+        device = 'cuda:' + str(1)
         x = torch.tensor([0.4], dtype=torch.float, device=device)
         y = torch.tensor([0.7], dtype=torch.float, device=device)
 
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
@@ -8,6 +8,7 @@
 
 #include "ATen/ATen.h"
 #ifdef WITH_CUDA
+#include "THC/THC.h"
 #include "torch/csrc/cuda/cuda_check.h"
 #include <nvrtc.h>
 #include <cuda.h>
@@ -550,13 +551,19 @@ struct CUDAFusionFunction : public CompiledFusionFunction {
      // it is possible that this is the first cuda call on this thread
      // so make sure we initialize the Driver API's context
      // cudaFree(0) accomplishes this.
-     cudaFree(0);
-
+     CUcontext pctx = 0;
+     TORCH_CU_CHECK(cuCtxGetCurrent(&pctx));
+     if (!pctx) {
+        std::unique_lock<std::mutex> cudaFreeMutexLock(
+            *(THCCachingAllocator_getCudaFreeMutex()));
+        cudaFree(0);
+     }
+     CUstream stream = at::globalContext().getCurrentCUDAStream();
      TORCH_CU_CHECK(cuLaunchKernel(
        function,
        numBlocks, 1, 1,
        blockSize, 1, 1,
-       0, nullptr,
+       0, stream,
        arguments,
        nullptr));
   }