Revert "PR #49173: [Crash fix] Fix cudaMallocAsync crashes."

nouiz · nouiz · commit e755f6bdd43a · 2021-07-26T10:11:14.000-07:00
This reverts commit a4553f8.
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
@@ -318,6 +318,7 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
+        ":gpu_runtime",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
@@ -95,10 +95,13 @@ void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
 #endif
 }
 
+std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
+
 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
     PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
     bool compute_stats)
     : name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
+  ++number_instantiated_;
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
                                                            platform_device_id)
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
@@ -67,7 +67,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
   explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
                                        size_t pool_size,
                                        bool reserve_memory = false,
-                                       bool compute_stats = false);
+                                       bool compute_stats = true);
   ~GpuCudaMallocAsyncAllocator() override;
   string Name() override { return name_; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -85,7 +85,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
 
   void SetStream(void* stream) override {
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
-    cuda_stream_ = reinterpret_cast<CUstream>(stream);
+    cuda_stream_ = *(static_cast<CUstream*>(stream));
 #endif
   }
 
@@ -95,6 +95,8 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
   // - If CUDA_VERSION >= 11030, print cudaMallocAsync statistics.
   void PrintAllocatorStatistics();
 
+  static int GetInstantiatedCountTestOnly() { return number_instantiated_; }
+
  private:
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   se::StreamExecutor* stream_exec_;  // Not owned.
@@ -112,6 +114,10 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
   CUmemoryPool pool_;
 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
 
+  // Just a counter for the number of time this class is instantiated.
+  // Only useful for tests.
+  static std::atomic<int> number_instantiated_;
+
   string name_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -66,14 +67,17 @@ class GPUDeviceTest : public ::testing::Test {
       const string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
       const std::vector<std::vector<float>>& memory_limit_mb = {},
-      const std::vector<std::vector<int32>>& priority = {}) {
+      const std::vector<std::vector<int32>>& priority = {},
+      const bool use_cuda_malloc_async = false) {
     SessionOptions options;
     ConfigProto* config = &options.config;
     (*config->mutable_device_count())["GPU"] = gpu_device_count;
     GPUOptions* gpu_options = config->mutable_gpu_options();
     gpu_options->set_visible_device_list(visible_device_list);
     gpu_options->set_per_process_gpu_memory_fraction(
         per_process_gpu_memory_fraction);
+    gpu_options->mutable_experimental()->set_use_cuda_malloc_async(
+        use_cuda_malloc_async);
     for (int i = 0; i < memory_limit_mb.size(); ++i) {
       auto virtual_devices =
           gpu_options->mutable_experimental()->add_virtual_devices();
@@ -109,6 +113,33 @@ class GPUDeviceTest : public ::testing::Test {
   }
 };
 
+TEST_F(GPUDeviceTest, CudaMallocAsync) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {},
+                                           /*use_cuda_malloc_async=*/true);
+  std::vector<std::unique_ptr<Device>> devices;
+  Status status;
+  int number_instantiated =
+      GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
+  {  // The new scope is to trigger the destruction of the object.
+    status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(devices.size(), 1);
+    Device* device = devices[0].get();
+    auto* device_info = device->tensorflow_gpu_device_info();
+    EXPECT_NE(device_info, nullptr);
+
+    AllocatorAttributes allocator_attributes = AllocatorAttributes();
+    allocator_attributes.set_gpu_compatible(true);
+    Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
+    void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment, 1024);
+    EXPECT_NE(ptr, nullptr);
+    allocator->DeallocateRaw(ptr);
+  }
+  EXPECT_EQ(number_instantiated + 1,
+            GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
+  EXPECT_EQ(status.code(), error::OK);
+}
+
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<std::unique_ptr<Device>> devices;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -207,21 +207,18 @@ Allocator* GPUProcessState::GetGPUAllocator(
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
       delete gpu_bfc_allocator;
-      delete sub_allocator;
       gpu_bfc_allocator = nullptr;
-      sub_allocator = nullptr;
       gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
-    } else if (UseCudaMallocAsyncAllocator()) {
+    } else if (UseCudaMallocAsyncAllocator() ||
+               options.experimental().use_cuda_malloc_async()) {
       LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
                 << platform_device_id;
       // If true, passes all allocation requests through to cudaMallocAsync
       // TODO: useful for doing memory debugging with tools like
       // compute-sanitizer.
       // TODO: **WARNING** probably will not work in a multi-gpu scenario
       delete gpu_bfc_allocator;
-      delete sub_allocator;
       gpu_bfc_allocator = nullptr;
-      sub_allocator = nullptr;
       gpu_allocator =
           new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
     }