PR #49173: [Crash fix] Fix cudaMallocAsync crashes.

zhangyujing · tensorflower-gardener · commit a4553f8ad74f · 2021-06-21T13:05:45.000-07:00
Imported from GitHub PR #49173 The first commit fixes #48869. The second commit fixes another follow up crashes when using TF_GPU_ALLOCATOR=cuda_malloc_async. The 2 fixes are: - The Allocator API have... PiperOrigin-RevId: 380643089 Change-Id: I06f04d8b2d8ed6b08b91f94123a5e9e8a1681793
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
@@ -318,7 +318,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_id",
-        ":gpu_runtime",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
@@ -41,13 +41,10 @@ static std::string GetCudaErrorMessage(CUresult result) {
 }
 #endif  // GOOGLE_CUDA
 
-std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
-
 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
     PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
     bool compute_stats)
     : name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
-  ++number_instantiated_;
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
                                                            platform_device_id)
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
@@ -67,7 +67,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
   explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
                                        size_t pool_size,
                                        bool reserve_memory = false,
-                                       bool compute_stats = true);
+                                       bool compute_stats = false);
   ~GpuCudaMallocAsyncAllocator() override;
   string Name() override { return name_; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -85,12 +85,10 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
 
   void SetStream(void* stream) override {
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
-    cuda_stream_ = *(static_cast<CUstream*>(stream));
+    cuda_stream_ = reinterpret_cast<CUstream>(stream);
 #endif
   }
 
-  static int GetInstantiatedCountTestOnly() { return number_instantiated_; }
-
  private:
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   se::StreamExecutor* stream_exec_;  // Not owned.
@@ -108,10 +106,6 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
   CUmemoryPool pool_;
 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
 
-  // Just a counter for the number of time this class is instantiated.
-  // Only useful for tests.
-  static std::atomic<int> number_instantiated_;
-
   string name_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -67,17 +66,14 @@ class GPUDeviceTest : public ::testing::Test {
       const string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
       const std::vector<std::vector<float>>& memory_limit_mb = {},
-      const std::vector<std::vector<int32>>& priority = {},
-      const bool use_cuda_malloc_async = false) {
+      const std::vector<std::vector<int32>>& priority = {}) {
     SessionOptions options;
     ConfigProto* config = &options.config;
     (*config->mutable_device_count())["GPU"] = gpu_device_count;
     GPUOptions* gpu_options = config->mutable_gpu_options();
     gpu_options->set_visible_device_list(visible_device_list);
     gpu_options->set_per_process_gpu_memory_fraction(
         per_process_gpu_memory_fraction);
-    gpu_options->mutable_experimental()->set_use_cuda_malloc_async(
-        use_cuda_malloc_async);
     for (int i = 0; i < memory_limit_mb.size(); ++i) {
       auto virtual_devices =
           gpu_options->mutable_experimental()->add_virtual_devices();
@@ -113,33 +109,6 @@ class GPUDeviceTest : public ::testing::Test {
   }
 };
 
-TEST_F(GPUDeviceTest, CudaMallocAsync) {
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {},
-                                           /*use_cuda_malloc_async=*/true);
-  std::vector<std::unique_ptr<Device>> devices;
-  Status status;
-  int number_instantiated =
-      GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
-  {  // The new scope is to trigger the destruction of the object.
-    status = DeviceFactory::GetFactory("GPU")->CreateDevices(
-        opts, kDeviceNamePrefix, &devices);
-    EXPECT_EQ(devices.size(), 1);
-    Device* device = devices[0].get();
-    auto* device_info = device->tensorflow_gpu_device_info();
-    EXPECT_NE(device_info, nullptr);
-
-    AllocatorAttributes allocator_attributes = AllocatorAttributes();
-    allocator_attributes.set_gpu_compatible(true);
-    Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
-    void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment, 1024);
-    EXPECT_NE(ptr, nullptr);
-    allocator->DeallocateRaw(ptr);
-  }
-  EXPECT_EQ(number_instantiated + 1,
-            GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
-  EXPECT_EQ(status.code(), error::OK);
-}
-
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<std::unique_ptr<Device>> devices;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -207,18 +207,21 @@ Allocator* GPUProcessState::GetGPUAllocator(
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
       delete gpu_bfc_allocator;
+      delete sub_allocator;
       gpu_bfc_allocator = nullptr;
+      sub_allocator = nullptr;
       gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
-    } else if (UseCudaMallocAsyncAllocator() ||
-               options.experimental().use_cuda_malloc_async()) {
+    } else if (UseCudaMallocAsyncAllocator()) {
       LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
                 << platform_device_id;
       // If true, passes all allocation requests through to cudaMallocAsync
       // TODO: useful for doing memory debugging with tools like
       // compute-sanitizer.
       // TODO: **WARNING** probably will not work in a multi-gpu scenario
       delete gpu_bfc_allocator;
+      delete sub_allocator;
       gpu_bfc_allocator = nullptr;
+      sub_allocator = nullptr;
       gpu_allocator =
           new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
     }