Remove unnecessary GPU allocator wrapped in GPUcudaMallocAllocator.

JXRiver · tensorflower-gardener · commit 76e61092a7aa · 2021-06-03T15:24:56.000-07:00
GPUcudaMallocAllocator doesn't use the passed in GPU allocator. Its GetStats doesn't return the correct value. Removing both to avoid confusions.

PiperOrigin-RevId: 377383677
Change-Id: I09ee5def36588d70fc3c1ffa465ce28a2f2af018
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -27,15 +27,12 @@ limitations under the License.
 namespace tensorflow {
 
 GPUcudaMallocAllocator::GPUcudaMallocAllocator(
-    Allocator* allocator, PlatformDeviceId platform_device_id)
-    : base_allocator_(allocator) {
+    PlatformDeviceId platform_device_id) {
   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
                                                            platform_device_id)
                      .ValueOrDie();
 }
 
-GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; }
-
 void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 #ifdef GOOGLE_CUDA
   // allocate with cudaMalloc
@@ -80,10 +77,6 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #endif  // GOOGLE_CUDA
 }
 
-absl::optional<AllocatorStats> GPUcudaMallocAllocator::GetStats() {
-  return base_allocator_->GetStats();
-}
-
 bool GPUcudaMallocAllocator::TracksAllocationSizes() const { return false; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -26,23 +26,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-// An allocator that wraps a GPU allocator and adds debugging
-// functionality that verifies that users do not write outside their
-// allocated memory.
+// An allocator which directly uses cuMemAlloc and cuMemFree to allocate and
+// free memory.
 class GPUcudaMallocAllocator : public Allocator {
  public:
-  explicit GPUcudaMallocAllocator(Allocator* allocator,
-                                  PlatformDeviceId platform_device_id);
-  ~GPUcudaMallocAllocator() override;
+  explicit GPUcudaMallocAllocator(PlatformDeviceId platform_device_id);
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   bool TracksAllocationSizes() const override;
-  absl::optional<AllocatorStats> GetStats() override;
 
  private:
-  Allocator* base_allocator_ = nullptr;  // owned
-
   se::StreamExecutor* stream_exec_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUcudaMallocAllocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -204,15 +204,22 @@ Allocator* GPUProcessState::GetGPUAllocator(
       // If true, passes all allocation requests through to cudaMalloc
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
-      gpu_allocator =
-          new GPUcudaMallocAllocator(gpu_allocator, platform_device_id);
+      delete gpu_bfc_allocator;
+      delete sub_allocator;
+      gpu_bfc_allocator = nullptr;
+      sub_allocator = nullptr;
+      gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
     } else if (UseCudaMallocAsyncAllocator()) {
       LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
                 << platform_device_id;
       // If true, passes all allocation requests through to cudaMallocAsync
       // TODO: useful for doing memory debugging with tools like
       // compute-sanitizer.
       // TODO: **WARNING** probably will not work in a multi-gpu scenario
+      delete gpu_bfc_allocator;
+      delete sub_allocator;
+      gpu_bfc_allocator = nullptr;
+      sub_allocator = nullptr;
       gpu_allocator =
           new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
     }
@@ -259,6 +266,9 @@ SharedCounter* GPUProcessState::GPUAllocatorCounter(TfDeviceId tf_device_id) {
 
   AllocatorParts& allocator_parts = gpu_allocators_[tf_device_id.value()];
   if (allocator_parts.counter.get() == nullptr) {
+    if (allocator_parts.bfc_allocator == nullptr) {
+      return nullptr;
+    }
     SharedCounter* timing_counter = new SharedCounter;
     allocator_parts.bfc_allocator->SetTimingCounter(timing_counter);
     allocator_parts.counter.reset(timing_counter);