Skip to content

Commit e755f6b

Browse files
committed
Revert "PR #49173: [Crash fix] Fix cudaMallocAsync crashes."
This reverts commit a4553f8.
1 parent dc74623 commit e755f6b

File tree

5 files changed

+46
-8
lines changed

5 files changed

+46
-8
lines changed

tensorflow/core/common_runtime/gpu/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ tf_cuda_cc_test(
318318
tags = tf_cuda_tests_tags(),
319319
deps = [
320320
":gpu_id",
321+
":gpu_runtime",
321322
"//tensorflow/cc:cc_ops",
322323
"//tensorflow/core:framework",
323324
"//tensorflow/core:framework_internal",

tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,13 @@ void GpuCudaMallocAsyncAllocator::PrintAllocatorStatistics() {
9595
#endif
9696
}
9797

98+
std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
99+
98100
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
99101
PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
100102
bool compute_stats)
101103
: name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
104+
++number_instantiated_;
102105
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
103106
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
104107
platform_device_id)

tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
6767
explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
6868
size_t pool_size,
6969
bool reserve_memory = false,
70-
bool compute_stats = false);
70+
bool compute_stats = true);
7171
~GpuCudaMallocAsyncAllocator() override;
7272
string Name() override { return name_; }
7373
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -85,7 +85,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
8585

8686
void SetStream(void* stream) override {
8787
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
88-
cuda_stream_ = reinterpret_cast<CUstream>(stream);
88+
cuda_stream_ = *(static_cast<CUstream*>(stream));
8989
#endif
9090
}
9191

@@ -95,6 +95,8 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
9595
// - If CUDA_VERSION >= 11030, print cudaMallocAsync statistics.
9696
void PrintAllocatorStatistics();
9797

98+
static int GetInstantiatedCountTestOnly() { return number_instantiated_; }
99+
98100
private:
99101
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
100102
se::StreamExecutor* stream_exec_; // Not owned.
@@ -112,6 +114,10 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
112114
CUmemoryPool pool_;
113115
#endif // TF_CUDA_MALLOC_ASYNC_SUPPORTED
114116

117+
// Just a counter for the number of time this class is instantiated.
118+
// Only useful for tests.
119+
static std::atomic<int> number_instantiated_;
120+
115121
string name_;
116122

117123
TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);

tensorflow/core/common_runtime/gpu/gpu_device_test.cc

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
1919

2020
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
21+
#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
2122
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
2223
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
2324
#include "tensorflow/core/lib/core/errors.h"
@@ -66,14 +67,17 @@ class GPUDeviceTest : public ::testing::Test {
6667
const string& visible_device_list = "",
6768
double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
6869
const std::vector<std::vector<float>>& memory_limit_mb = {},
69-
const std::vector<std::vector<int32>>& priority = {}) {
70+
const std::vector<std::vector<int32>>& priority = {},
71+
const bool use_cuda_malloc_async = false) {
7072
SessionOptions options;
7173
ConfigProto* config = &options.config;
7274
(*config->mutable_device_count())["GPU"] = gpu_device_count;
7375
GPUOptions* gpu_options = config->mutable_gpu_options();
7476
gpu_options->set_visible_device_list(visible_device_list);
7577
gpu_options->set_per_process_gpu_memory_fraction(
7678
per_process_gpu_memory_fraction);
79+
gpu_options->mutable_experimental()->set_use_cuda_malloc_async(
80+
use_cuda_malloc_async);
7781
for (int i = 0; i < memory_limit_mb.size(); ++i) {
7882
auto virtual_devices =
7983
gpu_options->mutable_experimental()->add_virtual_devices();
@@ -109,6 +113,33 @@ class GPUDeviceTest : public ::testing::Test {
109113
}
110114
};
111115

116+
TEST_F(GPUDeviceTest, CudaMallocAsync) {
117+
SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {},
118+
/*use_cuda_malloc_async=*/true);
119+
std::vector<std::unique_ptr<Device>> devices;
120+
Status status;
121+
int number_instantiated =
122+
GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
123+
{ // The new scope is to trigger the destruction of the object.
124+
status = DeviceFactory::GetFactory("GPU")->CreateDevices(
125+
opts, kDeviceNamePrefix, &devices);
126+
EXPECT_EQ(devices.size(), 1);
127+
Device* device = devices[0].get();
128+
auto* device_info = device->tensorflow_gpu_device_info();
129+
EXPECT_NE(device_info, nullptr);
130+
131+
AllocatorAttributes allocator_attributes = AllocatorAttributes();
132+
allocator_attributes.set_gpu_compatible(true);
133+
Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
134+
void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment, 1024);
135+
EXPECT_NE(ptr, nullptr);
136+
allocator->DeallocateRaw(ptr);
137+
}
138+
EXPECT_EQ(number_instantiated + 1,
139+
GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
140+
EXPECT_EQ(status.code(), error::OK);
141+
}
142+
112143
TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
113144
SessionOptions opts = MakeSessionOptions("0,abc");
114145
std::vector<std::unique_ptr<Device>> devices;

tensorflow/core/common_runtime/gpu/gpu_process_state.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -207,21 +207,18 @@ Allocator* GPUProcessState::GetGPUAllocator(
207207
// useful for doing memory debugging with tools like cuda-memcheck
208208
// **WARNING** probably will not work in a multi-gpu scenario
209209
delete gpu_bfc_allocator;
210-
delete sub_allocator;
211210
gpu_bfc_allocator = nullptr;
212-
sub_allocator = nullptr;
213211
gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
214-
} else if (UseCudaMallocAsyncAllocator()) {
212+
} else if (UseCudaMallocAsyncAllocator() ||
213+
options.experimental().use_cuda_malloc_async()) {
215214
LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
216215
<< platform_device_id;
217216
// If true, passes all allocation requests through to cudaMallocAsync
218217
// TODO: useful for doing memory debugging with tools like
219218
// compute-sanitizer.
220219
// TODO: **WARNING** probably will not work in a multi-gpu scenario
221220
delete gpu_bfc_allocator;
222-
delete sub_allocator;
223221
gpu_bfc_allocator = nullptr;
224-
sub_allocator = nullptr;
225222
gpu_allocator =
226223
new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
227224
}

0 commit comments

Comments
 (0)