Skip to content

Commit a4553f8

Browse files
zhangyujingtensorflower-gardener
authored andcommitted
PR #49173: [Crash fix] Fix cudaMallocAsync crashes.
Imported from GitHub PR #49173 The first commit fixes #48869. The second commit fixes another follow up crashes when using TF_GPU_ALLOCATOR=cuda_malloc_async. The 2 fixes are: - The Allocator API have... PiperOrigin-RevId: 380643089 Change-Id: I06f04d8b2d8ed6b08b91f94123a5e9e8a1681793
1 parent 4878ad9 commit a4553f8

File tree

5 files changed

+8
-46
lines changed

5 files changed

+8
-46
lines changed

tensorflow/core/common_runtime/gpu/BUILD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,6 @@ tf_cuda_cc_test(
318318
tags = tf_cuda_tests_tags(),
319319
deps = [
320320
":gpu_id",
321-
":gpu_runtime",
322321
"//tensorflow/cc:cc_ops",
323322
"//tensorflow/core:framework",
324323
"//tensorflow/core:framework_internal",

tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,10 @@ static std::string GetCudaErrorMessage(CUresult result) {
4141
}
4242
#endif // GOOGLE_CUDA
4343

44-
std::atomic<int> GpuCudaMallocAsyncAllocator::number_instantiated_(0);
45-
4644
GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
4745
PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
4846
bool compute_stats)
4947
: name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
50-
++number_instantiated_;
5148
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
5249
stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
5350
platform_device_id)

tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
6767
explicit GpuCudaMallocAsyncAllocator(PlatformDeviceId platform_device_id,
6868
size_t pool_size,
6969
bool reserve_memory = false,
70-
bool compute_stats = true);
70+
bool compute_stats = false);
7171
~GpuCudaMallocAsyncAllocator() override;
7272
string Name() override { return name_; }
7373
void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -85,12 +85,10 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
8585

8686
void SetStream(void* stream) override {
8787
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
88-
cuda_stream_ = *(static_cast<CUstream*>(stream));
88+
cuda_stream_ = reinterpret_cast<CUstream>(stream);
8989
#endif
9090
}
9191

92-
static int GetInstantiatedCountTestOnly() { return number_instantiated_; }
93-
9492
private:
9593
#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
9694
se::StreamExecutor* stream_exec_; // Not owned.
@@ -108,10 +106,6 @@ class GpuCudaMallocAsyncAllocator : public Allocator {
108106
CUmemoryPool pool_;
109107
#endif // TF_CUDA_MALLOC_ASYNC_SUPPORTED
110108

111-
// Just a counter for the number of time this class is instantiated.
112-
// Only useful for tests.
113-
static std::atomic<int> number_instantiated_;
114-
115109
string name_;
116110

117111
TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);

tensorflow/core/common_runtime/gpu/gpu_device_test.cc

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ limitations under the License.
1818
#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
1919

2020
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
21-
#include "tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h"
2221
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
2322
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
2423
#include "tensorflow/core/lib/core/errors.h"
@@ -67,17 +66,14 @@ class GPUDeviceTest : public ::testing::Test {
6766
const string& visible_device_list = "",
6867
double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
6968
const std::vector<std::vector<float>>& memory_limit_mb = {},
70-
const std::vector<std::vector<int32>>& priority = {},
71-
const bool use_cuda_malloc_async = false) {
69+
const std::vector<std::vector<int32>>& priority = {}) {
7270
SessionOptions options;
7371
ConfigProto* config = &options.config;
7472
(*config->mutable_device_count())["GPU"] = gpu_device_count;
7573
GPUOptions* gpu_options = config->mutable_gpu_options();
7674
gpu_options->set_visible_device_list(visible_device_list);
7775
gpu_options->set_per_process_gpu_memory_fraction(
7876
per_process_gpu_memory_fraction);
79-
gpu_options->mutable_experimental()->set_use_cuda_malloc_async(
80-
use_cuda_malloc_async);
8177
for (int i = 0; i < memory_limit_mb.size(); ++i) {
8278
auto virtual_devices =
8379
gpu_options->mutable_experimental()->add_virtual_devices();
@@ -113,33 +109,6 @@ class GPUDeviceTest : public ::testing::Test {
113109
}
114110
};
115111

116-
TEST_F(GPUDeviceTest, CudaMallocAsync) {
117-
SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {},
118-
/*use_cuda_malloc_async=*/true);
119-
std::vector<std::unique_ptr<Device>> devices;
120-
Status status;
121-
int number_instantiated =
122-
GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
123-
{ // The new scope is to trigger the destruction of the object.
124-
status = DeviceFactory::GetFactory("GPU")->CreateDevices(
125-
opts, kDeviceNamePrefix, &devices);
126-
EXPECT_EQ(devices.size(), 1);
127-
Device* device = devices[0].get();
128-
auto* device_info = device->tensorflow_gpu_device_info();
129-
EXPECT_NE(device_info, nullptr);
130-
131-
AllocatorAttributes allocator_attributes = AllocatorAttributes();
132-
allocator_attributes.set_gpu_compatible(true);
133-
Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
134-
void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment, 1024);
135-
EXPECT_NE(ptr, nullptr);
136-
allocator->DeallocateRaw(ptr);
137-
}
138-
EXPECT_EQ(number_instantiated + 1,
139-
GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
140-
EXPECT_EQ(status.code(), error::OK);
141-
}
142-
143112
TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
144113
SessionOptions opts = MakeSessionOptions("0,abc");
145114
std::vector<std::unique_ptr<Device>> devices;

tensorflow/core/common_runtime/gpu/gpu_process_state.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,21 @@ Allocator* GPUProcessState::GetGPUAllocator(
207207
// useful for doing memory debugging with tools like cuda-memcheck
208208
// **WARNING** probably will not work in a multi-gpu scenario
209209
delete gpu_bfc_allocator;
210+
delete sub_allocator;
210211
gpu_bfc_allocator = nullptr;
212+
sub_allocator = nullptr;
211213
gpu_allocator = new GPUcudaMallocAllocator(platform_device_id);
212-
} else if (UseCudaMallocAsyncAllocator() ||
213-
options.experimental().use_cuda_malloc_async()) {
214+
} else if (UseCudaMallocAsyncAllocator()) {
214215
LOG(INFO) << "Using CUDA malloc Async allocator for GPU: "
215216
<< platform_device_id;
216217
// If true, passes all allocation requests through to cudaMallocAsync
217218
// TODO: useful for doing memory debugging with tools like
218219
// compute-sanitizer.
219220
// TODO: **WARNING** probably will not work in a multi-gpu scenario
220221
delete gpu_bfc_allocator;
222+
delete sub_allocator;
221223
gpu_bfc_allocator = nullptr;
224+
sub_allocator = nullptr;
222225
gpu_allocator =
223226
new GpuCudaMallocAsyncAllocator(platform_device_id, total_bytes);
224227
}

0 commit comments

Comments
 (0)