Skip to content

Commit 2a08566

Browse files
kimishpatelfacebook-github-bot
authored andcommitted
Simple caching allocator for CPU. (#42006)
Summary: Pull Request resolved: #42006 This PR introduces a simple CPU caching allocator. This is specifically intended for mobile use cases and for inference. There is nothing specific to the implementation that can prevent it from other use cases, however its simplicity may not be suitable everywhere. It simply tracks allocation by sizes and relies on deterministic repeatable behavior where allocation of same sizes are made on every inference. Thus after the first allocation when the pointer is returned, instead of returning it to system, allocator caches it for subsequent use. Memory is freed automatically at the end of the process, or it can be explicitly freed. This is enabled at the moment in DefaultMobileCPUAllocator only. Test Plan: android test: cpu_caching_allocator_test Imported from OSS Reviewed By: dreiss Differential Revision: D22726976 fbshipit-source-id: 9a38b1ce34059d5653040a1c3d035bfc97609e6c
1 parent abe878c commit 2a08566

File tree

8 files changed

+297
-6
lines changed

8 files changed

+297
-6
lines changed

aten/src/ATen/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
420420
set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
421421
set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
422422
set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE)
423-
set(ATen_MOBILE_TEST_SRCS ${ATen_VEC256_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
423+
set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
424424
set(ATen_QUANTIZED_TEST_SRCS ${ATen_QUANTIZED_TEST_SRCS} PARENT_SCOPE)
425425
set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
426426
set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)

aten/src/ATen/test/CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,13 @@ list(APPEND ATen_HIP_TEST_SRCS
7777
list(APPEND ATen_VULKAN_TEST_SRCS
7878
${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
7979

80-
list(APPEND ATen_VEC256_TEST_SRCS
81-
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp)
80+
list(APPEND ATen_MOBILE_TEST_SRCS
81+
${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
82+
${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
8283

8384
# ---[ Send the lists to the parent scope.
8485
set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
8586
set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
8687
set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
8788
set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
88-
set(ATen_VEC256_TEST_SRCS ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE)
89+
set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include <gtest/gtest.h>
2+
3+
#include <ATen/cpu/vec256/vec256.h>
4+
#include <ATen/ATen.h>
5+
6+
#include <c10/core/CPUCachingAllocator.h>
7+
8+
TEST(CPUCachingAllocatorTest, check_alloc_free) {
9+
c10::CPUCachingAllocator caching_allocator;
10+
c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
11+
&caching_allocator);
12+
at::Tensor a = at::rand({23, 23});
13+
float* data_ptr = a.data_ptr<float>();
14+
a.reset();
15+
a = at::rand({23, 23});
16+
ASSERT_TRUE(data_ptr == a.data_ptr<float>());
17+
}
18+
19+
// This should just free the pointer correctly.
20+
TEST(CPUCachingAllocatorTest, check_alloc_outside_free_inside) {
21+
c10::CPUCachingAllocator caching_allocator;
22+
at::Tensor a = at::rand({23, 23});
23+
{
24+
c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
25+
&caching_allocator);
26+
float* data_ptr = a.data_ptr<float>();
27+
a.reset();
28+
a = at::rand({23, 23});
29+
}
30+
}
31+
32+
TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
33+
c10::CPUCachingAllocator caching_allocator;
34+
at::Tensor a;
35+
{
36+
c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
37+
&caching_allocator);
38+
a = at::rand({23, 23});
39+
}
40+
a.reset();
41+
}
42+
43+
int main(int argc, char* argv[]) {
44+
// At the moment caching allocator is only exposed to mobile cpu allocator.
45+
#ifdef C10_MOBILE
46+
::testing::InitGoogleTest(&argc, argv);
47+
at::manual_seed(42);
48+
return RUN_ALL_TESTS();
49+
#endif /* C10_Mobile */
50+
}

binaries/speed_benchmark_torch.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include "torch/csrc/jit/serialization/import.h"
2525
#include "torch/script.h"
2626

27+
#include "c10/core/CPUCachingAllocator.h"
28+
2729
#include <chrono>
2830
using namespace std::chrono;
2931

@@ -45,6 +47,10 @@ C10_DEFINE_bool(
4547
no_inputs,
4648
false,
4749
"Whether the model has any input. Will ignore other input arugments if true");
50+
C10_DEFINE_bool(
51+
use_caching_allocator,
52+
false,
53+
"Whether to cache allocations between inference iterations");
4854
C10_DEFINE_int(
4955
use_bundled_input,
5056
-1,
@@ -198,6 +204,11 @@ int main(int argc, char** argv) {
198204
std::cout << module.forward(inputs) << std::endl;
199205
}
200206

207+
c10::CPUCachingAllocator caching_allocator;
208+
c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
209+
if (FLAGS_use_caching_allocator) {
210+
caching_allocator_guard.emplace(&caching_allocator);
211+
}
201212
std::cout << "Starting benchmark." << std::endl;
202213
std::cout << "Running warmup runs." << std::endl;
203214
CAFFE_ENFORCE(

c10/core/CPUAllocator.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <c10/core/CPUAllocator.h>
2+
#include <c10/core/CPUCachingAllocator.h>
23
#include <c10/core/DeviceType.h>
34

45
// TODO: rename flags to C10
@@ -154,7 +155,15 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
154155
}
155156
// TODO: enable with better TLS support on mobile
156157
// profiledCPUMemoryReporter().Delete(pointer);
157-
c10::free_cpu(pointer);
158+
auto allocator_ptr = GetThreadLocalCachingAllocator();
159+
if (allocator_ptr != nullptr) {
160+
allocator_ptr->free(pointer);
161+
} else {
162+
c10::free_cpu(pointer);
163+
// This adds extra cost to freeing memory to the default case when
164+
// caching allocator is not enabled.
165+
CPUCachingAllocator::record_free(pointer);
166+
}
158167
}
159168

160169
virtual DataPtr allocate(const size_t nbytes) const override {
@@ -168,7 +177,13 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
168177
}
169178

170179
auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
171-
void* const data = c10::alloc_cpu(alloc_size);
180+
void* data;
181+
auto allocator_ptr = GetThreadLocalCachingAllocator();
182+
if (allocator_ptr != nullptr) {
183+
data = allocator_ptr->allocate(alloc_size);
184+
} else {
185+
data = c10::alloc_cpu(alloc_size);
186+
}
172187
// profiledCPUMemoryReporter().New(data, alloc_size);
173188
return {
174189
reinterpret_cast<uint8_t*>(data) + PreGuardBytes,

c10/core/CPUCachingAllocator.cpp

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#include <c10/core/CPUCachingAllocator.h>
2+
3+
namespace c10 {
4+
5+
namespace {
6+
thread_local CPUCachingAllocator* caching_allocator_ptr{nullptr};
7+
} // namespace
8+
9+
std::mutex CPUCachingAllocator::mutex_;
10+
ska::flat_hash_map<void*, size_t> CPUCachingAllocator::allocation_map_;
11+
12+
inline void* CPUCachingAllocator::allocate_and_cache(const size_t bytes) {
13+
void* ptr;
14+
try {
15+
ptr = c10::alloc_cpu(bytes);
16+
} catch (c10::Error& e) {
17+
// If allocation fails, try freeing cached available blocks.
18+
// For now free all available cached blocks.
19+
free_cached();
20+
// Furthermore to consider: If we ever come here running out of memory
21+
// perhaps it is best to disable caching, since this is likely to happen
22+
// again.
23+
// Try again.
24+
ptr = c10::alloc_cpu(bytes);
25+
}
26+
allocation_map_[ptr] = bytes;
27+
return ptr;
28+
}
29+
30+
void* CPUCachingAllocator::allocate(const size_t bytes) {
31+
std::lock_guard<std::mutex> guard(mutex_);
32+
const auto& it = available_map_.find(bytes);
33+
if (it == available_map_.end() || it->second.empty()) {
34+
return allocate_and_cache(bytes);
35+
}
36+
return it->second.pop_back_val();
37+
}
38+
39+
void CPUCachingAllocator::free(void* ptr) {
40+
// NB: since we are not really freeing the memory
41+
// the cases such as quantization code freeing original weights
42+
// on mobile, will not quite work, as we likely will hold
43+
// onto that memory.
44+
// NB: We can also enable max memory cached for better memory
45+
// management such that free will actually free the memory if
46+
// we are nearing or above the watermark.
47+
std::lock_guard<std::mutex> guard(mutex_);
48+
// If this allocation was done before caching allocator was enabled
49+
// then free regularly
50+
const auto& it = allocation_map_.find(ptr);
51+
if (it == allocation_map_.end()) {
52+
c10::free_cpu(ptr);
53+
return;
54+
}
55+
const size_t alloc_size = it->second;
56+
available_map_[alloc_size].push_back(ptr);
57+
}
58+
59+
void CPUCachingAllocator::record_free(void* ptr) {
60+
// This function captures the case when the allocated memory
61+
// is being freed outside the scope of this allocator.
62+
// At the moment only way to capture this is to have the allocator,
63+
// that uses this CachingAllocator as the backing allocator,
64+
// call this function explicity upon freeing memory while
65+
// outside the scope of caching allocator.
66+
// If the memory is freed in some other way, then we will likely
67+
// have undefined behavior or page fault. But this can be
68+
// the case without caching allocator as well.
69+
std::lock_guard<std::mutex> guard(mutex_);
70+
const auto& it = allocation_map_.find(ptr);
71+
if (it != allocation_map_.end()) {
72+
allocation_map_.erase(it);
73+
}
74+
}
75+
76+
void CPUCachingAllocator::free_cached() {
77+
for (const auto& it : available_map_) {
78+
for (const auto ptr : it.second) {
79+
c10::free_cpu(ptr);
80+
// When cached memory is return to OS, it must be removed
81+
// from allocation_map.
82+
allocation_map_.erase(ptr);
83+
}
84+
}
85+
available_map_.clear();
86+
}
87+
88+
CPUCachingAllocator::~CPUCachingAllocator() {
89+
free_cached();
90+
}
91+
92+
CPUCachingAllocator* GetThreadLocalCachingAllocator() {
93+
return caching_allocator_ptr;
94+
}
95+
96+
WithCPUCachingAllocatorGuard::WithCPUCachingAllocatorGuard(
97+
CPUCachingAllocator* allocator) {
98+
prev_caching_allocator_ptr_ = GetThreadLocalCachingAllocator();
99+
}
100+
101+
WithCPUCachingAllocatorGuard::~WithCPUCachingAllocatorGuard() {
102+
caching_allocator_ptr = prev_caching_allocator_ptr_;
103+
}
104+
105+
} // namespace c10

c10/core/CPUCachingAllocator.h

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <deque>
5+
#include <memory>
6+
#include <mutex>
7+
8+
#include <c10/core/CPUAllocator.h>
9+
#include <c10/util/Exception.h>
10+
#include <c10/util/SmallVector.h>
11+
#include <c10/util/flat_hash_map.h>
12+
13+
namespace c10 {
14+
15+
class C10_API CPUCachingAllocator {
16+
/*
17+
* What it does:
18+
* Caches all the allocations carried out by this allocator.
19+
* Cache key is the size of the allocation.
20+
* If requested size is found in the cache returns the cached pointer.
21+
* What it does not do:
22+
* No speculative allocation for any future allocations.
23+
*/
24+
private:
25+
// Invariants.
26+
// 1. If memory is ever allocated via this allocator then
27+
// the pointer will exist in allocation_map_, unless the allocator
28+
// returned the memory to OS via free_cached.
29+
// 1.1. Therefore even when the said memory is "freed" via this
30+
// allocator (and thus cached), it will continue to stay
31+
// in allocaiton_map_. Furthermore it will also exist in
32+
// available_map_. Thus an allocated memory pointer can be in both
33+
// allocation_map_ and available_map_ simultaneously.
34+
// 2. Memory pointer maybe removed from allocation_map_, when it
35+
// is freed outside of the scope of this allocator, but was allocated
36+
// by this allocator.
37+
// 3. Available map only contains that memory which was allocated
38+
// by this allocator and subsequently freed by this allocator.
39+
// As a result of above invariants, allocated memory ptr cannot be in
40+
// available_map_ unless it is in allocation_map_ as well.
41+
ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
42+
static ska::flat_hash_map<void*, size_t> allocation_map_;
43+
// Since allocation_map, which is a global instance, is mutated/read via
44+
// all public APIs we need a global mutex.
45+
static std::mutex mutex_;
46+
inline void* allocate_and_cache(const size_t bytes);
47+
void free_cached();
48+
public:
49+
static void record_free(void* ptr);
50+
// Checks the cache to see if allocation of size bytes can be found.
51+
// If so return cached memory, else
52+
// allocates memory, records it for caching and returns.
53+
void* allocate(const size_t bytes);
54+
// Checks if the memory being freed is was marked for allocation by
55+
// an earlier call to allocate. If so cache the allocation.
56+
// Otherwise free.
57+
void free(void* ptr);
58+
// Mainly for testing
59+
~CPUCachingAllocator();
60+
};
61+
62+
CPUCachingAllocator* GetDefaultCPUCachingAllocator();
63+
64+
bool ThreadLocalCachingAllocatorEnabled();
65+
CPUCachingAllocator* GetThreadLocalCachingAllocator();
66+
67+
/*
68+
* Usage pattern:
69+
* std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
70+
* std::make_unique<c10::CPUCachingAllocator>();
71+
* {
72+
* WithCPUCachingAllocatorGuard(caching_allocator.get());
73+
* ...
74+
* }
75+
*/
76+
77+
class C10_API WithCPUCachingAllocatorGuard {
78+
public:
79+
WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
80+
~WithCPUCachingAllocatorGuard();
81+
private:
82+
CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
83+
};
84+
85+
} // namespace c10

torch/csrc/jit/mobile/observer.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,30 @@ class MobileDebugInfo : public c10::DebugInfoBase {
3636
private:
3737
std::string model_name_;
3838
std::string method_name_;
39+
// TODO: Kimish
40+
// If we launch a thread such as for at::launch, interepter continuation
41+
// and if the caching allocator is enabled in the base thread
42+
// then, in order to propagate this information, that is caching allocator
43+
// is enabled, across thread boundaries we can use the mechanism provided
44+
// by ThreadLocalDebugInfo
45+
// Once the thread local MobileDebugInfo is accessible in the launched
46+
// thread, it can be accessed in that thread and that thread can set
47+
// its own thread local CachingAllocatorInfo.
48+
// However, we cannot expect every launched thread to extract and set
49+
// its own thread local copy of CachingAllocatorInfo.
50+
// But this can be done in lite interpreter, where in the run method
51+
// it can do info =
52+
// c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO))
53+
// .get_caching_allocator_info();
54+
// GetThreadLocalCachingAllocatorInfo() = info;
55+
// Other option is to have MobileDebugInfo itself be the place where thread
56+
// local copy of CachingAllocatorInfo is stored. Then
57+
// DefaultMobileCPUAllocator inspects this to decide if to use
58+
// CachingAllocator. However, current lite interpreter does not support FORK,
59+
// thus from the run method of lite interpreter we are not really gonna launch
60+
// another instance of lite interpreter in a different thread. So for now not
61+
// getting bothered about passing CachingAllocatorInfo across thread
62+
// boundaries. c10::CachingAllocatorInfo caching_allocator_info;
3963
size_t op_idx_ = 0;
4064
};
4165

0 commit comments

Comments
 (0)