Simple caching allocator for CPU. (#42006)

kimishpatel · facebook-github-bot · commit 2a08566b8fab · 2020-08-21T19:09:22.000-07:00
Summary: Pull Request resolved: #42006 This PR introduces a simple CPU caching allocator. This is specifically intended for mobile use cases and for inference. There is nothing specific to the implementation that can prevent it from other use cases, however its simplicity may not be suitable everywhere. It simply tracks allocation by sizes and relies on deterministic repeatable behavior where allocation of same sizes are made on every inference. Thus after the first allocation when the pointer is returned, instead of returning it to system, allocator caches it for subsequent use. Memory is freed automatically at the end of the process, or it can be explicitly freed. This is enabled at the moment in DefaultMobileCPUAllocator only. Test Plan: android test: cpu_caching_allocator_test Imported from OSS Reviewed By: dreiss Differential Revision: D22726976 fbshipit-source-id: 9a38b1ce34059d5653040a1c3d035bfc97609e6c
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -420,7 +420,7 @@ set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_MOBILE_BENCHMARK_SRCS ${ATen_MOBILE_BENCHMARK_SRCS} PARENT_SCOPE)
-set(ATen_MOBILE_TEST_SRCS ${ATen_VEC256_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
+set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_TEST_SRCS ${ATen_QUANTIZED_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -77,12 +77,13 @@ list(APPEND ATen_HIP_TEST_SRCS
 list(APPEND ATen_VULKAN_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
-list(APPEND ATen_VEC256_TEST_SRCS
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp)
+list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 # ---[ Send the lists to the parent scope.
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
 set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE)
-set(ATen_VEC256_TEST_SRCS ${ATen_VEC256_TEST_SRCS} PARENT_SCOPE)
+set(ATen_MOBILE_TEST_SRCS ${ATen_MOBILE_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cpu/vec256/vec256.h>
+#include <ATen/ATen.h>
+
+#include <c10/core/CPUCachingAllocator.h>
+
+TEST(CPUCachingAllocatorTest, check_alloc_free) {
+  c10::CPUCachingAllocator caching_allocator;
+  c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+      &caching_allocator);
+  at::Tensor a = at::rand({23, 23});
+  float* data_ptr = a.data_ptr<float>();
+  a.reset();
+  a = at::rand({23, 23});
+  ASSERT_TRUE(data_ptr == a.data_ptr<float>());
+}
+
+// This should just free the pointer correctly.
+TEST(CPUCachingAllocatorTest, check_alloc_outside_free_inside) {
+  c10::CPUCachingAllocator caching_allocator;
+  at::Tensor a = at::rand({23, 23});
+  {
+    c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+        &caching_allocator);
+    float* data_ptr = a.data_ptr<float>();
+    a.reset();
+    a = at::rand({23, 23});
+  }
+}
+
+TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
+  c10::CPUCachingAllocator caching_allocator;
+  at::Tensor a;
+  {
+    c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
+        &caching_allocator);
+    a = at::rand({23, 23});
+  }
+  a.reset();
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
@@ -24,6 +24,8 @@
 #include "torch/csrc/jit/serialization/import.h"
 #include "torch/script.h"
 
+#include "c10/core/CPUCachingAllocator.h"
+
 #include <chrono>
 using namespace std::chrono;
 
@@ -45,6 +47,10 @@ C10_DEFINE_bool(
   no_inputs,
   false,
   "Whether the model has any input. Will ignore other input arugments if true");
+C10_DEFINE_bool(
+  use_caching_allocator,
+  false,
+  "Whether to cache allocations between inference iterations");
 C10_DEFINE_int(
     use_bundled_input,
     -1,
@@ -198,6 +204,11 @@ int main(int argc, char** argv) {
     std::cout << module.forward(inputs) << std::endl;
   }
 
+  c10::CPUCachingAllocator caching_allocator;
+  c10::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
+  if (FLAGS_use_caching_allocator) {
+    caching_allocator_guard.emplace(&caching_allocator);
+  }
   std::cout << "Starting benchmark." << std::endl;
   std::cout << "Running warmup runs." << std::endl;
   CAFFE_ENFORCE(
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/CPUAllocator.h>
+#include <c10/core/CPUCachingAllocator.h>
 #include <c10/core/DeviceType.h>
 
 // TODO: rename flags to C10
@@ -154,7 +155,15 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     }
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
-    c10::free_cpu(pointer);
+    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    if (allocator_ptr != nullptr) {
+      allocator_ptr->free(pointer);
+    } else {
+      c10::free_cpu(pointer);
+      // This adds extra cost to freeing memory to the default case when
+      // caching allocator is not enabled.
+      CPUCachingAllocator::record_free(pointer);
+    }
   }
 
   virtual DataPtr allocate(const size_t nbytes) const override {
@@ -168,7 +177,13 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     }
 
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
-    void* const data = c10::alloc_cpu(alloc_size);
+    void* data;
+    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    if (allocator_ptr != nullptr) {
+      data = allocator_ptr->allocate(alloc_size);
+    } else {
+      data = c10::alloc_cpu(alloc_size);
+    }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {
         reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/core/CPUCachingAllocator.cpp
@@ -0,0 +1,105 @@
+#include <c10/core/CPUCachingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local CPUCachingAllocator* caching_allocator_ptr{nullptr};
+} // namespace
+
+std::mutex CPUCachingAllocator::mutex_;
+ska::flat_hash_map<void*, size_t> CPUCachingAllocator::allocation_map_;
+
+inline void* CPUCachingAllocator::allocate_and_cache(const size_t bytes) {
+  void* ptr;
+  try {
+    ptr = c10::alloc_cpu(bytes);
+  } catch (c10::Error& e) {
+    // If allocation fails, try freeing cached available blocks.
+    // For now free all available cached blocks.
+    free_cached();
+    // Furthermore to consider: If we ever come here running out of memory
+    // perhaps it is best to disable caching, since this is likely to happen
+    // again.
+    // Try again.
+    ptr = c10::alloc_cpu(bytes);
+  }
+  allocation_map_[ptr] = bytes;
+  return ptr;
+}
+
+void* CPUCachingAllocator::allocate(const size_t bytes) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = available_map_.find(bytes);
+  if (it == available_map_.end() || it->second.empty()) {
+    return allocate_and_cache(bytes);
+  }
+  return it->second.pop_back_val();
+}
+
+void CPUCachingAllocator::free(void* ptr) {
+  // NB: since we are not really freeing the memory
+  // the cases such as quantization code freeing original weights
+  // on mobile, will not quite work, as we likely will hold
+  // onto that memory.
+  // NB: We can also enable max memory cached for better memory
+  // management such that free will actually free the memory if
+  // we are nearing or above the watermark.
+  std::lock_guard<std::mutex> guard(mutex_);
+  // If this allocation was done before caching allocator was enabled
+  // then free regularly
+  const auto& it = allocation_map_.find(ptr);
+  if (it == allocation_map_.end()) {
+    c10::free_cpu(ptr);
+    return;
+  }
+  const size_t alloc_size = it->second;
+  available_map_[alloc_size].push_back(ptr);
+}
+
+void CPUCachingAllocator::record_free(void* ptr) {
+  // This function captures the case when the allocated memory
+  // is being freed outside the scope of this allocator.
+  // At the moment only way to capture this is to have the allocator,
+  // that uses this CachingAllocator as the backing allocator,
+  // call this function explicity upon freeing memory while
+  // outside the scope of caching allocator.
+  // If the memory is freed in some other way, then we will likely
+  // have undefined behavior or page fault. But this can be
+  // the case without caching allocator as well.
+  std::lock_guard<std::mutex> guard(mutex_);
+  const auto& it = allocation_map_.find(ptr);
+  if (it != allocation_map_.end()) {
+    allocation_map_.erase(it);
+  }
+}
+
+void CPUCachingAllocator::free_cached() {
+  for (const auto& it : available_map_) {
+    for (const auto ptr : it.second) {
+      c10::free_cpu(ptr);
+      // When cached memory is return to OS, it must be removed
+      // from allocation_map.
+      allocation_map_.erase(ptr);
+    }
+  }
+  available_map_.clear();
+}
+
+CPUCachingAllocator::~CPUCachingAllocator() {
+  free_cached();
+}
+
+CPUCachingAllocator* GetThreadLocalCachingAllocator() {
+  return caching_allocator_ptr;
+}
+
+WithCPUCachingAllocatorGuard::WithCPUCachingAllocatorGuard(
+    CPUCachingAllocator* allocator) {
+  prev_caching_allocator_ptr_ = GetThreadLocalCachingAllocator();
+}
+
+WithCPUCachingAllocatorGuard::~WithCPUCachingAllocatorGuard() {
+  caching_allocator_ptr = prev_caching_allocator_ptr_;
+}
+
+} // namespace c10
diff --git a/c10/core/CPUCachingAllocator.h b/c10/core/CPUCachingAllocator.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+class C10_API CPUCachingAllocator {
+  /*
+   * What it does:
+   * Caches all the allocations carried out by this allocator.
+   * Cache key is the size of the allocation.
+   * If requested size is found in the cache returns the cached pointer.
+   * What it does not do:
+   * No speculative allocation for any future allocations.
+   */
+  private:
+    // Invariants.
+    // 1. If memory is ever allocated via this allocator then
+    //    the pointer will exist in allocation_map_, unless the allocator
+    //    returned the memory to OS via free_cached.
+    //  1.1. Therefore even when the said memory is "freed" via this
+    //       allocator (and thus cached), it will continue to stay
+    //       in allocaiton_map_. Furthermore it will also exist in
+    //       available_map_. Thus an allocated memory pointer can be in both
+    //       allocation_map_ and available_map_ simultaneously.
+    // 2. Memory pointer maybe removed from allocation_map_, when it
+    //    is freed outside of the scope of this allocator, but was allocated
+    //    by this allocator.
+    // 3. Available map only contains that memory which was allocated
+    //    by this allocator and subsequently freed by this allocator.
+    // As a result of above invariants, allocated memory ptr cannot be in
+    // available_map_ unless it is in allocation_map_ as well.
+    ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
+    static ska::flat_hash_map<void*, size_t> allocation_map_;
+    // Since allocation_map, which is a global instance, is mutated/read via
+    // all public APIs we need a global mutex.
+    static std::mutex mutex_;
+    inline void* allocate_and_cache(const size_t bytes);
+    void free_cached();
+  public:
+    static void record_free(void* ptr);
+    // Checks the cache to see if allocation of size bytes can be found.
+    // If so return cached memory, else
+    // allocates memory, records it for caching and returns.
+    void* allocate(const size_t bytes);
+    // Checks if the memory being freed is was marked for allocation by
+    // an earlier call to allocate. If so cache the allocation.
+    // Otherwise free.
+    void free(void* ptr);
+    // Mainly for testing
+    ~CPUCachingAllocator();
+};
+
+CPUCachingAllocator* GetDefaultCPUCachingAllocator();
+
+bool ThreadLocalCachingAllocatorEnabled();
+CPUCachingAllocator* GetThreadLocalCachingAllocator();
+
+/*
+ * Usage pattern:
+ * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
+ *   std::make_unique<c10::CPUCachingAllocator>();
+ * {
+ * WithCPUCachingAllocatorGuard(caching_allocator.get());
+ * ...
+ * }
+ */
+
+class C10_API WithCPUCachingAllocatorGuard {
+  public:
+    WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
+    ~WithCPUCachingAllocatorGuard();
+  private:
+    CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
+};
+
+} // namespace c10
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
@@ -36,6 +36,30 @@ class MobileDebugInfo : public c10::DebugInfoBase {
  private:
   std::string model_name_;
   std::string method_name_;
+  // TODO: Kimish
+  // If we launch a thread such as for at::launch, interepter continuation
+  // and if the caching allocator is enabled in the base thread
+  // then, in order to propagate this information, that is caching allocator
+  // is enabled, across thread boundaries we can use the mechanism provided
+  // by ThreadLocalDebugInfo
+  // Once the thread local MobileDebugInfo is accessible in the launched
+  // thread, it can be accessed in that thread and that thread can set
+  // its own thread local CachingAllocatorInfo.
+  // However, we cannot expect every launched thread to extract and set
+  // its own thread local copy of CachingAllocatorInfo.
+  // But this can be done in lite interpreter, where in the run method
+  // it can do info =
+  // c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO))
+  // .get_caching_allocator_info();
+  // GetThreadLocalCachingAllocatorInfo() = info;
+  // Other option is to have MobileDebugInfo itself be the place where thread
+  // local copy of CachingAllocatorInfo is stored. Then
+  // DefaultMobileCPUAllocator inspects this to decide if to use
+  // CachingAllocator. However, current lite interpreter does not support FORK,
+  // thus from the run method of lite interpreter we are not really gonna launch
+  // another instance of lite interpreter in a different thread. So for now not
+  // getting bothered about passing CachingAllocatorInfo across thread
+  // boundaries. c10::CachingAllocatorInfo caching_allocator_info;
   size_t op_idx_ = 0;
 };