Advise the kernel to use huge pages (Linux)

skiminki · vondele · commit 39437f4e55aa · 2020-01-27T11:16:10.000+01:00
Align the TT allocation by 2M to make it huge page friendly and advise the kernel to use huge pages. Benchmarks on my i7-8700K (6C/12T) box: (3 runs per bench per config) vanilla (nps) hugepages (nps) avg ================================================================================== bench | 3012490 3024364 3036331 3071052 3067544 3071052 +1.5% bench 16 12 20 | 19237932 19050166 19085315 19266346 19207025 19548758 +1.1% bench 16384 12 20 | 18182313 18371581 18336838 19381275 19738012 19620225 +7.0% On my box, huge pages have a significant perf impact when using a big hash size. They also speed up TT initialization big time: vanilla (s) huge pages (s) speed-up ======================================================================= time stockfish bench 16384 1 1 | 5.37 1.48 3.6x In practice, huge pages with auto-defrag may always be enabled in the system, in which case this patch has no effect. This depends on the values in /sys/kernel/mm/transparent_hugepage/enabled and /sys/kernel/mm/transparent_hugepage/defrag. closes #2463 No functional change
diff --git a/src/misc.cpp b/src/misc.cpp
@@ -47,6 +47,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sstream>
 #include <vector>
 
+#ifdef __linux__
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
 #include "misc.h"
 #include "thread.h"
 
@@ -190,7 +195,7 @@ const std::string compiler_info() {
      compiler += "(unknown version)";
   #endif
 
-  #if defined(__APPLE__) 
+  #if defined(__APPLE__)
      compiler += " on Apple";
   #elif defined(__CYGWIN__)
      compiler += " on Cygwin";
@@ -288,6 +293,35 @@ void prefetch(void* addr) {
 
 #endif
 
+
+/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages.
+/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free.
+/// With c++17 some of this functionality can be simplified.
+#ifdef __linux__
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
+  *mem = aligned_alloc(alignment, size);
+  madvise(*mem, allocSize, MADV_HUGEPAGE);
+  return *mem;
+}
+
+#else
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 64; // assumed cache line size
+  size_t size = allocSize + alignment - 1; // allocate some extra space
+  *mem = malloc(size);
+  void* ret = reinterpret_cast<void*>((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1));
+  return ret;
+}
+
+#endif
+
+
 namespace WinProcGroup {
 
 #ifndef _WIN32
diff --git a/src/misc.h b/src/misc.h
@@ -33,6 +33,7 @@ const std::string engine_info(bool to_uci = false);
 const std::string compiler_info();
 void prefetch(void* addr);
 void start_logger(const std::string& fname);
+void* aligned_ttmem_alloc(size_t size, void** mem);
 
 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
diff --git a/src/tt.cpp b/src/tt.cpp
@@ -63,19 +63,17 @@ void TranspositionTable::resize(size_t mbSize) {
 
   Threads.main()->wait_for_search_finished();
 
-  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-
   free(mem);
-  mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);
 
+  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
+  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem));
   if (!mem)
   {
       std::cerr << "Failed to allocate " << mbSize
                 << "MB for transposition table." << std::endl;
       exit(EXIT_FAILURE);
   }
 
-  table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));
   clear();
 }
 
diff --git a/src/tt.h b/src/tt.h
@@ -57,24 +57,22 @@ struct TTEntry {
 };
 
 
-/// A TranspositionTable consists of a power of 2 number of clusters and each
-/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry
-/// contains information of exactly one position. The size of a cluster should
-/// divide the size of a cache line size, to ensure that clusters never cross
-/// cache lines. This ensures best cache performance, as the cacheline is
-/// prefetched, as soon as possible.
+/// A TranspositionTable is an array of Cluster, of size clusterCount. Each
+/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
+/// contains information on exactly one position. The size of a Cluster should
+/// divide the size of a cache line for best performance,
+/// as the cacheline is prefetched when possible.
 
 class TranspositionTable {
 
-  static constexpr int CacheLineSize = 64;
   static constexpr int ClusterSize = 3;
 
   struct Cluster {
     TTEntry entry[ClusterSize];
-    char padding[2]; // Align to a divisor of the cache line size
+    char padding[2]; // Pad to 32 bytes
   };
 
-  static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect");
+  static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
 
 public:
  ~TranspositionTable() { free(mem); }

Original file line number	Diff line number	Diff line change
`@@ -63,19 +63,17 @@ void TranspositionTable::resize(size_t mbSize) {`
`63`	`63`
`64`	`64`	`Threads.main()->wait_for_search_finished();`
`65`	`65`
`66`		`- clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);`
`67`		`-`
`68`	`66`	`free(mem);`
`69`		`- mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);`
`70`	`67`
	`68`	`+ clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);`
	`69`	`+ table = static_cast<Cluster>(aligned_ttmem_alloc(clusterCount sizeof(Cluster), &mem));`
`71`	`70`	`if (!mem)`
`72`	`71`	`{`
`73`	`72`	`std::cerr << "Failed to allocate " << mbSize`
`74`	`73`	`<< "MB for transposition table." << std::endl;`
`75`	`74`	`exit(EXIT_FAILURE);`
`76`	`75`	`}`
`77`	`76`
`78`		`- table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));`
`79`	`77`	`clear();`
`80`	`78`	`}`
`81`	`79`