Skip to content

Commit 39437f4

Browse files
skiminkivondele
authored andcommitted
Advise the kernel to use huge pages (Linux)
Align the TT allocation by 2M to make it huge page friendly and advise the kernel to use huge pages. Benchmarks on my i7-8700K (6C/12T) box: (3 runs per bench per config) vanilla (nps) hugepages (nps) avg ================================================================================== bench | 3012490 3024364 3036331 3071052 3067544 3071052 +1.5% bench 16 12 20 | 19237932 19050166 19085315 19266346 19207025 19548758 +1.1% bench 16384 12 20 | 18182313 18371581 18336838 19381275 19738012 19620225 +7.0% On my box, huge pages have a significant perf impact when using a big hash size. They also speed up TT initialization big time: vanilla (s) huge pages (s) speed-up ======================================================================= time stockfish bench 16384 1 1 | 5.37 1.48 3.6x In practice, huge pages with auto-defrag may always be enabled in the system, in which case this patch has no effect. This depends on the values in /sys/kernel/mm/transparent_hugepage/enabled and /sys/kernel/mm/transparent_hugepage/defrag. closes #2463 No functional change
1 parent 6d0eabd commit 39437f4

File tree

4 files changed

+45
-14
lines changed

4 files changed

+45
-14
lines changed

src/misc.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
4747
#include <sstream>
4848
#include <vector>
4949

50+
#ifdef __linux__
51+
#include <stdlib.h>
52+
#include <sys/mman.h>
53+
#endif
54+
5055
#include "misc.h"
5156
#include "thread.h"
5257

@@ -190,7 +195,7 @@ const std::string compiler_info() {
190195
compiler += "(unknown version)";
191196
#endif
192197

193-
#if defined(__APPLE__)
198+
#if defined(__APPLE__)
194199
compiler += " on Apple";
195200
#elif defined(__CYGWIN__)
196201
compiler += " on Cygwin";
@@ -288,6 +293,35 @@ void prefetch(void* addr) {
288293

289294
#endif
290295

296+
297+
/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages.
298+
/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free.
299+
/// With c++17 some of this functionality can be simplified.
300+
#ifdef __linux__
301+
302+
void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
303+
304+
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
305+
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
306+
*mem = aligned_alloc(alignment, size);
307+
madvise(*mem, allocSize, MADV_HUGEPAGE);
308+
return *mem;
309+
}
310+
311+
#else
312+
313+
void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
314+
315+
constexpr size_t alignment = 64; // assumed cache line size
316+
size_t size = allocSize + alignment - 1; // allocate some extra space
317+
*mem = malloc(size);
318+
void* ret = reinterpret_cast<void*>((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1));
319+
return ret;
320+
}
321+
322+
#endif
323+
324+
291325
namespace WinProcGroup {
292326

293327
#ifndef _WIN32

src/misc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const std::string engine_info(bool to_uci = false);
3333
const std::string compiler_info();
3434
void prefetch(void* addr);
3535
void start_logger(const std::string& fname);
36+
void* aligned_ttmem_alloc(size_t size, void** mem);
3637

3738
void dbg_hit_on(bool b);
3839
void dbg_hit_on(bool c, bool b);

src/tt.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,17 @@ void TranspositionTable::resize(size_t mbSize) {
6363

6464
Threads.main()->wait_for_search_finished();
6565

66-
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
67-
6866
free(mem);
69-
mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);
7067

68+
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
69+
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem));
7170
if (!mem)
7271
{
7372
std::cerr << "Failed to allocate " << mbSize
7473
<< "MB for transposition table." << std::endl;
7574
exit(EXIT_FAILURE);
7675
}
7776

78-
table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));
7977
clear();
8078
}
8179

src/tt.h

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,24 +57,22 @@ struct TTEntry {
5757
};
5858

5959

60-
/// A TranspositionTable consists of a power of 2 number of clusters and each
61-
/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry
62-
/// contains information of exactly one position. The size of a cluster should
63-
/// divide the size of a cache line size, to ensure that clusters never cross
64-
/// cache lines. This ensures best cache performance, as the cacheline is
65-
/// prefetched, as soon as possible.
60+
/// A TranspositionTable is an array of Cluster, of size clusterCount. Each
61+
/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
62+
/// contains information on exactly one position. The size of a Cluster should
63+
/// divide the size of a cache line for best performance,
64+
/// as the cacheline is prefetched when possible.
6665

6766
class TranspositionTable {
6867

69-
static constexpr int CacheLineSize = 64;
7068
static constexpr int ClusterSize = 3;
7169

7270
struct Cluster {
7371
TTEntry entry[ClusterSize];
74-
char padding[2]; // Align to a divisor of the cache line size
72+
char padding[2]; // Pad to 32 bytes
7573
};
7674

77-
static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect");
75+
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
7876

7977
public:
8078
~TranspositionTable() { free(mem); }

0 commit comments

Comments
 (0)