Skip to content

Commit b9e26a8

Browse files
skiminkiBM123499
authored andcommitted
Add large page support for NNUE weights and simplify TT mem management
Use TT memory functions to allocate memory for the NNUE weights. This should provide a small speed-up on systems where large pages are not automatically used, including Windows and some Linux distributions. Further, since we now have a wrapper for std::aligned_alloc(), we can simplify the TT memory management a bit: - We no longer need to store separate pointers to the hash table and its underlying memory allocation. - We also get to merge the Linux-specific and default implementations of aligned_ttmem_alloc(). Finally, we'll enable the VirtualAlloc code path with large page support also for Win32. STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 14896 W: 1854 L: 1686 D: 11356 Ptnml(0-2): 65, 1224, 4742, 1312, 105 closes official-stockfish#3081 No functional change.
1 parent c3ef3f4 commit b9e26a8

File tree

4 files changed

+45
-35
lines changed

4 files changed

+45
-35
lines changed

src/misc.cpp

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -298,23 +298,9 @@ void prefetch(void* addr) {
298298
/// The returned pointer is the aligned one, while the mem argument is the one that needs
299299
/// to be passed to free. With c++17 some of this functionality could be simplified.
300300

301-
#if defined(__linux__) && !defined(__ANDROID__)
302-
303-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
301+
#if defined(_WIN32)
304302

305-
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
306-
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
307-
if (posix_memalign(&mem, alignment, size))
308-
mem = nullptr;
309-
#if defined(MADV_HUGEPAGE)
310-
madvise(mem, allocSize, MADV_HUGEPAGE);
311-
#endif
312-
return mem;
313-
}
314-
315-
#elif defined(_WIN64)
316-
317-
static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
303+
static void* aligned_large_pages_alloc_win(size_t allocSize) {
318304

319305
HANDLE hProcessToken { };
320306
LUID luid { };
@@ -359,12 +345,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
359345
return mem;
360346
}
361347

362-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
348+
void* aligned_large_pages_alloc(size_t allocSize) {
363349

364350
static bool firstCall = true;
351+
void* mem;
365352

366353
// Try to allocate large pages
367-
mem = aligned_ttmem_alloc_large_pages(allocSize);
354+
mem = aligned_large_pages_alloc_win(allocSize);
368355

369356
// Suppress info strings on the first call. The first call occurs before 'uci'
370357
// is received and in that case this output confuses some GUIs.
@@ -386,23 +373,40 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
386373

387374
#else
388375

389-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
376+
void* aligned_large_pages_alloc(size_t allocSize) {
377+
378+
#if defined(__linux__)
379+
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
380+
#else
381+
constexpr size_t alignment = 4096; // assumed small page size
382+
#endif
390383

391-
constexpr size_t alignment = 64; // assumed cache line size
392-
size_t size = allocSize + alignment - 1; // allocate some extra space
393-
mem = malloc(size);
394-
void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
395-
return ret;
384+
// round up to multiples of alignment
385+
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
386+
void *mem;
387+
388+
#if defined(POSIXALIGNEDALLOC)
389+
mem = posix_memalign(&mem, alignment, size) ? nullptr : mem;
390+
#elif defined(_WIN32)
391+
mem = _mm_malloc(size, alignment);
392+
#else
393+
mem = std::aligned_alloc(alignment, size);
394+
#endif
395+
396+
#if defined(MADV_HUGEPAGE)
397+
madvise(mem, size, MADV_HUGEPAGE);
398+
#endif
399+
return mem;
396400
}
397401

398402
#endif
399403

400404

401-
/// aligned_ttmem_free() will free the previously allocated ttmem
405+
/// aligned_large_pages_free() will free the previously allocated ttmem
402406

403-
#if defined(_WIN64)
407+
#if defined(_WIN32)
404408

405-
void aligned_ttmem_free(void* mem) {
409+
void aligned_large_pages_free(void* mem) {
406410

407411
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
408412
{
@@ -415,8 +419,14 @@ void aligned_ttmem_free(void* mem) {
415419

416420
#else
417421

418-
void aligned_ttmem_free(void *mem) {
422+
void aligned_large_pages_free(void *mem) {
423+
#if defined(POSIXALIGNEDALLOC)
419424
free(mem);
425+
#elif defined(_WIN32)
426+
_mm_free(mem);
427+
#else
428+
free(mem);
429+
#endif
420430
}
421431

422432
#endif

src/misc.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ const std::string engine_info(bool to_uci = false);
3333
const std::string compiler_info();
3434
void prefetch(void* addr);
3535
void start_logger(const std::string& fname);
36-
void* aligned_ttmem_alloc(size_t size, void*& mem);
37-
void aligned_ttmem_free(void* mem); // nop if mem == nullptr
36+
void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
37+
void aligned_large_pages_free(void* mem); // nop if mem == nullptr
3838

3939
void dbg_hit_on(bool b);
4040
void dbg_hit_on(bool c, bool b);

src/tt.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,12 @@ void TranspositionTable::resize(size_t mbSize) {
6464

6565
Threads.main()->wait_for_search_finished();
6666

67-
aligned_ttmem_free(mem);
67+
aligned_large_pages_free(table);
6868

6969
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
70-
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
71-
if (!mem)
70+
71+
table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
72+
if (!table)
7273
{
7374
std::cerr << "Failed to allocate " << mbSize
7475
<< "MB for transposition table." << std::endl;

src/tt.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class TranspositionTable {
7575
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
7676

7777
public:
78-
~TranspositionTable() { aligned_ttmem_free(mem); }
78+
~TranspositionTable() { aligned_large_pages_free(table); }
7979
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
8080
TTEntry* probe(const Key key, bool& found) const;
8181
int hashfull() const;
@@ -91,7 +91,6 @@ class TranspositionTable {
9191

9292
size_t clusterCount;
9393
Cluster* table;
94-
void* mem;
9594
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
9695
};
9796

0 commit comments

Comments
 (0)