Skip to content

Commit 485d517

Browse files
skiminkivondele
authored andcommitted
Add large page support for NNUE weights and simplify TT mem management
Use TT memory functions to allocate memory for the NNUE weights. This should provide a small speed-up on systems where large pages are not automatically used, including Windows and some Linux distributions. Further, since we now have a wrapper for std::aligned_alloc(), we can simplify the TT memory management a bit: - We no longer need to store separate pointers to the hash table and its underlying memory allocation. - We also get to merge the Linux-specific and default implementations of aligned_ttmem_alloc(). Finally, we'll enable the VirtualAlloc code path with large page support also for Win32. STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba LLR: 2.94 (-2.94,2.94) {-0.25,1.25} Total: 14896 W: 1854 L: 1686 D: 11356 Ptnml(0-2): 65, 1224, 4742, 1312, 105 closes #3081 No functional change.
1 parent 16b4578 commit 485d517

File tree

7 files changed

+57
-45
lines changed

7 files changed

+57
-45
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ to find the best move. The classical evaluation computes this value as a functio
152152
of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
153153
The NNUE evaluation computes this value with a neural network based on basic
154154
inputs (e.g. piece positions only). The network is optimized and trained
155-
on the evalutions of millions of positions at moderate search depth.
155+
on the evaluations of millions of positions at moderate search depth.
156156

157157
The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
158158
It can be evaluated efficiently on CPUs, and exploits the fact that only parts

src/misc.cpp

Lines changed: 25 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -357,27 +357,11 @@ void std_aligned_free(void* ptr) {
357357
#endif
358358
}
359359

360-
/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
361-
/// The returned pointer is the aligned one, while the mem argument is the one that needs
362-
/// to be passed to free. With c++17 some of this functionality could be simplified.
360+
/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.
363361

364-
#if defined(__linux__) && !defined(__ANDROID__)
365-
366-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
367-
368-
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
369-
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
370-
if (posix_memalign(&mem, alignment, size))
371-
mem = nullptr;
372-
#if defined(MADV_HUGEPAGE)
373-
madvise(mem, allocSize, MADV_HUGEPAGE);
374-
#endif
375-
return mem;
376-
}
362+
#if defined(_WIN32)
377363

378-
#elif defined(_WIN64)
379-
380-
static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
364+
static void* aligned_large_pages_alloc_win(size_t allocSize) {
381365

382366
HANDLE hProcessToken { };
383367
LUID luid { };
@@ -422,12 +406,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
422406
return mem;
423407
}
424408

425-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
409+
void* aligned_large_pages_alloc(size_t allocSize) {
426410

427411
static bool firstCall = true;
412+
void* mem;
428413

429414
// Try to allocate large pages
430-
mem = aligned_ttmem_alloc_large_pages(allocSize);
415+
mem = aligned_large_pages_alloc_win(allocSize);
431416

432417
// Suppress info strings on the first call. The first call occurs before 'uci'
433418
// is received and in that case this output confuses some GUIs.
@@ -449,23 +434,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
449434

450435
#else
451436

452-
void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
437+
void* aligned_large_pages_alloc(size_t allocSize) {
438+
439+
#if defined(__linux__)
440+
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
441+
#else
442+
constexpr size_t alignment = 4096; // assumed small page size
443+
#endif
453444

454-
constexpr size_t alignment = 64; // assumed cache line size
455-
size_t size = allocSize + alignment - 1; // allocate some extra space
456-
mem = malloc(size);
457-
void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
458-
return ret;
445+
// round up to multiples of alignment
446+
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
447+
void *mem = std_aligned_alloc(alignment, size);
448+
#if defined(MADV_HUGEPAGE)
449+
madvise(mem, size, MADV_HUGEPAGE);
450+
#endif
451+
return mem;
459452
}
460453

461454
#endif
462455

463456

464-
/// aligned_ttmem_free() will free the previously allocated ttmem
457+
/// aligned_large_pages_free() will free the previously allocated ttmem
465458

466-
#if defined(_WIN64)
459+
#if defined(_WIN32)
467460

468-
void aligned_ttmem_free(void* mem) {
461+
void aligned_large_pages_free(void* mem) {
469462

470463
if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
471464
{
@@ -478,8 +471,8 @@ void aligned_ttmem_free(void* mem) {
478471

479472
#else
480473

481-
void aligned_ttmem_free(void *mem) {
482-
free(mem);
474+
void aligned_large_pages_free(void *mem) {
475+
std_aligned_free(mem);
483476
}
484477

485478
#endif

src/misc.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ void prefetch(void* addr);
3333
void start_logger(const std::string& fname);
3434
void* std_aligned_alloc(size_t alignment, size_t size);
3535
void std_aligned_free(void* ptr);
36-
void* aligned_ttmem_alloc(size_t size, void*& mem);
37-
void aligned_ttmem_free(void* mem); // nop if mem == nullptr
36+
void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
37+
void aligned_large_pages_free(void* mem); // nop if mem == nullptr
3838

3939
void dbg_hit_on(bool b);
4040
void dbg_hit_on(bool c, bool b);

src/nnue/evaluate_nnue.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ namespace Eval::NNUE {
5252
};
5353

5454
// Input feature converter
55-
AlignedPtr<FeatureTransformer> feature_transformer;
55+
LargePagePtr<FeatureTransformer> feature_transformer;
5656

5757
// Evaluation function
5858
AlignedPtr<Network> network;
@@ -70,14 +70,22 @@ namespace Eval::NNUE {
7070
std::memset(pointer.get(), 0, sizeof(T));
7171
}
7272

73+
template <typename T>
74+
void Initialize(LargePagePtr<T>& pointer) {
75+
76+
static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
77+
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
78+
std::memset(pointer.get(), 0, sizeof(T));
79+
}
80+
7381
// Read evaluation function parameters
7482
template <typename T>
75-
bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
83+
bool ReadParameters(std::istream& stream, T& reference) {
7684

7785
std::uint32_t header;
7886
header = read_little_endian<std::uint32_t>(stream);
7987
if (!stream || header != T::GetHashValue()) return false;
80-
return pointer->ReadParameters(stream);
88+
return reference.ReadParameters(stream);
8189
}
8290

8391
} // namespace Detail
@@ -110,8 +118,8 @@ namespace Eval::NNUE {
110118
std::string architecture;
111119
if (!ReadHeader(stream, &hash_value, &architecture)) return false;
112120
if (hash_value != kHashValue) return false;
113-
if (!Detail::ReadParameters(stream, feature_transformer)) return false;
114-
if (!Detail::ReadParameters(stream, network)) return false;
121+
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
122+
if (!Detail::ReadParameters(stream, *network)) return false;
115123
return stream && stream.peek() == std::ios::traits_type::eof();
116124
}
117125

src/nnue/evaluate_nnue.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,20 @@ namespace Eval::NNUE {
4040
}
4141
};
4242

43+
template <typename T>
44+
struct TtmemDeleter {
45+
void operator()(T* ptr) const {
46+
ptr->~T();
47+
aligned_large_pages_free(ptr);
48+
}
49+
};
50+
4351
template <typename T>
4452
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
4553

54+
template <typename T>
55+
using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;
56+
4657
} // namespace Eval::NNUE
4758

4859
#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED

src/tt.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,12 @@ void TranspositionTable::resize(size_t mbSize) {
6262

6363
Threads.main()->wait_for_search_finished();
6464

65-
aligned_ttmem_free(mem);
65+
aligned_large_pages_free(table);
6666

6767
clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
68-
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
69-
if (!mem)
68+
69+
table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
70+
if (!table)
7071
{
7172
std::cerr << "Failed to allocate " << mbSize
7273
<< "MB for transposition table." << std::endl;

src/tt.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class TranspositionTable {
7373
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
7474

7575
public:
76-
~TranspositionTable() { aligned_ttmem_free(mem); }
76+
~TranspositionTable() { aligned_large_pages_free(table); }
7777
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
7878
TTEntry* probe(const Key key, bool& found) const;
7979
int hashfull() const;
@@ -89,7 +89,6 @@ class TranspositionTable {
8989

9090
size_t clusterCount;
9191
Cluster* table;
92-
void* mem;
9392
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
9493
};
9594

0 commit comments

Comments
 (0)