Skip to content

Commit 7218ec4

Browse files
noobpwnftwvondele
authored andcommitted
Revert and fix earlier windows NUMA patch
revert 9048ac0 due to core spread problem and fix new OS compatibility with another method. This code assumes that if one NUMA node has more than one processor groups, they are created equal(having equal amount of cores assigned to each of the groups), and also the total number of available cores contained in such groups are equal to the number of available cores within one NUMA node because of how best_node function works. closes #3798 fixes #3787 No functional change.
1 parent a943b1d commit 7218ec4

File tree

1 file changed

+34
-20
lines changed

1 file changed

+34
-20
lines changed

src/misc.cpp

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
3636
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
3737
typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
3838
typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
39+
typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
3940
}
4041
#endif
4142

@@ -495,14 +496,14 @@ void bindThisThread(size_t) {}
495496

496497
#else
497498

498-
/// best_group() retrieves logical processor information using Windows specific
499-
/// API and returns the best group id for the thread with index idx. Original
499+
/// best_node() retrieves logical processor information using Windows specific
500+
/// API and returns the best node id for the thread with index idx. Original
500501
/// code from Texel by Peter Österlund.
501502

502-
int best_group(size_t idx) {
503+
int best_node(size_t idx) {
503504

504505
int threads = 0;
505-
int groups = 0;
506+
int nodes = 0;
506507
int cores = 0;
507508
DWORD returnLength = 0;
508509
DWORD byteOffset = 0;
@@ -530,8 +531,8 @@ int best_group(size_t idx) {
530531

531532
while (byteOffset < returnLength)
532533
{
533-
if (ptr->Relationship == RelationGroup)
534-
groups += ptr->Group.MaximumGroupCount;
534+
if (ptr->Relationship == RelationNumaNode)
535+
nodes++;
535536

536537
else if (ptr->Relationship == RelationProcessorCore)
537538
{
@@ -546,23 +547,23 @@ int best_group(size_t idx) {
546547

547548
free(buffer);
548549

549-
std::vector<int> core_groups;
550+
std::vector<int> groups;
550551

551-
// Run as many threads as possible on the same group until core limit is
552-
// reached, then move on filling the next group.
553-
for (int n = 0; n < groups; n++)
554-
for (int i = 0; i < cores / groups; i++)
555-
core_groups.push_back(n);
552+
// Run as many threads as possible on the same node until core limit is
553+
// reached, then move on filling the next node.
554+
for (int n = 0; n < nodes; n++)
555+
for (int i = 0; i < cores / nodes; i++)
556+
groups.push_back(n);
556557

557558
// In case a core has more than one logical processor (we assume 2) and we
558559
// have still threads to allocate, then spread them evenly across available
559-
// groups.
560+
// nodes.
560561
for (int t = 0; t < threads - cores; t++)
561-
core_groups.push_back(t % groups);
562+
groups.push_back(t % nodes);
562563

563564
// If we still have more threads than the total number of logical processors
564565
// then return -1 and let the OS to decide what to do.
565-
return idx < core_groups.size() ? core_groups[idx] : -1;
566+
return idx < groups.size() ? groups[idx] : -1;
566567
}
567568

568569

@@ -571,22 +572,35 @@ int best_group(size_t idx) {
571572
void bindThisThread(size_t idx) {
572573

573574
// Use only local variables to be thread-safe
574-
int group = best_group(idx);
575+
int node = best_node(idx);
575576

576-
if (group == -1)
577+
if (node == -1)
577578
return;
578579

579580
// Early exit if the needed API are not available at runtime
580581
HMODULE k32 = GetModuleHandle("Kernel32.dll");
581582
auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
582583
auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
584+
auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
583585

584586
if (!fun2 || !fun3)
585587
return;
586588

587-
GROUP_AFFINITY affinity;
588-
if (fun2(group, &affinity))
589-
fun3(GetCurrentThread(), &affinity, nullptr);
589+
if (!fun4) {
590+
GROUP_AFFINITY affinity;
591+
if (fun2(node, &affinity))
592+
fun3(GetCurrentThread(), &affinity, nullptr);
593+
} else {
594+
// If a numa node has more than one processor group, we assume they are
595+
// sized equal and we spread threads evenly across the groups.
596+
USHORT elements, returnedElements;
597+
elements = GetMaximumProcessorGroupCount();
598+
GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
599+
elements * sizeof(GROUP_AFFINITY));
600+
if (fun4(node, affinity, elements, &returnedElements))
601+
fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
602+
free(affinity);
603+
}
590604
}
591605

592606
#endif

0 commit comments

Comments
 (0)