@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
3636 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
3737typedef bool (*fun2_t )(USHORT, PGROUP_AFFINITY);
3838typedef bool (*fun3_t )(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
39+ typedef bool (*fun4_t )(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
3940}
4041#endif
4142
@@ -495,14 +496,14 @@ void bindThisThread(size_t) {}
495496
496497#else
497498
498- // / best_group () retrieves logical processor information using Windows specific
499- // / API and returns the best group id for the thread with index idx. Original
499+ // / best_node () retrieves logical processor information using Windows specific
500+ // / API and returns the best node id for the thread with index idx. Original
500501// / code from Texel by Peter Österlund.
501502
502- int best_group (size_t idx) {
503+ int best_node (size_t idx) {
503504
504505 int threads = 0 ;
505- int groups = 0 ;
506+ int nodes = 0 ;
506507 int cores = 0 ;
507508 DWORD returnLength = 0 ;
508509 DWORD byteOffset = 0 ;
@@ -530,8 +531,8 @@ int best_group(size_t idx) {
530531
531532 while (byteOffset < returnLength)
532533 {
533- if (ptr->Relationship == RelationGroup )
534- groups += ptr-> Group . MaximumGroupCount ;
534+ if (ptr->Relationship == RelationNumaNode )
535+ nodes++ ;
535536
536537 else if (ptr->Relationship == RelationProcessorCore)
537538 {
@@ -546,23 +547,23 @@ int best_group(size_t idx) {
546547
547548 free (buffer);
548549
549- std::vector<int > core_groups ;
550+ std::vector<int > groups ;
550551
551- // Run as many threads as possible on the same group until core limit is
552- // reached, then move on filling the next group .
553- for (int n = 0 ; n < groups ; n++)
554- for (int i = 0 ; i < cores / groups ; i++)
555- core_groups .push_back (n);
552+ // Run as many threads as possible on the same node until core limit is
553+ // reached, then move on filling the next node .
554+ for (int n = 0 ; n < nodes ; n++)
555+ for (int i = 0 ; i < cores / nodes ; i++)
556+ groups .push_back (n);
556557
557558 // In case a core has more than one logical processor (we assume 2) and we
558559 // have still threads to allocate, then spread them evenly across available
559- // groups .
560+ // nodes .
560561 for (int t = 0 ; t < threads - cores; t++)
561- core_groups .push_back (t % groups );
562+ groups .push_back (t % nodes );
562563
563564 // If we still have more threads than the total number of logical processors
564565 // then return -1 and let the OS to decide what to do.
565- return idx < core_groups .size () ? core_groups [idx] : -1 ;
566+ return idx < groups .size () ? groups [idx] : -1 ;
566567}
567568
568569
@@ -571,22 +572,35 @@ int best_group(size_t idx) {
571572void bindThisThread (size_t idx) {
572573
573574 // Use only local variables to be thread-safe
574- int group = best_group (idx);
575+ int node = best_node (idx);
575576
576- if (group == -1 )
577+ if (node == -1 )
577578 return ;
578579
579580 // Early exit if the needed API are not available at runtime
580581 HMODULE k32 = GetModuleHandle (" Kernel32.dll" );
581582 auto fun2 = (fun2_t )(void (*)())GetProcAddress (k32, " GetNumaNodeProcessorMaskEx" );
582583 auto fun3 = (fun3_t )(void (*)())GetProcAddress (k32, " SetThreadGroupAffinity" );
584+ auto fun4 = (fun4_t )(void (*)())GetProcAddress (k32, " GetNumaNodeProcessorMask2" );
583585
584586 if (!fun2 || !fun3)
585587 return ;
586588
587- GROUP_AFFINITY affinity;
588- if (fun2 (group, &affinity))
589- fun3 (GetCurrentThread (), &affinity, nullptr );
589+ if (!fun4) {
590+ GROUP_AFFINITY affinity;
591+ if (fun2 (node, &affinity))
592+ fun3 (GetCurrentThread (), &affinity, nullptr );
593+ } else {
594+ // If a numa node has more than one processor group, we assume they are
595+ // sized equal and we spread threads evenly across the groups.
596+ USHORT elements, returnedElements;
597+ elements = GetMaximumProcessorGroupCount ();
598+ GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc (
599+ elements * sizeof (GROUP_AFFINITY));
600+ if (fun4 (node, affinity, elements, &returnedElements))
601+ fun3 (GetCurrentThread (), &affinity[idx % returnedElements], nullptr );
602+ free (affinity);
603+ }
590604}
591605
592606#endif
0 commit comments