6868#include "fmgr.h"
6969#include "funcapi.h"
7070#include "miscadmin.h"
71+ #include "port/pg_numa.h"
7172#include "storage/lwlock.h"
7273#include "storage/pg_shmem.h"
7374#include "storage/shmem.h"
@@ -89,6 +90,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
8990
9091static HTAB * ShmemIndex = NULL ; /* primary index hashtable for shmem */
9192
93+ /* To get reliable results for NUMA inquiry we need to "touch pages" once */
94+ static bool firstNumaTouch = true;
9295
9396/*
9497 * InitShmemAccess() --- set up basic pointers to shared memory.
@@ -568,3 +571,159 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
568571
569572 return (Datum ) 0 ;
570573}
574+
575+ /*
576+ * SQL SRF showing NUMA memory nodes for allocated shared memory
577+ *
578+ * Compared to pg_get_shmem_allocations(), this function does not return
579+ * information about shared anonymous allocations and unused shared memory.
580+ */
581+ Datum
582+ pg_get_shmem_allocations_numa (PG_FUNCTION_ARGS )
583+ {
584+ #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
585+ ReturnSetInfo * rsinfo = (ReturnSetInfo * ) fcinfo -> resultinfo ;
586+ HASH_SEQ_STATUS hstat ;
587+ ShmemIndexEnt * ent ;
588+ Datum values [PG_GET_SHMEM_NUMA_SIZES_COLS ];
589+ bool nulls [PG_GET_SHMEM_NUMA_SIZES_COLS ];
590+ Size os_page_size ;
591+ void * * page_ptrs ;
592+ int * pages_status ;
593+ uint64 shm_total_page_count ,
594+ shm_ent_page_count ,
595+ max_nodes ;
596+ Size * nodes ;
597+
598+ if (pg_numa_init () == -1 )
599+ elog (ERROR , "libnuma initialization failed or NUMA is not supported on this platform" );
600+
601+ InitMaterializedSRF (fcinfo , 0 );
602+
603+ max_nodes = pg_numa_get_max_node ();
604+ nodes = palloc (sizeof (Size ) * (max_nodes + 1 ));
605+
606+ /*
607+ * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
608+ * the OS may have different memory page sizes.
609+ *
610+ * To correctly map between them, we need to: 1. Determine the OS memory
611+ * page size 2. Calculate how many OS pages are used by all buffer blocks
612+ * 3. Calculate how many OS pages are contained within each database
613+ * block.
614+ *
615+ * This information is needed before calling move_pages() for NUMA memory
616+ * node inquiry.
617+ */
618+ os_page_size = pg_numa_get_pagesize ();
619+
620+ /*
621+ * Allocate memory for page pointers and status based on total shared
622+ * memory size. This simplified approach allocates enough space for all
623+ * pages in shared memory rather than calculating the exact requirements
624+ * for each segment.
625+ *
626+ * Add 1, because we don't know how exactly the segments align to OS
627+ * pages, so the allocation might use one more memory page. In practice
628+ * this is not very likely, and moreover we have more entries, each of
629+ * them using only fraction of the total pages.
630+ */
631+ shm_total_page_count = (ShmemSegHdr -> totalsize / os_page_size ) + 1 ;
632+ page_ptrs = palloc0 (sizeof (void * ) * shm_total_page_count );
633+ pages_status = palloc (sizeof (int ) * shm_total_page_count );
634+
635+ if (firstNumaTouch )
636+ elog (DEBUG1 , "NUMA: page-faulting shared memory segments for proper NUMA readouts" );
637+
638+ LWLockAcquire (ShmemIndexLock , LW_SHARED );
639+
640+ hash_seq_init (& hstat , ShmemIndex );
641+
642+ /* output all allocated entries */
643+ memset (nulls , 0 , sizeof (nulls ));
644+ while ((ent = (ShmemIndexEnt * ) hash_seq_search (& hstat )) != NULL )
645+ {
646+ int i ;
647+ char * startptr ,
648+ * endptr ;
649+ Size total_len ;
650+
651+ /*
652+ * Calculate the range of OS pages used by this segment. The segment
653+ * may start / end half-way through a page, we want to count these
654+ * pages too. So we align the start/end pointers down/up, and then
655+ * calculate the number of pages from that.
656+ */
657+ startptr = (char * ) TYPEALIGN_DOWN (os_page_size , ent -> location );
658+ endptr = (char * ) TYPEALIGN (os_page_size ,
659+ (char * ) ent -> location + ent -> allocated_size );
660+ total_len = (endptr - startptr );
661+
662+ shm_ent_page_count = total_len / os_page_size ;
663+
664+ /*
665+ * If we ever get 0xff (-1) back from kernel inquiry, then we probably
666+ * have a bug in mapping buffers to OS pages.
667+ */
668+ memset (pages_status , 0xff , sizeof (int ) * shm_ent_page_count );
669+
670+ /*
671+ * Setup page_ptrs[] with pointers to all OS pages for this segment,
672+ * and get the NUMA status using pg_numa_query_pages.
673+ *
674+ * In order to get reliable results we also need to touch memory
675+ * pages, so that inquiry about NUMA memory node doesn't return -2
676+ * (ENOENT, which indicates unmapped/unallocated pages).
677+ */
678+ for (i = 0 ; i < shm_ent_page_count ; i ++ )
679+ {
680+ volatile uint64 touch pg_attribute_unused ();
681+
682+ page_ptrs [i ] = startptr + (i * os_page_size );
683+
684+ if (firstNumaTouch )
685+ pg_numa_touch_mem_if_required (touch , page_ptrs [i ]);
686+
687+ CHECK_FOR_INTERRUPTS ();
688+ }
689+
690+ if (pg_numa_query_pages (0 , shm_ent_page_count , page_ptrs , pages_status ) == -1 )
691+ elog (ERROR , "failed NUMA pages inquiry status: %m" );
692+
693+ /* Count number of NUMA nodes used for this shared memory entry */
694+ memset (nodes , 0 , sizeof (Size ) * (max_nodes + 1 ));
695+
696+ for (i = 0 ; i < shm_ent_page_count ; i ++ )
697+ {
698+ int s = pages_status [i ];
699+
700+ /* Ensure we are adding only valid index to the array */
701+ if (s < 0 || s > max_nodes )
702+ {
703+ elog (ERROR , "invalid NUMA node id outside of allowed range "
704+ "[0, " UINT64_FORMAT "]: %d" , max_nodes , s );
705+ }
706+
707+ nodes [s ]++ ;
708+ }
709+
710+ /*
711+ * Add one entry for each NUMA node, including those without allocated
712+ * memory for this segment.
713+ */
714+ for (i = 0 ; i <= max_nodes ; i ++ )
715+ {
716+ values [0 ] = CStringGetTextDatum (ent -> key );
717+ values [1 ] = i ;
718+ values [2 ] = Int64GetDatum (nodes [i ] * os_page_size );
719+
720+ tuplestore_putvalues (rsinfo -> setResult , rsinfo -> setDesc ,
721+ values , nulls );
722+ }
723+ }
724+
725+ LWLockRelease (ShmemIndexLock );
726+ firstNumaTouch = false;
727+
728+ return (Datum ) 0 ;
729+ }
0 commit comments