From: Tomas Vondra Date: Mon, 26 Jan 2026 21:20:18 +0000 (+0100) Subject: Handle ENOENT status when querying NUMA node X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9796c4f5607be5807f2d2ba9bca1bc87af198db3;p=thirdparty%2Fpostgresql.git Handle ENOENT status when querying NUMA node We've assumed that touching the memory is sufficient for a page to be located on one of the NUMA nodes. But a page may be moved to a swap after we touch it, due to memory pressure. We touch the memory before querying the status, but there is no guarantee it won't be moved to the swap in the meantime. The touching happens only on the first call, so later calls are more likely to be affected. And the batching increases the window too. It's up to the kernel if/when pages get moved to swap. We have to accept ENOENT (-2) as a valid result, and handle it without failing. This patch simply treats it as an unknown node, and returns NULL in the two affected views (pg_shmem_allocations_numa and pg_buffercache_numa). Hugepages cannot be swapped out, so this affects only regular pages. Reported by Christoph Berg, investigation and fix by me. Backpatch to 18, where the two views were introduced. Reported-by: Christoph Berg Discussion: 18 Backpatch-through: https://postgr.es/m/aTq5Gt_n-oS_QSpL@msg.df7cb.de --- diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 5a3d78d03d5..7d26dbeb7e1 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -526,8 +526,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) values[1] = Int64GetDatum(fctx->record[i].page_num); nulls[1] = false; - values[2] = Int32GetDatum(fctx->record[i].numa_node); - nulls[2] = false; + /* status is valid node number */ + if (fctx->record[i].numa_node >= 0) + { + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + } else { + /* some kind of error (e.g. pages moved to swap) */ + values[2] = (Datum) 0; + nulls[2] = true; + } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index ff3a738c19c..21e12e2f527 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -603,7 +603,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); max_nodes = pg_numa_get_max_node(); - nodes = palloc(sizeof(Size) * (max_nodes + 1)); + nodes = palloc(sizeof(Size) * (max_nodes + 2)); /* * Shared memory allocations can vary in size and may not align with OS @@ -639,7 +639,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) hash_seq_init(&hstat, ShmemIndex); /* output all allocated entries */ - memset(nulls, 0, sizeof(nulls)); while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { int i; @@ -688,22 +687,33 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) elog(ERROR, "failed NUMA pages inquiry status: %m"); /* Count number of NUMA nodes used for this shared memory entry */ - memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + memset(nodes, 0, sizeof(Size) * (max_nodes + 2)); for (i = 0; i < shm_ent_page_count; i++) { int s = pages_status[i]; /* Ensure we are adding only valid index to the array */ - if (s < 0 || s > max_nodes) + if (s >= 0 && s <= max_nodes) + { + /* valid NUMA node */ + nodes[s]++; + continue; + } + else if (s == -2) { - elog(ERROR, "invalid NUMA node id outside of allowed range " - "[0, " UINT64_FORMAT "]: %d", max_nodes, s); + /* -2 means ENOENT (e.g. page was moved to swap) */ + nodes[max_nodes + 1]++; + continue; } - nodes[s]++; + elog(ERROR, "invalid NUMA node id outside of allowed range " + "[0, " UINT64_FORMAT "]: %d", max_nodes, s); } + /* no NULLs for regular nodes */ + memset(nulls, 0, sizeof(nulls)); + /* * Add one entry for each NUMA node, including those without allocated * memory for this segment. @@ -717,6 +727,14 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } + + /* The last entry is used for pages without a NUMA node. */ + nulls[1] = true; + values[0] = CStringGetTextDatum(ent->key); + values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); } LWLockRelease(ShmemIndexLock);