From 7fe2f67c7c9f27955df584eb79edd6ec2be7f9e4 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Tue, 1 Jul 2025 12:02:31 +0200
Subject: [PATCH] Limit the size of numa_move_pages requests

There's a kernel bug in do_pages_stat(), affecting systems combining
64-bit kernel and 32-bit user space. The function splits the request
into chunks of 16 pointers, but forgets the pointers are 32-bit when
advancing to the next chunk. Some of the pointers get skipped, and
memory after the array is interpreted as pointers. The result is that
the produced status of memory pages is mostly bogus.

Systems combining 64-bit and 32-bit environments like this might seem
rare, but that's not the case - all 32-bit Debian packages are built in
a 32-bit chroot on a system with a 64-bit kernel.

This is a long-standing kernel bug (since 2010), affecting pretty much
all kernels, so it'll take time until all systems get a fixed kernel.
Luckily, we can work around the issue by chunking the requests the same
way do_pages_stat() does, at least on affected systems. We don't know
what kernel a 32-bit build will run on, so all 32-bit builds use chunks
of 16 elements (the largest chunk before hitting the issue).

64-bit builds are not affected by this issue, and so could work without
the chunking. But chunking has other advantages, so we apply chunking
even for 64-bit builds, with chunks of 1024 elements.

Reported-by: Christoph Berg <myon@debian.org>
Author: Christoph Berg <myon@debian.org>
Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/aEtDozLmtZddARdB@msg.df7cb.de
Context: https://marc.info/?l=linux-mm&m=175077821909222&w=2
Backpatch-through: 18
---
 src/port/pg_numa.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 4b487a2a4e8..d5935207d0a 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -29,6 +29,19 @@
 #include <numa.h>
 #include <numaif.h>
 
+/*
+ * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
+ * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
+ * chunk size, we make it work even on unfixed kernels.
+ *
+ * 64-bit system are not affected by the bug, and so use much larger chunks.
+ */
+#if SIZEOF_SIZE_T == 4
+#define NUMA_QUERY_CHUNK_SIZE 16
+#else
+#define NUMA_QUERY_CHUNK_SIZE 1024
+#endif
+
 /* libnuma requires initialization as per numa(3) on Linux */
 int
 pg_numa_init(void)
@@ -42,11 +55,46 @@ pg_numa_init(void)
  * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
  * first one allows us to batch and query about many memory pages in one single
  * giant system call that is way faster.
+ *
+ * We call numa_move_pages() for smaller chunks of the whole array. The first
+ * reason is to work around a kernel bug, but also to allow interrupting the
+ * query between the calls (for many pointers processing the whole array can
+ * take a lot of time).
  */
 int
 pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
 {
-	return numa_move_pages(pid, count, pages, NULL, status, 0);
+	unsigned long	next = 0;
+	int				ret = 0;
+
+	/*
+	 * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
+	 * items, to work around a kernel bug in do_pages_stat().
+	 */
+	while (next < count)
+	{
+		unsigned long count_chunk = Min(count - next,
+										NUMA_QUERY_CHUNK_SIZE);
+
+		/*
+		 * Bail out if any of the chunks errors out (ret<0). We ignore
+		 * (ret>0) which is used to return number of nonmigrated pages,
+		 * but we're not migrating any pages here.
+		 */
+		ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
+		if (ret < 0)
+		{
+			/* plain error, return as is */
+			return ret;
+		}
+
+		next += count_chunk;
+	}
+
+	/* should have consumed the input array exactly */
+	Assert(next == count);
+
+	return 0;
 }
 
 int
-- 
2.47.3