Limit the size of numa_move_pages requests

author Tomas Vondra <tomas.vondra@postgresql.org>

Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)

committer Tomas Vondra <tomas.vondra@postgresql.org>

Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)
author Tomas Vondra <tomas.vondra@postgresql.org>
Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)
committer Tomas Vondra <tomas.vondra@postgresql.org>
Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c

index 4b487a2a4e814ca31f7d192f40a506ee05a87736..d5935207d0a13be0d59e2cf89dc560715830e504 100644 (file)
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -29,6 +29,19 @@
  #include <numa.h>
  #include <numaif.h>
  
+/*
+ * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
+ * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
+ * chunk size, we make it work even on unfixed kernels.
+ *
+ * 64-bit system are not affected by the bug, and so use much larger chunks.
+ */
+#if SIZEOF_SIZE_T == 4
+#define NUMA_QUERY_CHUNK_SIZE 16
+#else
+#define NUMA_QUERY_CHUNK_SIZE 1024
+#endif
+
  /* libnuma requires initialization as per numa(3) on Linux */
  int
  pg_numa_init(void)
@@ -42,11 +55,46 @@ pg_numa_init(void)
   * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
   * first one allows us to batch and query about many memory pages in one single
   * giant system call that is way faster.
+ *
+ * We call numa_move_pages() for smaller chunks of the whole array. The first
+ * reason is to work around a kernel bug, but also to allow interrupting the
+ * query between the calls (for many pointers processing the whole array can
+ * take a lot of time).
   */
  int
  pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
  {
-       return numa_move_pages(pid, count, pages, NULL, status, 0);
+       unsigned long   next = 0;
+       int                             ret = 0;
+
+       /*
+        * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
+        * items, to work around a kernel bug in do_pages_stat().
+        */
+       while (next < count)
+       {
+               unsigned long count_chunk = Min(count - next,
+                                                                               NUMA_QUERY_CHUNK_SIZE);
+
+               /*
+                * Bail out if any of the chunks errors out (ret<0). We ignore
+                * (ret>0) which is used to return number of nonmigrated pages,
+                * but we're not migrating any pages here.
+                */
+               ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
+               if (ret < 0)
+               {
+                       /* plain error, return as is */
+                       return ret;
+               }
+
+               next += count_chunk;
+       }
+
+       /* should have consumed the input array exactly */
+       Assert(next == count);
+
+       return 0;
  }
  
  int
author	Tomas Vondra <tomas.vondra@postgresql.org>
	Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)
committer	Tomas Vondra <tomas.vondra@postgresql.org>
	Tue, 1 Jul 2025 10:02:31 +0000 (12:02 +0200)