#include <numa.h>
#include <numaif.h>
+/*
+ * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
+ * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
+ * chunk size, we make it work even on unfixed kernels.
+ *
+ * 64-bit system are not affected by the bug, and so use much larger chunks.
+ */
+#if SIZEOF_SIZE_T == 4
+#define NUMA_QUERY_CHUNK_SIZE 16
+#else
+#define NUMA_QUERY_CHUNK_SIZE 1024
+#endif
+
/* libnuma requires initialization as per numa(3) on Linux */
int
pg_numa_init(void)
* We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
* first one allows us to batch and query about many memory pages in one single
* giant system call that is way faster.
+ *
+ * We call numa_move_pages() for smaller chunks of the whole array. The first
+ * reason is to work around a kernel bug, but also to allow interrupting the
+ * query between the calls (for many pointers processing the whole array can
+ * take a lot of time).
*/
int
pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
{
- return numa_move_pages(pid, count, pages, NULL, status, 0);
+ unsigned long next = 0;
+ int ret = 0;
+
+ /*
+ * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
+ * items, to work around a kernel bug in do_pages_stat().
+ */
+ while (next < count)
+ {
+ unsigned long count_chunk = Min(count - next,
+ NUMA_QUERY_CHUNK_SIZE);
+
+ /*
+ * Bail out if any of the chunks errors out (ret<0). We ignore
+ * (ret>0) which is used to return number of nonmigrated pages,
+ * but we're not migrating any pages here.
+ */
+ ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
+ if (ret < 0)
+ {
+ /* plain error, return as is */
+ return ret;
+ }
+
+ next += count_chunk;
+ }
+
+ /* should have consumed the input array exactly */
+ Assert(next == count);
+
+ return 0;
}
int