From: Pádraig Brady Date: Mon, 18 Aug 2025 14:34:59 +0000 (+0100) Subject: nproc: honor cgroupv2 CPU quotas X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9b07115f4a344effef1dde8bd0e6e356d4b0e744;p=thirdparty%2Fgnulib.git nproc: honor cgroupv2 CPU quotas cgroupv1 CPU quotas are not considered, as those are now legacy (RHEL7 era), and are more complex/inefficient to parse. Tested in coreutils on Fedora 42 as detailed in tests/test-nproc.c * lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point, first at the common location for efficiency, resorting to searching mount points otherwise. (get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process, returning the lowest integer number of CPUs configured. (cpu_quota): On Linux return the cgroupv2 CPU quota if the currrent scheduler honors it. Otherwise return ULONG_MAX. (num_processors): Clamp the return to <= quota. * m4/nproc.m4: Add a check for mntent.h. * tests/test-nproc.c: Document how cgroup CPU quotas were tested. --- diff --git a/ChangeLog b/ChangeLog index 34735d518c..b7d23e44c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +2025-08-19 Pádraig Brady + + nproc: honor cgroupv2 CPU quotas + * lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point, + first at the common location for efficiency, + resorting to searching mount points otherwise. + (get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process, + returning the lowest integer number of CPUs configured. + (cpu_quota): On Linux return the cgroupv2 CPU quota if the + current scheduler honors it. Otherwise return ULONG_MAX. + (num_processors): Clamp the return to <= quota. + * m4/nproc.m4: Add a check for mntent.h. + * tests/test-nproc.c: Document how cgroup CPU quotas were tested. + 2025-08-19 Bruno Haible doc: Mention some musl libc stubs. diff --git a/lib/nproc.c b/lib/nproc.c index cecf60bc6e..7c5ae3acf9 100644 --- a/lib/nproc.c +++ b/lib/nproc.c @@ -22,7 +22,12 @@ #include #include +#if HAVE_MNTENT_H +# include +#endif #include +#include +#include #include #if HAVE_PTHREAD_GETAFFINITY_NP && 0 @@ -62,6 +67,8 @@ #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) +#define NPROC_MINIMUM 1 + /* Return the number of processors available to the current process, based on a modern system call that returns the "affinity" between the current process and each CPU. Return 0 if unknown or if such a system call does @@ -244,7 +251,7 @@ num_processors_via_affinity_mask (void) /* Return the total number of processors. Here QUERY must be one of NPROC_ALL, NPROC_CURRENT. The result is guaranteed to be at least 1. */ static unsigned long int -num_processors_ignoring_omp (enum nproc_query query) +num_processors_available (enum nproc_query query) { /* On systems with a modern affinity mask system call, we have sysconf (_SC_NPROCESSORS_CONF) @@ -377,7 +384,159 @@ num_processors_ignoring_omp (enum nproc_query query) } #endif - return 1; + return NPROC_MINIMUM; +} + +#if defined __linux__ || defined __ANDROID__ +/* Identify the cgroup2 mount point, + initially at the usual location for efficiency, + resorting to searching mount points otherwise. + Return NULL if the mount point is not found. + The returned string can be freed. */ +static char * +cgroup2_mount (void) +{ + FILE *fp; + char *ret = NULL; + + /* Check the usual location first. */ + if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0) + return strdup ("/sys/fs/cgroup"); + +#if HAVE_MNTENT_H + /* Otherwise look for the mount point. */ + struct mntent *mnt; + if (! (fp = setmntent ("/proc/mounts", "r"))) + return NULL; + while ((mnt = getmntent (fp)) != NULL) + { + if (strcmp (mnt->mnt_type, "cgroup2") == 0) + { + ret = strdup (mnt->mnt_dir); + break; + } + } + endmntent (fp); +#endif + + return ret; +} + +/* Return the minimum configured cgroupv2 CPU quota for the current process. + Return ULONG_MAX if quota can't be read. + Returned value will be >= 1. */ +static unsigned long int +get_cgroup2_cpu_quota (void) +{ + unsigned long int cpu_quota = ULONG_MAX; + FILE *fp; + + fp = fopen ("/proc/self/cgroup", "r"); + if (! fp) + return cpu_quota; + + /* Get our cgroupv2 (unififed) hierarchy. */ + char *cgroup = NULL; + char *cgroup_str = NULL; + size_t cgroup_size = 0; + ssize_t read; + while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1) + { + if (strncmp (cgroup_str, "0::/", 4) == 0) + { + char *end = cgroup_str + read - 1; + if (*end == '\n') + *end = '\0'; + cgroup = cgroup_str + 3; + break; + } + } + fclose (fp); + + char *mount = NULL; + if (cgroup && ! (mount = cgroup2_mount ())) + cgroup = NULL; + + /* Find the lowest quota in the hierarchy. */ + char *quota_str = NULL; + size_t quota_size = 0; + while (cgroup && *cgroup) + { + /* Walk back up the nested cgroup hierarchy + to find the lowest cpu quota as defined in a cpu.max file. + Note this file may not be present if the cpu controller + is not enabled for that part of the hierarchy. */ + + char cpu_max_file[PATH_MAX]; + snprintf (cpu_max_file, sizeof (cpu_max_file), + "%s%s/cpu.max", mount, cgroup); + + if ((fp = fopen (cpu_max_file, "r")) + && getline ("a_str, "a_size, fp) != -1 + && strncmp (quota_str, "max", 3) != 0) + { + long quota, period; + if (sscanf (quota_str, "%ld %ld", "a, &period) == 2 && period) + { + double ncpus = (double)quota / period; + if (cpu_quota == ULONG_MAX || ncpus < cpu_quota) + { + cpu_quota = MAX (1, (long)(ncpus + 0.5)); + /* nproc will return 1 minimum, so no point going lower */ + if (cpu_quota == 1) + *cgroup = '\0'; + } + } + } + + if (fp) + fclose (fp); + + char *last_sep = strrchr (cgroup, '/'); + if (! last_sep) + break; + if (last_sep == cgroup && *(cgroup + 1)) + *(cgroup + 1) = '\0'; /* Iterate on "/" also. */ + else + *last_sep = '\0'; + } + + free (quota_str); + free (mount); + free (cgroup_str); + + return cpu_quota; +} +#endif + + +/* Return the cgroupv2 CPU quota if the current scheduler honors it. + Otherwise return ULONG_MAX. + Returned value will be >= 1. */ +static unsigned long int +cpu_quota (void) +{ + unsigned long int quota = ULONG_MAX; + +#if defined __linux__ || defined __ANDROID__ +# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE + /* We've a new enough sched.h */ + switch (sched_getscheduler (0)) + { + case -1: + case SCHED_FIFO: + case SCHED_RR: + case SCHED_DEADLINE: + quota = ULONG_MAX; + break; + default: + quota = get_cgroup2_cpu_quota (); + break; + } +# endif +#endif + + return quota; } /* Parse OMP environment variables without dependence on OMP. @@ -416,13 +575,13 @@ parse_omp_threads (char const* threads) unsigned long int num_processors (enum nproc_query query) { - unsigned long int omp_env_limit = ULONG_MAX; + unsigned long int nproc_limit = ULONG_MAX; + /* Honor the OpenMP environment variables, recognized also by all + programs that are based on OpenMP. */ if (query == NPROC_CURRENT_OVERRIDABLE) { - unsigned long int omp_env_threads; - /* Honor the OpenMP environment variables, recognized also by all - programs that are based on OpenMP. */ + unsigned long int omp_env_threads, omp_env_limit; omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS")); omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT")); if (! omp_env_limit) @@ -431,14 +590,22 @@ num_processors (enum nproc_query query) if (omp_env_threads) return MIN (omp_env_threads, omp_env_limit); + nproc_limit = omp_env_limit; query = NPROC_CURRENT; } - /* Here query is one of NPROC_ALL, NPROC_CURRENT. */ - if (omp_env_limit == 1) - /* No need to even call num_processors_ignoring_omp (query). */ - return 1; - { - unsigned long nprocs = num_processors_ignoring_omp (query); - return MIN (nprocs, omp_env_limit); - } + + /* Honor any CPU quotas. */ + if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM) + { + unsigned long int quota = cpu_quota (); + nproc_limit = MIN (quota, nproc_limit); + } + + if (nproc_limit > NPROC_MINIMUM) + { + unsigned long nprocs = num_processors_available (query); + nproc_limit = MIN (nprocs, nproc_limit); + } + + return nproc_limit; } diff --git a/m4/nproc.m4 b/m4/nproc.m4 index 48c239be06..5e96afc93a 100644 --- a/m4/nproc.m4 +++ b/m4/nproc.m4 @@ -17,7 +17,7 @@ AC_DEFUN([gl_PREREQ_NPROC], dnl Persuade glibc to declare CPU_SETSIZE, CPU_ISSET etc. AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS]) - AC_CHECK_HEADERS([sys/pstat.h sys/sysmp.h sys/param.h],,, + AC_CHECK_HEADERS([mntent.h sys/pstat.h sys/sysmp.h sys/param.h],,, [AC_INCLUDES_DEFAULT]) dnl requires on OpenBSD 4.0. AC_CHECK_HEADERS([sys/sysctl.h],,, diff --git a/tests/test-nproc.c b/tests/test-nproc.c new file mode 100644 index 0000000000..2c7406f95d --- /dev/null +++ b/tests/test-nproc.c @@ -0,0 +1,65 @@ +/* +nproc honors cgroup v2 CPU quotas +and was tested in coreutils on a Fedora 42 system as follows: + +# Note we honor a limit anywhere in /proc/self/cgroup hierarchy +# so apply settings below in the parent cgroup of the current process +$ nested_cgroup=/sys/fs/cgroup/$(dirname $(cut -d/ -f2- /proc/self/cgroup)) +$ echo $nested_cgroup +/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/app.slice + +# This test system has 4 CPUs +$ src/nproc +4 + +# Behave like MAX (1, (int)round(quota/period)) +$ echo "100000 100000" > $nested_cgroup/cpu.max +$ src/nproc +1 +$ echo "90000 100000" > $nested_cgroup/cpu.max +$ src/nproc +1 +$ echo "140000 100000" > $nested_cgroup/cpu.max +$ src/nproc +1 +$ echo "150000 100000" > $nested_cgroup/cpu.max +$ src/nproc +2 + +# Ensure NPROC_ALL takes precedence +$ echo "100000 100000" > $nested_cgroup/cpu.max +$ src/nproc --all +4 + +# Ensure OMP env vars have appropriate precedence +$ echo "200000 100000" > $nested_cgroup/cpu.max +$ OMP_NUM_THREADS=10 src/nproc +10 +$ OMP_THREAD_LIMIT=10 src/nproc +2 + +# Ensure quota only reduces +$ echo "500000 100000" > $nested_cgroup/cpu.max +$ src/nproc +4 + +# Restore system to unlimited +$ echo "max 100000" > $nested_cgroup/cpu.max + +# Test quota in root hierarchy +$ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc +2 +$ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc +2 +$ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc +1 +$ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc +4 + +# Docker is similar to podman, but explicitly limits max allowable +$ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc +1 +$ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc +docker: Error response from daemon: +range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs +*/