#include <errno.h>
#include <limits.h>
+#if HAVE_MNTENT_H
+# include <mntent.h>
+#endif
#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#if HAVE_PTHREAD_GETAFFINITY_NP && 0
#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+#define NPROC_MINIMUM 1
+
/* Return the number of processors available to the current process, based
on a modern system call that returns the "affinity" between the current
process and each CPU. Return 0 if unknown or if such a system call does
/* Return the total number of processors. Here QUERY must be one of
NPROC_ALL, NPROC_CURRENT. The result is guaranteed to be at least 1. */
static unsigned long int
-num_processors_ignoring_omp (enum nproc_query query)
+num_processors_available (enum nproc_query query)
{
/* On systems with a modern affinity mask system call, we have
sysconf (_SC_NPROCESSORS_CONF)
}
#endif
- return 1;
+ return NPROC_MINIMUM;
+}
+
+#if defined __linux__ || defined __ANDROID__
+/* Identify the cgroup2 mount point,
+ initially at the usual location for efficiency,
+ resorting to searching mount points otherwise.
+ Return NULL if the mount point is not found.
+ The returned string can be freed. */
+static char *
+cgroup2_mount (void)
+{
+ FILE *fp;
+ char *ret = NULL;
+
+ /* Check the usual location first. */
+ if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0)
+ return strdup ("/sys/fs/cgroup");
+
+#if HAVE_MNTENT_H
+ /* Otherwise look for the mount point. */
+ struct mntent *mnt;
+ if (! (fp = setmntent ("/proc/mounts", "r")))
+ return NULL;
+ while ((mnt = getmntent (fp)) != NULL)
+ {
+ if (strcmp (mnt->mnt_type, "cgroup2") == 0)
+ {
+ ret = strdup (mnt->mnt_dir);
+ break;
+ }
+ }
+ endmntent (fp);
+#endif
+
+ return ret;
+}
+
+/* Return the minimum configured cgroupv2 CPU quota for the current process.
+ Return ULONG_MAX if quota can't be read.
+ Returned value will be >= 1. */
+static unsigned long int
+get_cgroup2_cpu_quota (void)
+{
+ unsigned long int cpu_quota = ULONG_MAX;
+ FILE *fp;
+
+ fp = fopen ("/proc/self/cgroup", "r");
+ if (! fp)
+ return cpu_quota;
+
+ /* Get our cgroupv2 (unififed) hierarchy. */
+ char *cgroup = NULL;
+ char *cgroup_str = NULL;
+ size_t cgroup_size = 0;
+ ssize_t read;
+ while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1)
+ {
+ if (strncmp (cgroup_str, "0::/", 4) == 0)
+ {
+ char *end = cgroup_str + read - 1;
+ if (*end == '\n')
+ *end = '\0';
+ cgroup = cgroup_str + 3;
+ break;
+ }
+ }
+ fclose (fp);
+
+ char *mount = NULL;
+ if (cgroup && ! (mount = cgroup2_mount ()))
+ cgroup = NULL;
+
+ /* Find the lowest quota in the hierarchy. */
+ char *quota_str = NULL;
+ size_t quota_size = 0;
+ while (cgroup && *cgroup)
+ {
+ /* Walk back up the nested cgroup hierarchy
+ to find the lowest cpu quota as defined in a cpu.max file.
+ Note this file may not be present if the cpu controller
+ is not enabled for that part of the hierarchy. */
+
+ char cpu_max_file[PATH_MAX];
+ snprintf (cpu_max_file, sizeof (cpu_max_file),
+ "%s%s/cpu.max", mount, cgroup);
+
+ if ((fp = fopen (cpu_max_file, "r"))
+ && getline ("a_str, "a_size, fp) != -1
+ && strncmp (quota_str, "max", 3) != 0)
+ {
+ long quota, period;
+ if (sscanf (quota_str, "%ld %ld", "a, &period) == 2 && period)
+ {
+ double ncpus = (double)quota / period;
+ if (cpu_quota == ULONG_MAX || ncpus < cpu_quota)
+ {
+ cpu_quota = MAX (1, (long)(ncpus + 0.5));
+ /* nproc will return 1 minimum, so no point going lower */
+ if (cpu_quota == 1)
+ *cgroup = '\0';
+ }
+ }
+ }
+
+ if (fp)
+ fclose (fp);
+
+ char *last_sep = strrchr (cgroup, '/');
+ if (! last_sep)
+ break;
+ if (last_sep == cgroup && *(cgroup + 1))
+ *(cgroup + 1) = '\0'; /* Iterate on "/" also. */
+ else
+ *last_sep = '\0';
+ }
+
+ free (quota_str);
+ free (mount);
+ free (cgroup_str);
+
+ return cpu_quota;
+}
+#endif
+
+
+/* Return the cgroupv2 CPU quota if the current scheduler honors it.
+ Otherwise return ULONG_MAX.
+ Returned value will be >= 1. */
+static unsigned long int
+cpu_quota (void)
+{
+ unsigned long int quota = ULONG_MAX;
+
+#if defined __linux__ || defined __ANDROID__
+# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE
+ /* We've a new enough sched.h */
+ switch (sched_getscheduler (0))
+ {
+ case -1:
+ case SCHED_FIFO:
+ case SCHED_RR:
+ case SCHED_DEADLINE:
+ quota = ULONG_MAX;
+ break;
+ default:
+ quota = get_cgroup2_cpu_quota ();
+ break;
+ }
+# endif
+#endif
+
+ return quota;
}
/* Parse OMP environment variables without dependence on OMP.
unsigned long int
num_processors (enum nproc_query query)
{
- unsigned long int omp_env_limit = ULONG_MAX;
+ unsigned long int nproc_limit = ULONG_MAX;
+ /* Honor the OpenMP environment variables, recognized also by all
+ programs that are based on OpenMP. */
if (query == NPROC_CURRENT_OVERRIDABLE)
{
- unsigned long int omp_env_threads;
- /* Honor the OpenMP environment variables, recognized also by all
- programs that are based on OpenMP. */
+ unsigned long int omp_env_threads, omp_env_limit;
omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS"));
omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT"));
if (! omp_env_limit)
if (omp_env_threads)
return MIN (omp_env_threads, omp_env_limit);
+ nproc_limit = omp_env_limit;
query = NPROC_CURRENT;
}
- /* Here query is one of NPROC_ALL, NPROC_CURRENT. */
- if (omp_env_limit == 1)
- /* No need to even call num_processors_ignoring_omp (query). */
- return 1;
- {
- unsigned long nprocs = num_processors_ignoring_omp (query);
- return MIN (nprocs, omp_env_limit);
- }
+
+ /* Honor any CPU quotas. */
+ if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM)
+ {
+ unsigned long int quota = cpu_quota ();
+ nproc_limit = MIN (quota, nproc_limit);
+ }
+
+ if (nproc_limit > NPROC_MINIMUM)
+ {
+ unsigned long nprocs = num_processors_available (query);
+ nproc_limit = MIN (nprocs, nproc_limit);
+ }
+
+ return nproc_limit;
}
--- /dev/null
+/*
+nproc honors cgroup v2 CPU quotas
+and was tested in coreutils on a Fedora 42 system as follows:
+
+# Note we honor a limit anywhere in /proc/self/cgroup hierarchy
+# so apply settings below in the parent cgroup of the current process
+$ nested_cgroup=/sys/fs/cgroup/$(dirname $(cut -d/ -f2- /proc/self/cgroup))
+$ echo $nested_cgroup
+/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/app.slice
+
+# This test system has 4 CPUs
+$ src/nproc
+4
+
+# Behave like MAX (1, (int)round(quota/period))
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "90000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "140000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "150000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+2
+
+# Ensure NPROC_ALL takes precedence
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc --all
+4
+
+# Ensure OMP env vars have appropriate precedence
+$ echo "200000 100000" > $nested_cgroup/cpu.max
+$ OMP_NUM_THREADS=10 src/nproc
+10
+$ OMP_THREAD_LIMIT=10 src/nproc
+2
+
+# Ensure quota only reduces
+$ echo "500000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+4
+
+# Restore system to unlimited
+$ echo "max 100000" > $nested_cgroup/cpu.max
+
+# Test quota in root hierarchy
+$ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc
+4
+
+# Docker is similar to podman, but explicitly limits max allowable
+$ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc
+docker: Error response from daemon:
+range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs
+*/