From cfba9b9eab911879a3f30132d9f17a8fd3edb943 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Sat, 5 Jul 2025 16:42:41 +0900 Subject: [PATCH] tree-wide: several cleanups for reading/writing /proc/sys/fs/nr_open - use unsigned for the return value of read_nr_open(), as it does not fail, and the kernel internally uses unsigned for the value, - when bumping the value by PID1, let's start from the kernel's maximum value defined in fs/file.c. The maximum value should be mostly an API of the kernel, but may changed in a future, hence still try several times if we fail to bump the value. Co-authored-by: Jared Baur Co-authored-by: John Rinehart --- src/basic/fd-util.c | 8 +++--- src/basic/fd-util.h | 9 ++++++- src/core/execute-serialize.c | 15 +++++------ src/core/main.c | 48 +++++++++--------------------------- src/test/test-fd-util.c | 2 +- 5 files changed, 31 insertions(+), 51 deletions(-) diff --git a/src/basic/fd-util.c b/src/basic/fd-util.c index 6493c83c48e..9cfaf8a8495 100644 --- a/src/basic/fd-util.c +++ b/src/basic/fd-util.c @@ -990,7 +990,7 @@ int fd_verify_safe_flags_full(int fd, int extra_flags) { return flags & (O_ACCMODE_STRICT | extra_flags); /* return the flags variable, but remove the noise */ } -int read_nr_open(void) { +unsigned read_nr_open(void) { _cleanup_free_ char *nr_open = NULL; int r; @@ -1001,9 +1001,9 @@ int read_nr_open(void) { if (r < 0) log_debug_errno(r, "Failed to read /proc/sys/fs/nr_open, ignoring: %m"); else { - int v; + unsigned v; - r = safe_atoi(nr_open, &v); + r = safe_atou(nr_open, &v); if (r < 0) log_debug_errno(r, "Failed to parse /proc/sys/fs/nr_open value '%s', ignoring: %m", nr_open); else @@ -1011,7 +1011,7 @@ int read_nr_open(void) { } /* If we fail, fall back to the hard-coded kernel limit of 1024 * 1024. */ - return 1024 * 1024; + return NR_OPEN_DEFAULT; } int fd_get_diskseq(int fd, uint64_t *ret) { diff --git a/src/basic/fd-util.h b/src/basic/fd-util.h index 25ad3e9361e..d759300e4ff 100644 --- a/src/basic/fd-util.h +++ b/src/basic/fd-util.h @@ -50,6 +50,13 @@ * around the problems with musl's definition. */ #define O_ACCMODE_STRICT (O_RDONLY|O_WRONLY|O_RDWR) +/* The default, minimum, and maximum values of /proc/sys/fs/nr_open. See kernel's fs/file.c. + * These values have been unchanged since kernel-2.6.26: + * https://github.com/torvalds/linux/commit/eceea0b3df05ed262ae32e0c6340cc7a3626632d */ +#define NR_OPEN_DEFAULT ((unsigned) (1024 * 1024)) +#define NR_OPEN_MINIMUM ((unsigned) (sizeof(long) * 8)) +#define NR_OPEN_MAXIMUM ((unsigned) (CONST_MIN((size_t) INT_MAX, SIZE_MAX / __SIZEOF_POINTER__) & ~(sizeof(long) * 8 - 1))) + int close_nointr(int fd); int safe_close(int fd); void safe_close_pair(int p[static 2]); @@ -148,7 +155,7 @@ static inline int fd_verify_safe_flags(int fd) { return fd_verify_safe_flags_full(fd, 0); } -int read_nr_open(void); +unsigned read_nr_open(void); int fd_get_diskseq(int fd, uint64_t *ret); int path_is_root_at(int dir_fd, const char *path); diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 8fc38902144..c425003b0bc 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1253,16 +1253,13 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext } static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { - int r, nr_open; + int r; assert(p); assert(f); assert(fds); - nr_open = read_nr_open(); - if (nr_open < 3) - nr_open = HIGH_RLIMIT_NOFILE; - assert(nr_open > 0); /* For compilers/static analyzers */ + unsigned nr_open = MAX(read_nr_open(), NR_OPEN_MINIMUM); for (;;) { _cleanup_free_ char *l = NULL; @@ -1290,7 +1287,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (r < 0) return r; - if (p->n_socket_fds > (size_t) nr_open) + if (p->n_socket_fds > nr_open) return -EINVAL; /* too many, someone is playing games with us */ } else if ((val = startswith(l, "exec-parameters-n-storage-fds="))) { if (p->fds) @@ -1300,7 +1297,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (r < 0) return r; - if (p->n_storage_fds > (size_t) nr_open) + if (p->n_storage_fds > nr_open) return -EINVAL; /* too many, someone is playing games with us */ } else if ((val = startswith(l, "exec-parameters-n-extra-fds="))) { if (p->fds) @@ -1310,7 +1307,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { if (r < 0) return r; - if (p->n_extra_fds > (size_t) nr_open) + if (p->n_extra_fds > nr_open) return -EINVAL; /* too many, someone is playing games with us */ } else if ((val = startswith(l, "exec-parameters-fds="))) { if (p->n_socket_fds + p->n_storage_fds + p->n_extra_fds == 0) @@ -1318,7 +1315,7 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { SYNTHETIC_ERRNO(EINVAL), "Got exec-parameters-fds= without " "prior exec-parameters-n-socket-fds= or exec-parameters-n-storage-fds= or exec-parameters-n-extra-fds="); - if (p->n_socket_fds + p->n_storage_fds + p->n_extra_fds > (size_t) nr_open) + if (p->n_socket_fds + p->n_storage_fds + p->n_extra_fds > nr_open) return -EINVAL; /* too many, someone is playing games with us */ if (p->fds) diff --git a/src/core/main.c b/src/core/main.c index ee980ed189c..c32a971455c 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1275,38 +1275,20 @@ static void bump_file_max_and_nr_open(void) { #endif #if BUMP_PROC_SYS_FS_NR_OPEN - int v = INT_MAX; + /* The kernel enforces maximum and minimum values on the fs.nr_open, but they are not directly + * exposed, but hardcoded in fs/file.c. Hopefully, these values will not be changed, but not sure. + * Let's first try the hardcoded maximum value, and if it does not work, try the half of it. */ - /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know - * what they are. The expression by which the maximum is determined is dependent on the architecture, - * and is something we don't really want to copy to userspace, as it is dependent on implementation - * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try - * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that - * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */ - - for (;;) { - int k; - - v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */ - if (v < 1024) { - log_warning("Can't bump fs.nr_open, value too small."); - break; - } - - k = read_nr_open(); - if (k < 0) { - log_error_errno(k, "Failed to read fs.nr_open: %m"); - break; - } + for (unsigned v = NR_OPEN_MAXIMUM; v >= NR_OPEN_MINIMUM; v /= 2) { + unsigned k = read_nr_open(); if (k >= v) { /* Already larger */ log_debug("Skipping bump, value is already larger."); break; } - r = sysctl_writef("fs/nr_open", "%i", v); + r = sysctl_writef("fs/nr_open", "%u", v); if (r == -EINVAL) { - log_debug("Couldn't write fs.nr_open as %i, halving it.", v); - v /= 2; + log_debug("Couldn't write fs.nr_open as %u, halving it.", v); continue; } if (r < 0) { @@ -1314,7 +1296,7 @@ static void bump_file_max_and_nr_open(void) { break; } - log_debug("Successfully bumped fs.nr_open to %i", v); + log_debug("Successfully bumped fs.nr_open to %u", v); break; } #endif @@ -1322,10 +1304,10 @@ static void bump_file_max_and_nr_open(void) { static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) { struct rlimit new_rlimit; - int r, nr; + int r; /* Get the underlying absolute limit the kernel enforces */ - nr = read_nr_open(); + unsigned nr = read_nr_open(); /* Calculate the new limits to use for us. Never lower from what we inherited. */ new_rlimit = (struct rlimit) { @@ -2690,14 +2672,8 @@ static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) { * (and thus use poll()/epoll instead of select(), the way everybody should) can * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit * we pass. */ - if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) { - int nr; - - /* Get the underlying absolute limit the kernel enforces */ - nr = read_nr_open(); - - rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); - } + if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) + rl->rlim_max = MIN((rlim_t) read_nr_open(), MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE)); /* If for some reason we were invoked with a soft limit above 1024 (which should never * happen!, but who knows what we get passed in from pam_limit when invoked as --user diff --git a/src/test/test-fd-util.c b/src/test/test-fd-util.c index 768a02f5961..621e611de3c 100644 --- a/src/test/test-fd-util.c +++ b/src/test/test-fd-util.c @@ -228,7 +228,7 @@ TEST(rearrange_stdio) { } TEST(read_nr_open) { - log_info("nr-open: %i", read_nr_open()); + log_info("nr-open: %u", read_nr_open()); } static size_t validate_fds( -- 2.47.3