From: Daan De Meyer Date: Mon, 11 May 2026 17:42:45 +0000 (+0200) Subject: chase: Use openat2() if available X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b81e3b751b60bb63fec28f65306e62ffa48cee7;p=thirdparty%2Fsystemd.git chase: Use openat2() if available Let's make use of openat2() if we can in chaseat(). --- diff --git a/meson.build b/meson.build index d6fbd7c2b7e..d5405995db2 100644 --- a/meson.build +++ b/meson.build @@ -593,6 +593,7 @@ foreach ident : [ ['mount_setattr', '''#include '''], # since glibc-2.36 ['move_mount', '''#include '''], # since glibc-2.36 ['open_tree', '''#include '''], # since glibc-2.36 + ['openat2', '''#include '''], # since glibc-2.42 ['pidfd_open', '''#include '''], # since glibc-2.36 ['pidfd_send_signal', '''#include '''], # since glibc-2.36 ['pidfd_spawn', '''#include '''], # since glibc-2.39 diff --git a/src/basic/chase.c b/src/basic/chase.c index b8fcbcb4e5f..66c78fc8ddb 100644 --- a/src/basic/chase.c +++ b/src/basic/chase.c @@ -50,6 +50,93 @@ static int chase_statx(int fd, struct statx *ret) { ret); } +static int chase_openat2(int root_fd, int dir_fd, const char *path, ChaseFlags chase_flags) { + /* Open the target of a chase operation via openat2(), translating the relevant ChaseFlags into + * RESOLVE_* and O_* flags and verifying MUST_BE_REGULAR/SOCKET via fstat after the open. Returns + * -EOPNOTSUPP when openat2() is unavailable (older kernels) or blocked by a seccomp filter + * (notably systemd's own filter, which returns ENOSYS to force programs onto the openat() + * fallback path) — the verdict is cached so subsequent calls in the same process skip the syscall + * entirely. */ + + static bool can_openat2 = true; + int r; + + assert(path); + assert(dir_fd >= 0 || IN_SET(dir_fd, AT_FDCWD, XAT_FDROOT)); + + if (!can_openat2) + return -EOPNOTSUPP; + + /* openat2() can handle everything the regular shortcut handles, plus a real root boundary (via + * RESOLVE_IN_ROOT) and CHASE_PROHIBIT_SYMLINKS (via RESOLVE_NO_SYMLINKS). It cannot model the other + * CHASE_NO_SHORTCUT flags, cannot trigger automounts on O_PATH fds, and RESOLVE_IN_ROOT requires + * the dirfd to be the root. Bail out so the caller falls back to the regular chase loop. */ + if ((chase_flags & (CHASE_NO_SHORTCUT_MASK & ~CHASE_PROHIBIT_SYMLINKS)) != 0) + return -EOPNOTSUPP; + if (FLAGS_SET(chase_flags, CHASE_TRIGGER_AUTOFS)) + return -EOPNOTSUPP; + if (root_fd != XAT_FDROOT && root_fd != dir_fd) + return -EOPNOTSUPP; + + _cleanup_close_ int dir_fd_local = -EBADF; + if (dir_fd == XAT_FDROOT) { + if (path_is_absolute(path)) + dir_fd = AT_FDCWD; + else { + dir_fd_local = open("/", O_CLOEXEC|O_DIRECTORY|O_PATH); + if (dir_fd_local < 0) + return -errno; + dir_fd = dir_fd_local; + } + } + + struct open_how how = { + .flags = O_PATH|O_CLOEXEC, + }; + if (FLAGS_SET(chase_flags, CHASE_NOFOLLOW)) + how.flags |= O_NOFOLLOW; + if (FLAGS_SET(chase_flags, CHASE_MUST_BE_DIRECTORY)) + how.flags |= O_DIRECTORY; + if (root_fd != XAT_FDROOT) + how.resolve |= RESOLVE_IN_ROOT; + if (FLAGS_SET(chase_flags, CHASE_PROHIBIT_SYMLINKS)) + how.resolve |= RESOLVE_NO_SYMLINKS; + + _cleanup_close_ int fd = openat2(dir_fd, path, &how, sizeof(how)); + if (fd < 0) { + /* ENOSYS: kernel too old or seccomp filter (systemd's filter returns ENOSYS). + * EPERM: Some seccomp profiles of container runtimes use EPERM rather than ENOSYS. + * But EPERM might also be returned because we can't access some component of the path. So + * we can't cache the result and skip using openat2() if it is blocked with EPERM. Instead + * we fall back to userspace chase() if we get EPERM. + * EAGAIN: with RESOLVE_IN_ROOT the kernel returns this when a ".." component + * (typically from following a symlink like /etc/os-release → ../usr/lib/os-release) + * is processed and the global mount_lock or rename_lock seqcount changed during + * the walk. Any mount activity anywhere in the system bumps mount_lock, so this + * fires reliably while we're still setting up a mount tree. Fall back to the + * regular chase loop, which handles root boundaries without openat2(). Don't + * cache this — the condition is per-call, not a kernel/sandbox capability. */ + if (errno == ENOSYS) + can_openat2 = false; + if (IN_SET(errno, ENOSYS, EPERM, EAGAIN)) + return -EOPNOTSUPP; + return -errno; + } + + if (FLAGS_SET(chase_flags, CHASE_MUST_BE_REGULAR)) { + r = fd_verify_regular(fd); + if (r < 0) + return r; + } + if (FLAGS_SET(chase_flags, CHASE_MUST_BE_SOCKET)) { + r = fd_verify_socket(fd); + if (r < 0) + return r; + } + + return TAKE_FD(fd); +} + static int chase_xopenat(int dir_fd, const char *path, ChaseFlags chase_flags, int open_flags, XOpenFlags xopen_flags) { /* Wrapper around xopenat_full() that translates CHASE_NOFOLLOW, CHASE_MUST_BE_* and * CHASE_TRIGGER_AUTOFS into their xopenat_full() counterparts. Used by shortcuts that want to open @@ -262,6 +349,20 @@ int chaseat(int root_fd, int dir_fd, const char *path, ChaseFlags flags, char ** if (FLAGS_SET(flags, CHASE_MUST_BE_DIRECTORY) + FLAGS_SET(flags, CHASE_MUST_BE_REGULAR) + FLAGS_SET(flags, CHASE_MUST_BE_SOCKET) > 1) return -EBADSLT; + if (!ret_path) { + r = chase_openat2(root_fd, dir_fd, path, flags); + if (r >= 0) { + if (ret_fd) + *ret_fd = r; + else + safe_close(r); + + return 1; + } + if (r != -EOPNOTSUPP) + return r; + } + if (root_fd == XAT_FDROOT && !ret_path && (flags & CHASE_NO_SHORTCUT_MASK) == 0) { /* Shortcut the common case where we don't have a real root boundary and no fancy features * are requested: open the target directly via xopenat_full() which applies any MUST_BE_* diff --git a/src/include/override/fcntl.h b/src/include/override/fcntl.h index b41f3645341..a6f2afe53b8 100644 --- a/src/include/override/fcntl.h +++ b/src/include/override/fcntl.h @@ -2,6 +2,8 @@ #pragma once #include_next /* IWYU pragma: export */ +#include /* IWYU pragma: export */ +#include /* This is defined since glibc-2.41. */ #ifndef F_DUPFD_QUERY @@ -22,3 +24,10 @@ #ifndef AT_HANDLE_MNT_ID_UNIQUE #define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */ #endif + +/* Defined since glibc-2.42. + * Supported since kernel v5.6 (fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179). */ +#if !HAVE_OPENAT2 +int missing_openat2(int dfd, const char *filename, const struct open_how *how, size_t usize); +# define openat2 missing_openat2 +#endif diff --git a/src/libc/fcntl.c b/src/libc/fcntl.c new file mode 100644 index 00000000000..ca5e6c7977c --- /dev/null +++ b/src/libc/fcntl.c @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include + +#if !HAVE_OPENAT2 +int missing_openat2(int dfd, const char *filename, const struct open_how *how, size_t usize) { + return syscall(__NR_openat2, dfd, filename, how, usize); +} +#endif diff --git a/src/libc/meson.build b/src/libc/meson.build index 3b7b96d07f2..e100a6f4ea0 100644 --- a/src/libc/meson.build +++ b/src/libc/meson.build @@ -2,6 +2,7 @@ libc_wrapper_sources = files( 'bpf.c', + 'fcntl.c', 'ioprio.c', 'kcmp.c', 'kexec.c',