]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
chase: Use openat2() if available
authorDaan De Meyer <daan@amutable.com>
Mon, 11 May 2026 17:42:45 +0000 (19:42 +0200)
committerDaan De Meyer <daan.j.demeyer@gmail.com>
Tue, 12 May 2026 12:43:39 +0000 (14:43 +0200)
Let's make use of openat2() if we can in chaseat().

meson.build
src/basic/chase.c
src/include/override/fcntl.h
src/libc/fcntl.c [new file with mode: 0644]
src/libc/meson.build

index d6fbd7c2b7ea68e23833811c4d2326f0ddbe7697..d5405995db22cbcf99da32d607be21cfe15a701f 100644 (file)
@@ -593,6 +593,7 @@ foreach ident : [
         ['mount_setattr',     '''#include <sys/mount.h>'''],    # since glibc-2.36
         ['move_mount',        '''#include <sys/mount.h>'''],    # since glibc-2.36
         ['open_tree',         '''#include <sys/mount.h>'''],    # since glibc-2.36
+        ['openat2',           '''#include <fcntl.h>'''],        # since glibc-2.42
         ['pidfd_open',        '''#include <sys/pidfd.h>'''],    # since glibc-2.36
         ['pidfd_send_signal', '''#include <sys/pidfd.h>'''],    # since glibc-2.36
         ['pidfd_spawn',       '''#include <spawn.h>'''],        # since glibc-2.39
index b8fcbcb4e5fa8a02e7938579fc43e82a255ae23f..66c78fc8ddb5609dc7289d9f80b73d2707b9b41a 100644 (file)
@@ -50,6 +50,93 @@ static int chase_statx(int fd, struct statx *ret) {
                         ret);
 }
 
+static int chase_openat2(int root_fd, int dir_fd, const char *path, ChaseFlags chase_flags) {
+        /* Open the target of a chase operation via openat2(), translating the relevant ChaseFlags into
+         * RESOLVE_* and O_* flags and verifying MUST_BE_REGULAR/SOCKET via fstat after the open. Returns
+         * -EOPNOTSUPP when openat2() is unavailable (older kernels) or blocked by a seccomp filter
+         * (notably systemd's own filter, which returns ENOSYS to force programs onto the openat()
+         * fallback path) — the verdict is cached so subsequent calls in the same process skip the syscall
+         * entirely. */
+
+        static bool can_openat2 = true;
+        int r;
+
+        assert(path);
+        assert(dir_fd >= 0 || IN_SET(dir_fd, AT_FDCWD, XAT_FDROOT));
+
+        if (!can_openat2)
+                return -EOPNOTSUPP;
+
+        /* openat2() can handle everything the regular shortcut handles, plus a real root boundary (via
+         * RESOLVE_IN_ROOT) and CHASE_PROHIBIT_SYMLINKS (via RESOLVE_NO_SYMLINKS). It cannot model the other
+         * CHASE_NO_SHORTCUT flags, cannot trigger automounts on O_PATH fds, and RESOLVE_IN_ROOT requires
+         * the dirfd to be the root. Bail out so the caller falls back to the regular chase loop. */
+        if ((chase_flags & (CHASE_NO_SHORTCUT_MASK & ~CHASE_PROHIBIT_SYMLINKS)) != 0)
+                return -EOPNOTSUPP;
+        if (FLAGS_SET(chase_flags, CHASE_TRIGGER_AUTOFS))
+                return -EOPNOTSUPP;
+        if (root_fd != XAT_FDROOT && root_fd != dir_fd)
+                return -EOPNOTSUPP;
+
+        _cleanup_close_ int dir_fd_local = -EBADF;
+        if (dir_fd == XAT_FDROOT) {
+                if (path_is_absolute(path))
+                        dir_fd = AT_FDCWD;
+                else {
+                        dir_fd_local = open("/", O_CLOEXEC|O_DIRECTORY|O_PATH);
+                        if (dir_fd_local < 0)
+                                return -errno;
+                        dir_fd = dir_fd_local;
+                }
+        }
+
+        struct open_how how = {
+                .flags = O_PATH|O_CLOEXEC,
+        };
+        if (FLAGS_SET(chase_flags, CHASE_NOFOLLOW))
+                how.flags |= O_NOFOLLOW;
+        if (FLAGS_SET(chase_flags, CHASE_MUST_BE_DIRECTORY))
+                how.flags |= O_DIRECTORY;
+        if (root_fd != XAT_FDROOT)
+                how.resolve |= RESOLVE_IN_ROOT;
+        if (FLAGS_SET(chase_flags, CHASE_PROHIBIT_SYMLINKS))
+                how.resolve |= RESOLVE_NO_SYMLINKS;
+
+        _cleanup_close_ int fd = openat2(dir_fd, path, &how, sizeof(how));
+        if (fd < 0) {
+                /* ENOSYS: kernel too old or seccomp filter (systemd's filter returns ENOSYS).
+                 * EPERM: Some seccomp profiles of container runtimes use EPERM rather than ENOSYS.
+                 * But EPERM might also be returned because we can't access some component of the path. So
+                 * we can't cache the result and skip using openat2() if it is blocked with EPERM. Instead
+                 * we fall back to userspace chase() if we get EPERM.
+                 * EAGAIN: with RESOLVE_IN_ROOT the kernel returns this when a ".." component
+                 * (typically from following a symlink like /etc/os-release → ../usr/lib/os-release)
+                 * is processed and the global mount_lock or rename_lock seqcount changed during
+                 * the walk. Any mount activity anywhere in the system bumps mount_lock, so this
+                 * fires reliably while we're still setting up a mount tree. Fall back to the
+                 * regular chase loop, which handles root boundaries without openat2(). Don't
+                 * cache this — the condition is per-call, not a kernel/sandbox capability. */
+                if (errno == ENOSYS)
+                        can_openat2 = false;
+                if (IN_SET(errno, ENOSYS, EPERM, EAGAIN))
+                        return -EOPNOTSUPP;
+                return -errno;
+        }
+
+        if (FLAGS_SET(chase_flags, CHASE_MUST_BE_REGULAR)) {
+                r = fd_verify_regular(fd);
+                if (r < 0)
+                        return r;
+        }
+        if (FLAGS_SET(chase_flags, CHASE_MUST_BE_SOCKET)) {
+                r = fd_verify_socket(fd);
+                if (r < 0)
+                        return r;
+        }
+
+        return TAKE_FD(fd);
+}
+
 static int chase_xopenat(int dir_fd, const char *path, ChaseFlags chase_flags, int open_flags, XOpenFlags xopen_flags) {
         /* Wrapper around xopenat_full() that translates CHASE_NOFOLLOW, CHASE_MUST_BE_* and
          * CHASE_TRIGGER_AUTOFS into their xopenat_full() counterparts. Used by shortcuts that want to open
@@ -262,6 +349,20 @@ int chaseat(int root_fd, int dir_fd, const char *path, ChaseFlags flags, char **
         if (FLAGS_SET(flags, CHASE_MUST_BE_DIRECTORY) + FLAGS_SET(flags, CHASE_MUST_BE_REGULAR) + FLAGS_SET(flags, CHASE_MUST_BE_SOCKET) > 1)
                 return -EBADSLT;
 
+        if (!ret_path) {
+                r = chase_openat2(root_fd, dir_fd, path, flags);
+                if (r >= 0) {
+                        if (ret_fd)
+                                *ret_fd = r;
+                        else
+                                safe_close(r);
+
+                        return 1;
+                }
+                if (r != -EOPNOTSUPP)
+                        return r;
+        }
+
         if (root_fd == XAT_FDROOT && !ret_path && (flags & CHASE_NO_SHORTCUT_MASK) == 0) {
                 /* Shortcut the common case where we don't have a real root boundary and no fancy features
                  * are requested: open the target directly via xopenat_full() which applies any MUST_BE_*
index b41f364534174dc5abe51d2c57c7e21eb1c44568..a6f2afe53b8108e6913f24a45eb587d56a296adb 100644 (file)
@@ -2,6 +2,8 @@
 #pragma once
 
 #include_next <fcntl.h>         /* IWYU pragma: export */
+#include <linux/openat2.h>      /* IWYU pragma: export */
+#include <stddef.h>
 
 /* This is defined since glibc-2.41. */
 #ifndef F_DUPFD_QUERY
 #ifndef AT_HANDLE_MNT_ID_UNIQUE
 #define AT_HANDLE_MNT_ID_UNIQUE 0x001  /* Return the u64 unique mount ID. */
 #endif
+
+/* Defined since glibc-2.42.
+ * Supported since kernel v5.6 (fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179). */
+#if !HAVE_OPENAT2
+int missing_openat2(int dfd, const char *filename, const struct open_how *how, size_t usize);
+#  define openat2 missing_openat2
+#endif
diff --git a/src/libc/fcntl.c b/src/libc/fcntl.c
new file mode 100644 (file)
index 0000000..ca5e6c7
--- /dev/null
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#if !HAVE_OPENAT2
+int missing_openat2(int dfd, const char *filename, const struct open_how *how, size_t usize) {
+        return syscall(__NR_openat2, dfd, filename, how, usize);
+}
+#endif
index 3b7b96d07f219e06e58c62f3a1268be87bed87ec..e100a6f4ea06dce3f6b2243904826bcd9d433109 100644 (file)
@@ -2,6 +2,7 @@
 
 libc_wrapper_sources = files(
         'bpf.c',
+        'fcntl.c',
         'ioprio.c',
         'kcmp.c',
         'kexec.c',