]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
nsresourced: add new daemon for granting clients user namespaces and assigning resour...
authorLennart Poettering <lennart@poettering.net>
Thu, 20 Apr 2023 17:07:33 +0000 (19:07 +0200)
committerLennart Poettering <lennart@poettering.net>
Sat, 6 Apr 2024 14:08:24 +0000 (16:08 +0200)
This adds a small, socket-activated Varlink daemon that can delegate UID
ranges for user namespaces to clients asking for it.

The primary call is AllocateUserRange() where the user passes in an
uninitialized userns fd, which is then set up.

There are other calls that allow assigning a mount fd to a userns
allocated that way, to set up permissions for a cgroup subtree, and to
allocate a veth for such a user namespace.

Since the UID assignments are supposed to be transitive, i.e. not
permanent, care is taken to ensure that users cannot create inodes owned
by these UIDs, so that persistancy cannot be acquired. This is
implemented via a BPF-LSM module that ensures that any member of a
userns allocated that way cannot create files unless the mount it
operates on is owned by the userns itself, or is explicitly
allowelisted.

BPF LSM program with contributions from Alexei Starovoitov.

27 files changed:
man/rules/meson.build
man/systemd-nsresourced.service.xml [new file with mode: 0644]
meson.build
meson_options.txt
network/80-namespace-ns.network [new file with mode: 0644]
network/meson.build
presets/90-systemd.preset
src/nsresourced/bpf/userns_restrict/meson.build [new file with mode: 0644]
src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h [new file with mode: 0644]
src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c [new file with mode: 0644]
src/nsresourced/meson.build [new file with mode: 0644]
src/nsresourced/nsresourced-manager.c [new file with mode: 0644]
src/nsresourced/nsresourced-manager.h [new file with mode: 0644]
src/nsresourced/nsresourced.c [new file with mode: 0644]
src/nsresourced/nsresourcework.c [new file with mode: 0644]
src/nsresourced/test-userns-restrict.c [new file with mode: 0644]
src/nsresourced/userns-registry.c [new file with mode: 0644]
src/nsresourced/userns-registry.h [new file with mode: 0644]
src/nsresourced/userns-restrict.c [new file with mode: 0644]
src/nsresourced/userns-restrict.h [new file with mode: 0644]
src/shared/meson.build
src/shared/varlink-io.systemd.NamespaceResource.c [new file with mode: 0644]
src/shared/varlink-io.systemd.NamespaceResource.h [new file with mode: 0644]
src/test/test-varlink-idl.c
units/meson.build
units/systemd-nsresourced.service.in [new file with mode: 0644]
units/systemd-nsresourced.socket [new file with mode: 0644]

index 1ca5b105b33d0f940f6210b1f24b4b468ea044e1..e6c0ac9b52cda0ba1d31c7010114466dbb17eb4c 100644 (file)
@@ -1007,6 +1007,10 @@ manpages = [
  ['systemd-networkd.service', '8', ['systemd-networkd'], 'ENABLE_NETWORKD'],
  ['systemd-notify', '1', [], ''],
  ['systemd-nspawn', '1', [], ''],
+ ['systemd-nsresourced.service',
+  '8',
+  ['systemd-nsresourced'],
+  'ENABLE_NSRESOURCED'],
  ['systemd-oomd.service', '8', ['systemd-oomd'], 'ENABLE_OOMD'],
  ['systemd-path', '1', [], ''],
  ['systemd-pcrlock',
diff --git a/man/systemd-nsresourced.service.xml b/man/systemd-nsresourced.service.xml
new file mode 100644 (file)
index 0000000..d0a561e
--- /dev/null
@@ -0,0 +1,81 @@
+<?xml version='1.0'?> <!--*-nxml-*-->
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+  "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
+
+<refentry id="systemd-nsresourced.service" conditional='ENABLE_NSRESOURCED'>
+
+  <refentryinfo>
+    <title>systemd-nsresourced.service</title>
+    <productname>systemd</productname>
+  </refentryinfo>
+
+  <refmeta>
+    <refentrytitle>systemd-nsresourced.service</refentrytitle>
+    <manvolnum>8</manvolnum>
+  </refmeta>
+
+  <refnamediv>
+    <refname>systemd-nsresourced.service</refname>
+    <refname>systemd-nsresourced</refname>
+    <refpurpose>User Namespace Resource Delegation Service</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <para><filename>systemd-nsresourced.service</filename></para>
+    <para><filename>/usr/lib/systemd/systemd-nsresourced</filename></para>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>Description</title>
+
+    <para><command>systemd-nsresourced</command> is a system service that permits transient delegation of a a
+    UID/GID range to a user namespace (see <citerefentry
+    project='man-pages'><refentrytitle>user_namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>)
+    allocated by a client, via a Varlink IPC API.</para>
+
+    <para>Unprivileged clients may allocate a user namespace, and then request a UID/GID range to be assigned
+    to it via this service. The user namespace may then be used to run containers and other sandboxes, and/or
+    apply it to an id-mapped mount.</para>
+
+    <para>Allocations of UIDs/GIDs this way are transient: when a user namespace goes away, its UID/GID range
+    is returned to the pool of available ranges. In order to ensure that clients cannot gain persistency in
+    their transient UID/GID range a BPF-LSM based policy is enforced that ensures that user namespaces set up
+    this way can only write to file systems they allocate themselves or that are explicitly allowlisted via
+    <command>systemd-nsresourced</command>.</para>
+
+    <para><command>systemd-nsresourced</command> automatically ensures that any registered UID ranges show up
+    in the system's NSS database via the <ulink url="https://systemd.io/USER_GROUP_API">User/Group Record
+    Lookup API via Varlink</ulink>.</para>
+
+    <para>Currently, only UID/GID ranges consisting of either exactly 1 or exactly 65536 UIDs/GIDs can be
+    registered with this service. Moreover, UIDs and GIDs are always allocated together, and
+    symmetrically.</para>
+
+    <para>The service provides API calls to allowlist mounts (referenced via their mount file descriptors as
+    per Linux <function>fsmount()</function> API), to pass ownership of a cgroup subtree to the user
+    namespace and to delegate a virtual Ethernet device pair to the user namespace. When used in combination
+    this is sufficient to implement fully unprivileged container environments, as implemented by
+    <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>, fully
+    unprivileged <varname>RootImage=</varname> (see
+    <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>) or
+    fully unprivileged disk image tools such as
+    <citerefentry><refentrytitle>systemd-dissect</refentrytitle><manvolnum>1</manvolnum></citerefentry>.</para>
+
+    <para>This service provides one <ulink url="https://varlink.org/">Varlink</ulink> service:
+    <constant>io.systemd.NamespaceResource</constant> allows registering user namespaces, and assign mounts,
+    cgroups and network interfaces to it.</para>
+  </refsect1>
+
+  <refsect1>
+    <title>See Also</title>
+    <para>
+      <citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
+      <citerefentry><refentrytitle>systemd-mountfsd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>,
+      <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
+      <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>,
+      <citerefentry><refentrytitle>systemd-dissect</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
+      <citerefentry project='man-pages'><refentrytitle>user_namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>
+    </para>
+  </refsect1>
+</refentry>
index f48ba6bcd7b37f085b7c1aa0a83e1643c6128f2a..8fd6774799c1bd1082ed7b8ccf4cfb6d54fec675 100644 (file)
@@ -272,6 +272,7 @@ conf.set_quoted('SYSTEMD_TEST_DATA',                          testdata_dir)
 conf.set_quoted('SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH', bindir / 'systemd-tty-ask-password-agent')
 conf.set_quoted('SYSTEMD_UPDATE_HELPER_PATH',                 libexecdir / 'systemd-update-helper')
 conf.set_quoted('SYSTEMD_USERWORK_PATH',                      libexecdir / 'systemd-userwork')
+conf.set_quoted('SYSTEMD_NSRESOURCEWORK_PATH',                libexecdir / 'systemd-nsresourcework')
 conf.set_quoted('SYSTEMD_VERITYSETUP_PATH',                   libexecdir / 'systemd-veritysetup')
 conf.set_quoted('SYSTEM_CONFIG_UNIT_DIR',                     pkgsysconfdir / 'system')
 conf.set_quoted('SYSTEM_DATA_UNIT_DIR',                       systemunitdir)
@@ -1619,6 +1620,7 @@ foreach term : ['analyze',
                 'machined',
                 'networkd',
                 'nscd',
+                'nsresourced',
                 'nss-myhostname',
                 'nss-systemd',
                 'oomd',
@@ -2279,6 +2281,7 @@ subdir('src/mount')
 subdir('src/network')
 subdir('src/notify')
 subdir('src/nspawn')
+subdir('src/nsresourced')
 subdir('src/nss-myhostname')
 subdir('src/nss-mymachines')
 subdir('src/nss-resolve')
index e3a33244fdf8f8768574f9d07649030f81929fde..b75f0746006d9dda50b4969e478b6511b2f6f979 100644 (file)
@@ -140,6 +140,8 @@ option('remote', type : 'feature', deprecated : { 'true' : 'enabled', 'false' :
        description : 'support for "journal over the network"')
 option('create-log-dirs', type : 'boolean',
        description : 'create /var/log/journal{,/remote}')
+option('nsresourced', type : 'boolean',
+       description : 'install the systemd-nsresourced stack')
 option('nss-myhostname', type : 'boolean',
        description : 'install nss-myhostname module')
 option('nss-mymachines', type : 'feature', deprecated : { 'true' : 'enabled', 'false' : 'disabled' },
diff --git a/network/80-namespace-ns.network b/network/80-namespace-ns.network
new file mode 100644 (file)
index 0000000..8a84de9
--- /dev/null
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: MIT-0
+#
+# This config file is installed as part of systemd.
+# It may be freely copied and edited (following the MIT No Attribution license).
+#
+# To make local modifications, one of the following methods may be used:
+# 1. add a drop-in file that extends this file by creating the
+#    /etc/systemd/network/80-namespace-ns.network.d/ directory and creating a
+#    new .conf file there.
+# 2. copy this file into /etc/systemd/network or one of the other paths checked
+#    by systemd-networkd and edit it there.
+# This file should not be edited in place, because it'll be overwritten on upgrades.
+
+# This network file matches the host-side of the virtual Ethernet link
+# created by systemd-nsresourced's network support. See systemd-nsresourced(1) for
+# details.
+
+[Match]
+Kind=veth
+Name=ns-*
+
+[Network]
+# Default to using a /28 prefix, giving up to 13 addresses per namespace
+Address=0.0.0.0/28
+LinkLocalAddressing=yes
+DHCPServer=yes
+IPMasquerade=both
+LLDP=yes
+EmitLLDP=customer-bridge
+IPv6AcceptRA=no
+IPv6SendRA=yes
index 9df0bea76096d79971edd0bcba1b5a2e47657176..a1a87272f7ed8fc6874fcefea55dd2067a462658 100644 (file)
@@ -11,6 +11,7 @@ if conf.get('ENABLE_NETWORKD') == 1
                 '80-container-ve.link',
                 '80-container-vz.network',
                 '80-container-vz.link',
+                '80-namespace-ns.network',
                 '80-vm-vt.network',
                 '80-vm-vt.link',
                 '80-wifi-adhoc.network',
index 5f1c5b072fabd66d374a54e1ccb0d044bc5c075d..f896ade2a4208c65005707a9e5d626e0caa02497 100644 (file)
@@ -24,8 +24,9 @@ enable systemd-homed-activate.service
 enable systemd-homed-firstboot.service
 enable systemd-journald-audit.socket
 enable systemd-network-generator.service
-enable systemd-networkd.service
 enable systemd-networkd-wait-online.service
+enable systemd-networkd.service
+enable systemd-nsresourced.socket
 enable systemd-pstore.service
 enable systemd-resolved.service
 enable systemd-sysext.service
diff --git a/src/nsresourced/bpf/userns_restrict/meson.build b/src/nsresourced/bpf/userns_restrict/meson.build
new file mode 100644 (file)
index 0000000..d773c75
--- /dev/null
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+if conf.get('HAVE_VMLINUX_H') != 1
+        subdir_done()
+endif
+
+userns_restrict_bpf_o_unstripped = custom_target(
+        'userns-restrict.bpf.unstripped.o',
+        input : 'userns-restrict.bpf.c',
+        output : 'userns-restrict.bpf.unstripped.o',
+        command : bpf_o_unstripped_cmd,
+        depends : vmlinux_h_dependency)
+
+userns_restrict_bpf_o = custom_target(
+        'userns-restrict.bpf.o',
+        input : userns_restrict_bpf_o_unstripped,
+        output : 'userns-restrict.bpf.o',
+        command : bpf_o_cmd)
+
+userns_restrict_skel_h = custom_target(
+        'userns-restrict.skel.h',
+        input : userns_restrict_bpf_o,
+        output : 'userns-restrict.skel.h',
+        command : skel_h_cmd,
+        capture : true)
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h b/src/nsresourced/bpf/userns_restrict/userns-restrict-skel.h
new file mode 100644 (file)
index 0000000..271caf4
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+#include "bpf-dlopen.h"
+
+/* libbpf is used via dlopen(), so rename symbols */
+#define bpf_object__attach_skeleton sym_bpf_object__attach_skeleton
+#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton
+#define bpf_object__load_skeleton sym_bpf_object__load_skeleton
+#define bpf_object__open_skeleton sym_bpf_object__open_skeleton
+
+#include "bpf/userns_restrict/userns-restrict.skel.h"
diff --git a/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns_restrict/userns-restrict.bpf.c
new file mode 100644 (file)
index 0000000..956f917
--- /dev/null
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the
+ * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define
+ * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0
+ * it does not.) */
+#undef offsetof
+#undef container_of
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <errno.h>
+
+void *bpf_rdonly_cast(void *, __u32) __ksym;
+
+/* BPF module that implements an allowlist of mounts (identified by mount ID) for user namespaces (identified
+ * by their inode number in nsfs) that restricts creation of inodes (which would inherit the callers UID/GID)
+ * or changing of ownership (similar).
+ *
+ * This hooks into the varius path-based LSM entrypoints that control inode creation as well as chmod(), and
+ * then looks up the calling process' user namespace in a global map of namespaces, which points us to
+ * another map that is simply a list of allowed mnt_ids. */
+
+// FIXME: ACL adjustments are currently not blocked. There's no path-based LSM hook available in the kernel
+// for setting xattrs or ACLs, hence we cannot easily block them, even though we want that. We can get away
+// with ignoring this for now, as ACLs never define ownership, but purely access: i.e. ACLs never allow
+// taking possession of an object, but only control access to it. Thus, things like suid access modes should
+// not be reachable through it. It still sucks though that a user can persistently add an ACL entry to a file
+// with their transient UIDs/GIDs.
+
+/* kernel currently enforces a maximum usernamespace nesting depth of 32, see create_user_ns() in the kernel sources */
+#define USER_NAMESPACE_DEPTH_MAX 32U
+
+struct mnt_id_map {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __uint(max_entries, 1);        /* placeholder, configured otherwise by nsresourced */
+        __type(key, int);
+        __type(value, int);
+};
+
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+        __uint(max_entries, 1);        /* placeholder, configured otherwise by nsresourced */
+        __type(key, unsigned);         /* userns inode */
+        __array(values, struct mnt_id_map);
+} userns_mnt_id_hash SEC(".maps");
+
+struct {
+        __uint(type, BPF_MAP_TYPE_RINGBUF);
+        __uint(max_entries, 4096);
+} userns_ringbuf SEC(".maps");
+
+static inline struct mount *real_mount(struct vfsmount *mnt) {
+        return container_of(mnt, struct mount, mnt);
+}
+
+static int validate_inode_on_mount(struct inode *inode, struct vfsmount *v) {
+        struct user_namespace *mount_userns, *task_userns, *p;
+        unsigned task_userns_inode;
+        struct task_struct *task;
+        void *mnt_id_map;
+        struct mount *m;
+        int mnt_id;
+
+        /* Get user namespace from vfsmount */
+        m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
+        mount_userns = m->mnt_ns->user_ns;
+
+        /* Get user namespace from task */
+        task = (struct task_struct*) bpf_get_current_task_btf();
+        task_userns = task->cred->user_ns;
+
+        /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+         * yes immediately. */
+        p = mount_userns;
+        for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
+                if (p == task_userns)
+                        return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */
+
+                p = p->parent;
+                if (!p)
+                        break;
+        }
+
+        /* Hmm, something is fishy if there's more than 32 levels of namespaces involved. Let's better be
+         * safe than sorry, and refuse. */
+        if (p)
+                return -EPERM;
+
+        /* This is a mount foreign to our task's user namespace, let's consult our allow list */
+        task_userns_inode = task_userns->ns.inum;
+
+        mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode);
+        if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
+                return 0;
+
+        mnt_id = m->mnt_id;
+
+        /* Otherwise, say yes if the mount ID is allowlisted */
+        if (bpf_map_lookup_elem(mnt_id_map, &mnt_id))
+                return 0;
+
+        return -EPERM;
+}
+
+static int validate_path(const struct path *path, int ret) {
+        struct inode *inode;
+        struct vfsmount *v;
+
+        if (ret != 0) /* propagate earlier error */
+                return ret;
+
+        inode = path->dentry->d_inode;
+        v = path->mnt;
+
+        return validate_inode_on_mount(inode, v);
+}
+
+SEC("lsm/path_chown")
+int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) {
+        return validate_path(path, ret);
+}
+
+SEC("lsm/path_mkdir")
+int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) {
+        return validate_path(dir, ret);
+}
+
+SEC("lsm/path_mknod")
+int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev, int ret) {
+        return validate_path(dir, ret);
+}
+
+SEC("lsm/path_symlink")
+int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) {
+        return validate_path(dir, ret);
+}
+
+SEC("lsm/path_link")
+int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) {
+        return validate_path(new_dir, ret);
+}
+
+SEC("kprobe/free_user_ns")
+void BPF_KPROBE(userns_restrict_free_user_ns, struct work_struct *work) {
+        struct user_namespace *userns;
+        unsigned inode;
+        void *mnt_id_map;
+
+        /* Inform userspace that a user namespace just went away. I wish there was a nicer way to hook into
+         * user namespaces being deleted than using kprobes, but couldn't find any. */
+
+        userns = bpf_rdonly_cast(container_of(work, struct user_namespace, work),
+                                 bpf_core_type_id_kernel(struct user_namespace));
+
+        inode = userns->ns.inum;
+
+        mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &inode);
+        if (!mnt_id_map) /* No rules installed for this userns? Then send no notification. */
+                return;
+
+        bpf_ringbuf_output(&userns_ringbuf, &inode, sizeof(inode), 0);
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/nsresourced/meson.build b/src/nsresourced/meson.build
new file mode 100644 (file)
index 0000000..cb131f0
--- /dev/null
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+subdir('bpf/userns_restrict')
+
+systemd_nsresourcework_sources = files(
+        'nsresourcework.c',
+        'userns-restrict.c',
+        'userns-registry.c',
+)
+
+systemd_nsresourced_sources = files(
+        'nsresourced-manager.c',
+        'nsresourced.c',
+        'userns-restrict.c',
+        'userns-registry.c',
+)
+
+userns_restrict_include = include_directories('.')
+
+if conf.get('HAVE_VMLINUX_H') == 1
+        systemd_nsresourcework_sources += userns_restrict_skel_h
+        systemd_nsresourced_sources += userns_restrict_skel_h
+
+        executables += [
+                test_template + {
+                        'sources' : files('test-userns-restrict.c', 'userns-restrict.c') + userns_restrict_skel_h,
+                        'conditions' : ['ENABLE_NSRESOURCED', 'HAVE_VMLINUX_H'],
+                        'include_directories' : [ includes, userns_restrict_include ],
+                },
+        ]
+endif
+
+executables += [
+        libexec_template + {
+                'name' : 'systemd-nsresourcework',
+                'conditions' : ['ENABLE_NSRESOURCED'],
+                'sources' : systemd_nsresourcework_sources,
+                'dependencies' : threads,
+                'include_directories' : [ includes, userns_restrict_include ],
+        },
+        libexec_template + {
+                'name' : 'systemd-nsresourced',
+                'conditions' : ['ENABLE_NSRESOURCED'],
+                'sources' : systemd_nsresourced_sources,
+                'dependencies' : threads,
+                'include_directories' : [ includes, userns_restrict_include ],
+        },
+]
diff --git a/src/nsresourced/nsresourced-manager.c b/src/nsresourced/nsresourced-manager.c
new file mode 100644 (file)
index 0000000..d87da58
--- /dev/null
@@ -0,0 +1,647 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+
+#include "bpf-dlopen.h"
+#include "build-path.h"
+#include "common-signal.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "mkdir.h"
+#include "nsresourced-manager.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "set.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "umask-util.h"
+#include "unaligned.h"
+#include "user-util.h"
+#include "userns-registry.h"
+#include "userns-restrict.h"
+
+#define LISTEN_TIMEOUT_USEC (25 * USEC_PER_SEC)
+
+static int start_workers(Manager *m, bool explicit_request);
+
+static int on_worker_exit(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        assert_se(!set_remove(m->workers_dynamic, s) != !set_remove(m->workers_fixed, s));
+        sd_event_source_disable_unref(s);
+
+        if (si->si_code == CLD_EXITED) {
+                if (si->si_status == EXIT_SUCCESS)
+                        log_debug("Worker " PID_FMT " exited successfully.", si->si_pid);
+                else
+                        log_warning("Worker " PID_FMT " died with a failure exit status %i, ignoring.", si->si_pid, si->si_status);
+        } else if (si->si_code == CLD_KILLED)
+                log_warning("Worker " PID_FMT " was killed by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+        else if (si->si_code == CLD_DUMPED)
+                log_warning("Worker " PID_FMT " dumped core by signal %s, ignoring.", si->si_pid, signal_to_string(si->si_status));
+        else
+                log_warning("Got unexpected exit code via SIGCHLD, ignoring.");
+
+        (void) start_workers(m, /* explicit_request= */ false); /* Fill up workers again if we fell below the low watermark */
+        return 0;
+}
+
+static int on_sigusr2(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        (void) start_workers(m, /* explicit_request=*/ true); /* Workers told us there's more work, let's add one more worker as long as we are below the high watermark */
+        return 0;
+}
+
+static int on_deferred_start_worker(sd_event_source *s, uint64_t usec, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+
+        assert(s);
+
+        m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source);
+
+        (void) start_workers(m, /* explicit_request=*/ false);
+        return 0;
+}
+
+DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+                event_source_hash_ops,
+                sd_event_source,
+                (void (*)(const sd_event_source*, struct siphash*)) trivial_hash_func,
+                (int (*)(const sd_event_source*, const sd_event_source*)) trivial_compare_func,
+                sd_event_source_disable_unref);
+
+int manager_new(Manager **ret) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        m = new(Manager, 1);
+        if (!m)
+                return -ENOMEM;
+
+        *m = (Manager) {
+                .listen_fd = -EBADF,
+                .worker_ratelimit = {
+                        .interval = 2 * USEC_PER_SEC,
+                        .burst = 250,
+                },
+                .registry_fd = -EBADF,
+        };
+
+        r = sd_event_new(&m->event);
+        if (r < 0)
+                return r;
+
+        r = sd_event_set_signal_exit(m->event, true);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_signal(m->event, NULL, (SIGRTMIN+18)|SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL);
+        if (r < 0)
+                return r;
+
+        r = sd_event_add_memory_pressure(m->event, NULL, NULL, NULL);
+        if (r < 0)
+                log_debug_errno(r, "Failed allocate memory pressure event source, ignoring: %m");
+
+        r = sd_event_set_watchdog(m->event, true);
+        if (r < 0)
+                log_debug_errno(r, "Failed to enable watchdog handling, ignoring: %m");
+
+        r = sd_event_add_signal(m->event, NULL, SIGUSR2|SD_EVENT_SIGNAL_PROCMASK, on_sigusr2, m);
+        if (r < 0)
+                return r;
+
+        *ret = TAKE_PTR(m);
+        return 0;
+}
+
+Manager* manager_free(Manager *m) {
+        if (!m)
+                return NULL;
+
+        set_free(m->workers_fixed);
+        set_free(m->workers_dynamic);
+
+        m->deferred_start_worker_event_source = sd_event_source_unref(m->deferred_start_worker_event_source);
+
+        safe_close(m->listen_fd);
+
+#if HAVE_VMLINUX_H
+        sd_event_source_disable_unref(m->userns_restrict_bpf_ring_buffer_event_source);
+        if (m->userns_restrict_bpf_ring_buffer)
+                sym_ring_buffer__free(m->userns_restrict_bpf_ring_buffer);
+        userns_restrict_bpf_free(m->userns_restrict_bpf);
+#endif
+
+        safe_close(m->registry_fd);
+
+        sd_event_unref(m->event);
+
+        return mfree(m);
+}
+
+static size_t manager_current_workers(Manager *m) {
+        assert(m);
+
+        return set_size(m->workers_fixed) + set_size(m->workers_dynamic);
+}
+
+static int start_one_worker(Manager *m) {
+        _cleanup_(sd_event_source_disable_unrefp) sd_event_source *source = NULL;
+        bool fixed;
+        pid_t pid;
+        int r;
+
+        assert(m);
+
+        fixed = set_size(m->workers_fixed) < NSRESOURCE_WORKERS_MIN;
+
+        r = safe_fork_full(
+                        "(sd-worker)",
+                        /* stdio_fds= */ NULL,
+                        &m->listen_fd, 1,
+                        FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_CLOSE_ALL_FDS,
+                        &pid);
+        if (r < 0)
+                return log_error_errno(r, "Failed to fork new worker child: %m");
+        if (r == 0) {
+                char pids[DECIMAL_STR_MAX(pid_t)];
+                /* Child */
+
+                if (m->listen_fd == 3) {
+                        r = fd_cloexec(3, false);
+                        if (r < 0) {
+                                log_error_errno(r, "Failed to turn off O_CLOEXEC for fd 3: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+                } else {
+                        if (dup2(m->listen_fd, 3) < 0) { /* dup2() creates with O_CLOEXEC off */
+                                log_error_errno(errno, "Failed to move listen fd to 3: %m");
+                                _exit(EXIT_FAILURE);
+                        }
+
+                        safe_close(m->listen_fd);
+                }
+
+                xsprintf(pids, PID_FMT, pid);
+                if (setenv("LISTEN_PID", pids, 1) < 0) {
+                        log_error_errno(errno, "Failed to set $LISTEN_PID: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("LISTEN_FDS", "1", 1) < 0) {
+                        log_error_errno(errno, "Failed to set $LISTEN_FDS: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                if (setenv("NSRESOURCE_FIXED_WORKER", one_zero(fixed), 1) < 0) {
+                        log_error_errno(errno, "Failed to set $NSRESOURCE_FIXED_WORKER: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+#if HAVE_VMLINUX_H
+                bool supported = m->userns_restrict_bpf;
+#else
+                bool supported = false;
+#endif
+
+                /* Tell the workers whether to enable the userns API */
+                if (setenv("NSRESOURCE_API", one_zero(supported), 1) < 0) {
+                        log_error_errno(errno, "Failed to set $NSRESOURCE_API: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = setenv_systemd_log_level();
+                if (r < 0) {
+                        log_error_errno(r, "Failed to set $SYSTEMD_LOG_LEVEL: %m");
+                        _exit(EXIT_FAILURE);
+                }
+
+                r = invoke_callout_binary(SYSTEMD_NSRESOURCEWORK_PATH, STRV_MAKE("systemd-nsresourcework", "xxxxxxxxxxxxxxxx")); /* With some extra space rename_process() can make use of */
+                log_error_errno(r, "Failed start worker process: %m");
+                _exit(EXIT_FAILURE);
+        }
+
+        r = sd_event_add_child(m->event, &source, pid, WEXITED, on_worker_exit, m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to watch child " PID_FMT ": %m", pid);
+
+        r = set_ensure_put(
+                        fixed ? &m->workers_fixed : &m->workers_dynamic,
+                        &event_source_hash_ops,
+                        source);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add child process to set: %m");
+
+        TAKE_PTR(source);
+
+        return 0;
+}
+
+static int start_workers(Manager *m, bool explicit_request) {
+        int r;
+
+        assert(m);
+
+        for (;;)  {
+                size_t n;
+
+                n = manager_current_workers(m);
+                if (n >= NSRESOURCE_WORKERS_MIN && (!explicit_request || n >= NSRESOURCE_WORKERS_MAX))
+                        break;
+
+                if (!ratelimit_below(&m->worker_ratelimit)) {
+
+                        /* If we keep starting workers too often but none sticks, let's fail the whole
+                         * daemon, something is wrong */
+                        if (n == 0) {
+                                sd_event_exit(m->event, EXIT_FAILURE);
+                                return log_error_errno(SYNTHETIC_ERRNO(EUCLEAN), "Worker threads requested too frequently, but worker count is zero, something is wrong.");
+                        }
+
+                        /* Otherwise, let's stop spawning more for a while. */
+                        log_warning("Worker threads requested too frequently, not starting new ones for a while.");
+
+                        if (!m->deferred_start_worker_event_source) {
+                                r = sd_event_add_time(
+                                                m->event,
+                                                &m->deferred_start_worker_event_source,
+                                                CLOCK_MONOTONIC,
+                                                ratelimit_end(&m->worker_ratelimit),
+                                                /* accuracy_usec= */ 0,
+                                                on_deferred_start_worker,
+                                                m);
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to allocate deferred start worker event source: %m");
+                        }
+
+                        break;
+                }
+
+                r = start_one_worker(m);
+                if (r < 0)
+                        return r;
+
+                explicit_request = false;
+        }
+
+        return 0;
+}
+
+static void manager_release_userns_bpf(Manager *m, uint64_t inode) {
+#if HAVE_VMLINUX_H
+        int r;
+
+        assert(m);
+
+        if (inode == 0)
+                return;
+
+        assert(m->userns_restrict_bpf);
+
+        r = userns_restrict_reset_by_inode(m->userns_restrict_bpf, inode);
+        if (r < 0)
+                return (void) log_warning_errno(r, "Failed to remove namespace inode from BPF map, ignoring: %m");
+#endif
+}
+
+static void manager_release_userns_fds(Manager *m, uint64_t inode) {
+        int r;
+
+        assert(m);
+        assert(inode != 0);
+
+        r = sd_notifyf(/* unset_environment= */ false,
+                       "FDSTOREREMOVE=1\n"
+                       "FDNAME=userns-%" PRIu64 "\n", inode);
+        if (r < 0)
+                log_warning_errno(r, "Failed to send fd store removal message, ignoring: %m");
+}
+
+static void manager_release_userns_by_inode(Manager *m, uint64_t inode) {
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_close_ int lock_fd = -EBADF;
+        int r;
+
+        assert(m);
+        assert(inode != 0);
+
+        lock_fd = userns_registry_lock(m->registry_fd);
+        if (lock_fd < 0)
+                return (void) log_error_errno(lock_fd, "Failed to lock registry: %m");
+
+        r = userns_registry_load_by_userns_inode(m->registry_fd, inode, &userns_info);
+        if (r < 0)
+                log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
+                               "Failed to find userns for inode %" PRIu64 ", ignoring: %m", inode);
+
+        if (userns_info && uid_is_valid(userns_info->start))
+                log_debug("Removing user namespace mapping %" PRIu64 " for UID " UID_FMT ".", inode, userns_info->start);
+        else
+                log_debug("Removing user namespace mapping %" PRIu64 ".", inode);
+
+        /* Remove the BPF rules */
+        manager_release_userns_bpf(m, inode);
+
+        /* Remove the resources from the fdstore */
+        manager_release_userns_fds(m, inode);
+
+        /* And finally remove the resources file from disk */
+        if (userns_info) {
+                /* Remove the cgroups of this userns */
+                r = userns_info_remove_cgroups(userns_info);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to remove cgroups of user namespace: %m");
+
+                r = userns_registry_remove(m->registry_fd, userns_info);
+                if (r < 0)
+                        log_warning_errno(r, "Failed to remove user namespace '%s', ignoring.", userns_info->name);
+        }
+}
+
+static int manager_scan_registry(Manager *m, Set **registry_inodes) {
+        _cleanup_free_ DirectoryEntries *de = NULL;
+        int r;
+
+        assert(m);
+        assert(registry_inodes);
+        assert(m->registry_fd >= 0);
+
+        r = readdir_all(m->registry_fd, RECURSE_DIR_IGNORE_DOT, &de);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enumerate registry.");
+
+        for (size_t i = 0; i < de->n_entries; i++) {
+                struct dirent *dentry = de->entries[i];
+                _cleanup_free_ char *u = NULL;
+                const char *e, *p;
+                uint64_t inode;
+
+                p = startswith(dentry->d_name, "i");
+                if (!p)
+                        continue;
+
+                e = endswith(p, ".userns");
+                if (!e)
+                        continue;
+
+                u = strndup(p, e - p);
+                if (!u)
+                        return log_oom();
+
+                r = safe_atou64(u, &inode);
+                if (r < 0) {
+                        log_warning_errno(r, "Failed to parse userns inode number from '%s', skipping: %m", dentry->d_name);
+                        continue;
+                }
+
+                if (inode > UINT32_MAX) { /* namespace inode numbers are 23bit only right now */
+                        log_warning("userns inode number outside of 32bit range, skipping.");
+                        continue;
+                }
+
+                if (set_ensure_put(registry_inodes, NULL, UINT32_TO_PTR(inode)) < 0)
+                        return log_oom();
+
+                log_debug("Found user namespace %" PRIu64 " in registry directory", inode);
+        }
+
+        return 0;
+}
+
+static int manager_make_listen_socket(Manager *m) {
+        static const union sockaddr_union sockaddr = {
+                .un.sun_family = AF_UNIX,
+                .un.sun_path = "/run/systemd/io.systemd.NamespaceResource",
+        };
+        int r;
+
+        assert(m);
+
+        if (m->listen_fd >= 0)
+                return 0;
+
+        m->listen_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
+        if (m->listen_fd < 0)
+                return log_error_errno(errno, "Failed to bind on socket: %m");
+
+        (void) sockaddr_un_unlink(&sockaddr.un);
+
+        WITH_UMASK(0000)
+                if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
+                        return log_error_errno(errno, "Failed to bind socket: %m");
+
+        r = mkdir_p("/run/systemd/userdb", 0755);
+        if (r < 0)
+                return log_error_errno(r, "Failed to create /run/systemd/userdb: %m");
+
+        r = symlink_idempotent("../io.systemd.NamespaceResource", "/run/systemd/userdb/io.systemd.NamespaceResource", /* make_relative= */ false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to symlink userdb socket: %m");
+
+        if (listen(m->listen_fd, SOMAXCONN) < 0)
+                return log_error_errno(errno, "Failed to listen on socket: %m");
+
+        return 1;
+}
+
+static int manager_scan_listen_fds(Manager *m, Set **fdstore_inodes) {
+        _cleanup_strv_free_ char **names = NULL;
+        int n, r;
+
+        assert(m);
+        assert(fdstore_inodes);
+
+        n = sd_listen_fds_with_names(/* unset_environment= */ true, &names);
+        if (n < 0)
+                return log_error_errno(n, "Failed to determine number of passed file descriptors: %m");
+
+        for (int i = 0; i < n; i++) {
+                _cleanup_close_ int fd = SD_LISTEN_FDS_START + i; /* Take possession */
+                const char *e;
+
+                /* If this is a BPF allowlist related fd, just close it, but remember which start UIDs this covers */
+                e = startswith(names[i], "userns-");
+                if (e) {
+                        uint64_t inode;
+
+                        r = safe_atou64(e, &inode);
+                        if (r < 0) {
+                                log_warning_errno(r, "Failed to parse UID from fd name '%s', ignoring: %m", e);
+                                continue;
+                        }
+
+                        if (inode > UINT32_MAX) {
+                                log_warning("Inode number outside of 32bit range, ignoring");
+                                continue;
+                        }
+
+                        if (set_ensure_put(fdstore_inodes, NULL, UINT32_TO_PTR(inode)) < 0)
+                                return log_oom();
+
+                        continue;
+                }
+
+                /* We don't check the name for the stream socket, for compatibility with older versions */
+                r = sd_is_socket(fd, AF_UNIX, SOCK_STREAM, 1);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to detect if passed file descriptor is a socket: %m");
+                if (r > 0) {
+                        if (m->listen_fd >= 0)
+                                return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Passed more than one AF_UNIX/SOCK_STREAM socket, refusing.");
+
+                        m->listen_fd = TAKE_FD(fd);
+                        continue;
+                }
+
+                log_warning("Closing passed file descriptor %i (%s) we don't recognize.", fd, names[i]);
+        }
+
+        return 0;
+}
+
+#if HAVE_VMLINUX_H
+static int ringbuf_event(void *userdata, void *data, size_t size) {
+        Manager *m = ASSERT_PTR(userdata);
+        size_t n;
+
+        if ((size % sizeof(unsigned int)) != 0) /* Not multiples of "unsigned int"? */
+                return -EIO;
+
+        n = size / sizeof(unsigned int);
+        for (size_t i = 0; i < n; i++) {
+                const void *d;
+                uint64_t inode;
+
+                d = (const uint8_t*) data + i * sizeof(unsigned int);
+                inode = unaligned_read_ne32(d);
+
+                log_debug("Got BPF ring buffer notification that user namespace %" PRIu64 " is now dead.", inode);
+                manager_release_userns_by_inode(m, inode);
+        }
+
+        return 0;
+}
+
+static int on_ringbuf_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        Manager *m = ASSERT_PTR(userdata);
+        int r;
+
+        r = sym_ring_buffer__poll(m->userns_restrict_bpf_ring_buffer, 0);
+        if (r < 0)
+                return log_error_errno(r, "Got failure reading from BPF ring buffer: %m");
+
+        return 0;
+}
+
+static int manager_setup_bpf(Manager *m) {
+        int rb_fd = -EBADF, poll_fd = -EBADF, r;
+
+        assert(m);
+        assert(!m->userns_restrict_bpf);
+        assert(!m->userns_restrict_bpf_ring_buffer);
+        assert(!m->userns_restrict_bpf_ring_buffer_event_source);
+
+        r = userns_restrict_install(/* pin= */ true, &m->userns_restrict_bpf);
+        if (r < 0) {
+                log_notice_errno(r, "Proceeding with user namespace interfaces disabled.");
+                return 0;
+        }
+
+        rb_fd = sym_bpf_map__fd(m->userns_restrict_bpf->maps.userns_ringbuf);
+        if (rb_fd < 0)
+                return log_error_errno(rb_fd, "Failed to get fd of ring buffer: %m");
+
+        m->userns_restrict_bpf_ring_buffer = sym_ring_buffer__new(rb_fd, ringbuf_event, m, NULL);
+        if (!m->userns_restrict_bpf_ring_buffer)
+                return log_error_errno(errno, "Failed to allocate BPF ring buffer object: %m");
+
+        poll_fd = sym_ring_buffer__epoll_fd(m->userns_restrict_bpf_ring_buffer);
+        if (poll_fd < 0)
+                return log_error_errno(poll_fd, "Failed to get poll fd of ring buffer: %m");
+
+        r = sd_event_add_io(
+                        m->event,
+                        &m->userns_restrict_bpf_ring_buffer_event_source,
+                        poll_fd,
+                        EPOLLIN,
+                        on_ringbuf_io,
+                        m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate event source for BPF ring buffer: %m");
+
+        return 0;
+}
+#else
+static int manager_setup_bpf(Manager *m) {
+        log_notice("Not setting up BPF subsystem, as functionality has been disabled at compile time.");
+        return 0;
+}
+#endif
+
+int manager_startup(Manager *m) {
+        _cleanup_(set_freep) Set *fdstore_inodes = NULL, *registry_inodes = NULL;
+        void *p;
+        int r;
+
+        assert(m);
+        assert(m->registry_fd < 0);
+        assert(m->listen_fd < 0);
+
+        m->registry_fd = userns_registry_open_fd();
+        if (m->registry_fd < 0)
+                return log_error_errno(m->registry_fd, "Failed to open registry directory: %m");
+
+        r = manager_setup_bpf(m);
+        if (r < 0)
+                return r;
+
+        r = manager_scan_listen_fds(m, &fdstore_inodes);
+        if (r < 0)
+                return r;
+
+        r = manager_scan_registry(m, &registry_inodes);
+        if (r < 0)
+                return r;
+
+        /* If there are resources tied to UIDs not found in the registry, then release them */
+        SET_FOREACH(p, fdstore_inodes)  {
+                uint64_t inode;
+
+                if (set_contains(registry_inodes, p))
+                        continue;
+
+                inode = PTR_TO_UINT32(p);
+
+                log_debug("Found stale fd store entry for user namespace %" PRIu64 ", removing.", inode);
+                manager_release_userns_by_inode(m, inode);
+        }
+
+        r = manager_make_listen_socket(m);
+        if (r < 0)
+                return r;
+
+        /* Let's make sure every accept() call on this socket times out after 25s. This allows workers to be
+         * GC'ed on idle */
+        if (setsockopt(m->listen_fd, SOL_SOCKET, SO_RCVTIMEO, TIMEVAL_STORE(LISTEN_TIMEOUT_USEC), sizeof(struct timeval)) < 0)
+                return log_error_errno(errno, "Failed to se SO_RCVTIMEO: %m");
+
+        r = start_workers(m, /* explicit_request= */ false);
+        if (r < 0)
+                return r;
+
+        return 0;
+}
diff --git a/src/nsresourced/nsresourced-manager.h b/src/nsresourced/nsresourced-manager.h
new file mode 100644 (file)
index 0000000..5ecf378
--- /dev/null
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+#include "sd-event.h"
+
+typedef struct Manager Manager;
+
+#include "hashmap.h"
+#include "ratelimit.h"
+
+#define NSRESOURCE_WORKERS_MIN 5
+#define NSRESOURCE_WORKERS_MAX 4096
+
+struct Manager {
+        sd_event *event;
+
+        Set *workers_fixed;    /* Workers 0…NSRESOURCE_WORKERS_MIN */
+        Set *workers_dynamic;  /* Workers NSRESOURCES_WORKERS_MIN+1…NSRESOURCES_WORKERS_MAX */
+
+        int listen_fd;
+
+        RateLimit worker_ratelimit;
+
+        sd_event_source *deferred_start_worker_event_source;
+
+#if HAVE_VMLINUX_H
+        struct userns_restrict_bpf *userns_restrict_bpf;
+        struct ring_buffer *userns_restrict_bpf_ring_buffer;
+        sd_event_source *userns_restrict_bpf_ring_buffer_event_source;
+#endif
+
+        int registry_fd;
+};
+
+int manager_new(Manager **ret);
+Manager* manager_free(Manager *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
+
+int manager_startup(Manager *m);
diff --git a/src/nsresourced/nsresourced.c b/src/nsresourced/nsresourced.c
new file mode 100644 (file)
index 0000000..7056897
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "daemon-util.h"
+#include "nsresourced-manager.h"
+#include "log.h"
+#include "main-func.h"
+#include "signal-util.h"
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(manager_freep) Manager *m = NULL;
+        int r;
+
+        log_setup();
+
+        umask(0022);
+
+        if (argc != 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "This program takes no arguments.");
+
+        if (setenv("SYSTEMD_BYPASS_USERDB", "io.systemd.NamespaceResource", 1) < 0)
+                return log_error_errno(errno, "Failed to set $SYSTEMD_BYPASS_USERDB: %m");
+
+        assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD) >= 0);
+
+        r = manager_new(&m);
+        if (r < 0)
+                return log_error_errno(r, "Could not create manager: %m");
+
+        r = manager_startup(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to start up daemon: %m");
+
+        _unused_ _cleanup_(notify_on_cleanup) const char *notify_stop = NULL;
+        notify_stop = notify_start(NOTIFY_READY, NOTIFY_STOPPING);
+
+        r = sd_event_loop(m->event);
+        if (r < 0)
+                return log_error_errno(r, "Event loop failed: %m");
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/nsresourcework.c b/src/nsresourced/nsresourcework.c
new file mode 100644 (file)
index 0000000..9e68249
--- /dev/null
@@ -0,0 +1,1782 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <linux/nsfs.h>
+#include <linux/veth.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include "sd-daemon.h"
+#include "sd-netlink.h"
+
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "group-record.h"
+#include "io-util.h"
+#include "lock-util.h"
+#include "main-func.h"
+#include "missing_magic.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "netlink-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "time-util.h"
+#include "uid-classification.h"
+#include "uid-range.h"
+#include "user-record-nss.h"
+#include "user-record.h"
+#include "user-util.h"
+#include "userdb.h"
+#include "userns-registry.h"
+#include "userns-restrict.h"
+#include "varlink-io.systemd.NamespaceResource.h"
+#include "varlink-io.systemd.UserDatabase.h"
+#include "varlink.h"
+
+#define ITERATIONS_MAX 64U
+#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
+#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
+#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC)
+#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
+#define USERNS_PER_UID 256
+
+typedef struct LookupParameters {
+        const char *user_name;
+        const char *group_name;
+        union {
+                uid_t uid;
+                gid_t gid;
+        };
+        const char *service;
+} LookupParameters;
+
+static int build_user_json(UserNamespaceInfo *userns_info, uid_t offset, JsonVariant **ret) {
+        _cleanup_free_ char *name = NULL, *realname = NULL;
+        UserDisposition disposition;
+        int r;
+
+        assert(userns_info);
+        assert(offset < userns_info->size);
+
+        if (asprintf(&name, "ns-%s-" UID_FMT, userns_info->name, offset) < 0)
+                return -ENOMEM;
+
+        if (userns_info->size > 1) {
+                disposition = USER_CONTAINER;
+                r = asprintf(&realname, "User " UID_FMT " of Allocated Namespace %s", offset, userns_info->name);
+        } else {
+                disposition = USER_DYNAMIC;
+                r = asprintf(&realname, "Allocated Namespace %s", userns_info->name);
+        }
+        if (r < 0)
+                return -ENOMEM;
+
+        return json_build(ret, JSON_BUILD_OBJECT(
+                                          JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name)),
+                                          JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(userns_info->start + offset)),
+                                          JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)),
+                                          JSON_BUILD_PAIR("realName", JSON_BUILD_STRING(realname)),
+                                          JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/")),
+                                          JSON_BUILD_PAIR("shell", JSON_BUILD_STRING(NOLOGIN)),
+                                          JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+                                          JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")),
+                                          JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition)))));
+}
+
+static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "uid",      JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, uid),       0 },
+                { "userName", JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, user_name), 0 },
+                { "service",  JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),   0 },
+                {}
+        };
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .uid = UID_INVALID,
+        };
+        uid_t offset;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (p.user_name) {
+                _cleanup_free_ char *n = NULL;
+                const char *e, *f;
+
+                e = startswith(p.user_name, "ns-");
+                if (!e)
+                        goto not_found;
+
+                f = strrchr(e, '-');
+                if (!f)
+                        goto not_found;
+
+                if (parse_uid(f+1, &offset) < 0)
+                        goto not_found;
+
+                n = strndup(e, f - e);
+                if (!n)
+                        return log_oom();
+
+                r = userns_registry_load_by_name(
+                                /* registry_fd= */ -EBADF,
+                                n,
+                                &userns_info);
+                if (r == -ENOENT)
+                        goto not_found;
+                if (r < 0)
+                        return r;
+
+                if (offset >= userns_info->size) /* Outside of range? */
+                        goto not_found;
+
+                if (uid_is_valid(p.uid) && p.uid != userns_info->start + offset)
+                        return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        } else if (uid_is_valid(p.uid)) {
+                uid_t start, uidmask;
+
+                if (uid_is_container(p.uid))
+                        uidmask = (uid_t) UINT32_C(0xFFFF0000);
+                else if (uid_is_dynamic(p.uid))
+                        uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
+                else
+                        goto not_found;
+
+                start = p.uid & uidmask;
+                offset = p.uid - start;
+
+                r = userns_registry_load_by_start_uid(
+                                /* registry_fd= */ -EBADF,
+                                start,
+                                &userns_info);
+                if (r == -ENOENT)
+                        goto not_found;
+                if (r < 0)
+                        return r;
+
+                if (offset >= userns_info->size) /* Outside of range? */
+                        goto not_found;
+        } else
+                return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL);
+
+        r = build_user_json(userns_info, offset, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v))));
+
+not_found:
+        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int build_group_json(UserNamespaceInfo *userns_info, gid_t offset, JsonVariant **ret) {
+        _cleanup_free_ char *name = NULL, *description = NULL;
+        UserDisposition disposition;
+        int r;
+
+        assert(userns_info);
+        assert(offset < userns_info->size);
+
+        if (asprintf(&name, "ns-%s-" GID_FMT, userns_info->name, offset) < 0)
+                return -ENOMEM;
+
+        if (userns_info->size > 1) {
+                disposition = USER_CONTAINER;
+                r = asprintf(&description, "Group " GID_FMT " of Allocated Namespace %s", offset, userns_info->name);
+        } else {
+                disposition = USER_DYNAMIC;
+                r = asprintf(&description, "Allocated Namespace %s", userns_info->name);
+        }
+        if (r < 0)
+                return -ENOMEM;
+
+        return json_build(ret, JSON_BUILD_OBJECT(
+                                          JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name)),
+                                          JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(userns_info->start + offset)),
+                                          JSON_BUILD_PAIR("description", JSON_BUILD_STRING(description)),
+                                          JSON_BUILD_PAIR("service", JSON_BUILD_CONST_STRING("io.systemd.NamespaceResource")),
+                                          JSON_BUILD_PAIR("disposition", JSON_BUILD_STRING(user_disposition_to_string(disposition)))));
+}
+
+static int vl_method_get_group_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "gid",       JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,      offsetof(LookupParameters, gid),        0 },
+                { "groupName", JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 },
+                { "service",   JSON_VARIANT_STRING,   json_dispatch_const_string, offsetof(LookupParameters, service),    0 },
+                {}
+        };
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        LookupParameters p = {
+                .gid = GID_INVALID,
+        };
+        gid_t offset;
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        if (p.group_name) {
+                _cleanup_free_ char *n = NULL;
+                const char *e, *f;
+
+                e = startswith(p.group_name, "ns-");
+                if (!e)
+                        goto not_found;
+
+                f = strrchr(e, '-');
+                if (!f)
+                        goto not_found;
+
+                if (parse_gid(f+1, &offset) < 0)
+                        goto not_found;
+
+                n = strndup(e, f - e);
+                if (!n)
+                        return log_oom();
+
+                r = userns_registry_load_by_name(
+                                /* registry_fd= */ -EBADF,
+                                n,
+                                &userns_info);
+                if (r == -ENOENT)
+                        goto not_found;
+                if (r < 0)
+                        return r;
+
+                if (offset >= userns_info->size) /* Outside of range? */
+                        goto not_found;
+
+                if (gid_is_valid(p.gid) && p.uid != userns_info->start + offset)
+                        return varlink_error(link, "io.systemd.UserDatabase.ConflictingRecordFound", NULL);
+
+        } else if (gid_is_valid(p.gid)) {
+                gid_t start, gidmask;
+
+                if (gid_is_container(p.gid))
+                        gidmask = (gid_t) UINT32_C(0xFFFF0000);
+                else if (gid_is_dynamic(p.gid))
+                        gidmask = (gid_t) UINT32_C(0xFFFFFFFF);
+                else
+                        goto not_found;
+
+                start = p.gid & gidmask;
+                offset = p.gid - start;
+
+                r = userns_registry_load_by_start_uid(
+                                /* registry_fd= */ -EBADF,
+                                (uid_t) start,
+                                &userns_info);
+                if (r == -ENOENT)
+                        goto not_found;
+                if (r < 0)
+                        return r;
+
+                if (offset >= userns_info->size) /* Outside of range? */
+                        goto not_found;
+        } else
+                return varlink_error(link, "io.systemd.UserDatabase.EnumerationNotSupported", NULL);
+
+        r = build_group_json(userns_info, offset, &v);
+        if (r < 0)
+                return r;
+
+        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("record", JSON_BUILD_VARIANT(v))));
+
+not_found:
+        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int vl_method_get_memberships(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+        static const JsonDispatch dispatch_table[] = {
+                { "userName",  JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, user_name),  0 },
+                { "groupName", JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, group_name), 0 },
+                { "service",   JSON_VARIANT_STRING, json_dispatch_const_string, offsetof(LookupParameters, service),    0 },
+                {}
+        };
+
+        LookupParameters p = {};
+        int r;
+
+        assert(parameters);
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.service, "io.systemd.NamespaceResource"))
+                return varlink_error(link, "io.systemd.UserDatabase.BadService", NULL);
+
+        /* We don't support auxiliary groups for namespace allocations */
+        return varlink_error(link, "io.systemd.UserDatabase.NoRecordFound", NULL);
+}
+
+static int uid_is_available(
+                int registry_dir_fd,
+                uid_t candidate) {
+
+        int r;
+
+        assert(registry_dir_fd >= 0);
+
+        log_debug("Checking if UID " UID_FMT " is available.", candidate);
+
+        r = userns_registry_uid_exists(registry_dir_fd, candidate);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return false;
+
+        r = userdb_by_uid(candidate, USERDB_AVOID_MULTIPLEXER, NULL);
+        if (r >= 0)
+                return false;
+        if (r != -ESRCH)
+                return r;
+
+        r = groupdb_by_gid(candidate, USERDB_AVOID_MULTIPLEXER, NULL);
+        if (r >= 0)
+                return false;
+        if (r != -ESRCH)
+                return r;
+
+        log_debug("UID " UID_FMT " is available.", candidate);
+
+        return true;
+}
+
+static int name_is_available(
+                int registry_dir_fd,
+                const char *name) {
+
+        _cleanup_free_ char *user_name = NULL;
+        int r;
+
+        assert(registry_dir_fd >= 0);
+        assert(name);
+
+        r = userns_registry_name_exists(registry_dir_fd, name);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return false;
+
+        user_name = strjoin("ns-", name, "-0");
+        if (!user_name)
+                return -ENOMEM;
+
+        r = userdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL);
+        if (r >= 0)
+                return false;
+        if (r != -ESRCH)
+                return r;
+
+        r = groupdb_by_name(user_name, USERDB_AVOID_MULTIPLEXER, NULL);
+        if (r >= 0)
+                return false;
+        if (r != -ESRCH)
+                return r;
+
+        log_debug("Namespace name '%s' is available.", name);
+
+        return true;
+}
+
+static int allocate_now(
+                int registry_dir_fd,
+                UserNamespaceInfo *info,
+                int *ret_lock_fd) {
+
+        static const uint8_t hash_key[16] = {
+                0xd4, 0xd7, 0x33, 0xa7, 0x4d, 0xd3, 0x42, 0xcd,
+                0xaa, 0xe9, 0x45, 0xd0, 0xfb, 0xec, 0x79, 0xee,
+        };
+
+        _cleanup_(uid_range_freep) UIDRange *valid_range = NULL;
+        uid_t candidate, uidmin, uidmax, uidmask;
+        unsigned n_tries = 100;
+        int r;
+
+        /* Returns the following error codes:
+         *
+         * EBUSY   â†’ all UID candidates we checked are already taken
+         * EEXIST  â†’ the name for the userns already exists
+         * EDEADLK â†’ the userns is already registered in the registry
+         */
+
+        assert(registry_dir_fd >= 0);
+        assert(info);
+
+        switch (info->size) {
+
+        case 0x10000U:
+                uidmin = CONTAINER_UID_BASE_MIN;
+                uidmax = CONTAINER_UID_BASE_MAX;
+                uidmask = (uid_t) UINT32_C(0xFFFF0000);
+                break;
+
+        case 1U:
+                uidmin = DYNAMIC_UID_MIN;
+                uidmax = DYNAMIC_UID_MAX;
+                uidmask = (uid_t) UINT32_C(0xFFFFFFFF);
+                break;
+
+        default:
+                assert_not_reached();
+        }
+
+        r = uid_range_load_userns(&valid_range, /* path= */ NULL, UID_RANGE_USERNS_INSIDE);
+        if (r < 0)
+                return r;
+
+        /* Check early whether we have any chance at all given our own uid range */
+        if (!uid_range_overlaps(valid_range, uidmin, uidmax))
+                return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), "Relevant UID range not delegated, can't allocate.");
+
+        _cleanup_close_ int lock_fd = -EBADF;
+        lock_fd = userns_registry_lock(registry_dir_fd);
+        if (lock_fd < 0)
+                return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+        /* Enforce limit on user namespaces per UID */
+        r = userns_registry_per_uid(registry_dir_fd, info->owner);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine number of currently registered user namespaces per UID " UID_FMT ": %m", info->owner);
+        if (r >= USERNS_PER_UID)
+                return log_debug_errno(SYNTHETIC_ERRNO(EUSERS), "User already registered %i user namespaces, refusing.", r);
+
+        r = userns_registry_inode_exists(registry_dir_fd, info->userns_inode);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return -EDEADLK;
+
+        r = name_is_available(registry_dir_fd, info->name);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return -EEXIST;
+
+        for (candidate = siphash24_string(info->name, hash_key) & UINT32_MAX;; /* Start from a hash of the input name */
+             candidate = random_u32()) {                                 /* Use random values afterwards */
+
+                if (--n_tries <= 0)
+                        return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Try limit hit, no UIDs available.");
+
+                candidate = (candidate % (uidmax - uidmin)) + uidmin;
+                candidate &= uidmask;
+
+                if (!uid_range_covers(valid_range, candidate, info->size))
+                        continue;
+
+                /* We only check the base UID for each range (!) */
+                r = uid_is_available(registry_dir_fd, candidate);
+                if (r < 0)
+                        return log_debug_errno(r, "Can't determine if UID range " UID_FMT " is available: %m", candidate);
+                if (r > 0) {
+                        info->start = candidate;
+
+                        log_debug("Allocating UID range " UID_FMT "…" UID_FMT, candidate, candidate + info->size - 1);
+
+                        if (ret_lock_fd)
+                                *ret_lock_fd = TAKE_FD(lock_fd);
+
+                        return 0;
+                }
+
+                log_debug("UID range " UID_FMT " already taken.", candidate);
+        }
+}
+
+static int write_userns(int usernsfd, const UserNamespaceInfo *userns_info) {
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        _cleanup_close_ int efd = -EBADF;
+        uint64_t u;
+        int r;
+
+        assert(usernsfd >= 0);
+        assert(userns_info);
+        assert(uid_is_valid(userns_info->target));
+        assert(uid_is_valid(userns_info->start));
+        assert(userns_info->size > 0);
+        assert(userns_info->size <= UINT32_MAX - userns_info->start);
+
+        efd = eventfd(0, EFD_CLOEXEC);
+        if (efd < 0)
+                return log_error_errno(errno, "Failed to allocate eventfd(): %m");
+
+        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG, &pid);
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                /* child */
+
+                if (setns(usernsfd, CLONE_NEWUSER) < 0) {
+                        log_error_errno(errno, "Failed to join user namespace: %m");
+                        goto child_fail;
+                }
+
+                if (eventfd_write(efd, 1) < 0) {
+                        log_error_errno(errno, "Failed to ping event fd: %m");
+                        goto child_fail;
+                }
+
+                freeze();
+
+        child_fail:
+                _exit(EXIT_FAILURE);
+        }
+
+        /* Wait until child joined the user namespace */
+        if (eventfd_read(efd, &u) < 0)
+                return log_error_errno(errno, "Failed to wait for event fd: %m");
+
+        /* Now write mapping */
+
+        _cleanup_free_ char *pmap = NULL;
+
+        if (asprintf(&pmap, "/proc/" PID_FMT "/uid_map", pid) < 0)
+                return log_oom();
+
+        r = write_string_filef(pmap, 0, UID_FMT " " UID_FMT " " UID_FMT "\n", userns_info->target, userns_info->start, userns_info->size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write 'uid_map' file of user namespace: %m");
+
+        pmap = mfree(pmap);
+        if (asprintf(&pmap, "/proc/" PID_FMT "/gid_map", pid) < 0)
+                return log_oom();
+
+        r = write_string_filef(pmap, 0, GID_FMT " " GID_FMT " " GID_FMT "\n", (gid_t) userns_info->target, (gid_t) userns_info->start, (gid_t) userns_info->size);
+        if (r < 0)
+                return log_error_errno(r, "Failed to write 'gid_map' file of user namespace: %m");
+
+        /* We are done! */
+
+        log_debug("Successfully configured user namespace.");
+        return 0;
+}
+
+static int test_userns_api_support(Varlink *link) {
+        int r;
+
+        assert(link);
+
+        /* We only expose the userns API if our manager daemon told us this OK to do. It will set this
+         * boolean only if it managed to set up BPF correctly for itself (i.e. watches for userns going away
+         * via BPF APIs). This should make very sure we don't accidentally allow any of the userns stuff to
+         * go through without the BPF LSM in effect. */
+
+        r = getenv_bool("NSRESOURCE_API");
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse $NSRESOURCE_API: %m");
+        if (r == 0)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceInterfaceNotSupported", NULL);
+
+        return 0;
+}
+
+static int validate_name(Varlink *link, const char *name, char **ret) {
+        _cleanup_free_ char *un = NULL;
+        int r;
+
+        assert(link);
+        assert(name);
+        assert(ret);
+
+        uid_t peer_uid;
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+
+        if (peer_uid == 0) {
+                if (!userns_name_is_valid(name))
+                        return varlink_error_invalid_parameter_name(link, "name");
+
+                un = strdup(name);
+                if (!un)
+                        return -ENOMEM;
+        } else {
+                /* The the client is not root then prefix the name with the UID of the peer, so that they
+                 * live in separate namespaces and cannot steal each other's names. */
+
+                if (asprintf(&un, UID_FMT "-%s", peer_uid, name) < 0)
+                        return -ENOMEM;
+
+                if (!userns_name_is_valid(un))
+                        return varlink_error_invalid_parameter_name(link, "name");
+        }
+
+        *ret = TAKE_PTR(un);
+        return 0;
+}
+
+static int validate_target_and_size(Varlink *link, unsigned target, unsigned size) {
+        assert(link);
+
+        if (!IN_SET(size, 1U, 0x10000))
+                return varlink_error_invalid_parameter_name(link, "size");
+
+        if (!uid_is_valid(target) || target > UINT32_MAX - size)
+                return varlink_error_invalid_parameter_name(link, "target");
+
+        return 0;
+}
+
+static int validate_userns(Varlink *link, int userns_fd) {
+        int r;
+
+        assert(link);
+        assert(userns_fd >= 0);
+
+        r = fd_verify_safe_flags(userns_fd);
+        if (r < 0)
+                return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
+
+        /* Validate this is actually a valid user namespace fd */
+        r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check if user namespace fd is actually a user namespace: %m");
+        if (r == 0)
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        /* And refuse the thing if it is our own */
+        r = is_our_namespace(userns_fd, NAMESPACE_USER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check if user namespace fd refers to our own user namespace: %m");
+        if (r > 0)
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        uid_t peer_uid;
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to acquire peer UID: %m");
+
+        if (peer_uid != 0) {
+                /* Refuse if the userns is not actually owned by our client. */
+                uid_t owner_uid;
+                if (ioctl(userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0)
+                        return log_debug_errno(errno, "Failed to get owner UID of user namespace: %m");
+
+                if (owner_uid != peer_uid)
+                        return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+        }
+
+        return 0;
+}
+
+static int validate_userns_is_empty(Varlink *link, int userns_fd) {
+        int r;
+
+        assert(link);
+        assert(userns_fd >= 0);
+
+        _cleanup_(uid_range_freep) UIDRange *range = NULL;
+        r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns UID range: %m");
+
+        if (!uid_range_is_empty(range))
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        range = uid_range_free(range);
+        r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns GID range: %m");
+
+        if (!uid_range_is_empty(range))
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        return 0;
+}
+
+typedef struct AllocateParameters {
+        const char *name;
+        unsigned size;
+        unsigned target;
+        unsigned userns_fd_idx;
+} AllocateParameters;
+
+static int vl_method_allocate_user_range(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "name",                        JSON_VARIANT_STRING,        json_dispatch_const_string, offsetof(AllocateParameters, name),          JSON_MANDATORY },
+                { "size",                        _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(AllocateParameters, size),          JSON_MANDATORY },
+                { "target",                      _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(AllocateParameters, target),        0              },
+                { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(AllocateParameters, userns_fd_idx), JSON_MANDATORY },
+                {}
+        };
+
+        struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+        _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF, lock_fd = -EBADF;
+        _cleanup_free_ char *userns_name = NULL;
+        uid_t peer_uid;
+        struct stat userns_st;
+        AllocateParameters p = {
+                .size = UINT_MAX,
+                .userns_fd_idx = UINT_MAX,
+        };
+        int r;
+
+        assert(link);
+        assert(parameters);
+
+        r = test_userns_api_support(link);
+        if (r != 0)
+                return r;
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        r = validate_name(link, p.name, &userns_name);
+        if (r != 0)
+                return r;
+
+        r = validate_target_and_size(link, p.target, p.size);
+        if (r != 0)
+                return r;
+
+        userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+        if (userns_fd < 0)
+                return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
+
+        r = validate_userns(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        r = validate_userns_is_empty(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        if (fstat(userns_fd, &userns_st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+
+        if (!*bpf) {
+                r = userns_restrict_install(/* pin= */ true, bpf);
+                if (r < 0)
+                        return r;
+        }
+
+        registry_dir_fd = userns_registry_open_fd();
+        if (registry_dir_fd < 0)
+                return registry_dir_fd;
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new();
+        if (!userns_info)
+                return -ENOMEM;
+
+        userns_info->name = TAKE_PTR(userns_name);
+        if (!userns_info->name)
+                return -ENOMEM;
+
+        userns_info->owner = peer_uid;
+        userns_info->userns_inode = userns_st.st_ino;
+        userns_info->size = p.size;
+        userns_info->target = p.target;
+
+        r = allocate_now(registry_dir_fd, userns_info, &lock_fd);
+        if (r == -EHOSTDOWN) /* The needed UID range is not delegated to us */
+                return varlink_error(link, "io.systemd.NamespaceResource.DynamicRangeUnavailable", NULL);
+        if (r == -EBUSY)     /* All used up */
+                return varlink_error(link, "io.systemd.NamespaceResource.NoDynamicRange", NULL);
+        if (r == -EDEADLK)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL);
+        if (r == -EEXIST)
+                return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL);
+        if (r < 0)
+                return r;
+
+        r = userns_registry_store(registry_dir_fd, userns_info);
+        if (r < 0)
+                return r;
+
+        /* Register the userns in the BPF map with an empty allowlist */
+        r = userns_restrict_put_by_fd(
+                        *bpf,
+                        userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0);
+        if (r < 0)
+                goto fail;
+
+        r = write_userns(userns_fd, userns_info);
+        if (r < 0)
+                goto fail;
+
+        lock_fd = safe_close(lock_fd);
+
+        /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */
+        r = sd_pid_notifyf_with_fds(
+                        /* pid= */ 0,
+                        /* unset_environment= */ false,
+                        &userns_fd, 1,
+                        "FDSTORE=1\n"
+                        "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode);
+        if (r < 0)
+                goto fail;
+
+        /* Note, we'll not return UID values from the host, since the child might not run in the same
+         * user namespace as us. If they want to know the ranges they should read them off the userns fd, so
+         * that they are translated into their PoV */
+        return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+
+fail:
+        /* Note: we don't have to clean-up the BPF maps in the error path: the bpf map type used will
+         * automatically do that once the userns inode goes away */
+        userns_registry_remove(registry_dir_fd, userns_info);
+        return r;
+}
+
+static int validate_userns_is_safe(Varlink *link, int userns_fd) {
+        int r;
+
+        assert(link);
+        assert(userns_fd >= 0);
+
+        /* Read the outside UID range and verify it isn't empty */
+        _cleanup_(uid_range_freep) UIDRange *outside_range = NULL;
+        r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &outside_range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns UID range: %m");
+        if (uid_range_is_empty(outside_range))
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        /* Read the outside GID range and check it is the same as the UID range */
+        _cleanup_(uid_range_freep) UIDRange *outside_range_gid = NULL;
+        r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &outside_range_gid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns GID range: %m");
+        if (!uid_range_equal(outside_range, outside_range_gid))
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        /* Read the inside UID range, and verify it matches the size of the outside UID range */
+        _cleanup_(uid_range_freep) UIDRange *inside_range = NULL;
+        r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &inside_range);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns contents: %m");
+        if (uid_range_size(outside_range) != uid_range_size(inside_range))
+                return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Uh, inside and outside UID range sizes don't match.");
+
+        /* Read the inside GID range, and verify it matches the inside UID range */
+        _cleanup_(uid_range_freep) UIDRange *inside_range_gid = NULL;
+        r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &inside_range_gid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read userns contents: %m");
+        if (!uid_range_equal(inside_range, inside_range_gid))
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        uid_t peer_uid;
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+
+        uid_t peer_gid;
+        r = varlink_get_peer_gid(link, &peer_gid);
+        if (r < 0)
+                return r;
+
+        /* Insist that the first UID/GID in the range matches the client's UID/GID */
+        if (outside_range->entries[0].start != peer_uid ||
+            outside_range_gid->entries[0].start != peer_gid)
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        /* If there are more than one UID in the range, then also insist that the first UID maps to root inside the userns */
+        if (uid_range_size(outside_range) > 1 && inside_range->entries[0].start != 0)
+                return varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
+
+        return 0;
+}
+
+typedef struct RegisterParameters {
+        const char *name;
+        unsigned userns_fd_idx;
+} RegisterParameters;
+
+static int vl_method_register_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "name",                        JSON_VARIANT_STRING,        json_dispatch_const_string, offsetof(RegisterParameters, name),          JSON_MANDATORY },
+                { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(RegisterParameters, userns_fd_idx), JSON_MANDATORY },
+                {}
+        };
+
+        struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+        _cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF;
+        _cleanup_free_ char *userns_name = NULL;
+        uid_t peer_uid;
+        struct stat userns_st;
+        RegisterParameters p = {
+                .userns_fd_idx = UINT_MAX,
+        };
+        int r;
+
+        assert(link);
+        assert(parameters);
+
+        r = test_userns_api_support(link);
+        if (r != 0)
+                return r;
+
+        r = varlink_dispatch(link, parameters, dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        r = validate_name(link, p.name, &userns_name);
+        if (r != 0)
+                return r;
+
+        userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+        if (userns_fd < 0)
+                return userns_fd;
+
+        r = validate_userns(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        r = validate_userns_is_safe(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        if (fstat(userns_fd, &userns_st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+
+        if (!*bpf) {
+                r = userns_restrict_install(/* pin= */ true, bpf);
+                if (r < 0)
+                        return r;
+        }
+
+        registry_dir_fd = userns_registry_open_fd();
+        if (registry_dir_fd < 0)
+                return registry_dir_fd;
+
+        _cleanup_close_ int lock_fd = -EBADF;
+        lock_fd = userns_registry_lock(registry_dir_fd);
+        if (lock_fd < 0)
+                return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+        r = userns_registry_inode_exists(registry_dir_fd, userns_st.st_ino);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceExists", NULL);
+
+        r = name_is_available(registry_dir_fd, userns_name);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return varlink_error(link, "io.systemd.NamespaceResource.NameExists", NULL);
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = userns_info_new();
+        if (!userns_info)
+                return -ENOMEM;
+
+        userns_info->name = TAKE_PTR(userns_name);
+        if (!userns_info->name)
+                return -ENOMEM;
+
+        userns_info->owner = peer_uid;
+        userns_info->userns_inode = userns_st.st_ino;
+
+        r = userns_registry_store(registry_dir_fd, userns_info);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to update userns registry: %m");
+
+        /* Register the userns in the BPF map with an empty allowlist */
+        r = userns_restrict_put_by_fd(
+                        *bpf,
+                        userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0);
+        if (r < 0)
+                goto fail;
+
+        /* Send user namespace and process fd to our manager process, which will watch the process and user namespace */
+        r = sd_pid_notifyf_with_fds(
+                        /* pid= */ 0,
+                        /* unset_environment= */ false,
+                        &userns_fd, 1,
+                        "FDSTORE=1\n"
+                        "FDNAME=userns-" INO_FMT "\n", userns_info->userns_inode);
+        if (r < 0)
+                goto fail;
+
+        return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+
+fail:
+        userns_registry_remove(registry_dir_fd, userns_info);
+        return r;
+}
+
+typedef struct AddMountParameters {
+        unsigned userns_fd_idx;
+        unsigned mount_fd_idx;
+} AddMountParameters;
+
+static int vl_method_add_mount_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+
+        static const JsonDispatch parameter_dispatch_table[] = {
+                { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, userns_fd_idx), JSON_MANDATORY },
+                { "mountFileDescriptor",         _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddMountParameters, mount_fd_idx),  JSON_MANDATORY },
+                {}
+        };
+
+        _cleanup_close_ int userns_fd = -EBADF, mount_fd = -EBADF, registry_dir_fd = -EBADF;
+        struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+        AddMountParameters p = {
+                .userns_fd_idx = UINT_MAX,
+                .mount_fd_idx = UINT_MAX,
+        };
+        int r, mnt_id = 0;
+        struct stat userns_st;
+        uid_t peer_uid;
+
+        assert(link);
+        assert(parameters);
+
+        r = test_userns_api_support(link);
+        if (r != 0)
+                return r;
+
+        /* Allowlisting arbitrary mounts is a privileged operation */
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+        if (peer_uid != 0)
+                return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+        r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+        if (userns_fd < 0)
+                return userns_fd;
+
+        r = validate_userns(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        if (fstat(userns_fd, &userns_st) < 0)
+                return -errno;
+
+        mount_fd = varlink_take_fd(link, p.mount_fd_idx);
+        if (mount_fd < 0)
+                return mount_fd;
+
+        r = fd_verify_safe_flags_full(mount_fd, O_PATH|O_DIRECTORY);
+        if (r < 0)
+                return log_debug_errno(r, "Mount file descriptor has unsafe flags set: %m");
+
+        r = fd_verify_directory(mount_fd);
+        if (r < 0)
+                return r;
+
+        r = path_get_mnt_id_at(mount_fd, NULL, &mnt_id);
+        if (r < 0)
+                return r;
+
+        registry_dir_fd = userns_registry_open_fd();
+        if (registry_dir_fd < 0)
+                return registry_dir_fd;
+
+        _cleanup_close_ int lock_fd = -EBADF;
+        lock_fd = userns_registry_lock(registry_dir_fd);
+        if (lock_fd < 0)
+                return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        r = userns_registry_load_by_userns_inode(
+                        registry_dir_fd,
+                        userns_st.st_ino,
+                        &userns_info);
+        if (r == -ENOENT)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+        if (r < 0)
+                return r;
+
+        if (!*bpf) {
+                r = userns_restrict_install(/* pin= */ true, bpf);
+                if (r < 0)
+                        return r;
+        }
+
+        /* Pin the mount fd */
+        r = sd_pid_notifyf_with_fds(
+                        /* pid= */ 0,
+                        /* unset_environment= */ false,
+                        &mount_fd, 1,
+                        "FDSTORE=1\n"
+                        "FDNAME=userns-" INO_FMT "\n", userns_st.st_ino);
+        if (r < 0)
+                return r;
+
+        /* Add this mount to the user namespace's BPF map allowlist entry. */
+        r = userns_restrict_put_by_fd(
+                        *bpf,
+                        userns_fd,
+                        /* replace= */ false,
+                        &mount_fd,
+                        1);
+        if (r < 0)
+                return r;
+
+        if (userns_info->size > 0)
+                log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s' @ UID " UID_FMT ")",
+                          mnt_id, userns_st.st_ino, userns_info->name, userns_info->start);
+        else
+                log_debug("Granting access to mount %i to user namespace " INO_FMT " ('%s')",
+                          mnt_id, userns_st.st_ino, userns_info->name);
+
+        return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+}
+
+static int validate_cgroup(Varlink *link, int fd, uint64_t *ret_cgroup_id) {
+        int r;
+
+        assert(link);
+        assert(fd >= 0);
+        assert(ret_cgroup_id);
+
+        r = fd_verify_safe_flags_full(fd, O_DIRECTORY);
+        if (r < 0)
+                return log_debug_errno(r, "Control group file descriptor has unsafe flags set: %m");
+
+        r = fd_verify_directory(fd);
+        if (r < 0)
+                return log_debug_errno(r, "Verification that cgroup fd refers to directory failed: %m");
+
+        r = fd_is_fs_type(fd, CGROUP2_SUPER_MAGIC);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to check if cgroup fd actually refers to cgroupfs: %m");
+        if (r == 0)
+                return varlink_error_invalid_parameter_name(link, "controlGroupFileDescriptor");
+
+        r = cg_fd_get_cgroupid(fd, ret_cgroup_id);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to read cgroup ID from cgroupfs: %m");
+
+        return 0;
+}
+
+typedef struct AddCGroupParameters {
+        unsigned userns_fd_idx;
+        unsigned cgroup_fd_idx;
+} AddCGroupParameters;
+
+static int vl_method_add_cgroup_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+        static const JsonDispatch parameter_dispatch_table[] = {
+                { "userNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, userns_fd_idx), JSON_MANDATORY },
+                { "controlGroupFileDescriptor",  _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint, offsetof(AddCGroupParameters, cgroup_fd_idx), JSON_MANDATORY },
+                {}
+        };
+
+        _cleanup_close_ int userns_fd = -EBADF, cgroup_fd = -EBADF, registry_dir_fd = -EBADF;
+        AddCGroupParameters p = {
+                .userns_fd_idx = UINT_MAX,
+                .cgroup_fd_idx = UINT_MAX,
+        };
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        struct stat userns_st, cgroup_st;
+        uid_t peer_uid;
+        int r;
+
+        assert(link);
+        assert(parameters);
+
+        r = test_userns_api_support(link);
+        if (r != 0)
+                return r;
+
+        r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+        if (userns_fd < 0)
+                return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
+
+        r = validate_userns(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        if (fstat(userns_fd, &userns_st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
+
+        cgroup_fd = varlink_take_fd(link, p.cgroup_fd_idx);
+        if (cgroup_fd < 0)
+                return log_debug_errno(cgroup_fd, "Failed to take cgroup fd from Varlink connection: %m");
+
+        uint64_t cgroup_id;
+        r = validate_cgroup(link, cgroup_fd, &cgroup_id);
+        if (r != 0)
+                return r;
+
+        if (fstat(cgroup_fd, &cgroup_st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() cgroup fd: %m");
+
+        registry_dir_fd = userns_registry_open_fd();
+        if (registry_dir_fd < 0)
+                return registry_dir_fd;
+
+        _cleanup_close_ int lock_fd = -EBADF;
+        lock_fd = userns_registry_lock(registry_dir_fd);
+        if (lock_fd < 0)
+                return lock_fd;
+
+        r = userns_registry_load_by_userns_inode(
+                        registry_dir_fd,
+                        userns_st.st_ino,
+                        &userns_info);
+        if (r == -ENOENT)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+        if (r < 0)
+                return r;
+
+        /* The user namespace must have a user assigned */
+        if (userns_info->size == 0)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceWithoutUserRange", NULL);
+        if (userns_info_has_cgroup(userns_info, cgroup_id))
+                return varlink_error(link, "io.systemd.NamespaceResource.ControlGroupAlreadyAdded", NULL);
+        if (userns_info->n_cgroups > USER_NAMESPACE_CGROUPS_DELEGATE_MAX)
+                return varlink_error(link, "io.systemd.NamespaceResource.TooManyControlGroups", NULL);
+
+        /* Registering a cgroup for this client is only allowed for the root or the owner of a userns */
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get connection peer: %m");
+        if (peer_uid != 0) {
+                if (peer_uid != userns_info->owner)
+                        return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+                /* The cgroup must be owned by the owner of the userns */
+                if (cgroup_st.st_uid != userns_info->owner)
+                        return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+        }
+
+        r = userns_info_add_cgroup(userns_info, cgroup_id);
+        if (r < 0)
+                return r;
+
+        r = userns_registry_store(registry_dir_fd, userns_info);
+        if (r < 0)
+                return r;
+
+        if (fchown(cgroup_fd, userns_info->start, userns_info->start) < 0)
+                return log_debug_errno(errno, "Failed to change ownership of cgroup: %m");
+
+        if (fchmod(cgroup_fd, 0755) < 0)
+                return log_debug_errno(errno, "Failed to change access mode of cgroup: %m");
+
+        FOREACH_STRING(attr, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") {
+                (void) fchmodat(cgroup_fd, attr, 0644, AT_SYMLINK_NOFOLLOW);
+                (void) fchownat(cgroup_fd, attr, userns_info->start, userns_info->start, AT_SYMLINK_NOFOLLOW);
+        }
+
+        log_debug("Granting ownership to cgroup %" PRIu64 " to userns " INO_FMT " ('%s' @ UID " UID_FMT ")",
+                  cgroup_id, userns_st.st_ino, userns_info->name, userns_info->start);
+
+        return varlink_replyb(link, JSON_BUILD_EMPTY_OBJECT);
+}
+
+static uint64_t hash_ifname_id(UserNamespaceInfo *userns_info, const char *ifname) {
+        struct siphash state;
+
+        assert(userns_info);
+
+        siphash24_init(&state, (const uint8_t[]) { 0xc4, 0x6c, 0x96, 0xe8, 0xad, 0x37, 0x4d, 0x5f, 0xa1, 0xae, 0xfe, 0x70, 0x40, 0xed, 0x41, 0x5f });
+        siphash24_compress_string(userns_info->name, &state);
+        siphash24_compress_byte(0, &state); /* separator */
+        siphash24_compress_string(strempty(ifname), &state);
+
+        return siphash24_finalize(&state);
+}
+
+static void hash_ether_addr(UserNamespaceInfo *userns_info, const char *ifname, uint64_t n, struct ether_addr *ret) {
+        struct siphash state;
+        uint64_t h;
+
+        assert(userns_info);
+        assert(ret);
+
+        siphash24_init(&state, (const uint8_t[]) { 0x36, 0xaa, 0xd1, 0x69, 0xc7, 0xe5, 0x4c, 0xaa, 0x1e, 0xb2, 0x9e, 0xb3, 0x3a, 0x6b, 0xd4, 0x71 });
+        siphash24_compress_string(userns_info->name, &state);
+        siphash24_compress_byte(0, &state); /* separator */
+        siphash24_compress_string(strempty(ifname), &state);
+        siphash24_compress_byte(0, &state); /* separator */
+        n = htole64(n); /* add the 'index' to the mix in an endianess-independent fashion */
+        siphash24_compress(&n, sizeof(n), &state);
+
+        h = htole64(siphash24_finalize(&state));
+
+        assert(sizeof(h) >= sizeof_field(struct ether_addr, ether_addr_octet));
+
+        memcpy(ret->ether_addr_octet, &h, sizeof_field(struct ether_addr, ether_addr_octet));
+        ether_addr_mark_random(ret);
+}
+
+static int create_veth(
+                int netns_fd,
+                const char *ifname_host,
+                const char *altifname_host,
+                struct ether_addr *mac_host,
+                const char *ifname_namespace,
+                struct ether_addr *mac_namespace) {
+
+        int r;
+
+        assert(netns_fd >= 0);
+        assert(ifname_host);
+        assert(mac_host);
+        assert(ifname_namespace);
+        assert(mac_namespace);
+
+        log_debug("Creating veth link on host %s (%s) with address %s to container as %s with address %s",
+                  ifname_host, strna(altifname_host), ETHER_ADDR_TO_STR(mac_host),
+                  ifname_namespace, ETHER_ADDR_TO_STR(mac_namespace));
+
+        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+        r = sd_netlink_open(&rtnl);
+        if (r < 0)
+                return r;
+
+        _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+        r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate netlink message: %m");
+
+        r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_host);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+        r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_host);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add netlink MAC address: %m");
+
+        r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open netlink container: %m");
+
+        r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
+        if (r < 0)
+                return log_error_errno(r, "Failed to open netlink container: %m");
+
+        r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
+        if (r < 0)
+                return log_error_errno(r, "Failed to open netlink container: %m");
+
+        r = sd_netlink_message_append_string(m, IFLA_IFNAME, ifname_namespace);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add netlink interface name: %m");
+
+        r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, mac_namespace);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add netlink MAC address: %m");
+
+        r = sd_netlink_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add netlink namespace field: %m");
+
+        r = sd_netlink_message_close_container(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to close netlink container: %m");
+
+        r = sd_netlink_message_close_container(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to close netlink container: %m");
+
+        r = sd_netlink_message_close_container(m);
+        if (r < 0)
+                return log_error_errno(r, "Failed to close netlink container: %m");
+
+        r = sd_netlink_call(rtnl, m, 0, NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add new veth interfaces (%s:%s): %m", ifname_host, ifname_namespace);
+
+        r = rtnl_set_link_alternative_names_by_ifname(&rtnl, ifname_host, STRV_MAKE(altifname_host));
+        if (r < 0)
+                log_warning_errno(r, "Failed to set alternative interface name to '%s', ignoring: %m", altifname_host);
+
+        return 0;
+}
+
+static int validate_netns(Varlink *link, int userns_fd, int netns_fd) {
+        int r;
+
+        assert(link);
+        assert(userns_fd >= 0);
+        assert(netns_fd >= 0);
+
+        r = fd_verify_safe_flags(netns_fd);
+        if (r < 0)
+                return log_debug_errno(r, "Network namespace file descriptor has unsafe flags set: %m");
+
+        /* Validate this is actually a valid network namespace fd */
+        r = fd_is_ns(netns_fd, CLONE_NEWNET);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+        /* And refuse the thing if it is our own */
+        r = is_our_namespace(netns_fd, NAMESPACE_NET);
+        if (r < 0)
+                return r;
+        if (r > 0)
+                return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+        /* Check if the netns actually belongs to the userns */
+        _cleanup_close_ int owner_userns_fd = -EBADF;
+        owner_userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+        if (owner_userns_fd < 0)
+                return -errno;
+
+        r = inode_same_at(owner_userns_fd, /* path_a= */ NULL, userns_fd, /* path_b= */ NULL, AT_EMPTY_PATH);
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+
+        uid_t peer_uid;
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+
+        if (peer_uid != 0) {
+                /* Refuse if the netns is not actually owned by our client. */
+
+                uid_t owner_uid;
+                if (ioctl(owner_userns_fd, NS_GET_OWNER_UID, &owner_uid) < 0)
+                        return -errno;
+
+                if (owner_uid != peer_uid)
+                        return varlink_error_invalid_parameter_name(link, "networkNamespaceFileDescriptor");
+        }
+
+        return 0;
+}
+
+typedef struct AddNetworkParameters {
+        unsigned userns_fd_idx;
+        unsigned netns_fd_idx;
+        const char *ifname;
+        const char *mode;
+} AddNetworkParameters;
+
+static int vl_method_add_netif_to_user_namespace(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+        static const JsonDispatch parameter_dispatch_table[] = {
+                { "userNamespaceFileDescriptor",    _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(AddNetworkParameters, userns_fd_idx), JSON_MANDATORY },
+                { "networkNamespaceFileDescriptor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint,         offsetof(AddNetworkParameters, netns_fd_idx),  JSON_MANDATORY },
+                { "namespaceInterfaceName",         JSON_VARIANT_STRING,        json_dispatch_const_string, offsetof(AddNetworkParameters, ifname),        0              },
+                { "mode",                           JSON_VARIANT_STRING,        json_dispatch_const_string, offsetof(AddNetworkParameters, mode),          JSON_MANDATORY },
+                {}
+        };
+
+        _cleanup_close_ int userns_fd = -EBADF, netns_fd = -EBADF, registry_dir_fd = -EBADF;
+        AddNetworkParameters p = {
+                .userns_fd_idx = UINT_MAX,
+        };
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        struct stat userns_st;
+        uid_t peer_uid;
+        int r;
+
+        assert(link);
+        assert(parameters);
+
+        r = test_userns_api_support(link);
+        if (r != 0)
+                return r;
+
+        r = varlink_dispatch(link, parameters, parameter_dispatch_table, &p);
+        if (r != 0)
+                return r;
+
+        userns_fd = varlink_take_fd(link, p.userns_fd_idx);
+        if (userns_fd < 0)
+                return userns_fd;
+
+        r = validate_userns(link, userns_fd);
+        if (r != 0)
+                return r;
+
+        if (fstat(userns_fd, &userns_st) < 0)
+                return -errno;
+
+        netns_fd = varlink_take_fd(link, p.netns_fd_idx);
+        if (netns_fd < 0)
+                return netns_fd;
+
+        r = validate_netns(link, userns_fd, netns_fd);
+        if (r != 0)
+                return r;
+
+        if (!streq_ptr(p.mode, "veth"))
+                return varlink_error_invalid_parameter_name(link, "mode");
+
+        if (p.ifname && !ifname_valid(p.ifname))
+                return varlink_error_invalid_parameter_name(link, "interfaceName");
+
+        registry_dir_fd = userns_registry_open_fd();
+        if (registry_dir_fd < 0)
+                return registry_dir_fd;
+
+        _cleanup_close_ int lock_fd = -EBADF;
+        lock_fd = userns_registry_lock(registry_dir_fd);
+        if (lock_fd < 0)
+                return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+        r = userns_registry_load_by_userns_inode(
+                        registry_dir_fd,
+                        userns_st.st_ino,
+                        &userns_info);
+        if (r == -ENOENT)
+                return varlink_error(link, "io.systemd.NamespaceResource.UserNamespaceNotRegistered", NULL);
+        if (r < 0)
+                return r;
+
+        /* Registering a network interface for this client is only allowed for the root or the owner of a userns */
+        r = varlink_get_peer_uid(link, &peer_uid);
+        if (r < 0)
+                return r;
+        if (peer_uid != 0 && peer_uid != userns_info->owner)
+                return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, NULL);
+
+        _cleanup_free_ char *ifname_host = NULL, *altifname_host = NULL;
+        const char *ifname_namespace = p.ifname ?: "host0";
+
+        /* The short ifname is just too short to generate readable and unique names where unprivileged users
+         * can't take each others names. Hence just hash it. The alternative name however contains more useful
+         * information. */
+        if (asprintf(&ifname_host, "ns-%08" PRIx64, hash_ifname_id(userns_info, p.ifname)) < 0)
+                return -ENOMEM;
+        strshorten(ifname_host, IFNAMSIZ-1);
+
+        if (p.ifname)
+                r = asprintf(&altifname_host, "ns-" UID_FMT "-%s-%s", userns_info->owner, userns_info->name, p.ifname);
+        else
+                r = asprintf(&altifname_host, "ns-" UID_FMT "-%s", userns_info->owner, userns_info->name);
+        if (r < 0)
+                return -ENOMEM;
+
+        struct ether_addr ether_addr_host, ether_addr_namespace;
+
+        hash_ether_addr(userns_info, p.ifname, 0, &ether_addr_host);
+        hash_ether_addr(userns_info, p.ifname, 1, &ether_addr_namespace);
+
+        r = create_veth(netns_fd,
+                        ifname_host, altifname_host, &ether_addr_host,
+                        ifname_namespace, &ether_addr_namespace);
+        if (r < 0)
+                return r;
+
+        log_debug("Adding veth tunnel %s from host to userns " INO_FMT " ('%s' @ UID " UID_FMT ", interface %s).",
+                  ifname_host, userns_st.st_ino, userns_info->name, userns_info->start, ifname_namespace);
+
+        return varlink_replyb(link, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hostInterfaceName", JSON_BUILD_STRING(ifname_host)),
+                                                      JSON_BUILD_PAIR("namespaceInterfaceName", JSON_BUILD_STRING(ifname_namespace))));
+}
+
+static int process_connection(VarlinkServer *server, int _fd) {
+        _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
+        _cleanup_(varlink_close_unrefp) Varlink *vl = NULL;
+        int r;
+
+        r = varlink_server_add_connection(server, fd, &vl);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add connection: %m");
+
+        TAKE_FD(fd);
+        vl = varlink_ref(vl);
+
+        r = varlink_set_allow_fd_passing_input(vl, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable fd passing for read: %m");
+
+        r = varlink_set_allow_fd_passing_output(vl, true);
+        if (r < 0)
+                return log_error_errno(r, "Failed to enable fd passing for write: %m");
+
+        for (;;) {
+                r = varlink_process(vl);
+                if (r == -ENOTCONN) {
+                        log_debug("Connection terminated.");
+                        break;
+                }
+                if (r < 0)
+                        return log_error_errno(r, "Failed to process connection: %m");
+                if (r > 0)
+                        continue;
+
+                r = varlink_wait(vl, CONNECTION_IDLE_USEC);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to wait for connection events: %m");
+                if (r == 0)
+                        break;
+        }
+
+        return 0;
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *bpf = NULL;
+        usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
+        _cleanup_(varlink_server_unrefp) VarlinkServer *server = NULL;
+        _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
+        unsigned n_iterations = 0;
+        int m, listen_fd, r;
+
+        log_setup();
+
+        m = sd_listen_fds(false);
+        if (m < 0)
+                return log_error_errno(m, "Failed to determine number of listening fds: %m");
+        if (m == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received.");
+        if (m > 1)
+                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time.");
+
+        listen_fd = SD_LISTEN_FDS_START;
+
+        r = fd_nonblock(listen_fd, false);
+        if (r < 0)
+                return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m");
+
+        r = varlink_server_new(&server, VARLINK_SERVER_INHERIT_USERDATA);
+        if (r < 0)
+                return log_error_errno(r, "Failed to allocate server: %m");
+
+        r = varlink_server_add_interface_many(
+                        server,
+                        &vl_interface_io_systemd_NamespaceResource,
+                        &vl_interface_io_systemd_UserDatabase);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add UserDatabase and NamespaceResource interface to varlink server: %m");
+
+        r = varlink_server_bind_method_many(
+                        server,
+                        "io.systemd.NamespaceResource.AllocateUserRange",              vl_method_allocate_user_range,
+                        "io.systemd.NamespaceResource.RegisterUserNamespace",          vl_method_register_user_namespace,
+                        "io.systemd.NamespaceResource.AddMountToUserNamespace",        vl_method_add_mount_to_user_namespace,
+                        "io.systemd.NamespaceResource.AddControlGroupToUserNamespace", vl_method_add_cgroup_to_user_namespace,
+                        "io.systemd.NamespaceResource.AddNetworkToUserNamespace",      vl_method_add_netif_to_user_namespace,
+                        "io.systemd.UserDatabase.GetUserRecord",                       vl_method_get_user_record,
+                        "io.systemd.UserDatabase.GetGroupRecord",                      vl_method_get_group_record,
+                        "io.systemd.UserDatabase.GetMemberships",                      vl_method_get_memberships);
+        if (r < 0)
+                return log_error_errno(r, "Failed to bind methods: %m");
+
+        varlink_server_set_userdata(server, &bpf);
+
+        r = getenv_bool("NSRESOURCE_FIXED_WORKER");
+        if (r < 0)
+                return log_error_errno(r, "Failed to parse NSRESOURCE_FIXED_WORKER: %m");
+        listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC;
+
+        r = pidref_set_parent(&parent);
+        if (r < 0)
+                return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
+
+        start_time = now(CLOCK_MONOTONIC);
+
+        for (;;) {
+                _cleanup_close_ int fd = -EBADF;
+                usec_t n;
+
+                /* Exit the worker in regular intervals, to flush out all memory use */
+                if (n_iterations++ > ITERATIONS_MAX) {
+                        log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations);
+                        break;
+                }
+
+                n = now(CLOCK_MONOTONIC);
+                if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) {
+                        log_debug("Exiting worker, ran for %s, that's enough.",
+                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0));
+                        break;
+                }
+
+                if (last_busy_usec == USEC_INFINITY)
+                        last_busy_usec = n;
+                else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) {
+                        log_debug("Exiting worker, been idle for %s.",
+                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0));
+                        break;
+                }
+
+                (void) rename_process("systemd-nsresourcework: waiting...");
+                fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC));
+                (void) rename_process("systemd-nsresourcework: processing...");
+
+                if (fd == -EAGAIN)
+                        continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected
+                                   * after a while, let's check if it's time to exit though. */
+                if (fd == -EINTR)
+                        continue; /* Might be that somebody attached via strace, let's just continue in that
+                                   * case */
+                if (fd < 0)
+                        return log_error_errno(fd, "Failed to accept() from listening socket: %m");
+
+                if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) {
+                        /* We only slept a very short time? If so, let's see if there are more sockets
+                         * pending, and if so, let's ask our parent for more workers */
+
+                        r = fd_wait_for_event(listen_fd, POLLIN, 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
+
+                        if (FLAGS_SET(r, POLLIN)) {
+                                r = pidref_kill(&parent, SIGUSR2);
+                                if (r == -ESRCH)
+                                        return log_error_errno(r, "Parent already died?");
+                                if (r < 0)
+                                        return log_error_errno(r, "Failed to send SIGUSR2 signal to parent. %m");
+                        }
+                }
+
+                (void) process_connection(server, TAKE_FD(fd));
+                last_busy_usec = USEC_INFINITY;
+        }
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/test-userns-restrict.c b/src/nsresourced/test-userns-restrict.c
new file mode 100644 (file)
index 0000000..7ef1d7b
--- /dev/null
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/eventfd.h>
+
+#include "fd-util.h"
+#include "main-func.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "namespace-util.h"
+#include "process-util.h"
+#include "rm-rf.h"
+#include "tmpfile-util.h"
+#include "userns-restrict.h"
+
+static int make_tmpfs_fsmount(void) {
+        _cleanup_close_ int fsfd = -EBADF, mntfd = -EBADF;
+
+        fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+        assert_se(fsfd >= 0);
+        assert_se(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) >= 0);
+
+        mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+        assert_se(mntfd >= 0);
+
+        return TAKE_FD(mntfd);
+}
+
+static void test_works_reg(int parent_fd, const char *fname) {
+        _cleanup_close_ int fd = -EBADF;
+
+        fd = openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666);
+        assert_se(fd >= 0);
+}
+
+static void test_fails_reg(int parent_fd, const char *fname) {
+        errno = 0;
+        assert_se(openat(parent_fd, fname, O_RDWR|O_CREAT|O_CLOEXEC, 0666) < 0);
+        assert_se(errno == EPERM);
+}
+
+static void test_works_dir(int parent_fd, const char *fname) {
+        assert_se(mkdirat(parent_fd, fname, 0666) >= 0);
+}
+
+static void test_fails_dir(int parent_fd, const char *fname) {
+        errno = 0;
+        assert_se(mkdirat(parent_fd, fname, 0666) < 0);
+        assert_se(errno == EPERM);
+}
+
+static int run(int argc, char *argv[]) {
+        _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+        _cleanup_close_ int userns_fd = -EBADF, host_fd1 = -EBADF, host_tmpfs = -EBADF, afd = -EBADF, bfd = -EBADF;
+        _cleanup_(rm_rf_physical_and_freep) char *t = NULL;
+        _cleanup_(sigkill_waitp) pid_t pid = 0;
+        int r;
+
+        log_set_max_level(LOG_DEBUG);
+        log_open();
+
+        r = userns_restrict_install(/* pin= */ false, &obj);
+        if (ERRNO_IS_NOT_SUPPORTED(r)) {
+                log_notice("Skipping test, LSM-BPF logic not supported.");
+                return EXIT_TEST_SKIP;
+        }
+        if (ERRNO_IS_PRIVILEGE(r)) {
+                log_notice("Skipping test, lacking privileges.");
+                return EXIT_TEST_SKIP;
+        }
+        if (r < 0)
+                return r;
+
+        assert_se(mkdtemp_malloc(NULL, &t) >= 0);
+
+        host_fd1 = open(t, O_DIRECTORY|O_CLOEXEC);
+        assert_se(host_fd1 >= 0);
+
+        host_tmpfs = make_tmpfs_fsmount();
+        assert_se(host_tmpfs >= 0);
+
+        userns_fd = userns_acquire("0 0 1", "0 0 1");
+        if (userns_fd < 0)
+                return log_error_errno(userns_fd, "Failed to make user namespace: %m");
+
+        r = userns_restrict_put_by_fd(
+                        obj,
+                        userns_fd,
+                        /* replace= */ true,
+                        /* mount_fds= */ NULL,
+                        /* n_mount_fds= */ 0);
+        if (r < 0)
+                return log_error_errno(r, "Failed to restrict user namespace: %m");
+
+        afd = eventfd(0, EFD_CLOEXEC);
+        bfd = eventfd(0, EFD_CLOEXEC);
+
+        assert_se(afd >= 0 && bfd >= 0);
+
+        r = safe_fork("(test)", FORK_DEATHSIG_SIGKILL, &pid);
+        assert_se(r >= 0);
+        if (r == 0) {
+                _cleanup_close_ int private_tmpfs = -EBADF;
+
+                assert_se(setns(userns_fd, CLONE_NEWUSER) >= 0);
+                assert_se(unshare(CLONE_NEWNS) >= 0);
+
+                /* Allocate tmpfs locally */
+                private_tmpfs = make_tmpfs_fsmount();
+
+                /* These two host mounts should be inaccessible */
+                test_fails_reg(host_fd1, "test");
+                test_fails_reg(host_tmpfs, "xxx");
+                test_fails_dir(host_fd1, "test2");
+                test_fails_dir(host_tmpfs, "xxx2");
+
+                /* But this mount created locally should be fine */
+                test_works_reg(private_tmpfs, "yyy");
+                test_works_dir(private_tmpfs, "yyy2");
+
+                /* Let's sync with the parent, so that it allowlists more stuff for us */
+                assert_se(eventfd_write(afd, 1) >= 0);
+                uint64_t x;
+                assert_se(eventfd_read(bfd, &x) >= 0);
+
+                /* And now we should also have access to the host tmpfs */
+                test_works_reg(host_tmpfs, "zzz");
+                test_works_reg(private_tmpfs, "aaa");
+                test_works_dir(host_tmpfs, "zzz2");
+                test_works_dir(private_tmpfs, "aaa2");
+
+                /* But this one should still fail */
+                test_fails_reg(host_fd1, "bbb");
+                test_fails_dir(host_fd1, "bbb2");
+
+                /* Sync again, to get more stuff allowlisted */
+                assert_se(eventfd_write(afd, 1) >= 0);
+                assert_se(eventfd_read(bfd, &x) >= 0);
+
+                /* Everything should now be allowed */
+                test_works_reg(host_tmpfs, "ccc");
+                test_works_reg(host_fd1, "ddd");
+                test_works_reg(private_tmpfs, "eee");
+                test_works_dir(host_tmpfs, "ccc2");
+                test_works_reg(host_fd1, "ddd2");
+                test_works_dir(private_tmpfs, "eee2");
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        uint64_t x;
+        assert_se(eventfd_read(afd, &x) >= 0);
+
+        r = userns_restrict_put_by_fd(
+                        obj,
+                        userns_fd,
+                        /* replace= */ false,
+                        &host_tmpfs,
+                        1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to loosen user namespace: %m");
+
+        assert_se(eventfd_write(bfd, 1) >= 0);
+
+        assert_se(eventfd_read(afd, &x) >= 0);
+
+        r = userns_restrict_put_by_fd(
+                        obj,
+                        userns_fd,
+                        /* replace= */ false,
+                        &host_fd1,
+                        1);
+        if (r < 0)
+                return log_error_errno(r, "Failed to loosen user namespace: %m");
+
+        assert_se(eventfd_write(bfd, 1) >= 0);
+
+        assert_se(wait_for_terminate_and_check("(test)", pid, WAIT_LOG) >= 0);
+
+        return 0;
+}
+
+DEFINE_MAIN_FUNCTION(run);
diff --git a/src/nsresourced/userns-registry.c b/src/nsresourced/userns-registry.c
new file mode 100644 (file)
index 0000000..2cc1b1f
--- /dev/null
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "chase.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "json.h"
+#include "missing_magic.h"
+#include "path-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "user-util.h"
+#include "userns-registry.h"
+
+int userns_registry_open_fd(void) {
+        int fd;
+
+        fd = chase_and_open(
+                        "/run/systemd/nsresource/registry",
+                        /* root= */ NULL,
+                        CHASE_MKDIR_0755,
+                        O_CLOEXEC|O_DIRECTORY|O_CREAT,
+                        /* ret_path= */ NULL);
+        if (fd < 0)
+                return log_debug_errno(fd, "Failed to open registry dir: %m");
+
+        return fd;
+}
+
+int userns_registry_lock(int dir_fd) {
+        _cleanup_close_ int registry_fd = -EBADF, lock_fd = -EBADF;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        lock_fd = xopenat_lock_full(dir_fd, "lock", O_CREAT|O_RDWR|O_CLOEXEC, /* xopen_flags= */ 0, 0600, LOCK_BSD, LOCK_EX);
+        if (lock_fd < 0)
+                return log_debug_errno(lock_fd, "Failed to open nsresource registry lock file: %m");
+
+        return TAKE_FD(lock_fd);
+}
+
+UserNamespaceInfo* userns_info_new(void) {
+        UserNamespaceInfo *info = new(UserNamespaceInfo, 1);
+        if (!info)
+                return NULL;
+
+        *info = (UserNamespaceInfo) {
+                .owner = UID_INVALID,
+                .start = UID_INVALID,
+                .target = UID_INVALID,
+        };
+
+        return info;
+}
+
+UserNamespaceInfo *userns_info_free(UserNamespaceInfo *userns) {
+        if (!userns)
+                return NULL;
+
+        free(userns->cgroups);
+        free(userns->name);
+
+        return mfree(userns);
+}
+
+static int dispatch_cgroups_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+        UserNamespaceInfo *info = ASSERT_PTR(userdata);
+        _cleanup_free_ uint64_t *cgroups = NULL;
+        size_t n_cgroups = 0;
+
+        if (json_variant_is_null(variant)) {
+                info->cgroups = mfree(info->cgroups);
+                info->n_cgroups = 0;
+                return 0;
+        }
+
+        if (!json_variant_is_array(variant))
+                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+        cgroups = new(uint64_t, json_variant_elements(variant));
+        if (!cgroups)
+                return json_log_oom(variant, flags);
+
+        JsonVariant *e;
+        JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+                bool found = false;
+
+                if (!json_variant_is_unsigned(e))
+                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a number.");
+
+                FOREACH_ARRAY(cg, cgroups, n_cgroups)
+                        if (*cg == json_variant_unsigned(e)) {
+                                found = true;
+                                break;
+                        }
+                if (found) /* suppress duplicate */
+                        continue;
+
+                cgroups[n_cgroups++] = json_variant_unsigned(e);
+        }
+
+        assert(n_cgroups <= json_variant_elements(variant));
+
+        free_and_replace(info->cgroups, cgroups);
+        info->n_cgroups = n_cgroups;
+
+        return 0;
+}
+
+static int userns_registry_load(int dir_fd, const char *fn, UserNamespaceInfo **ret) {
+
+        static const JsonDispatch dispatch_table[] = {
+                { "owner",   JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, owner),        JSON_MANDATORY },
+                { "name",    JSON_VARIANT_STRING,   json_dispatch_string,   offsetof(UserNamespaceInfo, name),         JSON_MANDATORY },
+                { "userns",  JSON_VARIANT_UNSIGNED, json_dispatch_uint64,   offsetof(UserNamespaceInfo, userns_inode), JSON_MANDATORY },
+                { "start",   JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, start),        0              },
+                { "size",    JSON_VARIANT_UNSIGNED, json_dispatch_uint32,   offsetof(UserNamespaceInfo, size),         0              },
+                { "target",  JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid,  offsetof(UserNamespaceInfo, target),       0              },
+                { "cgroups", JSON_VARIANT_ARRAY,    dispatch_cgroups_array, 0,                                         0              },
+                {}
+        };
+
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+        _cleanup_close_ int registry_fd = -EBADF;
+        int r;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        r = json_parse_file_at(NULL, dir_fd, fn, 0, &v, NULL, NULL);
+        if (r < 0)
+                return r;
+
+        userns_info = userns_info_new();
+        if (!userns_info)
+                return -ENOMEM;
+
+        r = json_dispatch(v, dispatch_table, 0, userns_info);
+        if (r < 0)
+                return r;
+
+        if (userns_info->userns_inode == 0)
+                return -EBADMSG;
+        if (userns_info->start == 0)
+                return -EBADMSG;
+        if (userns_info->size == 0) {
+                if (uid_is_valid(userns_info->start) || uid_is_valid(userns_info->target))
+                        return -EBADMSG;
+        } else {
+                if (!uid_is_valid(userns_info->start) || !uid_is_valid(userns_info->target))
+                        return -EBADMSG;
+
+                if (userns_info->size > UINT32_MAX - userns_info->start ||
+                    userns_info->size > UINT32_MAX - userns_info->target)
+                        return -EBADMSG;
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(userns_info);
+        return 0;
+}
+
+int userns_registry_uid_exists(int dir_fd, uid_t start) {
+        _cleanup_free_ char *fn = NULL;
+
+        assert(dir_fd >= 0);
+
+        if (!uid_is_valid(start))
+                return -ENOENT;
+
+        if (start == 0)
+                return true;
+
+        if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0)
+                return -ENOMEM;
+
+        if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+                return errno == ENOENT ? false : -errno;
+
+        return true;
+}
+
+int userns_registry_name_exists(int dir_fd, const char *name) {
+        _cleanup_free_ char *fn = NULL;
+
+        assert(dir_fd >= 0);
+
+        if (!userns_name_is_valid(name))
+                return -EINVAL;
+
+        fn = strjoin("n", name, ".userns");
+        if (!fn)
+                return -ENOMEM;
+
+        if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+                return errno == ENOENT ? false : -errno;
+
+        return true;
+}
+
+int userns_registry_inode_exists(int dir_fd, uint64_t inode) {
+        _cleanup_free_ char *fn = NULL;
+
+        assert(dir_fd >= 0);
+
+        if (inode <= 0)
+                return -EINVAL;
+
+        if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0)
+                return -ENOMEM;
+
+        if (faccessat(dir_fd, fn, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+                return errno == ENOENT ? false : -errno;
+
+        return true;
+}
+
+int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret) {
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_close_ int registry_fd = -EBADF;
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        if (!uid_is_valid(start))
+                return -ENOENT;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        if (asprintf(&fn, "u" UID_FMT ".userns", start) < 0)
+                return -ENOMEM;
+
+        r = userns_registry_load(dir_fd, fn, &userns_info);
+        if (r < 0)
+                return r;
+
+        if (userns_info->start != start)
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_PTR(userns_info);
+
+        return 0;
+}
+
+int userns_registry_load_by_userns_inode(int dir_fd, uint64_t inode, UserNamespaceInfo **ret) {
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_close_ int registry_fd = -EBADF;
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        if (inode == 0)
+                return -ENOENT;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        if (asprintf(&fn, "i%" PRIu64 ".userns", inode) < 0)
+                return -ENOMEM;
+
+        r = userns_registry_load(dir_fd, fn, &userns_info);
+        if (r < 0)
+                return r;
+
+        if (userns_info->userns_inode != inode)
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_PTR(userns_info);
+
+        return 0;
+}
+
+int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret) {
+        _cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+        _cleanup_close_ int registry_fd = -EBADF;
+        _cleanup_free_ char *fn = NULL;
+        int r;
+
+        assert(name);
+
+        if (!userns_name_is_valid(name)) /* Invalid names never exist */
+                return -ENOENT;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        fn = strjoin("n", name, ".userns");
+        if (!fn)
+                return -ENOMEM;
+
+        r = userns_registry_load(dir_fd, fn, &userns_info);
+        if (r < 0)
+                return r;
+
+        if (!streq_ptr(userns_info->name, name))
+                return -EBADMSG;
+
+        if (ret)
+                *ret = TAKE_PTR(userns_info);
+
+        return 0;
+}
+
+int userns_registry_store(int dir_fd, UserNamespaceInfo *info) {
+        _cleanup_close_ int registry_fd = -EBADF;
+        int r;
+
+        assert(info);
+
+        if (!uid_is_valid(info->owner) ||
+            !info->name ||
+            info->userns_inode == 0)
+                return -EINVAL;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        _cleanup_(json_variant_unrefp) JsonVariant *cgroup_array = NULL;
+        FOREACH_ARRAY(cg, info->cgroups, info->n_cgroups) {
+                r = json_variant_append_arrayb(
+                                &cgroup_array,
+                                JSON_BUILD_UNSIGNED(*cg));
+                if (r < 0)
+                        return r;
+        }
+
+        _cleanup_(json_variant_unrefp) JsonVariant *def = NULL;
+        r = json_build(&def, JSON_BUILD_OBJECT(
+                                       JSON_BUILD_PAIR("owner", JSON_BUILD_UNSIGNED(info->owner)),
+                                       JSON_BUILD_PAIR("name", JSON_BUILD_STRING(info->name)),
+                                       JSON_BUILD_PAIR("userns", JSON_BUILD_UNSIGNED(info->userns_inode)),
+                                       JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "start", JSON_BUILD_UNSIGNED(info->start)),
+                                       JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "size", JSON_BUILD_UNSIGNED(info->size)),
+                                       JSON_BUILD_PAIR_CONDITION(uid_is_valid(info->start), "target", JSON_BUILD_UNSIGNED(info->target)),
+                                       JSON_BUILD_PAIR_CONDITION(cgroup_array, "cgroups", JSON_BUILD_VARIANT(cgroup_array))));
+        if (r < 0)
+                return r;
+
+        _cleanup_free_ char *def_buf = NULL;
+        r = json_variant_format(def, 0, &def_buf);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to format userns JSON object: %m");
+
+        _cleanup_free_ char *reg_fn = NULL, *link1_fn = NULL, *link2_fn = NULL, *owner_fn = NULL, *uid_fn = NULL;
+        if (asprintf(&reg_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
+                return log_oom_debug();
+
+        r = write_string_file_at(dir_fd, reg_fn, def_buf, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to write userns data to '%s' in registry: %m", reg_fn);
+
+        link1_fn = strjoin("n", info->name, ".userns");
+        if (!link1_fn) {
+                r = log_oom_debug();
+                goto fail;
+        }
+
+        r = linkat_replace(dir_fd, reg_fn, dir_fd, link1_fn);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link1_fn);
+                goto fail;
+        }
+
+        if (uid_is_valid(info->start)) {
+                if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0) {
+                        r = log_oom_debug();
+                        goto fail;
+                }
+
+                r = linkat_replace(dir_fd, reg_fn, dir_fd, link2_fn);
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", link2_fn);
+                        goto fail;
+                }
+        }
+
+        if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0) {
+                r = log_oom_debug();
+                goto fail;
+        }
+
+        if (mkdirat(dir_fd, uid_fn, 0755) < 0 && errno != EEXIST) {
+                r = log_debug_errno(errno, "Failed to create per-UID subdir '%s' of registry: %m", uid_fn);
+                goto fail;
+        }
+
+        if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0) {
+                r = log_oom_debug();
+                goto fail;
+        }
+
+        r = linkat_replace(dir_fd, reg_fn, dir_fd, owner_fn);
+        if (r < 0) {
+                log_debug_errno(r, "Failed to link userns data to '%s' in registry: %m", owner_fn);
+                goto fail;
+        }
+
+        return 0;
+
+fail:
+        if (reg_fn)
+                (void) unlinkat(dir_fd, reg_fn, /* flags= */ 0);
+        if (link1_fn)
+                (void) unlinkat(dir_fd, link1_fn, /* flags= */ 0);
+        if (link2_fn)
+                (void) unlinkat(dir_fd, link2_fn, /* flags= */ 0);
+        if (owner_fn)
+                (void) unlinkat(dir_fd, owner_fn, /* flags= */ 0);
+        if (uid_fn)
+                (void) unlinkat(dir_fd, uid_fn, AT_REMOVEDIR);
+
+        return r;
+}
+
+int userns_registry_remove(int dir_fd, UserNamespaceInfo *info) {
+        _cleanup_close_ int registry_fd = -EBADF;
+        int ret = 0, r;
+
+        assert(info);
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        _cleanup_free_ char *reg_fn = NULL;
+        if (asprintf(&reg_fn, "i%" PRIu64 ".userns", info->userns_inode) < 0)
+                return log_oom_debug();
+
+        ret = RET_NERRNO(unlinkat(dir_fd, reg_fn, 0));
+
+        _cleanup_free_ char *link1_fn = NULL;
+        link1_fn = strjoin("n", info->name, ".userns");
+        if (!link1_fn)
+                return log_oom_debug();
+
+        RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link1_fn, 0)));
+
+        if (uid_is_valid(info->start)) {
+                _cleanup_free_ char *link2_fn = NULL;
+
+                if (asprintf(&link2_fn, "u" UID_FMT ".userns", info->start) < 0)
+                        return log_oom_debug();
+
+                RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, link2_fn, 0)));
+        }
+
+        _cleanup_free_ char *uid_fn = NULL;
+        if (asprintf(&uid_fn, "o" UID_FMT ".owns", info->owner) < 0)
+                return log_oom_debug();
+
+        _cleanup_free_ char *owner_fn = NULL;
+        if (asprintf(&owner_fn, "%s/i%" PRIu64 ".userns", uid_fn, info->userns_inode) < 0)
+                return log_oom_debug();
+
+        RET_GATHER(ret, RET_NERRNO(unlinkat(dir_fd, owner_fn, 0)));
+
+        r = RET_NERRNO(unlinkat(dir_fd, uid_fn, AT_REMOVEDIR));
+        if (r != -ENOTEMPTY)
+                RET_GATHER(ret, r);
+
+        return ret;
+}
+
+bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) {
+        assert(userns);
+
+        FOREACH_ARRAY(i, userns->cgroups, userns->n_cgroups)
+                if (*i == cgroup_id)
+                        return true;
+
+        return false;
+}
+
+int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id) {
+
+        if (userns_info_has_cgroup(userns, cgroup_id))
+                return 0;
+
+        if (!GREEDY_REALLOC(userns->cgroups, userns->n_cgroups+1))
+                return -ENOMEM;
+
+        userns->cgroups[userns->n_cgroups++] = cgroup_id;
+        return 1;
+}
+
+static int userns_destroy_cgroup(uint64_t cgroup_id) {
+        _cleanup_close_ int cgroup_fd = -EBADF, parent_fd = -EBADF;
+        int r;
+
+        cgroup_fd = cg_cgroupid_open(/* cgroupfsfd= */ -EBADF, cgroup_id);
+        if (cgroup_fd == -ESTALE) {
+                log_debug_errno(cgroup_fd, "Control group %" PRIu64 " already gone, ignoring: %m", cgroup_id);
+                return 0;
+        }
+        if (cgroup_fd < 0)
+                return log_debug_errno(errno, "Failed to open cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+        _cleanup_free_ char *path = NULL;
+        r = fd_get_path(cgroup_fd, &path);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to get path of cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+        const char *e = path_startswith(path, "/sys/fs/cgroup/");
+        if (!e)
+                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got cgroup path that doesn't start with /sys/fs/cgroup/, refusing: %s", path);
+        if (isempty(e))
+                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Got root cgroup path, which can't be right, refusing.");
+
+        log_debug("Path of cgroup %" PRIu64 " is: %s", cgroup_id, path);
+
+        _cleanup_free_ char *fname = NULL;
+        r = path_extract_filename(path, &fname);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to extract name of cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+        parent_fd = openat(cgroup_fd, "..", O_CLOEXEC|O_DIRECTORY);
+        if (parent_fd < 0)
+                return log_debug_errno(errno, "Failed to open parent cgroup of %" PRIu64 ", ignoring: %m", cgroup_id);
+
+        /* Safety check, never leave cgroupfs */
+        r = fd_is_fs_type(parent_fd, CGROUP2_SUPER_MAGIC);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if parent directory of cgroup %" PRIu64 " is still a cgroup, ignoring: %m", cgroup_id);
+        if (!r)
+                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Parent directory of cgroup %" PRIu64 " is not a cgroup, refusing.", cgroup_id);
+
+        cgroup_fd = safe_close(cgroup_fd);
+
+        r = rm_rf_child(parent_fd, fname, REMOVE_ONLY_DIRECTORIES|REMOVE_PHYSICAL|REMOVE_CHMOD);
+        if (r < 0)
+                log_debug_errno(r, "Failed to remove delegated cgroup %" PRIu64 ", ignoring: %m", cgroup_id);
+
+        return 0;
+}
+
+int userns_info_remove_cgroups(UserNamespaceInfo *userns) {
+        int ret = 0;
+
+        assert(userns);
+
+        FOREACH_ARRAY(c, userns->cgroups, userns->n_cgroups)
+                RET_GATHER(ret, userns_destroy_cgroup(*c));
+
+        userns->cgroups = mfree(userns->cgroups);
+        userns->n_cgroups = 0;
+
+        return ret;
+}
+
+bool userns_name_is_valid(const char *name) {
+
+        /* Checks if the specified string is suitable as user namespace name. */
+
+        if (strlen(name) > NAME_MAX) /* before we use alloca(), let's check for size */
+                return false;
+
+        const char *f = strjoina("n", name, ".userns"); /* Make sure we can name our lookup symlink with this name */
+        if (!filename_is_valid(f))
+                return false;
+
+        const char *u = strjoina("ns-", name, "-65535"); /* Make sure we can turn this into valid user names */
+        if (!valid_user_group_name(u, 0))
+                return false;
+
+        return true;
+}
+
+int userns_registry_per_uid(int dir_fd, uid_t owner) {
+        _cleanup_close_ int registry_fd = -EBADF;
+        int n = 0, r;
+
+        if (dir_fd < 0) {
+                registry_fd = userns_registry_open_fd();
+                if (registry_fd < 0)
+                        return registry_fd;
+
+                dir_fd = registry_fd;
+        }
+
+        _cleanup_free_ char *uid_fn = NULL;
+        if (asprintf(&uid_fn, "o" UID_FMT ".owns", owner) < 0)
+                return log_oom_debug();
+
+        _cleanup_free_ DirectoryEntries *de = NULL;
+
+        r = readdir_all_at(dir_fd, uid_fn, RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE, &de);
+        if (r == -ENOENT)
+                return 0;
+        if (r < 0)
+                return log_debug_errno(r, "Failed to enumerate contents of '%s' sub-directory: %m", uid_fn);
+
+        FOREACH_ARRAY(i, de->entries, de->n_entries) {
+                struct dirent *e = *i;
+
+                if (e->d_type != DT_REG)
+                        continue;
+
+                if (!startswith(e->d_name, "i") || !endswith(e->d_name, ".userns"))
+                        continue;
+
+                n++;
+
+                if (n == INT_MAX) /* overflow safety check, just in case */
+                        break;
+        }
+
+        return n;
+}
diff --git a/src/nsresourced/userns-registry.h b/src/nsresourced/userns-registry.h
new file mode 100644 (file)
index 0000000..9e66a6f
--- /dev/null
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#define USER_NAMESPACE_CGROUPS_DELEGATE_MAX 16
+
+typedef struct UserNamespaceInfo {
+        uid_t owner;
+        char *name;
+        uint64_t userns_inode;
+        uid_t start;
+        uint32_t size;
+        uid_t target;
+        uint64_t *cgroups;
+        size_t n_cgroups;
+} UserNamespaceInfo;
+
+UserNamespaceInfo* userns_info_new(void);
+UserNamespaceInfo* userns_info_free(UserNamespaceInfo *userns);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(UserNamespaceInfo*, userns_info_free);
+
+bool userns_info_has_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id);
+int userns_info_add_cgroup(UserNamespaceInfo *userns, uint64_t cgroup_id);
+int userns_info_remove_cgroups(UserNamespaceInfo *userns);
+
+bool userns_name_is_valid(const char *name);
+
+int userns_registry_open_fd(void);
+int userns_registry_lock(int dir_fd);
+
+int userns_registry_load_by_start_uid(int dir_fd, uid_t start, UserNamespaceInfo **ret);
+int userns_registry_load_by_userns_inode(int dir_fd, uint64_t userns, UserNamespaceInfo **ret);
+int userns_registry_load_by_name(int dir_fd, const char *name, UserNamespaceInfo **ret);
+
+int userns_registry_store(int dir_fd, UserNamespaceInfo *info);
+int userns_registry_remove(int dir_fd, UserNamespaceInfo *info);
+
+int userns_registry_inode_exists(int dir_fd, uint64_t inode);
+int userns_registry_name_exists(int dir_fd, const char *name);
+int userns_registry_uid_exists(int dir_fd, uid_t start);
+
+int userns_registry_per_uid(int dir_fd, uid_t owner);
diff --git a/src/nsresourced/userns-restrict.c b/src/nsresourced/userns-restrict.c
new file mode 100644 (file)
index 0000000..4e917fd
--- /dev/null
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "userns-restrict.h"
+
+#if HAVE_VMLINUX_H
+
+#include <sched.h>
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "lsm-util.h"
+#include "missing_mount.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+
+#define USERNS_MAX (16U*1024U)
+#define MOUNTS_MAX 4096U
+
+#define PROGRAM_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/programs"
+#define MAP_LINK_PREFIX "/sys/fs/bpf/systemd/userns-restrict/maps"
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+        (void) userns_restrict_bpf__destroy(obj); /* this call is fine with NULL */
+        return NULL;
+}
+
+static int make_inner_hash_map(void) {
+        int fd;
+
+        fd = compat_bpf_map_create(
+                        BPF_MAP_TYPE_HASH,
+                        NULL,
+                        sizeof(int),
+                        sizeof(uint32_t),
+                        MOUNTS_MAX,
+                        NULL);
+        if (fd < 0)
+                return log_debug_errno(errno, "Failed allocate inner BPF map: %m");
+
+        return fd;
+}
+
+int userns_restrict_install(
+                bool pin,
+                struct userns_restrict_bpf **ret) {
+
+        _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *obj = NULL;
+        _cleanup_close_ int dummy_mnt_id_hash_fd = -EBADF;
+        int r;
+
+        r = lsm_supported("bpf");
+        if (r < 0)
+                return r;
+        if (r == 0)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm not supported, can't lock down user namespace.");
+
+        r = dlopen_bpf();
+        if (r < 0)
+                return r;
+
+        /* bpf_object__next_map() is not available in libbpf pre-0.7.0, and we want to use it. */
+        if (!sym_bpf_object__next_map)
+                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "libbpf too old for locking down user namespace.");
+
+        obj = userns_restrict_bpf__open();
+        if (!obj)
+                return log_error_errno(errno, "Failed to open userns_restrict BPF object: %m");
+
+        if (pin) {
+                struct bpf_map *map;
+
+                /* libbpf will only create one level of dirs. Let's create the rest */
+                (void) mkdir_p(MAP_LINK_PREFIX, 0755);
+                (void) mkdir_p(PROGRAM_LINK_PREFIX, 0755);
+
+                map = sym_bpf_object__next_map(obj->obj, NULL);
+                while (map) {
+                        _cleanup_free_ char *fn = NULL;
+
+                        fn = path_join(MAP_LINK_PREFIX, sym_bpf_map__name(map));
+                        if (!fn)
+                                return log_oom();
+
+                        r = sym_bpf_map__set_pin_path(map, fn);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to set pin path to '%s': %m", fn);
+
+                        map = sym_bpf_object__next_map(obj->obj, map);
+                }
+        }
+
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_mnt_id_hash, USERNS_MAX);
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns/mnt_id hash table: %m");
+
+        r = sym_bpf_map__set_max_entries(obj->maps.userns_ringbuf, USERNS_MAX * sizeof(unsigned int));
+        if (r < 0)
+                return log_error_errno(r, "Failed to size userns ring buffer: %m");
+
+        /* Dummy map to satisfy the verifier */
+        dummy_mnt_id_hash_fd = make_inner_hash_map();
+        if (dummy_mnt_id_hash_fd < 0)
+                return dummy_mnt_id_hash_fd;
+
+        r = sym_bpf_map__set_inner_map_fd(obj->maps.userns_mnt_id_hash, dummy_mnt_id_hash_fd);
+        if (r < 0)
+                return log_error_errno(r, "Failed to set inner BPF map: %m");
+
+        r = userns_restrict_bpf__load(obj);
+        if (r < 0)
+                return log_error_errno(r, "Failed to load BPF object: %m");
+
+        for (int i = 0; i < obj->skeleton->prog_cnt; i++) {
+                _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+                struct bpf_prog_skeleton *ps = obj->skeleton->progs + i;
+                _cleanup_free_ char *fn = NULL;
+                bool linked = false;
+                const char *e;
+
+                e = startswith(ps->name, "userns_restrict_");
+                assert(e);
+
+                if (pin) {
+                        fn = path_join(PROGRAM_LINK_PREFIX, e);
+                        if (!fn)
+                                return log_oom();
+
+                        link = sym_bpf_link__open(fn);
+                        r = sym_libbpf_get_error(link);
+                        if (r < 0) {
+                                if (r != -ENOENT)
+                                        return log_error_errno(r, "Unable to open pinned program link: %m");
+                                link = NULL;
+                        } else {
+                                linked = true;
+                                log_info("userns-restrict BPF-LSM program %s already attached.", ps->name);
+                        }
+                }
+
+                if (!link) {
+                        link = sym_bpf_program__attach(*ps->prog);
+                        r = sym_libbpf_get_error(link);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to attach LSM BPF program: %m");
+
+                        log_info("userns-restrict BPF-LSM program %s now attached.", ps->name);
+                }
+
+                if (pin && !linked) {
+                        assert(fn);
+
+                        r = sym_bpf_link__pin(link, fn);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to pin LSM attachment: %m");
+                }
+
+                *ps->link = TAKE_PTR(link);
+        }
+
+        if (pin) {
+                r = sym_bpf_object__pin_maps(obj->obj, NULL);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to pin BPF maps: %m");
+        }
+
+        if (ret)
+                *ret = TAKE_PTR(obj);
+
+        return 0;
+}
+
+int userns_restrict_put_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t userns_inode,
+                bool replace,
+                const int mount_fds[],
+                size_t n_mount_fds) {
+
+        _cleanup_close_ int inner_map_fd = -EBADF;
+        _cleanup_free_ int *mnt_ids = NULL;
+        uint64_t ino = userns_inode;
+        int r, outer_map_fd;
+
+        assert(obj);
+        assert(userns_inode != 0);
+        assert(n_mount_fds == 0 || mount_fds);
+
+        /* The BPF map type BPF_MAP_TYPE_HASH_OF_MAPS only supports 32bit keys, and user namespace inode
+         * numbers are 32bit too, even though ino_t is 64bit these days. Should we ever run into a 64bit
+         * inode let's refuse early, we can't support this with the current BPF code for now. */
+        if (userns_inode > UINT32_MAX)
+                return -EINVAL;
+
+        mnt_ids = new(int, n_mount_fds);
+        if (!mnt_ids)
+                return -ENOMEM;
+
+        for (size_t i = 0; i < n_mount_fds; i++) {
+                r = path_get_mnt_id_at(mount_fds[i], "", mnt_ids + i);
+                if (r < 0)
+                        return log_debug_errno(r, "Failed to get mount ID: %m");
+        }
+
+        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+        if (outer_map_fd < 0)
+                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+        if (replace) {
+                /* Add if missing, replace if already exists */
+                inner_map_fd = make_inner_hash_map();
+                if (inner_map_fd < 0)
+                        return inner_map_fd;
+
+                r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_ANY);
+                if (r < 0)
+                        return log_debug_errno(errno, "Failed to replace map in inode hash: %m");
+        } else {
+                /* Let's add an entry for this userns inode if missing. If it exists just extend the existing map. We
+                 * might race against each other, hence we try a couple of times */
+                for (size_t n_try = 10;; n_try--) {
+                        uint32_t innermap_id;
+
+                        if (n_try == 0)
+                                return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
+                                                       "Stillcan't create inode entry in BPF map after 10 tries.");
+
+                        r = sym_bpf_map_lookup_elem(outer_map_fd, &ino, &innermap_id);
+                        if (r >= 0) {
+                                inner_map_fd = sym_bpf_map_get_fd_by_id(innermap_id);
+                                if (inner_map_fd < 0)
+                                        return log_debug_errno(inner_map_fd, "Failed to get file descriptor for inner map: %m");
+
+                                break;
+                        }
+                        if (errno != ENOENT)
+                                return log_debug_errno(errno, "Failed to look up inode hash entry: %m");
+
+                        /* No entry for this user namespace yet. Let's create one */
+                        inner_map_fd = make_inner_hash_map();
+                        if (inner_map_fd < 0)
+                                return inner_map_fd;
+
+                        r = sym_bpf_map_update_elem(outer_map_fd, &ino, &inner_map_fd, BPF_NOEXIST);
+                        if (r >= 0)
+                                break;
+                        if (errno != EEXIST)
+                                return log_debug_errno(errno, "Failed to add mount ID list to inode hash: %m");
+                }
+        }
+
+        FOREACH_ARRAY(mntid, mnt_ids, n_mount_fds) {
+                uint32_t dummy_value = 1;
+
+                r = sym_bpf_map_update_elem(inner_map_fd, mntid, &dummy_value, BPF_ANY);
+                if (r < 0)
+                        return log_debug_errno(errno, "Failed to add mount ID to map: %m");
+
+                log_debug("Allowing mount %i on userns inode %" PRIu64, *mntid, ino);
+        }
+
+        return 0;
+}
+
+int userns_restrict_put_by_fd(
+                struct userns_restrict_bpf *obj,
+                int userns_fd,
+                bool replace,
+                const int mount_fds[],
+                size_t n_mount_fds) {
+
+        struct stat st;
+        int r;
+
+        assert(obj);
+        assert(userns_fd >= 0);
+        assert(n_mount_fds == 0 || mount_fds);
+
+        r = fd_is_ns(userns_fd, CLONE_NEWUSER);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to determine if file descriptor is user namespace: %m");
+        if (r == 0)
+                return log_debug_errno(SYNTHETIC_ERRNO(EBADF), "User namespace fd is not actually a user namespace fd.");
+
+        if (fstat(userns_fd, &st) < 0)
+                return log_debug_errno(errno, "Failed to fstat() user namespace: %m");
+
+        return userns_restrict_put_by_inode(
+                        obj,
+                        st.st_ino,
+                        replace,
+                        mount_fds,
+                        n_mount_fds);
+}
+
+int userns_restrict_reset_by_inode(
+                struct userns_restrict_bpf *obj,
+                uint64_t ino) {
+
+        int r, outer_map_fd;
+        unsigned u;
+
+        assert(obj);
+        assert(ino != 0);
+
+        if (ino > UINT32_MAX) /* inodes larger than 32bit are definitely not included in our map, exit early */
+                return 0;
+
+        outer_map_fd = sym_bpf_map__fd(obj->maps.userns_mnt_id_hash);
+        if (outer_map_fd < 0)
+                return log_debug_errno(outer_map_fd, "Failed to get outer BPF map fd: %m");
+
+        u = (uint32_t) ino;
+
+        r = sym_bpf_map_delete_elem(outer_map_fd, &u);
+        if (r < 0)
+                return log_debug_errno(outer_map_fd, "Failed to remove entry for inode %" PRIu64 " from outer map: %m", ino);
+
+        return 0;
+}
+
+#else
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj) {
+        return NULL;
+}
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode) {
+        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "User Namespace Restriction BPF support disabled.");
+}
+#endif
diff --git a/src/nsresourced/userns-restrict.h b/src/nsresourced/userns-restrict.h
new file mode 100644 (file)
index 0000000..37aed7b
--- /dev/null
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+#if HAVE_VMLINUX_H
+#include "bpf/userns_restrict/userns-restrict-skel.h"
+#else
+struct userns_restrict_bpf;
+#endif
+
+int userns_restrict_install(bool pin, struct userns_restrict_bpf **ret);
+struct userns_restrict_bpf *userns_restrict_bpf_free(struct userns_restrict_bpf *obj);
+
+int userns_restrict_put_by_fd(struct userns_restrict_bpf *obj, int userns_fd, bool replace, const int mount_fds[], size_t n_mount_fds);
+int userns_restrict_put_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode, bool replace, const int mount_fds[], size_t n_mount_fds);
+
+int userns_restrict_reset_by_inode(struct userns_restrict_bpf *obj, uint64_t userns_inode);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct userns_restrict_bpf*, userns_restrict_bpf_free);
index 71db6d8694a299a40e6eb7cfef3426ac7cf8e579..3c705c4c721b791333c35c78dbd2fc9f4f5b4fa4 100644 (file)
@@ -180,6 +180,7 @@ shared_sources = files(
         'varlink-io.systemd.Hostname.c',
         'varlink-io.systemd.Journal.c',
         'varlink-io.systemd.ManagedOOM.c',
+        'varlink-io.systemd.NamespaceResource.c',
         'varlink-io.systemd.Network.c',
         'varlink-io.systemd.PCRExtend.c',
         'varlink-io.systemd.PCRLock.c',
diff --git a/src/shared/varlink-io.systemd.NamespaceResource.c b/src/shared/varlink-io.systemd.NamespaceResource.c
new file mode 100644 (file)
index 0000000..e98c6c6
--- /dev/null
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.NamespaceResource.h"
+
+static VARLINK_DEFINE_METHOD(
+                AllocateUserRange,
+                VARLINK_DEFINE_INPUT(name, VARLINK_STRING, 0),
+                VARLINK_DEFINE_INPUT(size, VARLINK_INT, 0),
+                VARLINK_DEFINE_INPUT(target, VARLINK_INT, VARLINK_NULLABLE),
+                VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_METHOD(
+                RegisterUserNamespace,
+                VARLINK_DEFINE_INPUT(name, VARLINK_STRING, 0),
+                VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_METHOD(
+                AddMountToUserNamespace,
+                VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, VARLINK_INT, 0),
+                VARLINK_DEFINE_INPUT(mountFileDescriptor, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_METHOD(
+                AddControlGroupToUserNamespace,
+                VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, VARLINK_INT, 0),
+                VARLINK_DEFINE_INPUT(controlGroupFileDescriptor, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_METHOD(
+                AddNetworkToUserNamespace,
+                VARLINK_DEFINE_INPUT(userNamespaceFileDescriptor, VARLINK_INT, 0),
+                VARLINK_DEFINE_INPUT(networkNamespaceFileDescriptor, VARLINK_INT, 0),
+                VARLINK_DEFINE_INPUT(namespaceInterfaceName, VARLINK_STRING, VARLINK_NULLABLE),
+                VARLINK_DEFINE_INPUT(mode, VARLINK_STRING, 0),
+                VARLINK_DEFINE_OUTPUT(hostInterfaceName, VARLINK_STRING, 0),
+                VARLINK_DEFINE_OUTPUT(namespaceInterfaceName, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(UserNamespaceInterfaceNotSupported);
+static VARLINK_DEFINE_ERROR(NameExists);
+static VARLINK_DEFINE_ERROR(UserNamespaceExists);
+static VARLINK_DEFINE_ERROR(DynamicRangeUnavailable);
+static VARLINK_DEFINE_ERROR(NoDynamicRange);
+static VARLINK_DEFINE_ERROR(UserNamespaceNotRegistered);
+static VARLINK_DEFINE_ERROR(UserNamespaceWithoutUserRange);
+static VARLINK_DEFINE_ERROR(TooManyControlGroups);
+static VARLINK_DEFINE_ERROR(ControlGroupAlreadyAdded);
+
+VARLINK_DEFINE_INTERFACE(
+                io_systemd_NamespaceResource,
+                "io.systemd.NamespaceResource",
+                &vl_method_AllocateUserRange,
+                &vl_method_RegisterUserNamespace,
+                &vl_method_AddMountToUserNamespace,
+                &vl_method_AddControlGroupToUserNamespace,
+                &vl_method_AddNetworkToUserNamespace,
+                &vl_error_UserNamespaceInterfaceNotSupported,
+                &vl_error_NameExists,
+                &vl_error_UserNamespaceExists,
+                &vl_error_DynamicRangeUnavailable,
+                &vl_error_NoDynamicRange,
+                &vl_error_UserNamespaceNotRegistered,
+                &vl_error_UserNamespaceWithoutUserRange,
+                &vl_error_TooManyControlGroups,
+                &vl_error_ControlGroupAlreadyAdded);
diff --git a/src/shared/varlink-io.systemd.NamespaceResource.h b/src/shared/varlink-io.systemd.NamespaceResource.h
new file mode 100644 (file)
index 0000000..443cb97
--- /dev/null
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_NamespaceResource;
index 3ca36a07a4966e9e06f0246cff5ccb5bcbd1d3ca..d64c6e9d3e7b25ecb7fe967be966448291a98a0c 100644 (file)
 #include "varlink-io.systemd.Credentials.h"
 #include "varlink-io.systemd.Journal.h"
 #include "varlink-io.systemd.ManagedOOM.h"
+#include "varlink-io.systemd.NamespaceResource.h"
 #include "varlink-io.systemd.Network.h"
 #include "varlink-io.systemd.PCRExtend.h"
 #include "varlink-io.systemd.PCRLock.h"
-#include "varlink-io.systemd.Resolve.Monitor.h"
 #include "varlink-io.systemd.Resolve.h"
+#include "varlink-io.systemd.Resolve.Monitor.h"
 #include "varlink-io.systemd.UserDatabase.h"
 #include "varlink-io.systemd.oom.h"
 #include "varlink-io.systemd.service.h"
@@ -129,6 +130,8 @@ TEST(parse_format) {
         print_separator();
         test_parse_format_one(&vl_interface_io_systemd_UserDatabase);
         print_separator();
+        test_parse_format_one(&vl_interface_io_systemd_NamespaceResource);
+        print_separator();
         test_parse_format_one(&vl_interface_io_systemd_Journal);
         print_separator();
         test_parse_format_one(&vl_interface_io_systemd_Resolve);
index 16a5564086d64b410789b059e2ad639987bf9b7c..76510a08f98b8c8c09f1eaa8e387f46d8da94229 100644 (file)
@@ -732,6 +732,14 @@ units = [
           'file' : 'systemd-userdbd.socket',
           'conditions' : ['ENABLE_USERDB'],
         },
+        {
+          'file' : 'systemd-nsresourced.service.in',
+          'conditions' : ['ENABLE_NSRESOURCED'],
+        },
+        {
+          'file' : 'systemd-nsresourced.socket',
+          'conditions' : ['ENABLE_NSRESOURCED'],
+        },
         {
           'file' : 'systemd-vconsole-setup.service.in',
           'conditions' : ['ENABLE_VCONSOLE'],
diff --git a/units/systemd-nsresourced.service.in b/units/systemd-nsresourced.service.in
new file mode 100644 (file)
index 0000000..3c92705
--- /dev/null
@@ -0,0 +1,47 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Namespace Resource Manager
+Documentation=man:systemd-nsresourced.service(8)
+Requires=systemd-nsresourced.socket
+After=systemd-nsresourced.socket
+Conflicts=shutdown.target
+Before=sysinit.target shutdown.target
+DefaultDependencies=no
+
+[Service]
+CapabilityBoundingSet=CAP_DAC_READ_SEARCH CAP_SYS_RESOURCE CAP_BPF CAP_PERFMON CAP_SETGID CAP_SETUID CAP_SYS_ADMIN CAP_CHOWN CAP_FOWNER
+ExecStart={{LIBEXECDIR}}/systemd-nsresourced
+IPAddressDeny=any
+LimitNOFILE={{HIGH_RLIMIT_NOFILE}}
+LockPersonality=yes
+MemoryDenyWriteExecute=yes
+NoNewPrivileges=yes
+PrivateDevices=yes
+ProtectProc=invisible
+ProtectControlGroups=yes
+ProtectHome=yes
+ProtectHostname=yes
+ProtectKernelLogs=yes
+ProtectKernelModules=yes
+ProtectSystem=strict
+RestrictAddressFamilies=AF_UNIX AF_NETLINK
+RestrictRealtime=yes
+RestrictSUIDSGID=yes
+SystemCallArchitectures=native
+SystemCallErrorNumber=EPERM
+SystemCallFilter=@system-service bpf perf_event_open open_by_handle_at
+Type=notify
+NotifyAccess=all
+FileDescriptorStoreMax=4096
+{{SERVICE_WATCHDOG}}
+
+[Install]
+Also=systemd-nsresourced.socket
diff --git a/units/systemd-nsresourced.socket b/units/systemd-nsresourced.socket
new file mode 100644 (file)
index 0000000..2e3c8e9
--- /dev/null
@@ -0,0 +1,23 @@
+#  SPDX-License-Identifier: LGPL-2.1-or-later
+#
+#  This file is part of systemd.
+#
+#  systemd is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU Lesser General Public License as published by
+#  the Free Software Foundation; either version 2.1 of the License, or
+#  (at your option) any later version.
+
+[Unit]
+Description=Namespace Resource Manager Socket
+Documentation=man:systemd-nsresourced.service(8)
+DefaultDependencies=no
+Conflicts=shutdown.target
+Before=sockets.target shutdown.target
+
+[Socket]
+ListenStream=/run/systemd/io.systemd.NamespaceResource
+Symlinks=/run/systemd/userdb/io.systemd.NamespaceResource
+SocketMode=0666
+
+[Install]
+WantedBy=sockets.target