--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?> <!--*-nxml-*-->
+<!DOCTYPE policyconfig PUBLIC "-//freedesktop//DTD PolicyKit Policy Configuration 1.0//EN"
+ "https://www.freedesktop.org/standards/PolicyKit/1/policyconfig.dtd">
+
+<!--
+ SPDX-License-Identifier: LGPL-2.1-or-later
+
+ This file is part of systemd.
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+-->
+
+<policyconfig>
+
+ <vendor>The systemd Project</vendor>
+ <vendor_url>https://systemd.io</vendor_url>
+
+ <!-- Allow allocation of a user namespace with an automatically assigned UID range -->
+ <action id="io.systemd.namespace-resource.allocate-user-namespace">
+ <description gettext-domain="systemd">Allow user namespace allocation</description>
+ <message gettext-domain="systemd">Authentication is required for an application to allocate a user namespace '$(name)' with an automatically assigned transient UID range.</message>
+ <defaults>
+ <allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.imply">io.systemd.namespace-resource.register-user-namespace</annotate>
+ </action>
+
+ <!-- Allow registration of a user namespace with a range allocated elsewhere -->
+ <action id="io.systemd.namespace-resource.register-user-namespace">
+ <description gettext-domain="systemd">Allow user namespace registration</description>
+ <message gettext-domain="systemd">Authentication is required for an application to register a user namespace '$(name)'.</message>
+ <defaults>
+ <allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
+ </defaults>
+ <annotate key="org.freedesktop.policykit.imply">io.systemd.namespace-resource.allocate-user-namespace</annotate>
+ </action>
+
+ <!-- Allow adding a mount to a registered userns -->
+ <action id="io.systemd.namespace-resource.delegate-mount">
+ <description gettext-domain="systemd">Allow adding a mount to a user namespace</description>
+ <message gettext-domain="systemd">Authentication is required for an application to add a mount to a user namespace.</message>
+ <defaults>
+ <allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
+ </defaults>
+ </action>
+
+ <!-- Allow adding a cgroup to a registered userns -->
+ <action id="io.systemd.namespace-resource.delegate-cgroup">
+ <description gettext-domain="systemd">Allow adding a control group to a user namespace</description>
+ <message gettext-domain="systemd">Authentication is required for an application to add a control group to a user namespace.</message>
+ <defaults>
+ <allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
+ </defaults>
+ </action>
+
+ <!-- Allow adding a network interface to a registered userns -->
+ <action id="io.systemd.namespace-resource.delegate-network-interface">
+ <description gettext-domain="systemd">Allow adding a network interface to a user namespace</description>
+ <message gettext-domain="systemd">Authentication is required for an application to add a network interface of type $(type) to a user namespace.</message>
+ <defaults>
+ <allow_any>yes</allow_any>
+ <allow_inactive>yes</allow_inactive>
+ <allow_active>yes</allow_active>
+ </defaults>
+ </action>
+
+</policyconfig>
#include "sd-netlink.h"
#include "sd-varlink.h"
+#include "bus-polkit.h"
#include "env-util.h"
#include "fd-util.h"
#include "fileio.h"
#define ITERATIONS_MAX 64U
#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
-#define CONNECTION_IDLE_USEC (15 * USEC_PER_SEC)
#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
#define USERNS_PER_UID 256
+typedef struct Context {
+ Hashmap *polkit_registry;
+ struct userns_restrict_bpf *bpf;
+} Context;
+
typedef struct LookupParameters {
const char *user_name;
const char *group_name;
{}
};
- struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
_cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF, lock_fd = -EBADF;
_cleanup_free_ char *userns_name = NULL;
+ Context *c = ASSERT_PTR(userdata);
uid_t peer_uid;
struct stat userns_st;
AllocateParameters p = {
if (r != 0)
return r;
- userns_fd = sd_varlink_take_fd(link, p.userns_fd_idx);
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
if (r < 0)
return r;
- if (!*bpf) {
- r = userns_restrict_install(/* pin= */ true, bpf);
+ const char *polkit_details[] = {
+ "name", userns_name,
+ NULL,
+ };
+
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ "io.systemd.namespace-resource.allocate-user-namespace",
+ polkit_details,
+ /* good_user= */ UID_INVALID,
+ POLKIT_DEFAULT_ALLOW, /* If no polkit is installed, allow unpriv userns namespace allocation */
+ &c->polkit_registry);
+ if (r <= 0)
+ return r;
+
+ if (!c->bpf) {
+ r = userns_restrict_install(/* pin= */ true, &c->bpf);
if (r < 0)
return r;
}
/* Register the userns in the BPF map with an empty allowlist */
r = userns_restrict_put_by_fd(
- *bpf,
+ c->bpf,
userns_fd,
/* replace= */ true,
/* mount_fds= */ NULL,
{}
};
- struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
_cleanup_close_ int userns_fd = -EBADF, registry_dir_fd = -EBADF;
_cleanup_free_ char *userns_name = NULL;
+ Context *c = ASSERT_PTR(userdata);
uid_t peer_uid;
struct stat userns_st;
RegisterParameters p = {
if (r != 0)
return r;
- userns_fd = sd_varlink_take_fd(link, p.userns_fd_idx);
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return userns_fd;
if (r < 0)
return r;
- if (!*bpf) {
- r = userns_restrict_install(/* pin= */ true, bpf);
+ const char *polkit_details[] = {
+ "name", userns_name,
+ NULL,
+ };
+
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ "io.systemd.namespace-resource.register-user-namespace",
+ polkit_details,
+ /* good_user= */ UID_INVALID,
+ POLKIT_DEFAULT_ALLOW, /* If no polkit is installed, allow unpriv userns namespace registration */
+ &c->polkit_registry);
+ if (r <= 0)
+ return r;
+
+ if (!c->bpf) {
+ r = userns_restrict_install(/* pin= */ true, &c->bpf);
if (r < 0)
return r;
}
/* Register the userns in the BPF map with an empty allowlist */
r = userns_restrict_put_by_fd(
- *bpf,
+ c->bpf,
userns_fd,
/* replace= */ true,
/* mount_fds= */ NULL,
};
_cleanup_close_ int userns_fd = -EBADF, mount_fd = -EBADF, registry_dir_fd = -EBADF;
- struct userns_restrict_bpf **bpf = ASSERT_PTR(userdata);
+ Context *c = ASSERT_PTR(userdata);
AddMountParameters p = {
.userns_fd_idx = UINT_MAX,
.mount_fd_idx = UINT_MAX,
if (r != 0)
return r;
- userns_fd = sd_varlink_take_fd(link, p.userns_fd_idx);
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return userns_fd;
if (fstat(userns_fd, &userns_st) < 0)
return -errno;
- mount_fd = sd_varlink_take_fd(link, p.mount_fd_idx);
+ mount_fd = sd_varlink_peek_dup_fd(link, p.mount_fd_idx);
if (mount_fd < 0)
return mount_fd;
if (r < 0)
return r;
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ "io.systemd.namespace-resource.delegate-mount",
+ /* polkit_details= */ NULL,
+ /* good_user= */ UID_INVALID,
+ POLKIT_DEFAULT_ALLOW, /* If no polkit is installed, allow delegation of mounts to registered userns */
+ &c->polkit_registry);
+ if (r <= 0)
+ return r;
+
registry_dir_fd = userns_registry_open_fd();
if (registry_dir_fd < 0)
return registry_dir_fd;
if (r < 0)
return r;
- if (!*bpf) {
- r = userns_restrict_install(/* pin= */ true, bpf);
+ if (!c->bpf) {
+ r = userns_restrict_install(/* pin= */ true, &c->bpf);
if (r < 0)
return r;
}
/* Add this mount to the user namespace's BPF map allowlist entry. */
r = userns_restrict_put_by_fd(
- *bpf,
+ c->bpf,
userns_fd,
/* replace= */ false,
&mount_fd,
.cgroup_fd_idx = UINT_MAX,
};
_cleanup_(userns_info_freep) UserNamespaceInfo *userns_info = NULL;
+ Context *c = ASSERT_PTR(userdata);
struct stat userns_st, cgroup_st;
uid_t peer_uid;
int r;
if (r != 0)
return r;
- userns_fd = sd_varlink_take_fd(link, p.userns_fd_idx);
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return log_debug_errno(userns_fd, "Failed to take user namespace fd from Varlink connection: %m");
if (fstat(userns_fd, &userns_st) < 0)
return log_debug_errno(errno, "Failed to fstat() user namespace fd: %m");
- cgroup_fd = sd_varlink_take_fd(link, p.cgroup_fd_idx);
+ cgroup_fd = sd_varlink_peek_dup_fd(link, p.cgroup_fd_idx);
if (cgroup_fd < 0)
return log_debug_errno(cgroup_fd, "Failed to take cgroup fd from Varlink connection: %m");
if (fstat(cgroup_fd, &cgroup_st) < 0)
return log_debug_errno(errno, "Failed to fstat() cgroup fd: %m");
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ "io.systemd.namespace-resource.delegate-cgroup",
+ /* polkit_details= */ NULL,
+ /* good_user= */ UID_INVALID,
+ POLKIT_DEFAULT_ALLOW, /* If no polkit is installed, allow delegation of cgroups to registered userns */
+ &c->polkit_registry);
+ if (r <= 0)
+ return r;
+
registry_dir_fd = userns_registry_open_fd();
if (registry_dir_fd < 0)
return registry_dir_fd;
};
_cleanup_close_ int userns_fd = -EBADF, netns_fd = -EBADF, registry_dir_fd = -EBADF;
+ Context *c = ASSERT_PTR(userdata);
AddNetworkParameters p = {
.userns_fd_idx = UINT_MAX,
.netns_fd_idx = UINT_MAX,
if (r != 0)
return r;
- userns_fd = sd_varlink_take_fd(link, p.userns_fd_idx);
+ userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
if (userns_fd < 0)
return userns_fd;
return -errno;
if (p.netns_fd_idx != UINT_MAX) {
- netns_fd = sd_varlink_take_fd(link, p.netns_fd_idx);
+ netns_fd = sd_varlink_peek_dup_fd(link, p.netns_fd_idx);
if (netns_fd < 0)
return netns_fd;
} else
return sd_varlink_error_invalid_parameter_name(link, "mode");
+ const char *polkit_details[] = {
+ "type", p.mode,
+ NULL,
+ };
+
+ r = varlink_verify_polkit_async_full(
+ link,
+ /* bus= */ NULL,
+ "io.systemd.namespace-resource.delegate-network-interface",
+ polkit_details,
+ /* good_user= */ UID_INVALID,
+ POLKIT_DEFAULT_ALLOW, /* If no polkit is installed, allow delegation of network interfaces to registered userns */
+ &c->polkit_registry);
+ if (r <= 0)
+ return r;
+
registry_dir_fd = userns_registry_open_fd();
if (registry_dir_fd < 0)
return registry_dir_fd;
static int process_connection(sd_varlink_server *server, int _fd) {
_cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
_cleanup_(sd_varlink_close_unrefp) sd_varlink *vl = NULL;
+ _cleanup_(sd_event_unrefp) sd_event *event = NULL;
int r;
assert(server);
assert(fd >= 0);
+ r = sd_event_new(&event);
+ if (r < 0)
+ return r;
+
+ r = sd_varlink_server_attach_event(server, event, /* priority= */ 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach Varlink server to event loop: %m");
+
r = sd_varlink_server_add_connection(server, fd, &vl);
if (r < 0)
return log_error_errno(r, "Failed to add connection: %m");
TAKE_FD(fd);
vl = sd_varlink_ref(vl);
- for (;;) {
- r = sd_varlink_process(vl);
- if (r == -ENOTCONN) {
- log_debug("Connection terminated.");
- break;
- }
- if (r < 0)
- return log_error_errno(r, "Failed to process connection: %m");
- if (r > 0)
- continue;
+ r = sd_event_loop(event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to run event loop: %m");
- r = sd_varlink_wait(vl, CONNECTION_IDLE_USEC);
- if (r < 0)
- return log_error_errno(r, "Failed to wait for connection events: %m");
- if (r == 0)
- break;
- }
+ r = sd_varlink_server_detach_event(server);
+ if (r < 0)
+ return log_error_errno(r, "Failed to detach Varlink server from event loop: %m");
return 0;
}
+static void context_free(Context *c) {
+ assert(c);
+
+ c->polkit_registry = hashmap_free(c->polkit_registry);
+ c->bpf = userns_restrict_bpf_free(c->bpf);
+}
+
static int run(int argc, char *argv[]) {
- _cleanup_(userns_restrict_bpf_freep) struct userns_restrict_bpf *bpf = NULL;
usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
_cleanup_(sd_varlink_server_unrefp) sd_varlink_server *server = NULL;
+ _cleanup_(context_free) Context c = {};
_cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
unsigned n_iterations = 0;
int m, listen_fd, r;
&server,
SD_VARLINK_SERVER_INHERIT_USERDATA|
SD_VARLINK_SERVER_ALLOW_FD_PASSING_INPUT|SD_VARLINK_SERVER_ALLOW_FD_PASSING_OUTPUT,
- &bpf);
+ &c);
if (r < 0)
return log_error_errno(r, "Failed to allocate varlink server: %m");
if (r < 0)
return log_error_errno(r, "Failed to bind methods: %m");
+ r = sd_varlink_server_set_exit_on_idle(server, true);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enable exit-on-idle mode: %m");
+
r = getenv_bool("NSRESOURCE_FIXED_WORKER");
if (r < 0)
return log_error_errno(r, "Failed to parse NSRESOURCE_FIXED_WORKER: %m");