From: Serge Hallyn Date: Tue, 21 Jun 2022 12:50:53 +0000 (+0200) Subject: use systemd dbus StartTransientUnit for unpriv cgroup2 X-Git-Tag: lxc-5.0.1~15 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cfcbdb75f061108021cb233d221e8496ac40c30e;p=thirdparty%2Flxc.git use systemd dbus StartTransientUnit for unpriv cgroup2 If, when init'ing cgroups for a container start, we detect that we are an unprivileged user on a unified-hierarchy-only system, then we try to request systemd, through dbus api, to create a new scope for us with delegation. Call the cgroup it creates for us P1. We then create P1/init, move ourselves into there, so we can enable the controllers for delegation to P1's children through P1/cgroup.subtree_control. On attach, we try to request systemd attach us to the container's scope. We can't do that ourselves in the normal case, as root owns our login cgroups. Create a new command api for the lxc monitor to tell lxc-attach the systemd scope to which to attach. Changelog: * free cgroup_meta.systemd_scope in lxc_conf_free (Thanks Tycho) * fix some indent * address some (not all) of brauner's feedback Signed-off-by: Serge Hallyn --- diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f01fdb3c9..0a6f406ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -26,7 +26,7 @@ jobs: run: | sudo apt-get update -qq sudo apt-get install -qq gcc clang meson llvm - sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x + sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev - name: Compiler version env: diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index 4457474b7..52d7cac72 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -25,7 +25,7 @@ jobs: run: | sudo apt-get update -qq sudo apt-get install -qq gcc clang - sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x + sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x libsystemd-dev - name: Compiler version run: | diff --git a/.github/workflows/sanitizers.sh b/.github/workflows/sanitizers.sh index 061061c0a..0144f153e 100755 --- a/.github/workflows/sanitizers.sh +++ b/.github/workflows/sanitizers.sh @@ -18,7 +18,7 @@ apt-get install --yes --no-install-recommends \ libpam0g-dev libseccomp-dev libselinux1-dev libtool linux-libc-dev \ llvm lsb-release make openssl pkg-config python3-all-dev \ python3-setuptools rsync squashfs-tools uidmap unzip uuid-runtime \ - wget xz-utils systemd-coredump + wget xz-utils systemd-coredump libsystemd-dev apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev ARGS="-Dprefix=/usr -Dtests=true -Dpam-cgroup=false -Dwerror=true -Dio-uring-event-loop=false -Db_lto_mode=default -Db_lundef=false" diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index 4a28c8e1c..ce50dfaec 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -22,7 +22,7 @@ jobs: run: | sudo apt-get update -qq sudo apt-get install -qq gcc clang meson llvm - sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x + sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev - name: Compiler version env: diff --git a/meson.build b/meson.build index 43362d37a..49a1c9a58 100644 --- a/meson.build +++ b/meson.build @@ -150,6 +150,7 @@ want_oss_fuzz = get_option('oss-fuzz') want_seccomp = get_option('seccomp') want_thread_safety = get_option('thread-safety') want_memfd_rexec = get_option('memfd-rexec') +want_sd_bus = get_option('sd-bus') srcconf.set_quoted('DEFAULT_CGROUP_PATTERN', cgrouppattern) if coverity @@ -255,6 +256,49 @@ else srcconf.set10('HAVE_LIBURING', false) endif +if not want_sd_bus.disabled() + has_sd_bus = true + sd_bus_optional = want_sd_bus.auto() + + libsystemd = dependency('libsystemd', required: not sd_bus_optional) + if not libsystemd.found() + if not sd_bus_optional + error('missing required libsystemd dependency') + endif + + has_sd_bus = false + endif + + if not cc.has_header('systemd/sd-bus.h') + if not sd_bus_optional + error('libsystemd misses required systemd/sd-bus.h header') + endif + + has_sd_bus = false + endif + + if not cc.has_header('systemd/sd-event.h') + if not sd_bus_optional + error('libsystemd misses required systemd/sd-event.h header') + endif + + has_sd_bus = false + endif + + if not cc.has_function('sd_bus_call_method_asyncv', prefix: '#include ', dependencies: libsystemd) + if not sd_bus_optional + error('libsystemd misses required sd_bus_call_method_asyncv function') + endif + + has_sd_bus = false + endif + + srcconf.set10('HAVE_LIBSYSTEMD', has_sd_bus) +else + has_sd_bus = false + srcconf.set10('HAVE_LIBSYSTEMD', false) +endif + ## Time EPOCH. sh = find_program('sh') date = find_program('date') @@ -638,6 +682,8 @@ endforeach found_headers = [] missing_headers = [] foreach tuple: [ + ['systemd/sd-bus.h'], + ['systemd/sd-event.h'], ['sys/resource.h'], ['sys/memfd.h'], ['sys/personality.h'], @@ -675,6 +721,7 @@ foreach tuple: [ ['pam'], ['openssl'], ['liburing'], + ['libsystemd'], ] if tuple.length() >= 2 @@ -749,6 +796,10 @@ if want_io_uring liblxc_dependencies += [liburing] endif +if has_sd_bus + liblxc_dependencies += [libsystemd] +endif + liblxc_link_whole = [liblxc_static] liblxc = shared_library( diff --git a/meson_options.txt b/meson_options.txt index 8f9b4e118..801ba4175 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -22,6 +22,9 @@ option('init-script', type : 'array', option('io-uring-event-loop', type: 'boolean', value: 'false', description: 'Enable io-uring based event loop') +option('sd-bus', type: 'feature', value: 'auto', + description: 'Enable linking against sd-bus') + # was --{disable,enable}-doc in autotools option('man', type: 'boolean', value: 'true', description: 'build and install manpages') diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index e39bde8df..ee4fc052f 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,11 @@ #include "strlcat.h" #endif +#if HAVE_LIBSYSTEMD +#include +#include +#endif + lxc_log_define(cgfsng, cgroup); /* @@ -947,6 +953,354 @@ static bool check_cgroup_dir_config(struct lxc_conf *conf) return true; } +#define SYSTEMD_SCOPE_FAILED 2 +#define SYSTEMD_SCOPE_UNSUPP 1 +#define SYSTEMD_SCOPE_SUCCESS 0 + +#if HAVE_LIBSYSTEMD +struct sd_callback_data { + char *scope_name; + bool job_complete; +}; + +static int systemd_jobremoved_callback(sd_bus_message *m, void *userdata, sd_bus_error *error) +{ + char *path, *unit, *result; + struct sd_callback_data *sd_data = userdata; + uint32_t id; + int r; + + r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result); + if (r < 0) + return log_error(-1, "bad message received in callback: %s", strerror(-r)); + + if (sd_data->scope_name && strcmp(unit, sd_data->scope_name) != 0) + return log_trace(-1, "unit was '%s' not '%s'", unit, sd_data->scope_name); + if (strcmp(result, "done") == 0) { + sd_data->job_complete = true; + return log_info(1, "job is done"); + } + return log_debug(0, "result was '%s', not 'done'", result); +} + +#define DESTINATION "org.freedesktop.systemd1" +#define PATH "/org/freedesktop/systemd1" +#define INTERFACE "org.freedesktop.systemd1.Manager" +#define MEMBER "StartTransientUnit" +static bool start_scope(sd_bus *bus, struct sd_callback_data *data, struct sd_event *event) +{ + __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; + __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; + __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; + char *path = NULL; + int r; + + r = sd_bus_message_new_method_call(bus, &m, + DESTINATION, PATH, INTERFACE, MEMBER); + if (r < 0) + return log_error(false, "Failed creating sdbus message"); + + r = sd_bus_message_append(m, "ss", data->scope_name, "fail"); + if (r < 0) + return log_error(false, "Failed setting systemd scope name"); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return log_error(false, "Failed allocating sdbus msg properties"); + + r = sd_bus_message_append(m, "(sv)(sv)(sv)", + "PIDs", "au", 1, getpid(), + "Delegate", "b", 1, + "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return log_error(false, "Failed setting properties on sdbus message"); + + r = sd_bus_message_close_container(m); + if (r < 0) + return log_error(false, "Failed closing sdbus message properties"); + + r = sd_bus_message_append(m, "a(sa(sv))", 0); + if (r < 0) + return log_error(false, "Failed appending aux boilerplate\n"); + + r = sd_bus_call(NULL, m, 0, &error, &reply); + if (r < 0) + return log_error(false, "Failed sending sdbus message: %s", error.message); + + /* Parse the response message */ + r = sd_bus_message_read(reply, "o", &path); + if (r < 0) + return log_error(false, "Failed to parse response message: %s", strerror(-r)); + + /* Now spin up a mini-event-loop to wait for the "job completed" message */ + int tries = 0; + + while (!data->job_complete) { + r = sd_event_run(event, 1000 * 1000); + if (r < 0) { + log_debug(stderr, "Error waiting for JobRemoved: %s\n", strerror(-r)); + continue; + } + if (data->job_complete || tries == 5) + break; + if (r > 0) { + log_trace(stderr, "Debug: we processed an event (%d), but not the one we wanted\n", r); + continue; + } + if (r == 0) // timeout + tries++; + } + if (!data->job_complete) { + return log_error(false, "Error: %s job was never removed", data->scope_name); + } + return true; +} + +static bool string_pure_unified_system(char *contents) +{ + char *p; + bool first_line_read = false; + + lxc_iterate_parts(p, contents, "\n") { + if (first_line_read) // if >1 line, this is not pure unified + return false; + first_line_read = true; + + if (strlen(p) > 3 && strncmp(p, "0:", 2) == 0) + return true; + } + + return false; +} + +/* + * Only call get_current_unified_cgroup() when we are in a pure + * unified (v2-only) cgroup + */ +static char *get_current_unified_cgroup(void) +{ + __do_free char *buf = NULL; + __do_free_string_list char **list = NULL; + char *p; + + buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); + if (!buf) + return NULL; + + if (!string_pure_unified_system(buf)) + return NULL; + + // 0::/user.slice/user-1000.slice/session-136.scope + // Get past the "0::" + p = buf; + if (strnequal(p, "0::", STRLITERALLEN("0::"))) + p += STRLITERALLEN("0::"); + + return strdup(p); +} + +static bool pure_unified_system(void) +{ + __do_free char *buf = NULL; + + buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); + if (!buf) + return false; + + return string_pure_unified_system(buf); +} + +#define MEMBER_JOIN "AttachProcessesToUnit" +static bool enter_scope(char *scope_name, pid_t pid) +{ + __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; + __attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;; + __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL; + __attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL; + int r; + + r = sd_bus_open_user(&bus); + if (r < 0) + return log_error(false, "Failed to connect to user bus: %s", strerror(-r)); + + r = sd_bus_message_new_method_call(bus, &m, + DESTINATION, PATH, INTERFACE, MEMBER_JOIN); + if (r < 0) + return log_error(false, "Failed creating sdbus message"); + + r = sd_bus_message_append(m, "ssau", scope_name, "/init", 1, pid); + if (r < 0) + return log_error(false, "Failed setting systemd scope name"); + + + r = sd_bus_call(NULL, m, 0, &error, &reply); + if (r < 0) + return log_error(false, "Failed sending sdbus message: %s", error.message); + + return true; +} + +static bool enable_controllers_delegation(int fd_dir, char *cg) +{ + __do_free char *rbuf = NULL; + __do_free char *wbuf = NULL; + __do_free_string_list char **cpulist = NULL; + char *controller; + size_t full_len = 0; + bool first = true; + int ret; + + rbuf = read_file_at(fd_dir, "cgroup.controllers", PROTECT_OPEN, 0); + if (!rbuf) + return false; + + lxc_iterate_parts(controller, rbuf, " ") { + full_len += strlen(controller) + 2; + wbuf = must_realloc(wbuf, full_len); + if (first) { + wbuf[0] = '\0'; + first = false; + } else { + (void)strlcat(wbuf, " ", full_len + 1); + } + strlcat(wbuf, "+", full_len + 1); + strlcat(wbuf, controller, full_len + 1); + } + if (!wbuf) + return log_debug(true, "No controllers to delegate!"); + + ret = lxc_writeat(fd_dir, "cgroup.subtree_control", wbuf, strlen(wbuf)); + if (ret < 0) + return log_error_errno(false, errno, "Failed to write \"%s\" to %s/cgroup.subtree_control", wbuf, cg); + + return true; +} + +/* + * systemd places us in say .../lxc-1.scope. We create lxc-1.scope/init, + * move ourselves to there, then enable controllers in lxc-1.scope + */ +static bool move_and_delegate_unified(char *parent_cgroup) +{ + __do_free char *buf = NULL; + __do_close int fd_parent = -EBADF; + int ret; + + fd_parent = open_at(-EBADF, parent_cgroup, O_DIRECTORY, 0, 0); + if (fd_parent < 0) + return syserror_ret(false, "Failed opening cgroup dir \"%s\"", parent_cgroup); + + ret = mkdirat(fd_parent, "init", 0755); + if (ret < 0 && errno != EEXIST) + return syserror_ret(false, "Failed to create \"%d/init\" cgroup", fd_parent); + + buf = read_file_at(fd_parent, "cgroup.procs", PROTECT_OPEN, 0); + if (!buf) + return false; + + ret = lxc_writeat(fd_parent, "init/cgroup.procs", buf, strlen(buf)); + if (ret) + return syserror_ret(false, "Failed to escape to cgroup \"init/cgroup.procs\""); + + /* enable controllers in parent_cgroup */ + return enable_controllers_delegation(fd_parent, parent_cgroup); +} + +static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) +{ + __do_free char *full_scope_name = NULL; + __do_free char *fs_cg_path = NULL; + sd_event *event = NULL; + __attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; // free the bus before the names it references, just to be sure + struct sd_callback_data sd_data; + int idx = 0; + size_t len; + int r; + + if (geteuid() == 0) + return log_info(SYSTEMD_SCOPE_UNSUPP, "Running privileged, not using a systemd unit"); + // Pure_unified_layout() can't be used as that info is not yet setup. At + // the same time, we don't want to calculate current cgroups until after + // we optionally enter a new systemd user scope. So let's just do a quick + // check for pure unified cgroup system: single line /proc/self/cgroup with + // only index '0:' + if (!pure_unified_system()) + return log_info(SYSTEMD_SCOPE_UNSUPP, "Not in unified layout, not using a systemd unit"); + + r = sd_bus_open_user(&bus); + if (r < 0) + return log_error(SYSTEMD_SCOPE_FAILED, "Failed to connect to user bus: %s", strerror(-r)); + + r = sd_bus_call_method_asyncv(bus, NULL, DESTINATION, PATH, INTERFACE, "Subscribe", NULL, NULL, NULL, NULL); + if (r < 0) + return log_error(SYSTEMD_SCOPE_FAILED, "Failed to subscribe to signals: %s", strerror(-r)); + + sd_data.job_complete = false; + sd_data.scope_name = NULL; + r = sd_bus_match_signal(bus, + NULL, // no slot + DESTINATION, PATH, INTERFACE, "JobRemoved", + systemd_jobremoved_callback, &sd_data); + if (r < 0) + return log_error(SYSTEMD_SCOPE_FAILED, "Failed to register systemd event loop signal handler: %s", strerror(-r)); + + // NEXT: create and attach event + r = sd_event_new(&event); + if (r < 0) + return log_error(SYSTEMD_SCOPE_FAILED, "Failed allocating new event: %s\n", strerror(-r)); + r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) { + // bus won't clean up event since the attach failed + sd_event_unrefp(&event); + return log_error(SYSTEMD_SCOPE_FAILED, "Failed attaching event: %s\n", strerror(-r)); + } + + // "lxc-" + (conf->name) + "-NN" + ".scope" + '\0' + len = STRLITERALLEN("lxc-") + strlen(conf->name) + 3 + STRLITERALLEN(".scope") + 1; + full_scope_name = malloc(len); + if (!full_scope_name) + return syserror("Out of memory"); + + do { + snprintf(full_scope_name, len, "lxc-%s-%d.scope", conf->name, idx); + sd_data.scope_name = full_scope_name; + if (start_scope(bus, &sd_data, event)) { + conf->cgroup_meta.systemd_scope = get_current_unified_cgroup(); + if (!conf->cgroup_meta.systemd_scope) + return log_trace(SYSTEMD_SCOPE_FAILED, "Out of memory"); + fs_cg_path = must_make_path("/sys/fs/cgroup", conf->cgroup_meta.systemd_scope, NULL); + if (!move_and_delegate_unified(fs_cg_path)) + return log_error(SYSTEMD_SCOPE_FAILED, "Failed delegating the controllers to our cgroup"); + return log_trace(SYSTEMD_SCOPE_SUCCESS, "Created systemd scope %s", full_scope_name); + } + idx++; + } while (idx < 99); + + return SYSTEMD_SCOPE_FAILED; // failed, let's try old-school after all +} +#else /* !HAVE_LIBSYSTEMD */ +static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf) +{ + TRACE("unpriv_systemd_create_scope: no systemd support"); + return SYSTEMD_SCOPE_UNSUPP; // not supported +} +#endif /* HAVE_LIBSYSTEMD */ + +// Return a duplicate of cgroup path @cg without leading /, so +// that caller can own+free it and be certain it's not abspath. +static char *cgroup_relpath(char *cg) +{ + char *p; + + if (!cg || strequal(cg, "/")) + return NULL; + p = strdup(deabs(cg)); + if (!p) + return ERR_PTR(-ENOMEM); + + return p; +} + __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler) { __do_free char *monitor_cgroup = NULL; @@ -1176,14 +1530,19 @@ __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, if (ret) return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); - TRACE("Moved monitor into cgroup %d", h->dfd_mon); + TRACE("Moved monitor (%d) into cgroup %d", handler->monitor_pid, h->dfd_mon); if (handler->transient_pid <= 0) continue; ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len); - if (ret) - return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); + if (ret) { + // TODO: probably ask systemd to do the move for us instead + if (!handler->conf->cgroup_meta.systemd_scope) + return log_error_errno(false, errno, "Failed to enter pid %d into cgroup %d", handler->transient_pid, h->dfd_mon); + else + TRACE("Failed moving transient process into cgroup %d", h->dfd_mon); + } TRACE("Moved transient process into cgroup %d", h->dfd_mon); @@ -2184,14 +2543,30 @@ static int cgroup_attach_create_leaf(const struct lxc_conf *conf, } static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, + const char *lxcpath, int unified_fd, int *sk_fd, pid_t pid, bool unprivileged) { __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; size_t pidstr_len; +#if HAVE_LIBSYSTEMD + __do_free char *scope = NULL; +#endif ssize_t ret; +#if HAVE_LIBSYSTEMD + scope = lxc_cmd_get_systemd_scope(conf->name, lxcpath); + if (scope) { + TRACE("%s:%s is running under systemd-created scope '%s'. Attaching...", lxcpath, conf->name, scope); + if (enter_scope(scope, pid)) + TRACE("Successfully entered scope '%s'", scope); + else + ERROR("Failed entering scope '%s'", scope); + } else { + TRACE("%s:%s is not running under a systemd-created scope", lxcpath, conf->name); + } +#endif if (unprivileged) { ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1); if (ret < 0) @@ -2229,6 +2604,7 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, struct userns_exec_unified_attach_data { const struct lxc_conf *conf; + const char *lxcpath; int unified_fd; int sk_pair[2]; pid_t pid; @@ -2239,8 +2615,8 @@ static int cgroup_unified_attach_child_wrapper(void *data) { struct userns_exec_unified_attach_data *args = data; - if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || - args->sk_pair[0] < 0 || args->sk_pair[1] < 0) + if (!args->conf || !args->lxcpath || args->unified_fd < 0 || + args->pid <= 0 || args->sk_pair[0] < 0 || args->sk_pair[1] < 0) return ret_errno(EINVAL); close_prot_errno_disarm(args->sk_pair[0]); @@ -2257,7 +2633,8 @@ static int cgroup_unified_attach_parent_wrapper(void *data) return ret_errno(EINVAL); close_prot_errno_disarm(args->sk_pair[1]); - return cgroup_attach_move_into_leaf(args->conf, args->unified_fd, + return cgroup_attach_move_into_leaf(args->conf, args->lxcpath, + args->unified_fd, &args->sk_pair[0], args->pid, args->unprivileged); } @@ -2286,6 +2663,7 @@ static int __cg_unified_attach(const struct hierarchy *h, ret = cgroup_attach(conf, name, lxcpath, pid); if (ret == 0) return log_trace(0, "Attached to unified cgroup via command handler"); + TRACE("__cg_unified_attach: cgroup_attach returned %d", ret); if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2) return log_error_errno(ret, errno, "Failed to attach to unified cgroup"); @@ -2294,6 +2672,7 @@ static int __cg_unified_attach(const struct hierarchy *h, /* not running */ if (!cgroup) return 0; + TRACE("lxc_cmd_get_cgroup_path returned %s", cgroup); path = make_cgroup_path(h, cgroup, NULL); @@ -2307,6 +2686,7 @@ static int __cg_unified_attach(const struct hierarchy *h, .unified_fd = unified_fd, .pid = pid, .unprivileged = am_guest_unpriv(), + .lxcpath = lxcpath, }; ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); @@ -3152,12 +3532,19 @@ static const char *stable_order(const char *controllers) #define CGFSNG_LAYOUT_UNIFIED BIT(1) static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, - bool unprivileged) + bool unprivileged, struct lxc_conf *conf) { __do_free char *cgroup_info = NULL; unsigned int layout_mask = 0; + int ret; char *it; + ret = unpriv_systemd_create_scope(ops, conf); + if (ret < 0) + return ret_set_errno(false, ret); + else if (ret == 0) + TRACE("Entered an unpriv systemd scope"); + /* * Root spawned containers escape the current cgroup, so use init's * cgroups as our base in that case. @@ -3175,7 +3562,7 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, __do_free_string_list char **controller_list = NULL, **delegate = NULL; char *line; - int dfd, ret, type; + int dfd, type; /* Handle the unified cgroup hierarchy. */ line = it; @@ -3185,7 +3572,10 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, type = UNIFIED_HIERARCHY; layout_mask |= CGFSNG_LAYOUT_UNIFIED; - current_cgroup = current_unified_cgroup(relative, line); + if (conf->cgroup_meta.systemd_scope) + current_cgroup = cgroup_relpath(conf->cgroup_meta.systemd_scope); + if (IS_ERR_OR_NULL(current_cgroup)) + current_cgroup = current_unified_cgroup(relative, line); if (IS_ERR(current_cgroup)) return PTR_ERR(current_cgroup); @@ -3429,7 +3819,7 @@ static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) */ ops->dfd_mnt = dfd; - ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map)); + ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map), conf); if (ret < 0) return syserror_ret(ret, "Failed to initialize cgroups"); @@ -3502,7 +3892,7 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) return move_ptr(cgfsng_ops); } -static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid) +static int __unified_attach_fd(const struct lxc_conf *conf, const char *lxcpath, int fd_unified, pid_t pid) { int ret; @@ -3512,6 +3902,7 @@ static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_ .unified_fd = fd_unified, .pid = pid, .unprivileged = am_guest_unpriv(), + .lxcpath = lxcpath, }; ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); @@ -3555,7 +3946,7 @@ static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name, int dfd_con = ctx->fd[idx]; if (unified_cgroup_fd(dfd_con)) - ret = __unified_attach_fd(conf, dfd_con, pid); + ret = __unified_attach_fd(conf, lxcpath, dfd_con, pid); else ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len); if (ret) @@ -3580,7 +3971,7 @@ static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name if (dfd_unified < 0) return ret_errno(ENOSYS); - return __unified_attach_fd(conf, dfd_unified, pid); + return __unified_attach_fd(conf, lxcpath, dfd_unified, pid); } int cgroup_attach(const struct lxc_conf *conf, const char *name, diff --git a/src/lxc/commands.c b/src/lxc/commands.c index 27861f25d..946c72e95 100644 --- a/src/lxc/commands.c +++ b/src/lxc/commands.c @@ -89,6 +89,7 @@ static const char *lxc_cmd_str(lxc_cmd_t cmd) [LXC_CMD_GET_CGROUP_CTX] = "get_cgroup_ctx", [LXC_CMD_GET_CGROUP_FD] = "get_cgroup_fd", [LXC_CMD_GET_LIMIT_CGROUP_FD] = "get_limit_cgroup_fd", + [LXC_CMD_GET_SYSTEMD_SCOPE] = "get_systemd_scope", }; if (cmd >= LXC_CMD_MAX) @@ -1316,6 +1317,55 @@ static int lxc_cmd_get_lxcpath_callback(int fd, struct lxc_cmd_req *req, return lxc_cmd_rsp_send_reap(fd, &rsp); } +char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath) +{ + bool stopped = false; + ssize_t ret; + struct lxc_cmd_rr cmd; + + lxc_cmd_init(&cmd, LXC_CMD_GET_SYSTEMD_SCOPE); + + ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL); + if (ret < 0) + return NULL; + + if (cmd.rsp.ret == 0) + return cmd.rsp.data; + + return NULL; +} + +static int lxc_cmd_get_systemd_scope_callback(int fd, struct lxc_cmd_req *req, + struct lxc_handler *handler, + struct lxc_async_descr *descr) +{ + __do_free char *scope = NULL; + struct lxc_cmd_rsp rsp = { + .ret = -EINVAL, + }; + + // cgroup_meta.systemd_scope is the full cgroup path to the scope. + // The caller just wants the actual scope name, that is, basename(). + // (XXX - or do we want the caller to massage it? I'm undecided) + if (handler->conf->cgroup_meta.systemd_scope) { + scope = strrchr(handler->conf->cgroup_meta.systemd_scope, '/'); + if (scope && *scope) + scope++; + if (scope && *scope) + scope = strdup(scope); + } + + if (!scope) + goto out; + + rsp.ret = 0; + rsp.data = scope; + rsp.datalen = strlen(scope) + 1; + +out: + return lxc_cmd_rsp_send_reap(fd, &rsp); +} + int lxc_cmd_add_state_client(const char *name, const char *lxcpath, lxc_state_t states[static MAX_STATE], int *state_client_fd) @@ -1900,6 +1950,7 @@ static int lxc_cmd_process(int fd, struct lxc_cmd_req *req, [LXC_CMD_GET_CGROUP_CTX] = lxc_cmd_get_cgroup_ctx_callback, [LXC_CMD_GET_CGROUP_FD] = lxc_cmd_get_cgroup_fd_callback, [LXC_CMD_GET_LIMIT_CGROUP_FD] = lxc_cmd_get_limit_cgroup_fd_callback, + [LXC_CMD_GET_SYSTEMD_SCOPE] = lxc_cmd_get_systemd_scope_callback, }; if (req->cmd >= LXC_CMD_MAX) diff --git a/src/lxc/commands.h b/src/lxc/commands.h index b4aac93a0..2a3974807 100644 --- a/src/lxc/commands.h +++ b/src/lxc/commands.h @@ -52,6 +52,7 @@ typedef enum { LXC_CMD_GET_CGROUP_CTX = 23, LXC_CMD_GET_CGROUP_FD = 24, LXC_CMD_GET_LIMIT_CGROUP_FD = 25, + LXC_CMD_GET_SYSTEMD_SCOPE = 26, LXC_CMD_MAX, } lxc_cmd_t; @@ -115,6 +116,7 @@ __hidden extern char *lxc_cmd_get_config_item(const char *name, const char *item const char *lxcpath); __hidden extern char *lxc_cmd_get_name(const char *hashed_sock); __hidden extern char *lxc_cmd_get_lxcpath(const char *hashed_sock); +__hidden extern char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath); __hidden extern pid_t lxc_cmd_get_init_pid(const char *name, const char *lxcpath); __hidden extern int lxc_cmd_get_init_pidfd(const char *name, const char *lxcpath); __hidden extern int lxc_cmd_get_state(const char *name, const char *lxcpath); diff --git a/src/lxc/conf.c b/src/lxc/conf.c index a3293a531..a24fdcc8f 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -4831,6 +4831,7 @@ void lxc_conf_free(struct lxc_conf *conf) free(conf->cgroup_meta.container_dir); free(conf->cgroup_meta.namespace_dir); free(conf->cgroup_meta.controllers); + free(conf->cgroup_meta.systemd_scope); free(conf->shmount.path_host); free(conf->shmount.path_cont); free(conf); diff --git a/src/lxc/conf.h b/src/lxc/conf.h index ccf59b47e..7dc2f15b6 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -74,6 +74,13 @@ struct lxc_cgroup { char *container_dir; char *namespace_dir; bool relative; + /* If an unpriv user in pure unified-only hierarchy + * starts a container, then we ask systemd to create + * a scope for us, and create the monitor and container + * cgroups under that. + * This will ignore the above things like monitor_dir + */ + char *systemd_scope; }; }; diff --git a/src/tests/oss-fuzz.sh b/src/tests/oss-fuzz.sh index 4a3920a77..2f95d34e5 100755 --- a/src/tests/oss-fuzz.sh +++ b/src/tests/oss-fuzz.sh @@ -24,7 +24,7 @@ mkdir -p $OUT apt-get update -qq apt-get install --yes --no-install-recommends \ build-essential docbook2x doxygen git \ - wget xz-utils systemd-coredump pkgconf + wget xz-utils systemd-coredump pkgconf libsystemd-dev apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev # make sure we have a new enough meson version