From: Christian Brauner Date: Sat, 20 Feb 2021 01:01:18 +0000 (+0100) Subject: cgroups: rework cgroup initialization X-Git-Tag: lxc-5.0.0~274^2~20 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=35ec1a385a2b0521b522d2c7cc82bfc7ef633e88;p=thirdparty%2Flxc.git cgroups: rework cgroup initialization Signed-off-by: Christian Brauner --- diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 8431007f4..a1e2ff95e 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -60,22 +60,29 @@ lxc_log_define(cgfsng, cgroup); -/* Given a pointer to a null-terminated array of pointers, realloc to add one +/* + * Given a pointer to a null-terminated array of pointers, realloc to add one * entry, and point the new entry to NULL. Do not fail. Return the index to the * second-to-last entry - that is, the one which is now available for use * (keeping the list null-terminated). */ -static int append_null_to_list(void ***list) +static int list_add(void ***list) { - int newentry = 0; + int idx = 0; + void **p; if (*list) - for (; (*list)[newentry]; newentry++) + for (; (*list)[idx]; idx++) ; - *list = must_realloc(*list, (newentry + 2) * sizeof(void **)); - (*list)[newentry + 1] = NULL; - return newentry; + p = realloc(*list, (idx + 2) * sizeof(void **)); + if (!p) + return ret_errno(ENOMEM); + + p[idx + 1] = NULL; + *list = p; + + return idx; } /* Given a null-terminated array of strings, check whether @entry is one of the @@ -93,59 +100,6 @@ static bool string_in_list(char **list, const char *entry) return false; } -/* Return a copy of @entry prepending "name=", i.e. turn "systemd" into - * "name=systemd". Do not fail. - */ -static char *cg_legacy_must_prefix_named(char *entry) -{ - size_t len; - char *prefixed; - - len = strlen(entry); - prefixed = must_realloc(NULL, len + 6); - - memcpy(prefixed, "name=", STRLITERALLEN("name=")); - memcpy(prefixed + STRLITERALLEN("name="), entry, len); - prefixed[len + 5] = '\0'; - - return prefixed; -} - -/* Append an entry to the clist. Do not fail. @clist must be NULL the first time - * we are called. - * - * We also handle named subsystems here. Any controller which is not a kernel - * subsystem, we prefix "name=". Any which is both a kernel and named subsystem, - * we refuse to use because we're not sure which we have here. - * (TODO: We could work around this in some cases by just remounting to be - * unambiguous, or by comparing mountpoint contents with current cgroup.) - * - * The last entry will always be NULL. - */ -static void must_append_controller(char **klist, char **nlist, char ***clist, - char *entry) -{ - int newentry; - char *copy; - - if (string_in_list(klist, entry) && string_in_list(nlist, entry)) { - ERROR("Refusing to use ambiguous controller \"%s\"", entry); - ERROR("It is both a named and kernel subsystem"); - return; - } - - newentry = append_null_to_list((void ***)clist); - - if (strnequal(entry, "name=", 5)) - copy = must_copy_string(entry); - else if (string_in_list(klist, entry)) - copy = must_copy_string(entry); - else - copy = cg_legacy_must_prefix_named(entry); - - (*clist)[newentry] = copy; -} - /* Given a handler's cgroup data, return the struct hierarchy for the controller * @c, or NULL if there is none. */ @@ -318,37 +272,6 @@ static inline bool is_unified_hierarchy(const struct hierarchy *h) return h->version == CGROUP2_SUPER_MAGIC; } -/* Given two null-terminated lists of strings, return true if any string is in - * both. - */ -static bool controller_lists_intersect(char **l1, char **l2) -{ - if (!l1 || !l2) - return false; - - for (int i = 0; l1[i]; i++) - if (string_in_list(l2, l1[i])) - return true; - - return false; -} - -/* For a null-terminated list of controllers @clist, return true if any of those - * controllers is already listed the null-terminated list of hierarchies @hlist. - * Realistically, if one is present, all must be present. - */ -static bool controller_list_is_dup(struct hierarchy **hlist, char **clist) -{ - if (!hlist) - return false; - - for (int i = 0; hlist[i]; i++) - if (controller_lists_intersect(hlist[i]->controllers, clist)) - return true; - - return false; -} - /* Return true if the controller @entry is found in the null-terminated list of * hierarchies @hlist. */ @@ -367,7 +290,7 @@ static bool controller_found(struct hierarchy **hlist, char *entry) /* Return true if all of the controllers which we require have been found. The * required list is freezer and anything in lxc.cgroup.use. */ -static bool all_controllers_found(struct cgroup_ops *ops) +__lxc_unused static bool all_controllers_found(struct cgroup_ops *ops) { struct hierarchy **hlist; @@ -382,98 +305,51 @@ static bool all_controllers_found(struct cgroup_ops *ops) return true; } -/* Get the controllers from a mountinfo line There are other ways we could get - * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we - * could parse the mount options. But we simply assume that the mountpoint must - * be /sys/fs/cgroup/controller-list - */ -static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line, - int type) +static char **__controller_list_empty(void) { - /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list - * for legacy hierarchies. - */ __do_free_string_list char **aret = NULL; - int i; - char *p2, *tok; - char *p = line, *sep = ","; - - for (i = 0; i < 4; i++) { - p = strchr(p, ' '); - if (!p) - return NULL; - p++; - } + int newentry; - /* Note, if we change how mountinfo works, then our caller will need to - * verify /sys/fs/cgroup/ in this field. - */ - if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15)) - return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p); + newentry = list_add((void ***)&aret); + aret[newentry] = NULL; + return move_ptr(aret); +} - p += 15; - p2 = strchr(p, ' '); - if (!p2) - return log_error(NULL, "Corrupt mountinfo"); - *p2 = '\0'; +static char **__controller_list(char *controllers) +{ + __do_free_string_list char **controller_list = NULL; + char *it; - if (type == CGROUP_SUPER_MAGIC) { - __do_free char *dup = NULL; + lxc_iterate_parts(it, controllers, " \t\n") { + int idx; - /* strdup() here for v1 hierarchies. Otherwise - * lxc_iterate_parts() will destroy mountpoints such as - * "/sys/fs/cgroup/cpu,cpuacct". - */ - dup = must_copy_string(p); - if (!dup) + idx = list_add((void ***)&controller_list); + controller_list[idx] = strdup(it); + if (!controller_list[idx]) return NULL; - - lxc_iterate_parts(tok, dup, sep) - must_append_controller(klist, nlist, &aret, tok); } - *p2 = ' '; - return move_ptr(aret); -} - -static char **cg_unified_make_empty_controller(void) -{ - __do_free_string_list char **aret = NULL; - int newentry; + if (!controller_list) + return NULL; - newentry = append_null_to_list((void ***)&aret); - aret[newentry] = NULL; - return move_ptr(aret); + return move_ptr(controller_list); } -static char **cg_unified_get_controllers(int dfd, const char *file) +static char **unified_controllers(int dfd, const char *file) { __do_free char *buf = NULL; - __do_free_string_list char **aret = NULL; - char *sep = " \t\n"; - char *tok; buf = read_file_at(dfd, file, PROTECT_OPEN, 0); if (!buf) return NULL; - lxc_iterate_parts(tok, buf, sep) { - int newentry; - char *copy; - - newentry = append_null_to_list((void ***)&aret); - copy = must_copy_string(tok); - aret[newentry] = copy; - } - - return move_ptr(aret); + return __controller_list(buf); } -static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops, - char **controllers) +static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers) { if (!ops->cgroup_use) - return true; + return false; for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) { bool found = false; @@ -489,299 +365,60 @@ static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops, if (found) continue; - return false; + return true; } - return true; + return false; } -static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint, - char *container_base_path, int type) +static int add_hierarchy(struct cgroup_ops *ops, int dfd_mnt, char *mnt, + int dfd_base, char *base_cgroup, char **controllers, + int type) { - __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; __do_free struct hierarchy *new = NULL; - __do_free_string_list char **controllers = clist; int idx; - if (abspath(container_base_path)) + if (abspath(base_cgroup)) return syserrno_set(-EINVAL, "Container base path must be relative to controller mount"); - if (!controllers && type != CGROUP2_SUPER_MAGIC) - return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed"); - - dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); - if (dfd_mnt < 0) - return syserrno(-errno, "Failed to open %s", mountpoint); - - if (!is_empty_string(container_base_path)) { - dfd_base = open_at(dfd_mnt, container_base_path, - PROTECT_OPATH_DIRECTORY, - PROTECT_LOOKUP_BENEATH_XDEV, 0); - if (dfd_base < 0) - return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path); - } - - if (!controllers) { - /* - * We assume that the cgroup we're currently in has been delegated to - * us and we are free to further delege all of the controllers listed - * in cgroup.controllers further down the hierarchy. - */ - if (dfd_base < 0) - controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers"); - else - controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers"); - if (!controllers) - controllers = cg_unified_make_empty_controller(); - if (!controllers[0]) - TRACE("No controllers are enabled for delegation"); - } - - /* Exclude all controllers that cgroup use does not want. */ - if (!cgroup_use_wants_controllers(ops, controllers)) - return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers"); - new = zalloc(sizeof(*new)); if (!new) return ret_errno(ENOMEM); - new->version = type; - new->controllers = move_ptr(controllers); - new->mountpoint = mountpoint; - new->container_base_path = container_base_path; new->cgfd_con = -EBADF; new->cgfd_limit = -EBADF; new->cgfd_mon = -EBADF; - TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s", - mountpoint, container_base_path); + new->version = type; + new->controllers = controllers; + new->mountpoint = mnt; + new->container_base_path = base_cgroup; + + new->dfd_mnt = dfd_mnt; + new->dfd_base = dfd_base; + + TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s", + mnt, maybe_empty(base_cgroup)); for (char *const *it = new->controllers; it && *it; it++) - TRACE("The detected hierarchy contains the %s controller", *it); + TRACE("The hierarchy contains the %s controller", *it); - idx = append_null_to_list((void ***)&ops->hierarchies); - if (dfd_base < 0) - new->dfd_base = dfd_mnt; - else - new->dfd_base = move_fd(dfd_base); - new->dfd_mnt = move_fd(dfd_mnt); + idx = list_add((void ***)&ops->hierarchies); if (type == CGROUP2_SUPER_MAGIC) ops->unified = new; (ops->hierarchies)[idx] = move_ptr(new); return 0; } -/* Get a copy of the mountpoint from @line, which is a line from - * /proc/self/mountinfo. - */ -static char *cg_hybrid_get_mountpoint(char *line) -{ - char *p = line, *sret = NULL; - size_t len; - char *p2; - - for (int i = 0; i < 4; i++) { - p = strchr(p, ' '); - if (!p) - return NULL; - p++; - } - - if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15)) - return NULL; - - p2 = strchr(p + 15, ' '); - if (!p2) - return NULL; - *p2 = '\0'; - - len = strlen(p); - sret = must_realloc(NULL, len + 1); - memcpy(sret, p, len); - sret[len] = '\0'; - - return sret; -} - -/* Given a multi-line string, return a null-terminated copy of the current line. */ -static char *copy_to_eol(char *p) -{ - char *p2, *sret; - size_t len; - - p2 = strchr(p, '\n'); - if (!p2) - return NULL; - - len = p2 - p; - sret = must_realloc(NULL, len + 1); - memcpy(sret, p, len); - sret[len] = '\0'; - - return sret; -} - -/* cgline: pointer to character after the first ':' in a line in a \n-terminated - * /proc/self/cgroup file. Check whether controller c is present. - */ -static bool controller_in_clist(char *cgline, char *c) -{ - __do_free char *tmp = NULL; - char *tok, *eol; - size_t len; - - eol = strchr(cgline, ':'); - if (!eol) - return false; - - len = eol - cgline; - tmp = must_realloc(NULL, len + 1); - memcpy(tmp, cgline, len); - tmp[len] = '\0'; - - lxc_iterate_parts(tok, tmp, ",") - if (strequal(tok, c)) - return true; - - return false; -} - -static inline char *trim(char *s) -{ - size_t len; - - len = strlen(s); - while ((len > 1) && (s[len - 1] == '\n')) - s[--len] = '\0'; - - return s; -} - -/* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for - * @controller. - */ -static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo, - char *controller, int type) -{ - char *base_cgroup = basecginfo; - - for (;;) { - bool is_cgv2_base_cgroup = false; - - /* cgroup v2 entry in "/proc//cgroup": "0::/some/path" */ - if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0')) - is_cgv2_base_cgroup = true; - - base_cgroup = strchr(base_cgroup, ':'); - if (!base_cgroup) - return NULL; - base_cgroup++; - - if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) { - __do_free char *copy = NULL; - - base_cgroup = strchr(base_cgroup, ':'); - if (!base_cgroup) - return NULL; - base_cgroup++; - - copy = copy_to_eol(base_cgroup); - if (!copy) - return NULL; - trim(copy); - - if (!relative) { - base_cgroup = prune_init_scope(copy); - if (!base_cgroup) - return NULL; - } else { - base_cgroup = copy; - } - - if (abspath(base_cgroup)) - base_cgroup = deabs(base_cgroup); - - /* We're allowing base_cgroup to be "". */ - return strdup(base_cgroup); - } - - base_cgroup = strchr(base_cgroup, '\n'); - if (!base_cgroup) - return NULL; - base_cgroup++; - } -} - static void must_append_string(char ***list, char *entry) { int newentry; char *copy; - newentry = append_null_to_list((void ***)list); + newentry = list_add((void ***)list); copy = must_copy_string(entry); (*list)[newentry] = copy; } -static int get_existing_subsystems(char ***klist, char ***nlist) -{ - __do_free char *line = NULL; - __do_fclose FILE *f = NULL; - size_t len = 0; - - f = fopen("/proc/self/cgroup", "re"); - if (!f) - return -1; - - while (getline(&line, &len, f) != -1) { - char *p, *p2, *tok; - p = strchr(line, ':'); - if (!p) - continue; - p++; - p2 = strchr(p, ':'); - if (!p2) - continue; - *p2 = '\0'; - - /* If the kernel has cgroup v2 support, then /proc/self/cgroup - * contains an entry of the form: - * - * 0::/some/path - * - * In this case we use "cgroup2" as controller name. - */ - if ((p2 - p) == 0) { - must_append_string(klist, "cgroup2"); - continue; - } - - lxc_iterate_parts(tok, p, ",") { - if (strnequal(tok, "name=", 5)) - must_append_string(nlist, tok); - else - must_append_string(klist, tok); - } - } - - return 0; -} - -static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist, - char **nlist) -{ - int k; - char **it; - - TRACE("basecginfo is:"); - TRACE("%s", basecginfo); - - for (k = 0, it = klist; it && *it; it++, k++) - TRACE("kernel subsystem %d: %s", k, *it); - - for (k = 0, it = nlist; it && *it; it++, k++) - TRACE("named subsystem %d: %s", k, *it); -} - static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune) { if (!path_prune || !hierarchies) @@ -1130,7 +767,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, !ops->setup_limits_legacy(ops, conf, true)) return log_error(false, "Failed to setup legacy device limits"); - limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL); + limit_path = make_cgroup_path(h, h->container_base_path, cgroup_limit_dir, NULL); path = must_make_path(limit_path, cgroup_leaf, NULL); /* @@ -1146,7 +783,7 @@ static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir); } } else { - path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL); + path = make_cgroup_path(h, h->container_base_path, cgroup_limit_dir, NULL); fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); } @@ -1805,8 +1442,7 @@ static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarc INFO("Remounted %s read-only", controllerpath); } - sourcepath = must_make_path(h->mountpoint, h->container_base_path, - container_cgroup, NULL); + sourcepath = make_cgroup_path(h, h->container_base_path, container_cgroup, NULL); if (cgroup_automount_type == LXC_AUTO_CGROUP_RO) flags |= MS_RDONLY; @@ -2128,7 +1764,7 @@ __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, for (int i = 0; ops->hierarchies[i]; i++) { __do_free char *controllerpath = NULL, *path2 = NULL; struct hierarchy *h = ops->hierarchies[i]; - char *controller = strrchr(h->mountpoint, '/'); + char *controller = h->mountpoint; if (!controller) continue; @@ -2196,10 +1832,9 @@ __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops, __do_free char *fullpath = NULL; int ret; - fullpath = - must_make_path(ops->hierarchies[i]->mountpoint, - ops->hierarchies[i]->container_base_path, - "cgroup.procs", NULL); + fullpath = make_cgroup_path(ops->hierarchies[i], + ops->hierarchies[i]->container_base_path, + "cgroup.procs", NULL); ret = lxc_write_to_file(fullpath, "0", 2, false, 0666); if (ret != 0) return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath); @@ -2385,20 +2020,28 @@ static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops, const char *controller, bool limiting) { struct hierarchy *h; + size_t len; + const char *path; h = get_hierarchy(ops, controller); if (!h) - return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"", - controller ? controller : "(null)"); + return log_warn_errno(NULL, ENOENT, + "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller)); if (limiting) - return h->container_limit_path - ? h->container_limit_path + strlen(h->mountpoint) - : NULL; + path = h->container_limit_path; + else + path = h->container_full_path; + if (!path) + return NULL; - return h->container_full_path - ? h->container_full_path + strlen(h->mountpoint) - : NULL; + len = strlen(h->mountpoint); + if (!strnequal(h->mountpoint, DEFAULT_CGROUP_MOUNTPOINT, + STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) { + path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT); + path += strspn(path, "/"); + } + return path += len; } __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops, @@ -2420,7 +2063,7 @@ static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h, const char *inpath, const char *filename) { - return must_make_path(h->mountpoint, inpath, filename, NULL); + return make_cgroup_path(h, inpath, filename, NULL); } static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid) @@ -2619,7 +2262,7 @@ static int __cg_unified_attach(const struct hierarchy *h, if (!cgroup) return 0; - path = must_make_path(h->mountpoint, cgroup, NULL); + path = make_cgroup_path(h, cgroup, NULL); unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC); if (unified_fd < 0) @@ -3278,21 +2921,27 @@ __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops * return __cgfsng_delegate_controllers(ops, ops->container_cgroup); } -static void cg_unified_delegate(char ***delegate) +static int cg_unified_delegate(char ***delegate) { __do_free char *buf = NULL; - char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL}; + char *standard[] = { + "cgroup.procs", + "cgroup.threads", + "cgroup.subtree_control", + "memory.oom.group", + NULL, + }; char *token; int idx; buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0); if (!buf) { for (char **p = standard; p && *p; p++) { - idx = append_null_to_list((void ***)delegate); + idx = list_add((void ***)delegate); (*delegate)[idx] = must_copy_string(*p); } - SYSWARN("Failed to read /sys/kernel/cgroup/delegate"); - return; + + return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate"); } lxc_iterate_parts(token, buf, " \t\n") { @@ -3303,127 +2952,19 @@ static void cg_unified_delegate(char ***delegate) if (strequal(token, "cgroup.procs")) continue; - idx = append_null_to_list((void ***)delegate); + idx = list_add((void ***)delegate); (*delegate)[idx] = must_copy_string(token); } -} - -/* At startup, parse_hierarchies finds all the info we need about cgroup - * mountpoints and current cgroups, and stores it in @d. - */ -static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged) -{ - __do_free char *cgroup_info = NULL, *line = NULL; - __do_free_string_list char **klist = NULL, **nlist = NULL; - __do_fclose FILE *f = NULL; - int ret; - size_t len = 0; - - /* Root spawned containers escape the current cgroup, so use init's - * cgroups as our base in that case. - */ - if (!relative && (geteuid() == 0)) - cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); - else - cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); - if (!cgroup_info) - return ret_errno(ENOMEM); - - ret = get_existing_subsystems(&klist, &nlist); - if (ret < 0) - return syserrno(-errno, "Failed to retrieve available legacy cgroup controllers"); - - f = fopen("/proc/self/mountinfo", "re"); - if (!f) - return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\""); - - lxc_cgfsng_print_basecg_debuginfo(cgroup_info, klist, nlist); - - while (getline(&line, &len, f) != -1) { - __do_free char *base_cgroup = NULL, *mountpoint = NULL; - __do_free_string_list char **controller_list = NULL; - int type; - bool writeable; - - type = get_cgroup_version(line); - if (type == 0) - continue; - - if (type == CGROUP2_SUPER_MAGIC && ops->unified) - continue; - - if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) { - if (type == CGROUP2_SUPER_MAGIC) - ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; - else if (type == CGROUP_SUPER_MAGIC) - ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; - } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) { - if (type == CGROUP_SUPER_MAGIC) - ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; - } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { - if (type == CGROUP2_SUPER_MAGIC) - ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; - } - - controller_list = cg_hybrid_get_controllers(klist, nlist, line, type); - if (!controller_list && type == CGROUP_SUPER_MAGIC) - continue; - - if (type == CGROUP_SUPER_MAGIC) - if (controller_list_is_dup(ops->hierarchies, controller_list)) { - TRACE("Skipping duplicating controller"); - continue; - } - - mountpoint = cg_hybrid_get_mountpoint(line); - if (!mountpoint) { - WARN("Failed parsing mountpoint from \"%s\"", line); - continue; - } - - if (type == CGROUP_SUPER_MAGIC) - base_cgroup = cg_hybrid_get_current_cgroup(relative, cgroup_info, controller_list[0], CGROUP_SUPER_MAGIC); - else - base_cgroup = cg_hybrid_get_current_cgroup(relative, cgroup_info, NULL, CGROUP2_SUPER_MAGIC); - if (!base_cgroup) { - WARN("Failed to find current cgroup"); - continue; - } - - if (type == CGROUP2_SUPER_MAGIC) - writeable = test_writeable_v2(mountpoint, base_cgroup); - else - writeable = test_writeable_v1(mountpoint, base_cgroup); - if (!writeable) { - TRACE("The %s group is not writeable", base_cgroup); - continue; - } - - if (type == CGROUP2_SUPER_MAGIC) - ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type); - else - ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type); - if (ret) - return syserrno(ret, "Failed to add cgroup hierarchy"); - if (ops->unified && unprivileged) - cg_unified_delegate(&(ops->unified)->cgroup2_chown); - } - - /* verify that all controllers in cgroup.use and all crucial - * controllers are accounted for - */ - if (!all_controllers_found(ops)) - return log_error_errno(-1, ENOENT, "Failed to find all required controllers"); return 0; } -static inline bool __current_cgroup_unified_line(const char *line) +static inline bool unified_cgroup(const char *line) { return *line == '0'; } -static inline char *__current_cgroup_unified(bool relative, char *line) +static inline char *current_unified_cgroup(bool relative, char *line) { char *current_cgroup; @@ -3446,70 +2987,209 @@ static inline char *__current_cgroup_unified(bool relative, char *line) return current_cgroup; } -/* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */ -static char *current_cgroup_unified(bool relative) +static inline const char *unprefix(const char *controllers) +{ + if (strnequal(controllers, "name=", STRLITERALLEN("name="))) + return controllers + STRLITERALLEN("name="); + return controllers; +} + +static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, + bool unprivileged) { __do_free char *cgroup_info = NULL; char *it; + /* + * Root spawned containers escape the current cgroup, so use init's + * cgroups as our base in that case. + */ if (!relative && (geteuid() == 0)) cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); else cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); if (!cgroup_info) - return NULL; + return ret_errno(ENOMEM); lxc_iterate_parts(it, cgroup_info, "\n") { - char *current_cgroup; + __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; + __do_free char *controllers = NULL, *current_cgroup = NULL; + __do_free_string_list char **controller_list = NULL, + **delegate = NULL; + char *line; + int dfd, ret, type; + + /* Handle the unified cgroup hierarchy. */ + line = it; + if (unified_cgroup(line)) { + char *unified_mnt; + + current_cgroup = current_unified_cgroup(relative, line); + if (IS_ERR(current_cgroup)) + return PTR_ERR(current_cgroup); + + if (unified_cgroup_fd(ops->dfd_mnt_cgroupfs_host)) { + dfd_mnt = dup_cloexec(ops->dfd_mnt_cgroupfs_host); + unified_mnt = ""; + } else { + dfd_mnt = open_at(ops->dfd_mnt_cgroupfs_host, + "unified", + PROTECT_OPATH_DIRECTORY, + PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); + unified_mnt = "unified"; + } + if (dfd_mnt < 0) { + if (errno != ENOENT) + return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt_cgroupfs_host); - if (!__current_cgroup_unified_line(it)) - continue; + SYSTRACE("Unified cgroup not mounted"); + continue; + } + dfd = dfd_mnt; + + if (!is_empty_string(current_cgroup)) { + dfd_base = open_at(dfd_mnt, current_cgroup, + PROTECT_OPATH_DIRECTORY, + PROTECT_LOOKUP_BENEATH_XDEV, 0); + if (dfd_base < 0) + return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup); + dfd = dfd_base; + } - current_cgroup = __current_cgroup_unified(relative, it); - if (IS_ERR(current_cgroup)) - return NULL; + controller_list = unified_controllers(dfd, "cgroup.controllers"); + if (!controller_list) { + TRACE("No controllers are enabled for delegation in the unified hierarchy"); + controller_list = __controller_list_empty(); + } - return current_cgroup; - } + if (unprivileged) { + ret = cg_unified_delegate(&delegate); + if (ret < 0) + return syserrno(ret, "Failed to determine delegation requirements"); - return log_error(NULL, "Failed to retrieve current cgroup for %s process", - relative ? "current" : "init"); -} + for (char *const *d = delegate; d && *d; d++) { + if (faccessat(dfd, *d, W_OK, 0)) { + if (errno == ENOENT) + continue; -static int cg_unified_init(struct cgroup_ops *ops, bool relative, - bool unprivileged) -{ - __do_free char *base_cgroup = NULL; - int ret; + SYSINFO("Lacking write access to %s, skipping unified cgroup", *d); + break; + } + } + } - base_cgroup = current_cgroup_unified(relative); - if (!base_cgroup) - return ret_errno(EINVAL); + type = CGROUP2_SUPER_MAGIC; + controllers = strdup(unified_mnt); + if (!controllers) + return ret_errno(ENOMEM); + } else { + char *__controllers, *__current_cgroup; - /* TODO: If the user requested specific controllers via lxc.cgroup.use - * we should verify here. The reason I'm not doing it right is that I'm - * not convinced that lxc.cgroup.use will be the future since it is a - * global property. I much rather have an option that lets you request - * controllers per container. - */ + __controllers = strchr(line, ':'); + if (!__controllers) + return ret_errno(EINVAL); + __controllers++; + + __current_cgroup = strchr(__controllers, ':'); + if (!__current_cgroup) + return ret_errno(EINVAL); + *__current_cgroup = '\0'; + __current_cgroup++; + + controllers = strdup(unprefix(__controllers)); + if (!controllers) + return ret_errno(ENOMEM); + + dfd_mnt = open_at(ops->dfd_mnt_cgroupfs_host, + controllers, PROTECT_OPATH_DIRECTORY, + PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); + if (dfd_mnt < 0) { + if (errno != ENOENT) + return syserrno(-errno, "Failed to open %d/%s", + ops->dfd_mnt_cgroupfs_host, controllers); - ret = add_hierarchy(ops, NULL, - must_copy_string(DEFAULT_CGROUP_MOUNTPOINT), - move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC); - if (ret) - return syserrno(ret, "Failed to add unified cgroup hierarchy"); + SYSTRACE("%s not mounted", controllers); + continue; + } + dfd = dfd_mnt; + + if (!abspath(__current_cgroup)) + return ret_errno(EINVAL); + + /* remove init.scope */ + if (!relative) + __current_cgroup = prune_init_scope(__current_cgroup); + + /* create a relative path */ + __current_cgroup = deabs(__current_cgroup); - if (unprivileged) - cg_unified_delegate(&(ops->unified)->cgroup2_chown); + current_cgroup = strdup(__current_cgroup); + if (!current_cgroup) + return ret_errno(ENOMEM); - if (bpf_devices_cgroup_supported()) - ops->unified->bpf_device_controller = 1; + if (!is_empty_string(current_cgroup)) { + dfd_base = open_at(dfd_mnt, current_cgroup, + PROTECT_OPATH_DIRECTORY, + PROTECT_LOOKUP_BENEATH_XDEV, 0); + if (dfd_base < 0) + return syserrno(-errno, "Failed to open %d/%s", + dfd_mnt, current_cgroup); + dfd = dfd_base; + } - ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; - return CGROUP2_SUPER_MAGIC; + if (faccessat(dfd, "cgroup.procs", W_OK, 0)) { + if (errno == ENOENT) + continue; + + SYSINFO("Lacking write access to %s", controllers); + break; + } + + /* + * We intentionally pass __current_cgroup here and not + * controllers because we would otherwise chop the + * mountpoint. + */ + controller_list = __controller_list(__controllers); + if (IS_ERR(controller_list)) + return PTR_ERR(controller_list); + + if (skip_hierarchy(ops, controller_list)) + continue; + + type = CGROUP_SUPER_MAGIC; + ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; + } + + ret = add_hierarchy(ops, dfd_mnt, controllers, dfd, current_cgroup, controller_list, type); + if (ret < 0) + return syserrno(ret, "Failed to add %s hierarchy", controllers); + + /* Transfer ownership. */ + move_fd(dfd_mnt); + move_fd(dfd_base); + move_ptr(current_cgroup); + move_ptr(controllers); + move_ptr(controller_list); + if (type == CGROUP2_SUPER_MAGIC) + ops->unified->cgroup2_chown = move_ptr(delegate); + } + + /* determine cgroup layout */ + if (ops->unified) { + if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { + ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; + } else { + if (bpf_devices_cgroup_supported()) + ops->unified->bpf_device_controller = 1; + ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; + } + } + + return 0; } -static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf) +static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) { __do_close int dfd = -EBADF; int ret; @@ -3548,12 +3228,7 @@ static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf) */ ops->dfd_mnt_cgroupfs_host = dfd; - if (unified_cgroup_fd(dfd)) - ret = cg_unified_init(ops, conf->cgroup_meta.relative, - !lxc_list_empty(&conf->id_map)); - else - ret = cg_hybrid_init(ops, conf->cgroup_meta.relative, - !lxc_list_empty(&conf->id_map)); + ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map)); if (ret < 0) return syserrno(ret, "Failed to initialize cgroups"); @@ -3577,7 +3252,7 @@ __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) return 0; } -struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf) +struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) { __do_free struct cgroup_ops *cgfsng_ops = NULL; @@ -3588,7 +3263,7 @@ struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf) cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF; - if (__cgroup_init(cgfsng_ops, conf)) + if (initialize_cgroups(cgfsng_ops, conf)) return NULL; cgfsng_ops->data_init = cgfsng_data_init; diff --git a/src/lxc/cgroups/cgroup.c b/src/lxc/cgroups/cgroup.c index 68e3e3e25..469660655 100644 --- a/src/lxc/cgroups/cgroup.c +++ b/src/lxc/cgroups/cgroup.c @@ -21,7 +21,7 @@ lxc_log_define(cgroup, lxc); -__hidden extern struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf); +__hidden extern struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf); struct cgroup_ops *cgroup_init(struct lxc_conf *conf) { @@ -30,7 +30,7 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf) if (!conf) return log_error_errno(NULL, EINVAL, "No valid conf given"); - cgroup_ops = cgfsng_ops_init(conf); + cgroup_ops = cgroup_ops_init(conf); if (!cgroup_ops) return log_error_errno(NULL, errno, "Failed to initialize cgroup driver"); @@ -47,13 +47,13 @@ struct cgroup_ops *cgroup_init(struct lxc_conf *conf) TRACE("Initialized cgroup driver %s", cgroup_ops->driver); if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) - TRACE("Running with legacy cgroup layout"); + TRACE("Legacy cgroup layout"); else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_HYBRID) - TRACE("Running with hybrid cgroup layout"); + TRACE("Hybrid cgroup layout"); else if (cgroup_ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) - TRACE("Running with unified cgroup layout"); + TRACE("Unified cgroup layout"); else - WARN("Running with unknown cgroup layout"); + WARN("Unsupported cgroup layout"); return cgroup_ops; } diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index 3a8e3f3ef..1d08bdb55 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -230,4 +230,11 @@ static inline int cgroup_unified_fd(const struct cgroup_ops *ops) return ops->unified->cgfd_con; } +#define make_cgroup_path(__hierarchy, __first, ...) \ + ({ \ + const struct hierarchy *__h = __hierarchy; \ + must_make_path(DEFAULT_CGROUP_MOUNTPOINT, __h->mountpoint, \ + __first, __VA_ARGS__); \ + }) + #endif /* __LXC_CGROUP_H */ diff --git a/src/lxc/cgroups/cgroup_utils.c b/src/lxc/cgroups/cgroup_utils.c index 448fc9fa5..6beb57067 100644 --- a/src/lxc/cgroups/cgroup_utils.c +++ b/src/lxc/cgroups/cgroup_utils.c @@ -86,7 +86,7 @@ bool test_writeable_v2(char *mountpoint, char *path) return (access(cgroup_threads_file, W_OK) == 0); } -int unified_cgroup_fd(int fd) +bool unified_cgroup_fd(int fd) { int ret; diff --git a/src/lxc/cgroups/cgroup_utils.h b/src/lxc/cgroups/cgroup_utils.h index 77bf40c15..41a8a2199 100644 --- a/src/lxc/cgroups/cgroup_utils.h +++ b/src/lxc/cgroups/cgroup_utils.h @@ -29,7 +29,7 @@ __hidden extern bool test_writeable_v1(char *mountpoint, char *path); */ __hidden extern bool test_writeable_v2(char *mountpoint, char *path); -__hidden extern int unified_cgroup_fd(int fd); +__hidden extern bool unified_cgroup_fd(int fd); static inline bool cgns_supported(void) {