From: Serge Hallyn Date: Thu, 3 Mar 2016 18:31:23 +0000 (-0800) Subject: cgfsng: next generation filesystem-backed cgroup implementation X-Git-Tag: lxc-2.0.0.rc6~1^2~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ccb4cabe02e42e37faf7d3da089393777abe8ab1;p=thirdparty%2Flxc.git cgfsng: next generation filesystem-backed cgroup implementation This makes simplifying assumptions: all usable cgroups must be mounted under /sys/fs/cgroup/controller or /sys/fs/cgroup/contr1,contr2. Currently this will only work with cgroup namespaces, because lxc.mount.auto = cgroup is not implemented. So cgfsng_ops_init() returns NULL if cgroup namespaces are not enabled. Signed-off-by: Serge Hallyn --- diff --git a/src/lxc/Makefile.am b/src/lxc/Makefile.am index 9f7a29e83..0861fa3a8 100644 --- a/src/lxc/Makefile.am +++ b/src/lxc/Makefile.am @@ -89,6 +89,7 @@ liblxc_so_SOURCES = \ error.h error.c \ parse.c parse.h \ cgfs.c \ + cgfsng.c \ cgroup.c cgroup.h \ lxc.h \ initutils.c initutils.h \ diff --git a/src/lxc/cgfs.c b/src/lxc/cgfs.c index 05e7bcfd9..c493d5802 100644 --- a/src/lxc/cgfs.c +++ b/src/lxc/cgfs.c @@ -141,7 +141,6 @@ static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, cons static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow); static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices); static int cgroup_recursive_task_count(const char *cgroup_path); -static int count_lines(const char *fn); static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path); static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path); @@ -2116,7 +2115,7 @@ static int cgroup_recursive_task_count(const char *cgroup_path) if (r >= 0) n += r; } else if (!strcmp(dent->d_name, "tasks")) { - r = count_lines(sub_path); + r = lxc_count_file_lines(sub_path); if (r >= 0) n += r; } @@ -2128,25 +2127,6 @@ static int cgroup_recursive_task_count(const char *cgroup_path) return n; } -static int count_lines(const char *fn) -{ - FILE *f; - char *line = NULL; - size_t sz = 0; - int n = 0; - - f = fopen_cloexec(fn, "r"); - if (!f) - return -1; - - while (getline(&line, &sz, f) != -1) { - n++; - } - free(line); - fclose(f); - return n; -} - static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path) { @@ -2420,7 +2400,7 @@ static const char *cgfs_canonical_path(void *hdata) return path; } -static bool cgfs_escape(void) +static bool cgfs_escape(void *hdata) { struct cgroup_meta_data *md; int i; diff --git a/src/lxc/cgfsng.c b/src/lxc/cgfsng.c new file mode 100644 index 000000000..465434324 --- /dev/null +++ b/src/lxc/cgfsng.c @@ -0,0 +1,1466 @@ +/* + * lxc: linux Container library + * + * Copyright © 2016 Canonical Ltd. + * + * Authors: + * Serge Hallyn + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * cgfs-ng.c: this is a new, simplified implementation of a filesystem + * cgroup backend. The original cgfs.c was designed to be as flexible + * as possible. It would try to find cgroup filesystems no matter where + * or how you had them mounted, and deduce the most usable mount for + * each controller. It also was not designed for unprivileged use, as + * that was reserved for cgmanager. + * + * This new implementation assumes that cgroup filesystems are mounted + * under /sys/fs/cgroup/clist where clist is either the controller, or + * a comman-separated list of controllers. + */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "cgroup.h" +#include "utils.h" +#include "commands.h" + +lxc_log_define(lxc_cgfsng, lxc); + +static struct cgroup_ops cgfsng_ops; + +//#define EXTRADEBUG 1 + +/* + * A descriptor for a mounted hierarchy + * @controllers: either NULL, or a null-terminated list of all + * the co-mounted controllers + * @mountpoint: the mountpoint we will use. It will be either + * /sys/fs/cgroup/controller or /sys/fs/cgroup/controllerlist + * @base_cgroup: the cgroup under which the container cgroup path + is created. This will be either the caller's cgroup (if not + root), or init's cgroup (if root). + */ +struct hierarchy { + char **controllers; + char *mountpoint; + char *base_cgroup; + char *fullcgpath; +}; + +/* + * The cgroup data which is attached to the lxc_handler. + * @hierarchies - a NULL-terminated array of struct hierarchy, one per + * hierarchy. No duplicates. First sufficient, writeable mounted + * hierarchy wins + * @cgroup_use - a copy of the lxc.cgroup.use + * @cgroup_pattern - a copy of the lxc.cgroup.pattern + * @container_cgroup - if not null, the cgroup which was created for + * the container. For each hierarchy, it is created under the + * @hierarchy->base_cgroup directory. Relative to the base_cgroup + * it is the same for all hierarchies. + * @name - the container name + */ +struct cgfsng_handler_data { + struct hierarchy **hierarchies; + char *cgroup_use; + char *cgroup_pattern; + char *container_cgroup; // cgroup we created for the container + char *name; // container name +}; + +static void free_string_list(char **clist) +{ + if (clist) { + int i; + + for (i = 0; clist[i]; i++) + free(clist[i]); + free(clist); + } +} + +/* Re-alllocate a pointer, do not fail */ +static void *must_realloc(void *orig, size_t sz) +{ + void *ret; + + do { + ret = realloc(orig, sz); + } while (!ret); + return ret; +} + +/* Allocate a pointer, do not fail */ +static void *must_alloc(size_t sz) +{ + return must_realloc(NULL, sz); +} + +/* return copy of string @entry; do not fail. */ +static char *must_copy_string(const char *entry) +{ + char *ret; + + if (!entry) + return NULL; + do { + ret = strdup(entry); + } while (!ret); + return ret; +} + +/* + * This is a special case - return a copy of @entry + * prepending 'name='. I.e. turn systemd into name=systemd. + * Do not fail. + */ +static char *must_prefix_named(char *entry) +{ + char *ret; + size_t len = strlen(entry); + + ret = must_alloc(len + 6); + snprintf(ret, len + 6, "name=%s", entry); + return ret; +} + +/* + * Given a pointer to a null-terminated array of pointers, realloc to + * add one entry, and point the new entry to NULL. Do not fail. Return + * the index to the second-to-last entry - that is, the one which is + * now available for use (keeping the list null-terminated). + */ +static int append_null_to_list(void ***list) +{ + int newentry = 0; + + if (*list) + for (; (*list)[newentry]; newentry++); + + *list = must_realloc(*list, (newentry + 2) * sizeof(void **)); + (*list)[newentry + 1] = NULL; + return newentry; +} + +/* + * Given a null-terminated array of strings, check whether @entry + * is one of the strings + */ +static bool string_in_list(char **list, const char *entry) +{ + int i; + + if (!list) + return false; + for (i = 0; list[i]; i++) + if (strcmp(list[i], entry) == 0) + return true; + + return false; +} + +/* + * append an entry to the clist. Do not fail. + * *clist must be NULL the first time we are called. + * + * We also handle named subsystems here. Any controller which is not a + * kernel subsystem, we prefix 'name='. Any which is both a kernel and + * named subsystem, we refuse to use because we're not sure which we + * have here. (TODO - we could work around this in some cases by just + * remounting to be unambiguous, or by comparing mountpoint contents + * with current cgroup) + * + * The last entry will always be NULL. + */ +static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry) +{ + int newentry; + char *copy; + + if (string_in_list(klist, entry) && string_in_list(nlist, entry)) { + ERROR("Refusing to use ambiguous controller '%s'", entry); + ERROR("It is both a named and kernel subsystem"); + return; + } + + newentry = append_null_to_list((void ***)clist); + + if (strncmp(entry, "name=", 5) == 0) + copy = must_copy_string(entry); + else if (string_in_list(klist, entry)) + copy = must_copy_string(entry); + else + copy = must_prefix_named(entry); + + (*clist)[newentry] = copy; +} + +static void free_hierarchies(struct hierarchy **hlist) +{ + if (hlist) { + int i; + + for (i = 0; hlist[i]; i++) { + free(hlist[i]->mountpoint); + free(hlist[i]->base_cgroup); + free(hlist[i]->fullcgpath); + free_string_list(hlist[i]->controllers); + } + free(hlist); + } +} + +static void free_handler_data(struct cgfsng_handler_data *d) +{ + free_hierarchies(d->hierarchies); + free(d->cgroup_use); + free(d->cgroup_pattern); + free(d->container_cgroup); + free(d->name); + free(d); +} + +/* + * Given a handler's cgroup data, return the struct hierarchy for the + * controller @c, or NULL if there is none. + */ +struct hierarchy *get_hierarchy(struct cgfsng_handler_data *d, const char *c) +{ + int i; + + if (!d || !d->hierarchies) + return NULL; + for (i = 0; d->hierarchies[i]; i++) { + if (string_in_list(d->hierarchies[i]->controllers, c)) + return d->hierarchies[i]; + } + return NULL; +} + +/* + * Given two null-terminated lists of strings, return true if any string + * is in both. + */ +static bool controller_lists_intersect(char **l1, char **l2) +{ + int i; + + if (!l1 || !l2) + return false; + + for (i = 0; l1[i]; i++) { + if (string_in_list(l2, l1[i])) + return true; + } + return false; +} + +/* + * For a null-terminated list of controllers @clist, return true if any of + * those controllers is already listed the null-terminated list of + * hierarchies @hlist. Realistically, if one is present, all must be present. + */ +static bool controller_list_is_dup(struct hierarchy **hlist, char **clist) +{ + int i; + + if (!hlist) + return false; + for (i = 0; hlist[i]; i++) + if (controller_lists_intersect(hlist[i]->controllers, clist)) + return true; + return false; + +} + +/* + * Return true if the controller @entry is found in the null-terminated + * list of hierarchies @hlist + */ +static bool controller_found(struct hierarchy **hlist, char *entry) +{ + int i; + if (!hlist) + return false; + + for (i = 0; hlist[i]; i++) + if (string_in_list(hlist[i]->controllers, entry)) + return true; + return false; +} + +/* + * Return true if all of the controllers which we require have been + * found. The required list is systemd, freezer, and anything in + * lxc.cgroup.use. + */ +static bool all_controllers_found(struct cgfsng_handler_data *d) +{ + char *p, *saveptr = NULL; + struct hierarchy ** hlist = d->hierarchies; + + if (!controller_found(hlist, "name=systemd")) { + ERROR("no systemd controller mountpoint found"); + return false; + } + if (!controller_found(hlist, "freezer")) { + ERROR("no freezer controller mountpoint found"); + return false; + } + + if (!d->cgroup_use) + return true; + for (p = strtok_r(d->cgroup_use, ",", &saveptr); p; + p = strtok_r(NULL, ",", &saveptr)) { + if (!controller_found(hlist, p)) { + ERROR("no %s controller mountpoint found", p); + return false; + } + } + return true; +} + +/* Return true if the fs type is fuse.lxcfs */ +static bool is_lxcfs(const char *line) +{ + char *p = strstr(line, " - "); + if (!p) + return false; + return strncmp(p, " - fuse.lxcfs ", 14); +} + +/* + * Get the controllers from a mountinfo line + * There are other ways we could get this info. For lxcfs, field 3 + * is /cgroup/controller-list. For cgroupfs, we could parse the mount + * options. But we simply assume that the mountpoint must be + * /sys/fs/cgroup/controller-list + */ +static char **get_controllers(char **klist, char **nlist, char *line) +{ + // the fourth field is /sys/fs/cgroup/comma-delimited-controller-list + int i; + char *p = line, *p2, *tok, *saveptr = NULL; + char **aret = NULL; + + for (i = 0; i < 4; i++) { + p = index(p, ' '); + if (!p) + return NULL; + p++; + } + if (!p) + return NULL; + /* note - if we change how mountinfo works, then our caller + * will need to verify /sys/fs/cgroup/ in this field */ + if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) + return NULL; + p += 15; + p2 = index(p, ' '); + if (!p2) { + ERROR("corrupt mountinfo"); + return NULL; + } + *p2 = '\0'; + for (tok = strtok_r(p, ",", &saveptr); tok; + tok = strtok_r(NULL, ",", &saveptr)) { + must_append_controller(klist, nlist, &aret, tok); + } + + return aret; +} + +/* return true if the fstype is cgroup */ +static bool is_cgroupfs(char *line) +{ + char *p = strstr(line, " - "); + if (!p) + return false; + return strncmp(p, " - cgroup ", 10); +} + +/* Add a controller to our list of hierarchies */ +static void add_controller(struct cgfsng_handler_data *d, char **clist, + char *mountpoint, char *base_cgroup) +{ + struct hierarchy *new; + int newentry; + + new = must_alloc(sizeof(*new)); + new->controllers = clist; + new->mountpoint = mountpoint; + new->base_cgroup = base_cgroup; + new->fullcgpath = NULL; + + newentry = append_null_to_list((void ***)&d->hierarchies); + d->hierarchies[newentry] = new; +} + +/* + * Get a copy of the mountpoint from @line, which is a line from + * /proc/self/mountinfo + */ +static char *get_mountpoint(char *line) +{ + int i; + char *p = line, *sret; + size_t len; + + for (i = 0; i < 4; i++) { + p = index(p, ' '); + if (!p) + return NULL; + p++; + } + /* we've already stuck a \0 after the mountpoint */ + len = strlen(p); + sret = must_alloc(len + 1); + memcpy(sret, p, len); + sret[len] = '\0'; + return sret; +} + +/* + * Given a multi-line string, return a null-terminated copy of the + * current line. + */ +static char *copy_to_eol(char *p) +{ + char *p2 = index(p, '\n'), *sret; + size_t len; + + if (!p2) + return NULL; + + len = p2 - p; + sret = must_alloc(len + 1); + memcpy(sret, p, len); + sret[len] = '\0'; + return sret; +} + +/* + * cgline: pointer to character after the first ':' in a line in a + * \n-terminated /proc/self/cgroup file. Check whether * controller c is + * present. + */ +static bool controller_in_clist(char *cgline, char *c) +{ + char *tok, *saveptr = NULL, *eol, *tmp; + size_t len; + + eol = index(cgline, ':'); + if (!eol) + return false; + + len = eol - cgline; + tmp = alloca(len + 1); + memcpy(tmp, cgline, len); + tmp[len] = '\0'; + + for (tok = strtok_r(tmp, ",", &saveptr); tok; + tok = strtok_r(NULL, ",", &saveptr)) { + if (strcmp(tok, c) == 0) + return true; + } + return false; +} + +/* + * @basecginfo is a copy of /proc/$$/cgroup. Return the current + * cgroup for @controller + */ +static char *get_current_cgroup(char *basecginfo, char *controller) +{ + char *p = basecginfo; + + while (1) { + p = index(p, ':'); + if (!p) + return NULL; + p++; + if (controller_in_clist(p, controller)) { + p = index(p, ':'); + if (!p) + return NULL; + p++; + return copy_to_eol(p); + } + + p = index(p, '\n'); + if (!p) + return NULL; + p++; + } +} + +static void append_line(char **dest, size_t oldlen, char *new, size_t newlen) +{ + size_t full = oldlen + newlen; + + *dest = must_realloc(*dest, full + 1); + + strcat(*dest, new); +} + +/* Slurp in a whole file */ +static char *read_file(char *fnam) +{ + FILE *f; + char *line = NULL, *buf = NULL; + size_t len = 0, fulllen = 0; + + f = fopen(fnam, "r"); + if (!f) + return NULL; + while (getline(&line, &len, f) != -1) { + append_line(&buf, fulllen, line, len); + fulllen += len; + } + fclose(f); + free(line); + return buf; +} + +static char *must_make_path(const char *first, ...) __attribute__((sentinel)); + +/* + * Given a hierarchy @mountpoint and base @path, verify that we can create + * directories underneath it. + */ +static bool test_writeable(char *mountpoint, char *path) +{ + char *fullpath = must_make_path(mountpoint, path, NULL); + int ret; + + ret = access(fullpath, W_OK); + free(fullpath); + return ret == 0; +} + +static void must_append_string(char ***list, char *entry) +{ + int newentry = append_null_to_list((void ***)list); + char *copy; + + copy = must_copy_string(entry); + (*list)[newentry] = copy; +} + +static void get_existing_subsystems(char ***klist, char ***nlist) +{ + FILE *f; + char *line = NULL; + size_t len = 0; + + if ((f = fopen("/proc/self/cgroup", "r")) == NULL) + return; + while (getline(&line, &len, f) != -1) { + char *p, *p2, *tok, *saveptr = NULL; + p = index(line, ':'); + if (!p) + continue; + p++; + p2 = index(p, ':'); + if (!p2) + continue; + *p2 = '\0'; + for (tok = strtok_r(p, ",", &saveptr); tok; + tok = strtok_r(NULL, ",", &saveptr)) { + if (strncmp(tok, "name=", 5) == 0) + must_append_string(nlist, tok); + else + must_append_string(klist, tok); + } + } + + free(line); + fclose(f); +} + +static void trim(char *s) +{ + size_t len = strlen(s); + while (s[len-1] == '\n') + s[--len] = '\0'; +} + +#if EXTRADEBUG +static void print_init_debuginfo(struct cgfsng_handler_data *d) +{ + int i; + printf("Cgroup information:\n"); + printf(" container name: %s\n", d->name); + printf(" lxc.cgroup.use: %s\n", d->cgroup_use ? d->cgroup_use : "(none)"); + printf(" lxc.cgroup.pattern: %s\n", d->cgroup_pattern); + printf(" cgroup: %s\n", d->container_cgroup ? d->container_cgroup : "(none)"); + if (!d->hierarchies) { + printf(" No hierarchies found.\n"); + return; + } + printf(" Hierarchies:\n"); + for (i = 0; d->hierarchies[i]; i++) { + struct hierarchy *h = d->hierarchies[i]; + int j; + printf(" %d: base_cgroup %s\n", i, h->base_cgroup); + printf(" mountpoint %s\n", h->mountpoint); + printf(" controllers:\n"); + for (j = 0; h->controllers[j]; j++) + printf(" %d: %s\n", j, h->controllers[j]); + } +} +#else +#define print_init_debuginfo(d) +#endif + +/* + * At startup, parse_hierarchies finds all the info we need about + * cgroup mountpoints and current cgroups, and stores it in @d. + */ +static bool parse_hierarchies(struct cgfsng_handler_data *d) +{ + FILE *f; + char * line = NULL, *basecginfo; + char **klist = NULL, **nlist = NULL; + size_t len = 0; + + if (geteuid()) + basecginfo = read_file("/proc/self/cgroup"); + else + basecginfo = read_file("/proc/1/cgroup"); + if (!basecginfo) + return false; + + if ((f = fopen("/proc/self/mountinfo", "r")) == NULL) { + ERROR("Failed opening /proc/self/mountinfo"); + return false; + } + + get_existing_subsystems(&klist, &nlist); +#if EXTRADEBUG + printf("basecginfo is %s\n", basecginfo); + int k; + for (k = 0; klist[k]; k++) + printf("kernel subsystem %d: %s\n", k, klist[k]); + for (k = 0; nlist[k]; k++) + printf("named subsystem %d: %s\n", k, nlist[k]); +#endif + + /* we support simple cgroup mounts and lxcfs mounts */ + while (getline(&line, &len, f) != -1) { + char **controller_list = NULL; + char *mountpoint, *base_cgroup; + + if (!is_lxcfs(line) && !is_cgroupfs(line)) + continue; + + controller_list = get_controllers(klist, nlist, line); + if (!controller_list) + continue; + + if (controller_list_is_dup(d->hierarchies, controller_list)) { + free(controller_list); + continue; + } + + mountpoint = get_mountpoint(line); + if (!mountpoint) { + ERROR("Error reading mountinfo: bad line '%s'", line); + free_string_list(controller_list); + continue; + } + + base_cgroup = get_current_cgroup(basecginfo, controller_list[0]); + if (!base_cgroup) { + ERROR("Failed to find current cgroup for controller '%s'", controller_list[0]); + free_string_list(controller_list); + free(mountpoint); + continue; + } + trim(base_cgroup); + prune_init_scope(base_cgroup); + if (!test_writeable(mountpoint, base_cgroup)) { + free_string_list(controller_list); + free(mountpoint); + free(base_cgroup); + continue; + } + add_controller(d, controller_list, mountpoint, base_cgroup); + } + + free_string_list(klist); + free_string_list(nlist); + + free(basecginfo); + + fclose(f); + free(line); + + print_init_debuginfo(d); + + /* verify that all controllers in cgroup.use and all crucial + * controllers are accounted for + */ + if (!all_controllers_found(d)) + return false; + + return true; +} + +static void *cgfsng_init(const char *name) +{ + struct cgfsng_handler_data *d; + const char *cgroup_use, *cgroup_pattern; + + d = must_alloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + + d->name = must_copy_string(name); + + errno = 0; + cgroup_use = lxc_global_config_value("lxc.cgroup.use"); + if (!cgroup_use && errno != 0) { // lxc.cgroup.use can be NULL + SYSERROR("Error reading list of cgroups to use"); + goto out_free; + } + d->cgroup_use = must_copy_string(cgroup_use); + + cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern"); + if (!cgroup_pattern) { // lxc.cgroup.pattern is only NULL on error + ERROR("Error getting cgroup pattern"); + goto out_free; + } + d->cgroup_pattern = must_copy_string(cgroup_pattern); + + if (!parse_hierarchies(d)) + goto out_free; + + print_init_debuginfo(d); + + return d; + +out_free: + free_handler_data(d); + return NULL; +} + +/* + * Concatenate all passed-in strings into one path. Do not fail. If any piece is + * not prefixed with '/', add a '/'. + */ +static char *must_make_path(const char *first, ...) +{ + va_list args; + char *cur, *dest; + size_t full_len = strlen(first); + + dest = must_copy_string(first); + + va_start(args, first); + while ((cur = va_arg(args, char *)) != NULL) { + full_len += strlen(cur); + if (cur[0] != '/') + full_len++; + dest = must_realloc(dest, full_len + 1); + if (cur[0] != '/') + strcat(dest, "/"); + strcat(dest, cur); + } + va_end(args); + + return dest; +} + +static int cgroup_rmdir(char *dirname) +{ + struct dirent dirent, *direntp; + DIR *dir; + int r = 0; + + dir = opendir(dirname); + if (!dir) + return -1; + + while (!readdir_r(dir, &dirent, &direntp)) { + struct stat mystat; + char *pathname; + + if (!direntp) + break; + + if (!strcmp(direntp->d_name, ".") || + !strcmp(direntp->d_name, "..")) + continue; + + pathname = must_make_path(dirname, direntp->d_name, NULL); + + if (lstat(pathname, &mystat)) { + if (!r) + WARN("failed to stat %s\n", pathname); + r = -1; + goto next; + } + + if (!S_ISDIR(mystat.st_mode)) + goto next; + if (cgroup_rmdir(pathname) < 0) + r = -1; +next: + free(pathname); + } + + if (rmdir(dirname) < 0) { + if (!r) + WARN("%s: failed to delete %s: %m", __func__, dirname); + r = -1; + } + + if (closedir(dir) < 0) { + if (!r) + WARN("%s: failed to delete %s: %m", __func__, dirname); + r = -1; + } + return r; +} + +static int rmdir_wrapper(void *data) +{ + char *path = data; + + if (setresgid(0,0,0) < 0) + SYSERROR("Failed to setgid to 0"); + if (setresuid(0,0,0) < 0) + SYSERROR("Failed to setuid to 0"); + if (setgroups(0, NULL) < 0) + SYSERROR("Failed to clear groups"); + + return cgroup_rmdir(path); +} + +void recursive_destroy(char *path, struct lxc_conf *conf) +{ + int r; + if (conf && !lxc_list_empty(&conf->id_map)) + r = userns_exec_1(conf, rmdir_wrapper, path); + else + r = cgroup_rmdir(path); + + if (r < 0) + ERROR("Error destroying %s\n", path); +} + +static void cgfsng_destroy(void *hdata, struct lxc_conf *conf) +{ + struct cgfsng_handler_data *d = hdata; + + if (!d) + return; + + if (d->container_cgroup && d->hierarchies) { + int i; + for (i = 0; d->hierarchies[i]; i++) { + struct hierarchy *h = d->hierarchies[i]; + if (!h->fullcgpath) { + recursive_destroy(h->fullcgpath, conf); + free(h->fullcgpath); + h->fullcgpath = NULL; + } + } + } + + free_handler_data(d); +} + +struct cgroup_ops *cgfsng_ops_init(void) +{ + /* TODO - when cgroup_mount is implemented, drop this check */ + if (!file_exists("/proc/self/ns/cgroup")) + return NULL; + return &cgfsng_ops; +} + +static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname) +{ + char *fullpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL); + int ret; + + ret = mkdir_p(fullpath, 0755); + h->fullcgpath = fullpath; + return ret == 0; +} + +static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname) +{ + if (rmdir(h->fullcgpath) < 0) + SYSERROR("Failed to clean up cgroup %s from failed creation attempt", h->fullcgpath); + free(h->fullcgpath); + h->fullcgpath = NULL; +} + +/* + * Try to create the same cgrou pin all hierarchies. + * Start with cgroup_pattern; next cgroup_pattern-1, -2, ..., -999 + */ +static inline bool cgfsng_create(void *hdata) +{ + struct cgfsng_handler_data *d = hdata; + char *tmp, *cgname, *offset; + int i, idx = 0; + size_t len; + + if (!d) + return false; + if (d->container_cgroup) { + WARN("cgfsng_create called a second time"); + return false; + } + + tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern); + if (!tmp) { + ERROR("Failed expanding cgroup name pattern"); + return false; + } + len = strlen(tmp) + 5; // leave room for -NNN\0 + cgname = must_alloc(len); + strcpy(cgname, tmp); + free(tmp); + offset = cgname + len - 5; + +again: + if (idx == 1000) + goto out_free; + if (idx) + snprintf(offset, 5, "-%d", idx); + for (i = 0; d->hierarchies[i]; i++) { + if (!create_path_for_hierarchy(d->hierarchies[i], cgname)) { + int j; + SYSERROR("Failed to create %s: %s", d->hierarchies[i]->fullcgpath, strerror(errno)); + free(d->hierarchies[i]->fullcgpath); + d->hierarchies[i]->fullcgpath = NULL; + for (j = 0; j < i; j++) + remove_path_for_hierarchy(d->hierarchies[j], cgname); + idx++; + goto again; + } + } + /* Done */ + d->container_cgroup = cgname; + return true; + +out_free: + free(cgname); + return false; +} + +static const char *cgfsng_canonical_path(void *hdata) +{ + struct cgfsng_handler_data *d = hdata; + + return d->container_cgroup; +} + +static bool cgfsng_enter(void *hdata, pid_t pid) +{ + struct cgfsng_handler_data *d = hdata; + char pidstr[25]; + int i, len; + + len = snprintf(pidstr, 25, "%d", pid); + if (len < 0 || len > 25) + return false; + + for (i = 0; d->hierarchies[i]; i++) { + char *fullpath = must_make_path(d->hierarchies[i]->fullcgpath, + "cgroup.procs", NULL); + if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) { + ERROR("Failed to enter %s\n", fullpath); + free(fullpath); + return false; + } + free(fullpath); + } + + return true; +} + +struct chown_data { + struct cgfsng_handler_data *d; + uid_t origuid; // target uid in parent namespace +}; + +static int chown_cgroup_wrapper(void *data) +{ + struct chown_data *arg = data; + struct cgfsng_handler_data *d = arg->d; + uid_t destuid; + int i; + + if (setresgid(0,0,0) < 0) + SYSERROR("Failed to setgid to 0"); + if (setresuid(0,0,0) < 0) + SYSERROR("Failed to setuid to 0"); + if (setgroups(0, NULL) < 0) + SYSERROR("Failed to clear groups"); + + destuid = get_ns_uid(arg->origuid); + + for (i = 0; d->hierarchies[i]; i++) { + char *fullpath = must_make_path(d->hierarchies[i]->fullcgpath, NULL); + if (chown(fullpath, destuid, 0) < 0) { + SYSERROR("Error chowning %s", fullpath); + free(fullpath); + return -1; + } + // TODO - do we need to chown tasks and cgroup.procs too? + + free(fullpath); + } + + return 0; +} + +static bool cgfsns_chown(void *hdata, struct lxc_conf *conf) +{ + struct cgfsng_handler_data *d = hdata; + struct chown_data wrap; + + if (!d) + return false; + + if (lxc_list_empty(&conf->id_map)) + return true; + + wrap.d = d; + wrap.origuid = geteuid(); + + if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap) < 0) { + ERROR("Error requesting cgroup chown in new namespace"); + return false; + } + + return true; +} + +static bool cgfsng_mount(void *hdata, const char *root, int type) +{ + if (cgns_supported()) + return true; + // TODO - implement this. Not needed for cgroup namespaces + return false; +} + +static int recursive_count_nrtasks(char *dirname) +{ + struct dirent dirent, *direntp; + DIR *dir; + int count = 0, ret; + char *path; + + dir = opendir(dirname); + if (!dir) + return 0; + + while (!readdir_r(dir, &dirent, &direntp)) { + struct stat mystat; + + if (!direntp) + break; + + if (!strcmp(direntp->d_name, ".") || + !strcmp(direntp->d_name, "..")) + continue; + + path = must_make_path(dirname, direntp->d_name, NULL); + + if (lstat(path, &mystat)) + goto next; + + if (!S_ISDIR(mystat.st_mode)) + goto next; + + count += recursive_count_nrtasks(path); +next: + free(path); + } + + path = must_make_path(dirname, "cgroup.procs", NULL); + ret = lxc_count_file_lines(path); + if (ret != -1) + count += ret; + free(path); + + (void) closedir(dir); + + return count; +} + +static int cgfsng_nrtasks(void *hdata) { + struct cgfsng_handler_data *d = hdata; + char *path; + int count; + + if (!d || !d->container_cgroup || !d->hierarchies) + return -1; + path = must_make_path(d->hierarchies[0]->fullcgpath, NULL); + count = recursive_count_nrtasks(path); + free(path); + return count; +} + +/* Only root needs to escape to the cgroup of its init */ +static bool cgfsng_escape(void *hdata) +{ + struct cgfsng_handler_data *d = hdata; + int i; + + if (geteuid()) + return true; + + for (i = 0; d->hierarchies[i]; i++) { + char *fullpath = must_make_path(d->hierarchies[i]->mountpoint, + d->hierarchies[i]->base_cgroup, + "cgroup.procs", NULL); + if (lxc_write_to_file(fullpath, "0", 2, false) != 0) { + ERROR("Failed to enter %s\n", fullpath); + free(fullpath); + return false; + } + free(fullpath); + } + + return true; +} + +#define THAWED "THAWED" +#define THAWED_LEN (strlen(THAWED)) + +static bool cgfsng_unfreeze(void *hdata) +{ + struct cgfsng_handler_data *d = hdata; + char *fullpath; + struct hierarchy *h = get_hierarchy(d, "freezer"); + + if (!d || !h) + return false; + fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL); + if (lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false) != 0) { + free(fullpath); + return false; + } + free(fullpath); + return true; +} + +static const char *cgfsng_get_cgroup(void *hdata, const char *subsystem) +{ + struct cgfsng_handler_data *d = hdata; + struct hierarchy *h; + if (!d) + return NULL; + + h = get_hierarchy(d, subsystem); + if (!h) + return NULL; + + return h->fullcgpath; +} + +static bool cgfsng_attach(const char *name, const char *lxcpath, pid_t pid) +{ + struct cgfsng_handler_data *d; + char pidstr[25]; + int i, len; + + len = snprintf(pidstr, 25, "%d", pid); + if (len < 0 || len > 25) + return false; + + d = cgfsng_init(name); + if (!d) + return false; + + for (i = 0; d->hierarchies[i]; i++) { + char *path, *fullpath; + struct hierarchy *h = d->hierarchies[i]; + + path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); + if (!path) // not running + continue; + + fullpath = must_make_path(path, "cgroup.procs", NULL); + if (lxc_write_to_file(fullpath, pidstr, len, false) != 0) { + SYSERROR("Failed to attach %d to %s", (int)pid, fullpath); + free(fullpath); + free(path); + free_handler_data(d); + return false; + } + free(path); + free(fullpath); + } + + free_handler_data(d); + return true; +} + +/* + * Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. + * Here we don't have a cgroup_data set up, so we ask the running + * container through the commands API for the cgroup path + */ +static int cgfsng_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath) +{ + char *subsystem, *p, *path; + struct cgfsng_handler_data *d; + struct hierarchy *h; + int ret = -1; + + subsystem = alloca(strlen(filename) + 1); + strcpy(subsystem, filename); + if ((p = strchr(subsystem, '.')) != NULL) + *p = '\0'; + + path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem); + if (!path) // not running + return -1; + + d = cgfsng_init(name); + if (!d) { + free(path); + return false; + } + + h = get_hierarchy(d, subsystem); + if (h) { + char *fullpath = must_make_path(path, filename, NULL); + ret = lxc_read_from_file(fullpath, value, len); + free(fullpath); + } + + free_handler_data(d); + free(path); + + return ret; +} + +/* + * Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. + * Here we don't have a cgroup_data set up, so we ask the running + * container through the commands API for the cgroup path + */ +static int cgfsng_set(const char *filename, const char *value, const char *name, const char *lxcpath) +{ + char *subsystem, *p, *path; + struct cgfsng_handler_data *d; + struct hierarchy *h; + int ret = -1; + + subsystem = alloca(strlen(filename) + 1); + strcpy(subsystem, filename); + if ((p = strchr(subsystem, '.')) != NULL) + *p = '\0'; + + path = lxc_cmd_get_cgroup_path(name, lxcpath, subsystem); + if (!path) // not running + return -1; + + d = cgfsng_init(name); + if (!d) { + free(path); + return false; + } + + h = get_hierarchy(d, subsystem); + if (h) { + char *fullpath = must_make_path(path, filename, NULL); + ret = lxc_write_to_file(fullpath, value, strlen(value), false); + free(fullpath); + } + + free_handler_data(d); + free(path); + + return ret; +} + +/* + * Check whether a container already has a particular rule, as otherwise + * may end up with spurious permission errors. + */ +static bool cgroup_devices_has_allow_or_deny(struct cgfsng_handler_data *d, + char *v, bool for_allow, char *path) +{ + FILE *devices_list; + char *line = NULL; + size_t sz = 0; + bool ret = !for_allow; + + /* if it's a deny rule and container has all devices, then it doesn't + * yet have the deny rule */ + if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0) + return false; + + devices_list = fopen_cloexec(path, "r"); + if (!devices_list) { + free(path); + return false; + } + + while (getline(&line, &sz, devices_list) != -1) { + size_t len = strlen(line); + if (len > 0 && line[len-1] == '\n') + line[len-1] = '\0'; + if (strcmp(line, "a *:* rwm") == 0) { + /* if container has all access and we're adding allow rule, + * then already has it; if it has all access and we're + * adding a deny rule, then it does not. */ + ret = for_allow; + goto out; + } else if (for_allow && strcmp(line, v) == 0) { + /* if the line is there verbatim and it is an + * allow rule, then it already has it */ + ret = true; + goto out; + } + } + +out: + fclose(devices_list); + free(line); + return ret; +} + +/* + * Called from setup_limits - here we have the container's cgroup_data because + * we created the cgroups + */ +static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfsng_handler_data *d) +{ + char *subsystem = NULL, *p; + int ret = -1; + struct hierarchy *h; + + subsystem = alloca(strlen(filename) + 1); + strcpy(subsystem, filename); + if ((p = strchr(subsystem, '.')) != NULL) + *p = '\0'; + + h = get_hierarchy(d, subsystem); + if (h) { + char *fullpath = must_make_path(h->fullcgpath, filename, NULL); + ret = lxc_write_to_file(fullpath, value, strlen(value), false); + free(fullpath); + } + return ret; +} + +static bool cgfsng_setup_limits(void *hdata, struct lxc_list *cgroup_settings, + bool do_devices) +{ + struct cgfsng_handler_data *d = hdata; + struct lxc_list *iterator, *sorted_cgroup_settings, *next; + struct lxc_cgroup *cg; + struct hierarchy *h; + char *listpath = NULL; + bool ret = false; + + if (lxc_list_empty(cgroup_settings)) + return true; + + sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings); + if (!sorted_cgroup_settings) { + return false; + } + + if (do_devices) { + h = get_hierarchy(d, "devices"); + if (!h) { + ERROR("No devices cgroup setup for %s\n", d->name); + return false; + } + listpath = must_make_path(h->fullcgpath, "devices.list", NULL); + } + + lxc_list_for_each(iterator, sorted_cgroup_settings) { + cg = iterator->elem; + + if (do_devices == !strncmp("devices", cg->subsystem, 7)) { + if (strcmp(cg->subsystem, "devices.deny") == 0 && + cgroup_devices_has_allow_or_deny(d, cg->value, false, listpath)) + continue; + if (strcmp(cg->subsystem, "devices.allow") == 0 && + cgroup_devices_has_allow_or_deny(d, cg->value, true, listpath)) + continue; + if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) { + if (do_devices && (errno == EACCES || errno == EPERM)) { + WARN("Error setting %s to %s for %s", + cg->subsystem, cg->value, d->name); + continue; + } + SYSERROR("Error setting %s to %s for %s", + cg->subsystem, cg->value, d->name); + goto out; + } + } + + DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value); + } + + ret = true; + INFO("cgroup has been setup"); +out: + free(listpath); + lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) { + lxc_list_del(iterator); + free(iterator); + } + free(sorted_cgroup_settings); + return ret; +} + +static struct cgroup_ops cgfsng_ops = { + .init = cgfsng_init, + .destroy = cgfsng_destroy, + .create = cgfsng_create, + .enter = cgfsng_enter, + .canonical_path = cgfsng_canonical_path, + .escape = cgfsng_escape, + .get_cgroup = cgfsng_get_cgroup, + .get = cgfsng_get, + .set = cgfsng_set, + .unfreeze = cgfsng_unfreeze, + .setup_limits = cgfsng_setup_limits, + .name = "cgroupfs-ng", + .attach = cgfsng_attach, + .chown = cgfsns_chown, + .mount_cgroup = cgfsng_mount, + .nrtasks = cgfsng_nrtasks, + .driver = CGFSNG, + + /* unsupported */ + .create_legacy = NULL, +}; diff --git a/src/lxc/cgmanager.c b/src/lxc/cgmanager.c index 7a35d03aa..c387b00ce 100644 --- a/src/lxc/cgmanager.c +++ b/src/lxc/cgmanager.c @@ -299,7 +299,7 @@ static bool lxc_cgmanager_create(const char *controller, const char *cgroup_path * be in "/lxc/c1" rather than "/user/..../c1" * called internally with connection already open */ -static bool cgm_escape(void) +static bool cgm_escape(void *hdata) { bool ret = true, cgm_needs_disconnect = false; pid_t me = getpid(); @@ -1436,7 +1436,7 @@ struct cgroup_ops *cgm_ops_init(void) cgm_all_controllers_same = false; // if root, try to escape to root cgroup - if (geteuid() == 0 && !cgm_escape()) { + if (geteuid() == 0 && !cgm_escape(NULL)) { free_subsystems(); return NULL; } diff --git a/src/lxc/cgroup.c b/src/lxc/cgroup.c index 5d67bd36e..684a3c3df 100644 --- a/src/lxc/cgroup.c +++ b/src/lxc/cgroup.c @@ -34,6 +34,7 @@ lxc_log_define(lxc_cgroup, lxc); static struct cgroup_ops *ops = NULL; extern struct cgroup_ops *cgfs_ops_init(void); +extern struct cgroup_ops *cgfsng_ops_init(void); extern struct cgroup_ops *cgm_ops_init(void); __attribute__((constructor)) @@ -45,8 +46,10 @@ void cgroup_ops_init(void) } DEBUG("cgroup_init"); + ops = cgfsng_ops_init(); #if HAVE_CGMANAGER - ops = cgm_ops_init(); + if (!ops) + ops = cgm_ops_init(); #endif if (!ops) ops = cgfs_ops_init(); @@ -109,10 +112,10 @@ const char *cgroup_get_cgroup(struct lxc_handler *handler, const char *subsystem return NULL; } -bool cgroup_escape(void) +bool cgroup_escape(struct lxc_handler *handler) { if (ops) - return ops->escape(); + return ops->escape(handler->cgroup_data); return false; } diff --git a/src/lxc/cgroup.h b/src/lxc/cgroup.h index 9919486e3..ff3651e4c 100644 --- a/src/lxc/cgroup.h +++ b/src/lxc/cgroup.h @@ -35,6 +35,7 @@ struct lxc_list; typedef enum { CGFS, CGMANAGER, + CGFSNG, } cgroup_driver_t; struct cgroup_ops { @@ -47,7 +48,7 @@ struct cgroup_ops { bool (*create_legacy)(void *hdata, pid_t pid); const char *(*get_cgroup)(void *hdata, const char *subsystem); const char *(*canonical_path)(void *hdata); - bool (*escape)(void); + bool (*escape)(void *hdata); int (*set)(const char *filename, const char *value, const char *name, const char *lxcpath); int (*get)(const char *filename, char *value, size_t len, const char *name, const char *lxcpath); bool (*unfreeze)(void *hdata); @@ -72,7 +73,7 @@ extern void cgroup_cleanup(struct lxc_handler *handler); extern bool cgroup_create_legacy(struct lxc_handler *handler); extern int cgroup_nrtasks(struct lxc_handler *handler); extern const char *cgroup_get_cgroup(struct lxc_handler *handler, const char *subsystem); -extern bool cgroup_escape(void); +extern bool cgroup_escape(struct lxc_handler *handler); /* * Currently, this call only makes sense for privileged containers. diff --git a/src/lxc/criu.c b/src/lxc/criu.c index 6ef490589..25e8d7082 100644 --- a/src/lxc/criu.c +++ b/src/lxc/criu.c @@ -47,7 +47,7 @@ lxc_log_define(lxc_criu, lxc); -void exec_criu(struct criu_opts *opts) +void exec_criu(struct lxc_handler *handler, struct criu_opts *opts) { char **argv, log[PATH_MAX]; int static_args = 22, argc = 0, i, ret; @@ -63,7 +63,7 @@ void exec_criu(struct criu_opts *opts) * /actual/ root cgroup so that lxcfs thinks criu has enough rights to * see all cgroups. */ - if (!cgroup_escape()) { + if (!cgroup_escape(handler)) { ERROR("failed to escape cgroups"); return; } @@ -517,7 +517,7 @@ void do_restore(struct lxc_container *c, int pipe, char *directory, bool verbose os.cgroup_path = cgroup_canonical_path(handler); /* exec_criu() returning is an error */ - exec_criu(&os); + exec_criu(handler, &os); umount(rootfs->mount); rmdir(rootfs->mount); goto out_fini_handler; @@ -624,6 +624,16 @@ static bool do_dump(struct lxc_container *c, char *mode, char *directory, if (pid == 0) { struct criu_opts os; + struct lxc_handler *handler; + + handler = lxc_init(c->name, c->lxc_conf, c->config_path); + if (!handler) + exit(1); + + if (!cgroup_init(handler)) { + ERROR("failed initing cgroups"); + exit(1); + } os.action = mode; os.directory = directory; @@ -633,7 +643,7 @@ static bool do_dump(struct lxc_container *c, char *mode, char *directory, os.predump_dir = predump_dir; /* exec_criu() returning is an error */ - exec_criu(&os); + exec_criu(handler, &os); exit(1); } else { int status; diff --git a/src/lxc/criu.h b/src/lxc/criu.h index e35f98a81..75e63816d 100644 --- a/src/lxc/criu.h +++ b/src/lxc/criu.h @@ -58,7 +58,7 @@ struct criu_opts { const char *cgroup_path; }; -void exec_criu(struct criu_opts *opts); +void exec_criu(struct lxc_handler *handler, struct criu_opts *opts); /* Check and make sure the container has a configuration that we know CRIU can * dump. */ diff --git a/src/lxc/utils.c b/src/lxc/utils.c index 0bc7a2045..6bee69874 100644 --- a/src/lxc/utils.c +++ b/src/lxc/utils.c @@ -1771,3 +1771,25 @@ err: close(fd); return ret; } + +/* + * Return the number of lines in file @fn, or -1 on error + */ +int lxc_count_file_lines(const char *fn) +{ + FILE *f; + char *line = NULL; + size_t sz = 0; + int n = 0; + + f = fopen_cloexec(fn, "r"); + if (!f) + return -1; + + while (getline(&line, &sz, f) != -1) { + n++; + } + free(line); + fclose(f); + return n; +} diff --git a/src/lxc/utils.h b/src/lxc/utils.h index 96ec45c20..7d20a398c 100644 --- a/src/lxc/utils.h +++ b/src/lxc/utils.h @@ -284,4 +284,5 @@ int safe_mount(const char *src, const char *dest, const char *fstype, unsigned long flags, const void *data, const char *rootfs); int mount_proc_if_needed(const char *rootfs); int null_stdfds(void); +int lxc_count_file_lines(const char *fn); #endif /* __LXC_UTILS_H */