From: Christian Brauner Date: Mon, 29 Jun 2020 09:34:01 +0000 (+0200) Subject: lxc: support CLONE_INTO_CGROUP X-Git-Tag: lxc-5.0.0~404^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f7176c3ea944ce2b9968b7c4a18c266639927395;p=thirdparty%2Flxc.git lxc: support CLONE_INTO_CGROUP Signed-off-by: Christian Brauner --- diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 6c64c996c..bab4ba340 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops, struct hierarchy *h = ops->hierarchies[i]; int ret; + if (is_unified_hierarchy(h) && handler->clone_flags & CLONE_INTO_CGROUP) + continue; + ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len); if (ret != 0) return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path); diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index c5bf7941a..e3712b710 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops) return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED; } +static inline int cgroup_unified_fd(const struct cgroup_ops *ops) +{ + if (!ops->unified) + return -EBADF; + + return ops->unified->cgfd_con; +} + #endif diff --git a/src/lxc/process_utils.c b/src/lxc/process_utils.c index 7494def46..ccc4c0bf9 100644 --- a/src/lxc/process_utils.c +++ b/src/lxc/process_utils.c @@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc); * The nice thing about this is that we get fork() behavior. That is * lxc_raw_clone() returns 0 in the child and the child pid in the parent. */ -__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd) +__returns_twice pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd) { #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) @@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd) pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0); if (pid < 0 && errno == ENOSYS) { SYSTRACE("Falling back to legacy clone"); - return __lxc_raw_clone(flags, pidfd); + return lxc_raw_legacy_clone(flags, pidfd); } return pid; diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h index 4ea898a63..61b0e412b 100644 --- a/src/lxc/process_utils.h +++ b/src/lxc/process_utils.h @@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd); * The child must use lxc_raw_getpid() to retrieve its pid. */ extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd); +extern pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd); /* * lxc_raw_clone_cb() - create a new process diff --git a/src/lxc/start.c b/src/lxc/start.c index c49b249fb..244de39dd 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1081,8 +1081,7 @@ static int do_start(void *data) /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See * https://github.com/lxc/lxd/issues/1978. */ - if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) == - (CLONE_NEWNET | CLONE_NEWUSER)) { + if (handler->ns_unshare_flags & CLONE_NEWNET) { ret = unshare(CLONE_NEWNET); if (ret < 0) { SYSERROR("Failed to unshare CLONE_NEWNET"); @@ -1190,7 +1189,7 @@ static int do_start(void *data) * * 8:cpuset:/ */ - if (handler->ns_clone_flags & CLONE_NEWCGROUP) { + if (handler->ns_unshare_flags & CLONE_NEWCGROUP) { ret = unshare(CLONE_NEWCGROUP); if (ret < 0) { if (errno != EINVAL) { @@ -1205,7 +1204,7 @@ static int do_start(void *data) } } - if (handler->ns_clone_flags & CLONE_NEWTIME) { + if (handler->ns_unshare_flags & CLONE_NEWTIME) { ret = unshare(CLONE_NEWTIME); if (ret < 0) { if (errno != EINVAL) { @@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler) if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag)) return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets"); + /* Deal with namespaces that are unshared. */ + if (handler->ns_clone_flags & CLONE_NEWTIME) + handler->ns_unshare_flags |= CLONE_NEWTIME; + + if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP) + handler->ns_unshare_flags |= CLONE_NEWCGROUP; + + if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) == + (CLONE_NEWNET | CLONE_NEWUSER)) + handler->ns_unshare_flags |= CLONE_NEWNET; + + /* Deal with namespaces that are spawned. */ + handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags; + + handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD; + return 0; } @@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler) } /* Create a process in a new set of namespaces. */ - handler->ns_on_clone_flags = handler->ns_clone_flags; - if (handler->ns_clone_flags & CLONE_NEWUSER) { - /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to - * clone a new user namespace first and only later unshare our - * network namespace to ensure that network devices ownership is - * set up correctly. - */ - handler->ns_on_clone_flags &= ~CLONE_NEWNET; - } - /* The cgroup namespace gets unshare()ed not clone()ed. */ - handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP; - - /* The time namespace (currently) gets unshare()ed not clone()ed. */ - handler->ns_on_clone_flags &= ~CLONE_NEWTIME; - if (share_ns) { pid_t attacher_pid; @@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler) SYSERROR("Intermediate process failed"); goto out_delete_net; } + + if (handler->pid < 0) { + SYSERROR(LXC_CLONE_ERROR); + goto out_delete_net; + } } else { - handler->pid = lxc_raw_clone_cb(do_start, handler, - CLONE_PIDFD | handler->ns_on_clone_flags, - &handler->pidfd); - } - if (handler->pid < 0) { - SYSERROR(LXC_CLONE_ERROR); - goto out_delete_net; + int cgroup_fd; + + struct lxc_clone_args clone_args = { + .flags = handler->clone_flags, + .pidfd = ptr_to_u64(&handler->pidfd), + .exit_signal = SIGCHLD, + }; + + if (handler->ns_clone_flags & CLONE_NEWCGROUP) { + cgroup_fd = cgroup_unified_fd(cgroup_ops); + if (cgroup_fd >= 0) { + handler->clone_flags |= CLONE_INTO_CGROUP; + clone_args.flags |= CLONE_INTO_CGROUP; + clone_args.cgroup = cgroup_fd; + } + } + + /* Try to spawn directly into target cgroup. */ + handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2); + if (handler->pid < 0) { + SYSTRACE("Failed to spawn container directly into target cgroup"); + + /* Kernel might simply be too old for CLONE_INTO_CGROUP. */ + handler->clone_flags &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP); + handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP; + handler->ns_unshare_flags |= CLONE_NEWCGROUP; + + clone_args.flags = handler->clone_flags; + + handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0); + } else if (cgroup_fd >= 0) { + TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd); + } + + /* Kernel might be too old for clone3(). */ + if (handler->pid < 0) { + SYSTRACE("Failed to spawn container via clone3()"); + handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd); + } + + if (handler->pid < 0) { + SYSERROR(LXC_CLONE_ERROR); + goto out_delete_net; + } + + if (handler->pid == 0) { + (void)do_start(handler); + _exit(EXIT_FAILURE); + } } + if (handler->pidfd < 0) + handler->clone_flags &= ~CLONE_PIDFD; TRACE("Cloned child process %d", handler->pid); /* Verify that we can actually make use of pidfds. */ @@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler) } TRACE("Set up cgroup2 device controller limits"); - if (handler->ns_clone_flags & CLONE_NEWCGROUP) { + if (handler->ns_unshare_flags & CLONE_NEWCGROUP) { /* Now we're ready to preserve the cgroup namespace */ ret = lxc_try_preserve_ns(handler->pid, "cgroup"); if (ret < 0) { @@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler) cgroup_ops->payload_finalize(cgroup_ops); TRACE("Finished setting up cgroups"); - if (handler->ns_clone_flags & CLONE_NEWTIME) { + if (handler->ns_unshare_flags & CLONE_NEWTIME) { /* Now we're ready to preserve the cgroup namespace */ ret = lxc_try_preserve_ns(handler->pid, "time"); if (ret < 0) { diff --git a/src/lxc/start.h b/src/lxc/start.h index ece4aac47..6852f6e22 100644 --- a/src/lxc/start.h +++ b/src/lxc/start.h @@ -26,20 +26,18 @@ struct lxc_handler { * list the clone flags that were unshare()ed rather then clone()ed * because of ordering requirements (e.g. e.g. CLONE_NEWNET and * CLONE_NEWUSER) or implementation details. - * - * @ns_keep_flags; - * - The clone flags for the namespaces that the container will inherit - * from the parent. They are not recorded in the handler itself but - * are present in the container's config. * - * @ns_share_flags; - * - The clone flags for the namespaces that the container will share - * with another process. They are not recorded in the handler itself - * but are present in the container's config. + * @ns_unshare_flags + * - Flags for namespaces that were unshared, not cloned. + * + * @clone_flags + * - ns_on_clone flags | other flags used to create container. */ struct /* lxc_ns */ { - int ns_clone_flags; - int ns_on_clone_flags; + unsigned int ns_clone_flags; + unsigned int ns_on_clone_flags; + unsigned int ns_unshare_flags; + unsigned int clone_flags; }; /* File descriptor to pin the rootfs for privileged containers. */