lxc: support CLONE_INTO_CGROUP

author Christian Brauner <christian.brauner@ubuntu.com>

Mon, 29 Jun 2020 09:34:01 +0000 (11:34 +0200)

committer Christian Brauner <christian.brauner@ubuntu.com>

Mon, 29 Jun 2020 11:58:21 +0000 (13:58 +0200)
author Christian Brauner <christian.brauner@ubuntu.com>
Mon, 29 Jun 2020 09:34:01 +0000 (11:34 +0200)
committer Christian Brauner <christian.brauner@ubuntu.com>
Mon, 29 Jun 2020 11:58:21 +0000 (13:58 +0200)
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c

index 6c64c996c285b2f3c3bd737a6ab4f2f3f00e0f33..bab4ba34094cdea3d6177e2608e5644d0ded70a7 100644 (file)
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -1549,6 +1549,9 @@ __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
                 struct hierarchy *h = ops->hierarchies[i];
                 int ret;
  
+               if (is_unified_hierarchy(h) && handler->clone_flags & CLONE_INTO_CGROUP)
+                       continue;
+
                 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
                 if (ret != 0)
                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h

index c5bf7941ada88cb8f7a519bf4f3a6ed2500f9aac..e3712b710e499df7a7af7d61e8c3fc0bd9ccdca4 100644 (file)
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -194,4 +194,12 @@ static inline bool pure_unified_layout(const struct cgroup_ops *ops)
         return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
  }
  
+static inline int cgroup_unified_fd(const struct cgroup_ops *ops)
+{
+       if (!ops->unified)
+               return -EBADF;
+
+       return ops->unified->cgfd_con;
+}
+
  #endif
diff --git a/src/lxc/process_utils.c b/src/lxc/process_utils.c

index 7494def46b4897284504db5c214185112f6636bb..ccc4c0bf9860e1fbd9c9667b035ec3aa0d90431a 100644 (file)
--- a/src/lxc/process_utils.c
+++ b/src/lxc/process_utils.c
@@ -28,7 +28,7 @@ lxc_log_define(process_utils, lxc);
   * The nice thing about this is that we get fork() behavior. That is
   * lxc_raw_clone() returns 0 in the child and the child pid in the parent.
   */
-__returns_twice static pid_t __lxc_raw_clone(unsigned long flags, int *pidfd)
+__returns_twice pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd)
  {
  
  #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__)
@@ -108,7 +108,7 @@ __returns_twice pid_t lxc_raw_clone(unsigned long flags, int *pidfd)
         pid = lxc_clone3(&args, CLONE_ARGS_SIZE_VER0);
         if (pid < 0 && errno == ENOSYS) {
                 SYSTRACE("Falling back to legacy clone");
-               return __lxc_raw_clone(flags, pidfd);
+               return lxc_raw_legacy_clone(flags, pidfd);
         }
  
         return pid;
diff --git a/src/lxc/process_utils.h b/src/lxc/process_utils.h

index 4ea898a6331686d301ebb2dd3c1534d6608db7c2..61b0e412b7a5be6e27e9e9608020f8b936c35275 100644 (file)
--- a/src/lxc/process_utils.h
+++ b/src/lxc/process_utils.h
@@ -240,6 +240,7 @@ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags, int *pidfd);
   *   The child must use lxc_raw_getpid() to retrieve its pid.
   */
  extern pid_t lxc_raw_clone(unsigned long flags, int *pidfd);
+extern pid_t lxc_raw_legacy_clone(unsigned long flags, int *pidfd);
  
  /*
   * lxc_raw_clone_cb() - create a new process
diff --git a/src/lxc/start.c b/src/lxc/start.c

index c49b249fb3578223ada67fff343c8adfdf02a22a..244de39dd107a82d218853c186d35371f5e74b1e 100644 (file)
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1081,8 +1081,7 @@ static int do_start(void *data)
         /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
          * https://github.com/lxc/lxd/issues/1978.
          */
-       if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
-           (CLONE_NEWNET | CLONE_NEWUSER)) {
+       if (handler->ns_unshare_flags & CLONE_NEWNET) {
                 ret = unshare(CLONE_NEWNET);
                 if (ret < 0) {
                         SYSERROR("Failed to unshare CLONE_NEWNET");
@@ -1190,7 +1189,7 @@ static int do_start(void *data)
          *
          *      8:cpuset:/
          */
-       if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+       if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
                 ret = unshare(CLONE_NEWCGROUP);
                 if (ret < 0) {
                         if (errno != EINVAL) {
@@ -1205,7 +1204,7 @@ static int do_start(void *data)
                 }
         }
  
-       if (handler->ns_clone_flags & CLONE_NEWTIME) {
+       if (handler->ns_unshare_flags & CLONE_NEWTIME) {
                 ret = unshare(CLONE_NEWTIME);
                 if (ret < 0) {
                         if (errno != EINVAL) {
@@ -1537,6 +1536,22 @@ int resolve_clone_flags(struct lxc_handler *handler)
         if (wants_timens && (conf->ns_keep & ns_info[LXC_NS_TIME].clone_flag))
                 return log_trace_errno(-1, EINVAL, "Requested to keep time namespace while also specifying offsets");
  
+       /* Deal with namespaces that are unshared. */
+       if (handler->ns_clone_flags & CLONE_NEWTIME)
+               handler->ns_unshare_flags |= CLONE_NEWTIME;
+
+       if (!pure_unified_layout(handler->cgroup_ops) && handler->ns_clone_flags & CLONE_NEWCGROUP)
+               handler->ns_unshare_flags |= CLONE_NEWCGROUP;
+
+       if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
+           (CLONE_NEWNET | CLONE_NEWUSER))
+               handler->ns_unshare_flags |= CLONE_NEWNET;
+
+       /* Deal with namespaces that are spawned. */
+       handler->ns_on_clone_flags = handler->ns_clone_flags & ~handler->ns_unshare_flags;
+
+       handler->clone_flags = handler->ns_on_clone_flags | CLONE_PIDFD;
+
         return 0;
  }
  
@@ -1659,21 +1674,6 @@ static int lxc_spawn(struct lxc_handler *handler)
         }
  
         /* Create a process in a new set of namespaces. */
-       handler->ns_on_clone_flags = handler->ns_clone_flags;
-       if (handler->ns_clone_flags & CLONE_NEWUSER) {
-               /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
-                * clone a new user namespace first and only later unshare our
-                * network namespace to ensure that network devices ownership is
-                * set up correctly.
-                */
-               handler->ns_on_clone_flags &= ~CLONE_NEWNET;
-       }
-       /* The cgroup namespace gets unshare()ed not clone()ed. */
-       handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
-
-       /* The time namespace (currently) gets unshare()ed not clone()ed. */
-       handler->ns_on_clone_flags &= ~CLONE_NEWTIME;
-
         if (share_ns) {
                 pid_t attacher_pid;
  
@@ -1689,15 +1689,64 @@ static int lxc_spawn(struct lxc_handler *handler)
                         SYSERROR("Intermediate process failed");
                         goto out_delete_net;
                 }
+
+               if (handler->pid < 0) {
+                       SYSERROR(LXC_CLONE_ERROR);
+                       goto out_delete_net;
+               }
         } else {
-               handler->pid = lxc_raw_clone_cb(do_start, handler,
-                                               CLONE_PIDFD | handler->ns_on_clone_flags,
-                                               &handler->pidfd);
-       }
-       if (handler->pid < 0) {
-               SYSERROR(LXC_CLONE_ERROR);
-               goto out_delete_net;
+               int cgroup_fd;
+
+               struct lxc_clone_args clone_args = {
+                       .flags = handler->clone_flags,
+                       .pidfd = ptr_to_u64(&handler->pidfd),
+                       .exit_signal = SIGCHLD,
+               };
+
+               if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+                       cgroup_fd = cgroup_unified_fd(cgroup_ops);
+                       if (cgroup_fd >= 0) {
+                               handler->clone_flags    |= CLONE_INTO_CGROUP;
+                               clone_args.flags        |= CLONE_INTO_CGROUP;
+                               clone_args.cgroup       = cgroup_fd;
+                       }
+               }
+
+               /* Try to spawn directly into target cgroup. */
+               handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER2);
+               if (handler->pid < 0) {
+                       SYSTRACE("Failed to spawn container directly into target cgroup");
+
+                       /* Kernel might simply be too old for CLONE_INTO_CGROUP. */
+                       handler->clone_flags            &= ~(CLONE_INTO_CGROUP | CLONE_NEWCGROUP);
+                       handler->ns_on_clone_flags      &= ~CLONE_NEWCGROUP;
+                       handler->ns_unshare_flags       |= CLONE_NEWCGROUP;
+
+                       clone_args.flags                = handler->clone_flags;
+
+                       handler->pid = lxc_clone3(&clone_args, CLONE_ARGS_SIZE_VER0);
+               } else if (cgroup_fd >= 0) {
+                       TRACE("Spawned container directly into target cgroup via cgroup2 fd %d", cgroup_fd);
+               }
+
+               /* Kernel might be too old for clone3(). */
+               if (handler->pid < 0) {
+                       SYSTRACE("Failed to spawn container via clone3()");
+                       handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+               }
+
+               if (handler->pid < 0) {
+                       SYSERROR(LXC_CLONE_ERROR);
+                       goto out_delete_net;
+               }
+
+               if (handler->pid == 0) {
+                       (void)do_start(handler);
+                       _exit(EXIT_FAILURE);
+               }
         }
+       if (handler->pidfd < 0)
+               handler->clone_flags &= ~CLONE_PIDFD;
         TRACE("Cloned child process %d", handler->pid);
  
         /* Verify that we can actually make use of pidfds. */
@@ -1853,7 +1902,7 @@ static int lxc_spawn(struct lxc_handler *handler)
         }
         TRACE("Set up cgroup2 device controller limits");
  
-       if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
+       if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
                 /* Now we're ready to preserve the cgroup namespace */
                 ret = lxc_try_preserve_ns(handler->pid, "cgroup");
                 if (ret < 0) {
@@ -1870,7 +1919,7 @@ static int lxc_spawn(struct lxc_handler *handler)
         cgroup_ops->payload_finalize(cgroup_ops);
         TRACE("Finished setting up cgroups");
  
-       if (handler->ns_clone_flags & CLONE_NEWTIME) {
+       if (handler->ns_unshare_flags & CLONE_NEWTIME) {
                 /* Now we're ready to preserve the cgroup namespace */
                 ret = lxc_try_preserve_ns(handler->pid, "time");
                 if (ret < 0) {
diff --git a/src/lxc/start.h b/src/lxc/start.h

index ece4aac472e493c1c785a6abc592bfb75c08f695..6852f6e22d6a952d0b3b196a9b484f535903723b 100644 (file)
--- a/src/lxc/start.h
+++ b/src/lxc/start.h
@@ -26,20 +26,18 @@ struct lxc_handler {
          *   list the clone flags that were unshare()ed rather then clone()ed
          *   because of ordering requirements (e.g. e.g. CLONE_NEWNET and
          *   CLONE_NEWUSER) or implementation details.
-         *
-        * @ns_keep_flags;
-        * - The clone flags for the namespaces that the container will inherit
-        *   from the parent. They are not recorded in the handler itself but
-        *   are present in the container's config.
          *
-        * @ns_share_flags;
-        * - The clone flags for the namespaces that the container will share
-        *   with another process.  They are not recorded in the handler itself
-        *   but are present in the container's config.
+        * @ns_unshare_flags
+        * - Flags for namespaces that were unshared, not cloned.
+        *
+        * @clone_flags
+        * - ns_on_clone flags | other flags used to create container.
          */
         struct /* lxc_ns */ {
-               int ns_clone_flags;
-               int ns_on_clone_flags;
+               unsigned int ns_clone_flags;
+               unsigned int ns_on_clone_flags;
+               unsigned int ns_unshare_flags;
+               unsigned int clone_flags;
         };
  
         /* File descriptor to pin the rootfs for privileged containers. */
author	Christian Brauner <christian.brauner@ubuntu.com>
	Mon, 29 Jun 2020 09:34:01 +0000 (11:34 +0200)
committer	Christian Brauner <christian.brauner@ubuntu.com>
	Mon, 29 Jun 2020 11:58:21 +0000 (13:58 +0200)
src/lxc/cgroups/cgfsng.c		patch \| blob \| blame \| history
src/lxc/cgroups/cgroup.h		patch \| blob \| blame \| history
src/lxc/process_utils.c		patch \| blob \| blame \| history
src/lxc/process_utils.h		patch \| blob \| blame \| history
src/lxc/start.c		patch \| blob \| blame \| history
src/lxc/start.h		patch \| blob \| blame \| history