]> git.ipfire.org Git - thirdparty/lxc.git/commitdiff
confile: add "force" to cgroup:{mixed,ro,rw}
authorShukui Yang <yangshukui@huawei.com>
Fri, 16 Feb 2018 04:16:40 +0000 (23:16 -0500)
committerChristian Brauner <christian.brauner@ubuntu.com>
Wed, 21 Feb 2018 14:45:55 +0000 (15:45 +0100)
This lets users specify

        lxc.mount.auto = cgroup:mixed:force
or
        lxc.mount.auto = cgroup:ro:force
or
        lxc.mount.auto = cgroup:rw:force

When cgroup namespaces are supported LXC will not mount cgroups for the
container since it assumes that the init system will mount cgroups itself if it
wants to. This assumption already broke when users wanted to run containers
without CAP_SYS_ADMIN. For example, systemd based containers wouldn't start
since systemd needs to mount cgroups (named systemd hierarchy for legacy
cgroups and the unified hierarchy for unified cgroups) to track processes. This
problem was solved by detecting whether the container had CAP_SYS_ADMIN. If it
didn't we performed the cgroup mounts for it.
However, there are more cases when we should be able to mount cgroups for the
container when cgroup namespaces are supported:
- init systems not mounting cgroups themselves:
  A init system that doesn't mount cgroups would not have cgroups available
  especially when combined with custom LSM profiles to prevent cgroup
  {u}mount()ing inside containers.
- application containers:
  Application containers will usually not mount by cgroups themselves.
- read-only cgroups:
  It is useful to be able to mount cgroups read-only to e.g. prevent
  changing cgroup limits from inside the container while at the same time
  allowing the applications to perform introspection on their own cgroups. This
  again is mostly useful for application containers. System containers running
  systemd will usually not work correctly when cgroups are mounted read-only.
To be fair, all of those use-cases could be covered by custom hooks or
lxc.mount.entry entries but exposing it through lxc.mount.auto takes care of
setting correct mount options and adding the necessary logic to e.g. mount
filesystem read-only correctly.

Currently we only extend this to cgroup:{mixed,ro,rw} but technically there's
no reason not to enable the same behavior for cgroup-full:{mixed,ro,rw} as
well. If someone requests this we can simply treat it as a bug and add "force"
for cgroup-full.

Replaces #2136.

Signed-off-by: Shukui Yang <yangshukui@huawei.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
doc/lxc.container.conf.sgml.in
src/lxc/cgroups/cgfsng.c
src/lxc/conf.c
src/lxc/conf.h
src/lxc/confile.c

index bbf0c681cd4a516b4978c7043084d26aa8718b16..a75bdba24c541dc3b6fa3a9bdbc43426127da06f 100644 (file)
@@ -930,36 +930,75 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                   <filename>/sys</filename> as read-write
                 </para>
               </listitem>
+
               <listitem>
                 <para>
                   <option>cgroup:mixed</option>:
-                  mount a tmpfs to <filename>/sys/fs/cgroup</filename>,
-                  create directories for all hierarchies to which
-                  the container is added, create subdirectories
-                  there with the name of the cgroup, and bind-mount
-                  the container's own cgroup into that directory.
-                  The container will be able to write to its own
-                  cgroup directory, but not the parents, since they
-                  will be remounted read-only.
+                  Mount a tmpfs to <filename>/sys/fs/cgroup</filename>,
+                  create directories for all hierarchies to which the container
+                  is added, create subdirectories in those hierarchies with the
+                  name of the cgroup, and bind-mount the container's own cgroup
+                  into that directory. The container will be able to write to
+                  its own cgroup directory, but not the parents, since they will
+                  be remounted read-only.
                 </para>
               </listitem>
+
               <listitem>
                 <para>
-                  <option>cgroup:ro</option>: similar to
-                  <option>cgroup:mixed</option>, but everything will
-                be mounted read-only.
+                  <option>cgroup:mixed:force</option>:
+                  The <option>force</option> option will cause LXC to perform
+                  the cgroup mounts for the container under all circumstances.
+                  Otherwise it is similar to <option>cgroup:mixed</option>.
+                  This is mainly useful when the cgroup namespaces are enabled
+                  where LXC will normally leave mounting cgroups to the init
+                  binary of the container since it is perfectly safe to do so.
                 </para>
               </listitem>
+
+              <listitem>
+                <para>
+                  <option>cgroup:ro</option>:
+                  similar to <option>cgroup:mixed</option>, but everything will
+                  be mounted read-only.
+                </para>
+              </listitem>
+
+              <listitem>
+                <para>
+                  <option>cgroup:ro:force</option>:
+                  The <option>force</option> option will cause LXC to perform
+                  the cgroup mounts for the container under all circumstances.
+                  Otherwise it is similar to <option>cgroup:ro</option>.
+                  This is mainly useful when the cgroup namespaces are enabled
+                  where LXC will normally leave mounting cgroups to the init
+                  binary of the container since it is perfectly safe to do so.
+                </para>
+              </listitem>
+
               <listitem>
                 <para>
                   <option>cgroup:rw</option>: similar to
-                  <option>cgroup:mixed</option>, but everything will
-                  be mounted read-write. Note that the paths leading
-                  up to the container's own cgroup will be writable,
-                  but will not be a cgroup filesystem but just part
-                  of the tmpfs of <filename>/sys/fs/cgroup</filename>
+                  <option>cgroup:mixed</option>, but everything will be mounted
+                  read-write. Note that the paths leading up to the container's
+                  own cgroup will be writable, but will not be a cgroup
+                  filesystem but just part of the tmpfs of
+                  <filename>/sys/fs/cgroup</filename>
+                </para>
+              </listitem>
+
+              <listitem>
+                <para>
+                  <option>cgroup:rw:force</option>:
+                  The <option>force</option> option will cause LXC to perform
+                  the cgroup mounts for the container under all circumstances.
+                  Otherwise it is similar to <option>cgroup:rw</option>.
+                  This is mainly useful when the cgroup namespaces are enabled
+                  where LXC will normally leave mounting cgroups to the init
+                  binary of the container since it is perfectly safe to do so.
                 </para>
               </listitem>
+
               <listitem>
                 <para>
                   <option>cgroup</option> (without specifier):
index 380fb22f503c9061b202c38fc1a5cd1e6a74aabd..1d2d4e317cc27f3b55c6bf3b26e4963c436b47c9 100644 (file)
@@ -2023,26 +2023,31 @@ static int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
 
 static bool cgfsng_mount(void *hdata, const char *root, int type)
 {
-       int i;
+       int i, ret;
        char *tmpfspath = NULL;
        bool retval = false;
        struct lxc_handler *handler = hdata;
        struct cgfsng_handler_data *d = handler->cgroup_data;
-       bool has_cgns = false, has_sys_admin = true;
+       bool has_cgns = false, wants_force_mount = false;
 
        if ((type & LXC_AUTO_CGROUP_MASK) == 0)
                return true;
 
-       has_cgns = cgns_supported();
-       if (!lxc_list_empty(&handler->conf->keepcaps))
-               has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
-       else
-               has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
+       if (type & LXC_AUTO_CGROUP_FORCE) {
+               type &= ~LXC_AUTO_CGROUP_FORCE;
+               wants_force_mount = true;
+       }
 
-       if (has_cgns && has_sys_admin)
-               return true;
+       if (!wants_force_mount){
+               if (!lxc_list_empty(&handler->conf->keepcaps))
+                       wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
+               else
+                       wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
+       }
 
-       tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
+       has_cgns = cgns_supported();
+       if (has_cgns && !wants_force_mount)
+               return true;
 
        if (type == LXC_AUTO_CGROUP_NOSPEC)
                type = LXC_AUTO_CGROUP_MIXED;
@@ -2050,17 +2055,17 @@ static bool cgfsng_mount(void *hdata, const char *root, int type)
                type = LXC_AUTO_CGROUP_FULL_MIXED;
 
        /* Mount tmpfs */
-       if (safe_mount("cgroup_root", tmpfspath, "tmpfs",
-                       MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
-                       "size=10240k,mode=755",
-                       root) < 0)
-               goto  bad;
+       tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
+       ret = safe_mount("cgroup_root", tmpfspath, "tmpfs",
+                        MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
+                        "size=10240k,mode=755", root);
+       if (ret < 0)
+               goto on_error;
 
        for (i = 0; hierarchies[i]; i++) {
                char *controllerpath, *path2;
                struct hierarchy *h = hierarchies[i];
                char *controller = strrchr(h->mountpoint, '/');
-               int r;
 
                if (!controller)
                        continue;
@@ -2070,49 +2075,56 @@ static bool cgfsng_mount(void *hdata, const char *root, int type)
                        free(controllerpath);
                        continue;
                }
-               if (mkdir(controllerpath, 0755) < 0) {
+               ret = mkdir(controllerpath, 0755);
+               if (ret < 0) {
                        SYSERROR("Error creating cgroup path: %s", controllerpath);
                        free(controllerpath);
-                       goto bad;
+                       goto on_error;
                }
 
-               if (has_cgns && !has_sys_admin) {
+               if (has_cgns && wants_force_mount) {
                        /* If cgroup namespaces are supported but the container
                         * will not have CAP_SYS_ADMIN after it has started we
                         * need to mount the cgroups manually.
                         */
-                       r = cg_mount_in_cgroup_namespace(type, h, controllerpath);
+                       ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
                        free(controllerpath);
-                       if (r < 0)
-                               goto bad;
+                       if (ret < 0)
+                               goto on_error;
+
                        continue;
                }
 
-               if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) {
+               ret = mount_cgroup_full(type, h, controllerpath, d->container_cgroup);
+               if (ret < 0) {
                        free(controllerpath);
-                       goto bad;
+                       goto on_error;
                }
+
                if (!cg_mount_needs_subdirs(type)) {
                        free(controllerpath);
                        continue;
                }
-               path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL);
-               if (mkdir_p(path2, 0755) < 0) {
+
+               path2 = must_make_path(controllerpath, h->base_cgroup,
+                                      d->container_cgroup, NULL);
+               ret = mkdir_p(path2, 0755);
+               if (ret < 0) {
                        free(controllerpath);
                        free(path2);
-                       goto bad;
+                       goto on_error;
                }
 
-               r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2,
-                                                   d->container_cgroup);
+               ret = do_secondstage_mounts_if_needed(
+                   type, h, controllerpath, path2, d->container_cgroup);
                free(controllerpath);
                free(path2);
-               if (r < 0)
-                       goto bad;
+               if (ret < 0)
+                       goto on_error;
        }
        retval = true;
 
-bad:
+on_error:
        free(tmpfspath);
        return retval;
 }
index f2f326c46ef3a390984a0a0f8ab9b5e6efee9078..28d27878c6d5b2ef0a5578d5739fff92d59e450a 100644 (file)
@@ -570,7 +570,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
        if (flags & LXC_AUTO_CGROUP_MASK) {
                int cg_flags;
 
-               cg_flags = flags & LXC_AUTO_CGROUP_MASK;
+               cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
                /* If the type of cgroup mount was not specified, it depends on the
                 * container's capabilities as to what makes sense: if we have
                 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
@@ -592,7 +592,8 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha
                        else
                                cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
                }
-
+               if (flags & LXC_AUTO_CGROUP_FORCE)
+                               cg_flags |= LXC_AUTO_CGROUP_FORCE;
                if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
                        SYSERROR("error mounting /sys/fs/cgroup");
                        return -1;
@@ -3168,7 +3169,7 @@ int lxc_setup(struct lxc_handler *handler)
         * before, /sys could not have been mounted
         * (is either mounted automatically or via fstab entries)
         */
-       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
+       if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & (LXC_AUTO_CGROUP_MASK), handler) < 0) {
                ERROR("failed to setup the automatic mounts for '%s'", name);
                return -1;
        }
index 71df182d6e08f781e6ba227f33e401b7d7a09237..8f814895f19ca466f49299953f76cb0927ddad4b 100644 (file)
@@ -160,9 +160,9 @@ enum {
         * variants, which is safe. */
        LXC_AUTO_CGROUP_NOSPEC        = 0x0B0,   /* /sys/fs/cgroup (partial mount, r/w or mixed, depending on caps) */
        LXC_AUTO_CGROUP_FULL_NOSPEC   = 0x0E0,   /* /sys/fs/cgroup (full mount, r/w or mixed, depending on caps) */
-       LXC_AUTO_CGROUP_MASK          = 0x0F0,
-
-       LXC_AUTO_ALL_MASK             = 0x0FF,   /* all known settings */
+       LXC_AUTO_CGROUP_FORCE         = 0x100,   /* mount cgroups even when cgroup namespaces are supported */
+       LXC_AUTO_CGROUP_MASK          = 0x1F0,   /* all known cgroup options, doe not contain LXC_AUTO_CGROUP_FORCE */
+       LXC_AUTO_ALL_MASK             = 0x1FF,   /* all known settings */
 };
 
 /*
index ccd45a2fa6f3ba0b17ae537f4236884800f34d68..f6253114f34a748f5ace1ed2aaff8bb80acf8435 100644 (file)
@@ -1914,26 +1914,30 @@ static int set_config_mount_auto(const char *key, const char *value,
                int mask;
                int flag;
        } allowed_auto_mounts[] = {
-           { "proc",              LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_MIXED         },
-           { "proc:mixed",        LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_MIXED         },
-           { "proc:rw",           LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_RW            },
-           { "sys",               LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_MIXED          },
-           { "sys:ro",            LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_RO             },
-           { "sys:mixed",         LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_MIXED          },
-           { "sys:rw",            LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_RW             },
-           { "cgroup",            LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC      },
-           { "cgroup:mixed",      LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED       },
-           { "cgroup:ro",         LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO          },
-           { "cgroup:rw",         LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW          },
-           { "cgroup-full",       LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_NOSPEC },
-           { "cgroup-full:mixed", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_MIXED  },
-           { "cgroup-full:ro",    LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RO     },
-           { "cgroup-full:rw",    LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RW     },
-           /* NB: For adding anything that is just a single on/off, but has
-            *     no options: keep mask and flag identical and just define the
-            *     enum value as an unused bit so far
+           { "proc",                    LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_MIXED                            },
+           { "proc:mixed",              LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_MIXED                            },
+           { "proc:rw",                 LXC_AUTO_PROC_MASK,   LXC_AUTO_PROC_RW                               },
+           { "sys",                     LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_MIXED                             },
+           { "sys:ro",                  LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_RO                                },
+           { "sys:mixed",               LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_MIXED                             },
+           { "sys:rw",                  LXC_AUTO_SYS_MASK,    LXC_AUTO_SYS_RW                                },
+           { "cgroup",                  LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC                         },
+           { "cgroup:mixed",            LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED                          },
+           { "cgroup:ro",               LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO                             },
+           { "cgroup:rw",               LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW                             },
+           { "cgroup:force",            LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC | LXC_AUTO_CGROUP_FORCE },
+           { "cgroup:mixed:force",      LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED | LXC_AUTO_CGROUP_FORCE  },
+           { "cgroup:ro:force",         LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO | LXC_AUTO_CGROUP_FORCE     },
+           { "cgroup:rw:force",         LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW | LXC_AUTO_CGROUP_FORCE     },
+           { "cgroup-full",             LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_NOSPEC                    },
+           { "cgroup-full:mixed",       LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_MIXED                     },
+           { "cgroup-full:ro",          LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RO                        },
+           { "cgroup-full:rw",          LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RW                        },
+           /* For adding anything that is just a single on/off, but has no
+            * options: keep mask and flag identical and just define the enum
+            * value as an unused bit so far
             */
-           { NULL,                0,                    0                           }
+           { NULL,                      0,                     0                                             }
        };
 
        if (lxc_config_value_empty(value)) {