]> git.ipfire.org Git - thirdparty/lxc.git/commitdiff
conf: support idmapped lxc.mount.entry entries
authorChristian Brauner <christian.brauner@ubuntu.com>
Mon, 17 May 2021 09:41:38 +0000 (11:41 +0200)
committerChristian Brauner <christian.brauner@ubuntu.com>
Wed, 19 May 2021 12:21:51 +0000 (14:21 +0200)
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
src/lxc/af_unix.c
src/lxc/af_unix.h
src/lxc/cgroups/cgfsng.c
src/lxc/conf.c
src/lxc/conf.h
src/lxc/confile.c
src/lxc/start.c
src/lxc/storage/storage.c
src/lxc/sync.c
src/lxc/sync.h

index b491b95078b71a24cfd28ca302f93c9297773dc3..14d3160cddfa7dc92cf77ab714b6c8d1991eb81c 100644 (file)
@@ -164,6 +164,16 @@ int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data,
        return lxc_abstract_unix_send_fds(fd, sendfds, num_sendfds, data, size);
 }
 
+int __lxc_abstract_unix_send_two_fds(int fd, int fd_first, int fd_second,
+                                    void *data, size_t size)
+{
+       int fd_send[2] = {
+               fd_first,
+               fd_second,
+       };
+       return lxc_abstract_unix_send_fds(fd, fd_send, 2, data, size);
+}
+
 static ssize_t lxc_abstract_unix_recv_fds_iov(int fd,
                                              struct unix_fds *ret_fds,
                                              struct iovec *ret_iov,
@@ -355,13 +365,14 @@ ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd, void *ret_data,
        return ret;
 }
 
-ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd)
+ssize_t __lxc_abstract_unix_recv_two_fds(int fd, int *fd_first, int *fd_second,
+                                        void *data, size_t size)
 {
        call_cleaner(put_unix_fds) struct unix_fds *fds = NULL;
        char buf[1] = {};
        struct iovec iov = {
-           .iov_base   = buf,
-           .iov_len    = sizeof(buf),
+           .iov_base   = data ?: buf,
+           .iov_len    = size ?: sizeof(buf),
        };
        ssize_t ret;
 
@@ -377,11 +388,11 @@ ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd)
                return ret_errno(ENODATA);
 
        if (fds->fd_count_ret != fds->fd_count_max) {
-               ret_fd[0] = -EBADF;
-               ret_fd[1] = -EBADF;
+               *fd_first = -EBADF;
+               *fd_second = -EBADF;
        } else {
-               ret_fd[0] = move_fd(fds->fd[0]);
-               ret_fd[1] = move_fd(fds->fd[1]);
+               *fd_first = move_fd(fds->fd[0]);
+               *fd_second = move_fd(fds->fd[1]);
        }
 
        return 0;
index 7b979374348abff3a1512f4412b6ca91038ae0c1..77c115a3b55f6bacee269ba69c31ba45806831e7 100644 (file)
@@ -125,7 +125,24 @@ __hidden extern ssize_t lxc_abstract_unix_recv_one_fd(int fd, int *ret_fd,
                                                      size_t size_ret_data)
     __access_r(3, 4);
 
-__hidden extern ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *ret_fd);
+__hidden extern int __lxc_abstract_unix_send_two_fds(int fd, int fd_first,
+                                                    int fd_second, void *data,
+                                                    size_t size);
+
+static inline int lxc_abstract_unix_send_two_fds(int fd, int fd_first,
+                                                int fd_second)
+{
+       return __lxc_abstract_unix_send_two_fds(fd, fd_first, fd_second, NULL, 0);
+}
+
+__hidden extern ssize_t __lxc_abstract_unix_recv_two_fds(int fd, int *fd_first,
+                                                        int *fd_second,
+                                                        void *data, size_t size);
+
+static inline ssize_t lxc_abstract_unix_recv_two_fds(int fd, int *fd_first, int *fd_second)
+{
+       return __lxc_abstract_unix_recv_two_fds(fd, fd_first, fd_second, NULL, 0);
+}
 
 __hidden extern int lxc_unix_send_fds(int fd, int *sendfds, int num_sendfds, void *data, size_t size);
 
index 078d47609caa27a8f0723a17ca9fa3f2bf0493fd..80fcbb93ca2859748e8495d2bf9ee987114a26b3 100644 (file)
@@ -2211,16 +2211,13 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
                                        int *sk_fd, pid_t pid)
 {
        __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
-       int target_fds[2];
        char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
        size_t pidstr_len;
        ssize_t ret;
 
-       ret = lxc_abstract_unix_recv_two_fds(sk, target_fds);
+       ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
        if (ret < 0)
                return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
-       target_fd0 = target_fds[0];
-       target_fd1 = target_fds[1];
 
        pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
 
index 1fc0b684697d0ef038a67adecebd3516647030ed..0852ce6bd58c3575833c0868d37ec585c11d7cf3 100644 (file)
@@ -15,6 +15,7 @@
 #include <netinet/in.h>
 #include <pwd.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -57,6 +58,7 @@
 #include "start.h"
 #include "storage/storage.h"
 #include "storage/overlay.h"
+#include "sync.h"
 #include "syscall_wrappers.h"
 #include "terminal.h"
 #include "utils.h"
@@ -126,6 +128,8 @@ char *lxchook_names[NUM_LXC_HOOKS] = {
 struct mount_opt {
        char *name;
        int clear;
+       bool recursive;
+       __u64 flag;
        int legacy_flag;
 };
 
@@ -140,45 +144,48 @@ struct limit_opt {
 };
 
 static struct mount_opt mount_opt[] = {
-       { "async",         1, MS_SYNCHRONOUS },
-       { "atime",         1, MS_NOATIME     },
-       { "bind",          0, MS_BIND        },
-       { "defaults",      0, 0              },
-       { "dev",           1, MS_NODEV       },
-       { "diratime",      1, MS_NODIRATIME  },
-       { "dirsync",       0, MS_DIRSYNC     },
-       { "exec",          1, MS_NOEXEC      },
-       { "lazytime",      0, MS_LAZYTIME    },
-       { "mand",          0, MS_MANDLOCK    },
-       { "noatime",       0, MS_NOATIME     },
-       { "nodev",         0, MS_NODEV       },
-       { "nodiratime",    0, MS_NODIRATIME  },
-       { "noexec",        0, MS_NOEXEC      },
-       { "nomand",        1, MS_MANDLOCK    },
-       { "norelatime",    1, MS_RELATIME    },
-       { "nostrictatime", 1, MS_STRICTATIME },
-       { "nosuid",        0, MS_NOSUID      },
-       { "rbind",         0, MS_BIND|MS_REC },
-       { "relatime",      0, MS_RELATIME    },
-       { "remount",       0, MS_REMOUNT     },
-       { "ro",            0, MS_RDONLY      },
-       { "rw",            1, MS_RDONLY      },
-       { "strictatime",   0, MS_STRICTATIME },
-       { "suid",          1, MS_NOSUID      },
-       { "sync",          0, MS_SYNCHRONOUS },
-       { NULL,            0, 0              },
+       { "atime",         1, false, MOUNT_ATTR_NOATIME,     MS_NOATIME       },
+       { "dev",           1, false, MOUNT_ATTR_NODEV,       MS_NODEV         },
+       { "diratime",      1, false, MOUNT_ATTR_NODIRATIME,  MS_NODIRATIME    },
+       { "exec",          1, false, MOUNT_ATTR_NOEXEC,      MS_NOEXEC        },
+       { "noatime",       0, false, MOUNT_ATTR_NOATIME,     MS_NOATIME       },
+       { "nodev",         0, false, MOUNT_ATTR_NODEV,       MS_NODEV         },
+       { "nodiratime",    0, false, MOUNT_ATTR_NODIRATIME,  MS_NODIRATIME    },
+       { "noexec",        0, false, MOUNT_ATTR_NOEXEC,      MS_NOEXEC        },
+       { "norelatime",    1, false, MOUNT_ATTR_RELATIME,    MS_RELATIME      },
+       { "nostrictatime", 1, false, MOUNT_ATTR_STRICTATIME, MS_STRICTATIME   },
+       { "nosuid",        0, false, MOUNT_ATTR_NOSUID,      MS_NOSUID        },
+       { "relatime",      0, false, MOUNT_ATTR_RELATIME,    MS_RELATIME      },
+       { "ro",            0, false, MOUNT_ATTR_RDONLY,      MS_RDONLY        },
+       { "rw",            1, false, MOUNT_ATTR_RDONLY,      MS_RDONLY        },
+       { "strictatime",   0, false, MOUNT_ATTR_STRICTATIME, MS_STRICTATIME   },
+       { "suid",          1, false, MOUNT_ATTR_NOSUID,      MS_NOSUID        },
+
+       { "bind",          0, false,  0,                     MS_BIND          },
+       { "defaults",      0, false,  0,                     0                },
+       { "rbind",         0, true,   0,                     MS_BIND | MS_REC },
+
+       { "sync",          0, false, ~0,                     MS_SYNCHRONOUS   },
+       { "async",         1, false, ~0,                     MS_SYNCHRONOUS   },
+       { "dirsync",       0, false, ~0,                     MS_DIRSYNC       },
+       { "lazytime",      0, false, ~0,                     MS_LAZYTIME      },
+       { "mand",          0, false, ~0,                     MS_MANDLOCK      },
+       { "nomand",        1, false, ~0,                     MS_MANDLOCK      },
+       { "remount",       0, false, ~0,                     MS_REMOUNT       },
+
+       { NULL,            0, false, ~0,                     ~0               },
 };
 
 static struct mount_opt propagation_opt[] = {
-       { "private",     0, MS_PRIVATE           },
-       { "shared",      0, MS_SHARED            },
-       { "slave",       0, MS_SLAVE             },
-       { "unbindable",  0, MS_UNBINDABLE        },
-       { "rprivate",    0, MS_PRIVATE|MS_REC    },
-       { "rshared",     0, MS_SHARED|MS_REC     },
-       { "rslave",      0, MS_SLAVE|MS_REC      },
-       { "runbindable", 0, MS_UNBINDABLE|MS_REC },
-       { NULL,          0, 0                    },
+       { "private",     0, false, MS_PRIVATE,    MS_PRIVATE             },
+       { "shared",      0, false, MS_SHARED,     MS_SHARED              },
+       { "slave",       0, false, MS_SLAVE,      MS_SLAVE               },
+       { "unbindable",  0, false, MS_UNBINDABLE, MS_UNBINDABLE          },
+       { "rprivate",    0, true,  MS_PRIVATE,    MS_PRIVATE | MS_REC    },
+       { "rshared",     0, true,  MS_SHARED,     MS_SHARED | MS_REC     },
+       { "rslave",      0, true,  MS_SLAVE,      MS_SLAVE | MS_REC      },
+       { "runbindable", 0, true,  MS_UNBINDABLE, MS_UNBINDABLE | MS_REC },
+       { NULL,          0, 0                                            },
 };
 
 static struct caps_opt caps_opt[] = {
@@ -525,7 +532,7 @@ void lxc_storage_put(struct lxc_conf *conf)
  */
 int lxc_rootfs_init(struct lxc_conf *conf, bool userns)
 {
-       __do_close int dfd_path = -EBADF, fd_pin = -EBADF, fd_userns = -EBADF;
+       __do_close int dfd_path = -EBADF, fd_pin = -EBADF;
        int ret;
        struct stat st;
        struct statfs stfs;
@@ -596,7 +603,6 @@ int lxc_rootfs_init(struct lxc_conf *conf, bool userns)
 
 out:
        rootfs->fd_path_pin = move_fd(fd_pin);
-       rootfs->mnt_opts.userns_fd = move_fd(fd_userns);
        return 0;
 }
 
@@ -623,7 +629,7 @@ int lxc_rootfs_prepare_parent(struct lxc_handler *handler)
        if (!can_use_bind_mounts())
                return syserror_set(-EOPNOTSUPP, "Kernel does not support the new mount api");
 
-       if (rootfs->mnt_opts.userns_self)
+       if (strequal(rootfs->mnt_opts.userns_path, "container"))
                fd_userns = dup_cloexec(handler->nsfd[LXC_NS_USER]);
        else
                fd_userns = open_at(-EBADF, rootfs->mnt_opts.userns_path,
@@ -1105,7 +1111,7 @@ void lxc_delete_tty(struct lxc_tty_info *ttys)
        free_disarm(ttys->tty);
 }
 
-static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
+static int __lxc_send_ttys_to_parent(struct lxc_handler *handler)
 {
        int i;
        int ret = -1;
@@ -1150,12 +1156,6 @@ static int lxc_create_ttys(struct lxc_handler *handler)
                goto on_error;
        }
 
-       ret = lxc_send_ttys_to_parent(handler);
-       if (ret < 0) {
-               ERROR("Failed to send ttys to parent");
-               goto on_error;
-       }
-
        if (!conf->is_execute) {
                ret = lxc_setup_ttys(conf);
                if (ret < 0) {
@@ -1166,15 +1166,26 @@ static int lxc_create_ttys(struct lxc_handler *handler)
 
        if (conf->ttys.tty_names) {
                ret = setenv("container_ttys", conf->ttys.tty_names, 1);
-               if (ret < 0)
+               if (ret < 0) {
                        SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
+                       goto on_error;
+               }
        }
 
-       ret = 0;
+       return 0;
 
 on_error:
        lxc_delete_tty(&conf->ttys);
 
+       return -1;
+}
+
+int lxc_send_ttys_to_parent(struct lxc_handler *handler)
+{
+       int ret = -1;
+
+       ret = __lxc_send_ttys_to_parent(handler);
+       lxc_delete_tty(&handler->conf->ttys);
        return ret;
 }
 
@@ -1648,7 +1659,6 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler)
        char default_devpts_mntopts[256] = "gid=5,newinstance,ptmxmode=0666,mode=0620";
        struct lxc_conf *conf = handler->conf;
        struct lxc_rootfs *rootfs = &conf->rootfs;
-       int sock = handler->data_sock[0];
 
        if (conf->pty_max <= 0)
                return log_debug(0, "No new devpts instance will be mounted since no pts devices are requested");
@@ -1695,14 +1705,9 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler)
        if (devpts_fd < 0) {
                devpts_fd = -EBADF;
                TRACE("Failed to create detached devpts mount");
-               ret = lxc_abstract_unix_send_fds(sock, NULL, 0, &devpts_fd, sizeof(int));
-       } else {
-               ret = lxc_abstract_unix_send_fds(sock, &devpts_fd, 1, NULL, 0);
        }
-       if (ret < 0)
-               return log_error_errno(-1, errno, "Failed to send devpts fd to parent");
 
-       TRACE("Sent devpts file descriptor %d to parent", devpts_fd);
+       handler->conf->devpts_fd = move_fd(devpts_fd);
 
        /* Remove any pre-existing /dev/ptmx file. */
        ret = unlinkat(rootfs->dfd_dev, "ptmx", 0);
@@ -1741,6 +1746,24 @@ static int lxc_setup_devpts_child(struct lxc_handler *handler)
        return 0;
 }
 
+int lxc_send_devpts_to_parent(struct lxc_handler *handler)
+{
+       int ret;
+
+       if (handler->conf->pty_max <= 0)
+               return log_debug(0, "No devpts file descriptor will be sent since no pts devices are requested");
+
+       ret = lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->conf->devpts_fd, 1, NULL, 0);
+       if (ret < 0)
+               SYSERROR("Failed to send devpts file descriptor %d to parent", handler->conf->devpts_fd);
+       else
+               TRACE("Sent devpts file descriptor %d to parent", handler->conf->devpts_fd);
+
+       close_prot_errno_disarm(handler->conf->devpts_fd);
+
+       return 0;
+}
+
 static int setup_personality(personality_t persona)
 {
        int ret;
@@ -2031,6 +2054,104 @@ int parse_mntopts_legacy(const char *mntopts, unsigned long *mntflags, char **mn
        return 0;
 }
 
+static int parse_vfs_attr(struct lxc_mount_options *opts, char *opt, size_t size)
+{
+       /*
+        * If opt is found in mount_opt, set or clear flags.
+        * Otherwise append it to data.
+        */
+       for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
+               if (!strnequal(opt, mo->name, strlen(mo->name)))
+                       continue;
+
+               /* This is a recursive bind-mount. */
+               if (strequal(mo->name, "rbind")) {
+                       opts->recursive = 1;
+                       opts->bind = 1;
+                       return 0;
+               }
+
+               /* This is a bind-mount. */
+               if (strequal(mo->name, "bind")) {
+                       opts->bind = 1;
+                       return 0;
+               }
+
+               if (mo->flag == ~0)
+                       return log_info(0, "Ignoring %s mount option", mo->name);
+
+               if (mo->clear) {
+                       opts->attr.attr_clr |= mo->flag;
+                       TRACE("Lowering %s", mo->name);
+               } else {
+                       opts->attr.attr_set |= mo->flag;
+                       TRACE("Raising %s", mo->name);
+               }
+
+               return 0;
+       }
+
+       for (struct mount_opt *mo = &mount_opt[0]; mo->name != NULL; mo++) {
+               if (!strnequal(opt, mo->name, strlen(mo->name)))
+                       continue;
+
+               /* TODO: Handle recursive propagation requests. */
+               opts->attr.propagation = mo->flag;
+               return 0;
+       }
+
+       return 0;
+}
+
+static int parse_mount_attrs(struct lxc_mount_options *opts, const char *mntopts)
+{
+       __do_free char *mntopts_new = NULL, *mntopts_dup = NULL;
+       char *mntopt_cur = NULL;
+       int ret;
+       size_t size;
+
+       if (!opts)
+               return ret_errno(EINVAL);
+
+       if (!mntopts)
+               return 0;
+
+       mntopts_dup = strdup(mntopts);
+       if (!mntopts_dup)
+               return ret_errno(ENOMEM);
+
+       size = strlen(mntopts_dup) + 1;
+       mntopts_new = zalloc(size);
+       if (!mntopts_new)
+               return ret_errno(ENOMEM);
+
+       lxc_iterate_parts(mntopt_cur, mntopts_dup, ",") {
+               char *end = NULL;
+
+               /* This is a filesystem specific option. */
+               if (strchr(mntopt_cur, '=')) {
+                       if (!end) {
+                               end = stpcpy(mntopts_new, mntopt_cur);
+                       } else {
+                               end = stpcpy(end, ",");
+                               end = stpcpy(end, mntopt_cur);
+                       }
+
+                       continue;
+               }
+
+               /* This is a generic vfs option. */
+               ret = parse_vfs_attr(opts, mntopt_cur, size);
+               if (ret < 0)
+                       return syserror("Failed to parse mount attributes: \"%s\"", mntopt_cur);
+       }
+
+       if (*mntopts_new)
+               opts->data = move_ptr(mntopts_new);
+
+       return 0;
+}
+
 static void parse_propagationopt(char *opt, unsigned long *flags)
 {
        struct mount_opt *mo;
@@ -2204,7 +2325,7 @@ const char *lxc_mount_options_info[LXC_MOUNT_MAX] = {
 };
 
 /* Remove "optional", "create=dir", and "create=file" from mntopt */
-int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts)
+int parse_lxc_mount_attrs(struct lxc_mount_options *opts, char *mnt_opts)
 {
        for (size_t i = LXC_MOUNT_CREATE_DIR; i < LXC_MOUNT_MAX; i++) {
                __do_close int fd_userns = -EBADF;
@@ -2238,17 +2359,15 @@ int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts)
                        if (len >= sizeof(opts->userns_path))
                                return syserror_set(-EIO, "Excessive idmap path length for \"idmap=<path>\" LXC specific mount option");
 
-                       memcpy(opts->userns_path, opt_next, len);
+                       strlcpy(opts->userns_path, opt_next, len);
 
                        if (is_empty_string(opts->userns_path))
                                return syserror_set(-EINVAL, "Missing idmap path for \"idmap=<path>\" LXC specific mount option");
 
-                       if (strequal(opts->userns_path, "container")) {
-                               opts->userns_self = 1;
-                       } else {
+                       if (!strequal(opts->userns_path, "container")) {
                                fd_userns = open(opts->userns_path, O_RDONLY | O_NOCTTY | O_CLOEXEC);
                                if (fd_userns < 0)
-                                       return syserror("Failed to open user namespace");
+                                       return syserror("Failed to open user namespace %s", opts->userns_path);
                        }
 
                        TRACE("Parse LXC specific mount option %d->\"idmap=%s\"", fd_userns, opts->userns_path);
@@ -2343,12 +2462,19 @@ static inline int mount_entry_on_generic(struct mntent *mntent,
                return -1;
        }
 
-       ret = parse_lxc_mntopts(&opts, mntent->mnt_opts);
+       ret = parse_lxc_mount_attrs(&opts, mntent->mnt_opts);
        if (ret < 0)
                return ret;
 
+       /*
+        * Idmapped mount entries will be setup by the parent for us. Note that
+        * we rely on mount_entry_create_dir_file() above to have already
+        * created the target path for us. So the parent can just open the
+        * target and send us the target fd.
+        */
+       errno = EOPNOTSUPP;
        if (!is_empty_string(opts.userns_path))
-               return syserror_set(-EINVAL, "Idmapped mount entries not yet supported");
+               return systrace_ret(0, "Skipping idmapped mount entry");
 
        ret = parse_propagationopts(mntent->mnt_opts, &pflags);
        if (ret < 0)
@@ -2575,6 +2701,226 @@ static int setup_mount_entries(const struct lxc_conf *conf,
        return mount_file_entries(rootfs, f, lxc_name, lxc_path);
 }
 
+static int __lxc_idmapped_mounts_child(struct lxc_handler *handler, FILE *f)
+{
+       struct lxc_conf *conf = handler->conf;
+       struct lxc_rootfs *rootfs = &conf->rootfs;
+       int ret;
+       char buf[PATH_MAX];
+       struct mntent mntent;
+
+       while (getmntent_r(f, &mntent, buf, sizeof(buf))) {
+               __do_close int fd_from = -EBADF, fd_to = -EBADF,
+                              fd_userns = -EBADF;
+               __do_free char *__data = NULL;
+               struct lxc_mount_options opts = {};
+               int dfd_from;
+               const char *source_relative, *target_relative;
+
+               ret = parse_lxc_mount_attrs(&opts, mntent.mnt_opts);
+               if (ret < 0)
+                       return syserror("Failed to parse LXC specific mount options");
+               __data = opts.data;
+
+               ret = parse_mount_attrs(&opts, mntent.mnt_opts);
+               if (ret < 0)
+                       return syserror("Failed to parse mount options");
+
+               /* No idmapped mount entry so skip it. */
+               if (is_empty_string(opts.userns_path))
+                       continue;
+
+               if (!can_use_bind_mounts())
+                       return syserror_set(-EINVAL, "Kernel does not support idmapped mounts");
+
+               if (!opts.bind)
+                       return syserror_set(-EINVAL, "Only bind mounts can currently be idmapped");
+
+               /* We don't support new filesystem mounts yet. */
+               if (!is_empty_string(mntent.mnt_type) &&
+                   !strequal(mntent.mnt_type, "none"))
+                       return syserror_set(-EINVAL, "Only bind mounts can currently be idmapped");
+
+               /* Someone specified additional mount options for a bind-mount. */
+               if (!is_empty_string(opts.data))
+                       return syserror_set(-EINVAL, "Bind mounts don't support non-generic mount options");
+
+               /*
+                * The source path is supposed to be taken relative to the
+                * container's rootfs mount or - if the container does not have
+                * a separate rootfs - to the host's /.
+                */
+               source_relative = deabs(mntent.mnt_fsname);
+               if (opts.relative || !rootfs->path)
+                       dfd_from = rootfs->dfd_mnt;
+               else
+                       dfd_from = rootfs->dfd_host;
+               fd_from = open_tree(dfd_from, source_relative,
+                                   OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+                                   (opts.recursive ? AT_RECURSIVE : 0));
+               if (fd_from < 0)
+                       return syserror("Failed to create detached %smount of %d/%s",
+                                       opts.recursive ? "recursive " : "",
+                                       dfd_from, source_relative);
+
+               if (strequal(opts.userns_path, "container"))
+                       fd_userns = openat(dfd_from, "proc/self/ns/user", O_RDONLY | O_CLOEXEC);
+               else
+                       fd_userns = open_at(-EBADF, opts.userns_path,
+                                           PROTECT_OPEN_WITH_TRAILING_SYMLINKS, 0, 0);
+               if (fd_userns < 0) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to open user namespace \"%s\" for detached %smount of %d/%s",
+                                       opts.userns_path, opts.recursive ? "recursive " : "",
+                                       dfd_from, source_relative);
+               }
+
+               ret = __lxc_abstract_unix_send_two_fds(handler->data_sock[0],
+                                                      fd_from, fd_userns,
+                                                      &opts, sizeof(opts));
+               if (ret <= 0) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to send file descriptor %d for detached %smount of %d/%s and file descriptor %d of user namespace \"%s\" to parent",
+                                       fd_from, opts.recursive ? "recursive " : "",
+                                       dfd_from, source_relative, fd_userns,
+                                       opts.userns_path);
+               }
+
+               ret = lxc_abstract_unix_rcv_credential(handler->data_sock[0], NULL, 0);
+               if (ret <= 0) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to receive notification that parent idmapped detached %smount %d/%s to user namespace %d",
+                                       opts.recursive ? "recursive " : "",
+                                       dfd_from, source_relative, fd_userns);
+               }
+
+               /* Set remaining mount options. */
+               ret = mount_setattr(fd_from, "", AT_EMPTY_PATH |
+                                   (opts.recursive ? AT_RECURSIVE : 0),
+                                   &opts.attr, sizeof(opts.attr));
+               if (ret < 0) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to receive notification that parent idmapped detached %smount %d/%s to user namespace %d",
+                                       opts.recursive ? "recursive " : "",
+                                       dfd_from, source_relative, fd_userns);
+               }
+
+               /*
+                * In contrast to the legacy mount codepath we will simplify
+                * our lifes and just always treat the target mountpoint to be
+                * relative to the container's rootfs mountpoint or - if the
+                * container does not have a separate rootfs - to the host's /.
+                */
+
+               target_relative = deabs(mntent.mnt_dir);
+               if (rootfs->path)
+                       dfd_from = rootfs->dfd_mnt;
+               else
+                       dfd_from = rootfs->dfd_host;
+               fd_to = open_at(dfd_from, target_relative, PROTECT_OPATH_FILE, PROTECT_LOOKUP_BENEATH_WITH_SYMLINKS, 0);
+               if (fd_to < 0) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to open target mountpoint %d/%s for detached idmapped %smount %d:%d/%s",
+                                       dfd_from, target_relative,
+                                       opts.recursive ? "recursive " : "",
+                                       fd_userns, dfd_from, source_relative);
+               }
+
+               ret = move_detached_mount(fd_from, fd_to, "", 0, 0);
+               if (ret) {
+                       if (opts.optional) {
+                               TRACE("Skipping optional idmapped mount");
+                               continue;
+                       }
+
+                       return syserror("Failed to attach detached idmapped %smount %d:%d/%s to target mountpoint %d/%s",
+                                       opts.recursive ? "recursive " : "",
+                                       fd_userns, dfd_from, source_relative, dfd_from, target_relative);
+               }
+
+               TRACE("Attached detached idmapped %smount %d:%d/%s to target mountpoint %d/%s",
+                     opts.recursive ? "recursive " : "", fd_userns, dfd_from,
+                     source_relative, dfd_from, target_relative);
+       }
+
+       if (!feof(f) || ferror(f))
+               return syserror_set(-EINVAL, "Failed to parse mount entries");
+
+       return 0;
+}
+
+static int lxc_idmapped_mounts_child(struct lxc_handler *handler)
+{
+       __do_fclose FILE *f_entries = NULL;
+       int fret = -1;
+       struct lxc_conf *conf = handler->conf;
+       const char *fstab = conf->fstab;
+       struct lxc_list *mount = &conf->mount_list;
+       int ret;
+
+       f_entries = make_anonymous_mount_file(mount, conf->lsm_aa_allow_nesting);
+       if (!f_entries) {
+               SYSERROR("Failed to create anonymous mount file");
+               goto out;
+       }
+
+       ret = __lxc_idmapped_mounts_child(handler, f_entries);
+       if (ret) {
+               SYSERROR("Failed to setup idmapped mount entries");
+               goto out;
+       }
+
+       TRACE("Finished setting up idmapped mounts");
+
+       if (fstab) {
+               __do_endmntent FILE *f_fstab = NULL;
+
+               f_fstab = setmntent(fstab, "re");
+               if (!f_fstab) {
+                       SYSERROR("Failed to open fstab format file \"%s\"", fstab);
+                       goto out;
+               }
+
+               ret = __lxc_idmapped_mounts_child(handler, f_fstab);
+               if (ret) {
+                       SYSERROR("Failed to setup idmapped mount entries specified in fstab");
+                       goto out;
+               }
+
+               TRACE("Finished setting up idmapped mounts specified in fstab");
+       }
+
+       fret = 0;
+
+out:
+       ret = lxc_abstract_unix_send_credential(handler->data_sock[0], NULL, 0);
+       if (ret < 0)
+               return syserror("Failed to inform child that we are done setting up mounts");
+       TRACE("AAAA");
+
+       return fret;
+}
+
 static int parse_cap(const char *cap)
 {
        size_t i;
@@ -2841,7 +3187,6 @@ struct lxc_conf *lxc_conf_init(void)
        new->rootfs.dfd_host = -EBADF;
        new->rootfs.fd_path_pin = -EBADF;
        new->rootfs.dfd_idmapped = -EBADF;
-       new->rootfs.mnt_opts.userns_fd = -EBADF;
        new->logfd = -1;
        lxc_list_init(&new->cgroup);
        lxc_list_init(&new->cgroup2);
@@ -3613,6 +3958,45 @@ static int lxc_rootfs_prepare_child(struct lxc_handler *handler)
        return 0;
 }
 
+int lxc_idmapped_mounts_parent(struct lxc_handler *handler)
+{
+       for (;;) {
+               __do_close int fd_from = -EBADF, fd_userns = -EBADF;
+               struct lxc_mount_attr attr = {};
+               struct lxc_mount_options opts = {};
+               ssize_t ret;
+
+               ret = __lxc_abstract_unix_recv_two_fds(handler->data_sock[1],
+                                                      &fd_from, &fd_userns,
+                                                      &opts, sizeof(opts));
+               if (ret < 0)
+                       return syserror("Failed to receive idmapped mount file descriptors from child");
+
+               if (fd_from < 0 || fd_userns < 0)
+                       return log_trace(0, "Finished receiving idmapped mount file descriptors from child");
+
+               attr.attr_set   = MOUNT_ATTR_IDMAP;
+               attr.userns_fd  = fd_userns;
+               ret = mount_setattr(fd_from, "",
+                                   AT_EMPTY_PATH |
+                                   (opts.recursive ? AT_RECURSIVE : 0),
+                                   &attr, sizeof(attr));
+               if (ret)
+                       return syserror("Failed to idmap detached %smount %d to %d",
+                                       opts.recursive ? "recursive " : "",
+                                       fd_from, fd_userns);
+
+               ret = lxc_abstract_unix_send_credential(handler->data_sock[1], NULL, 0);
+               if (ret < 0)
+                       return syserror("Parent failed to notify child that detached %smount %d was idmapped to user namespace %d",
+                                       opts.recursive ? "recursive " : "",
+                                       fd_from, fd_userns);
+
+               TRACE("Parent idmapped detached %smount %d to user namespace %d",
+                     opts.recursive ? "recursive " : "", fd_from, fd_userns);
+       }
+}
+
 int lxc_setup(struct lxc_handler *handler)
 {
        int ret;
@@ -3644,10 +4028,6 @@ int lxc_setup(struct lxc_handler *handler)
                                                            &lxc_conf->network);
                if (ret < 0)
                        return log_error(-1, "Failed to setup network");
-
-               ret = lxc_network_send_name_and_ifindex_to_parent(handler);
-               if (ret < 0)
-                       return log_error(-1, "Failed to send network device names and ifindices to parent");
        }
 
        if (lxc_conf->autodev > 0) {
@@ -3674,6 +4054,13 @@ int lxc_setup(struct lxc_handler *handler)
                        return log_error(-1, "Failed to setup mount entries");
        }
 
+       if (!lxc_sync_wake_parent(handler, START_SYNC_IDMAPPED_MOUNTS))
+               return -1;
+
+       ret = lxc_idmapped_mounts_child(handler);
+       if (ret)
+               return syserror("Failed to attached detached idmapped mounts");
+
        lxc_conf->rootfs.dfd_dev = open_at(lxc_conf->rootfs.dfd_mnt, "dev",
                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
        if (lxc_conf->rootfs.dfd_dev < 0 && errno != ENOENT)
index da742bdd4bdad071231ff08069f9dc6a54d370af..a185b2023cd7c668ec53065191abb3d4708fdd32 100644 (file)
@@ -201,12 +201,13 @@ struct lxc_mount_options {
        int create_file : 1;
        int optional : 1;
        int relative : 1;
-       int userns_self : 1;
+       int recursive : 1;
+       int bind : 1;
        char userns_path[PATH_MAX];
-       int userns_fd;
        unsigned long mnt_flags;
        unsigned long prop_flags;
        char *data;
+       struct lxc_mount_attr attr;
 };
 
 /* Defines a structure to store the rootfs location, the
@@ -512,9 +513,12 @@ __hidden extern int lxc_rootfs_prepare(struct lxc_conf *conf, bool userns);
 __hidden extern void lxc_storage_put(struct lxc_conf *conf);
 __hidden extern int lxc_rootfs_init(struct lxc_conf *conf, bool userns);
 __hidden extern int lxc_rootfs_prepare_parent(struct lxc_handler *handler);
+__hidden extern int lxc_idmapped_mounts_parent(struct lxc_handler *handler);
 __hidden extern int lxc_map_ids(struct lxc_list *idmap, pid_t pid);
 __hidden extern int lxc_create_tty(const char *name, struct lxc_conf *conf);
 __hidden extern void lxc_delete_tty(struct lxc_tty_info *ttys);
+__hidden extern int lxc_send_ttys_to_parent(struct lxc_handler *handler);
+__hidden extern int lxc_send_devpts_to_parent(struct lxc_handler *handler);
 __hidden extern int lxc_clear_config_caps(struct lxc_conf *c);
 __hidden extern int lxc_clear_config_keepcaps(struct lxc_conf *c);
 __hidden extern int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version);
@@ -541,7 +545,7 @@ __hidden extern int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), v
                                     const char *fn_name);
 __hidden extern int parse_mntopts_legacy(const char *mntopts, unsigned long *mntflags, char **mntdata);
 __hidden extern int parse_propagationopts(const char *mntopts, unsigned long *pflags);
-__hidden extern int parse_lxc_mntopts(struct lxc_mount_options *opts, char *mnt_opts);
+__hidden extern int parse_lxc_mount_attrs(struct lxc_mount_options *opts, char *mnt_opts);
 __hidden extern void tmp_proc_unmount(struct lxc_conf *lxc_conf);
 __hidden extern void suggest_default_idmap(void);
 __hidden extern FILE *make_anonymous_mount_file(struct lxc_list *mount, bool include_nesting_helpers);
@@ -593,12 +597,10 @@ static inline void put_lxc_mount_options(struct lxc_mount_options *mnt_opts)
        mnt_opts->create_file = 0;
        mnt_opts->optional = 0;
        mnt_opts->relative = 0;
-       mnt_opts->userns_self = 0;
        mnt_opts->userns_path[0] = '\0';
        mnt_opts->mnt_flags = 0;
        mnt_opts->prop_flags = 0;
 
-       close_prot_errno_disarm(mnt_opts->userns_fd);
        free_disarm(mnt_opts->data);
 }
 
@@ -608,7 +610,6 @@ static inline void put_lxc_rootfs(struct lxc_rootfs *rootfs, bool unpin)
                close_prot_errno_disarm(rootfs->dfd_host);
                close_prot_errno_disarm(rootfs->dfd_mnt);
                close_prot_errno_disarm(rootfs->dfd_dev);
-               close_prot_errno_disarm(rootfs->mnt_opts.userns_fd);
                if (unpin)
                        close_prot_errno_disarm(rootfs->fd_path_pin);
                close_prot_errno_disarm(rootfs->dfd_idmapped);
index 1d74311daea3725cd217544b43f7572ed9e1d639..1a18737623a5bf1a5dbfacdf180590dc9ff865ec 100644 (file)
@@ -2799,7 +2799,7 @@ static int set_config_rootfs_options(const char *key, const char *value,
        if (!dup)
                return -ENOMEM;
 
-       ret = parse_lxc_mntopts(mnt_opts, dup);
+       ret = parse_lxc_mount_attrs(mnt_opts, dup);
        if (ret < 0)
                return ret;
 
index e3ed26d0a07ad38da756a23ca9b68b7063552cbc..21e70dce85a66fa09c159eddd8bd6ae29be310f8 100644 (file)
@@ -1291,12 +1291,6 @@ static int do_start(void *data)
        if (ret < 0)
                goto out_warn_father;
 
-       ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
-       if (ret < 0) {
-               SYSERROR("Failed to send seccomp notify fd to parent");
-               goto out_warn_father;
-       }
-
        ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
        if (ret < 0) {
                ERROR("Failed to run lxc.hook.start for container \"%s\"",
@@ -1336,6 +1330,35 @@ static int do_start(void *data)
        if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
                goto out_warn_father;
 
+       ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
+       if (ret < 0) {
+               SYSERROR("Failed to send seccomp notify fd to parent");
+               goto out_warn_father;
+       }
+
+       ret = lxc_send_devpts_to_parent(handler);
+       if (ret < 0) {
+               SYSERROR("Failed to send seccomp devpts fd to parent");
+               goto out_warn_father;
+       }
+
+       ret = lxc_send_ttys_to_parent(handler);
+       if (ret < 0) {
+               SYSERROR("Failed to send tty file descriptors to parent");
+               goto out_warn_father;
+       }
+
+       if (handler->ns_clone_flags & CLONE_NEWNET) {
+               ret = lxc_network_send_name_and_ifindex_to_parent(handler);
+               if (ret < 0) {
+                       SYSERROR("Failed to send network device names and ifindices to parent");
+                       goto out_warn_father;
+               }
+       }
+
+       if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
+               goto out_warn_father;
+
        /* Reset the environment variables the user requested in a clear
         * environment.
         */
@@ -1458,16 +1481,16 @@ static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
                return -1;
 
        for (i = 0; i < conf->ttys.max; i++) {
-               int ttyfds[2];
+               int ttyx = -EBADF, ttyy = -EBADF;
 
-               ret = lxc_abstract_unix_recv_two_fds(sock, ttyfds);
+               ret = lxc_abstract_unix_recv_two_fds(sock, &ttyx, &ttyy);
                if (ret < 0)
                        break;
 
                tty = &ttys->tty[i];
                tty->busy = -1;
-               tty->ptx = ttyfds[0];
-               tty->pty = ttyfds[1];
+               tty->ptx = ttyx;
+               tty->pty = ttyy;
                TRACE("Received pty with ptx fd %d and pty fd %d from child", tty->ptx, tty->pty);
        }
 
@@ -1875,6 +1898,15 @@ static int lxc_spawn(struct lxc_handler *handler)
        if (!lxc_sync_barrier_child(handler, START_SYNC_CGROUP_UNSHARE))
                goto out_delete_net;
 
+       ret = lxc_idmapped_mounts_parent(handler);
+       if (ret) {
+               ERROR("Failed to setup mount entries");
+               goto out_delete_net;
+       }
+
+       if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
+               goto out_delete_net;
+
        /*
         * With isolation the limiting devices cgroup was already setup, so
         * only setup devices here if we have no namespace directory.
@@ -1924,21 +1956,13 @@ static int lxc_spawn(struct lxc_handler *handler)
                goto out_delete_net;
        }
 
-       /* Tell the child to complete its initialization and wait for it to exec
-        * or return an error. (The child will never return
-        * START_SYNC_READY_START+1. It will either close the sync pipe, causing
-        * lxc_sync_barrier_child to return success, or return a different
-        * value, causing us to error out).
-        */
-       if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
+       if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
                goto out_delete_net;
 
-       if (handler->ns_clone_flags & CLONE_NEWNET) {
-               ret = lxc_network_recv_name_and_ifindex_from_child(handler);
-               if (ret < 0) {
-                       ERROR("Failed to receive names and ifindices for network devices from child");
-                       goto out_delete_net;
-               }
+       ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
+       if (ret < 0) {
+               SYSERROR("Failed to receive seccomp notify fd from child");
+               goto out_delete_net;
        }
 
        ret = lxc_setup_devpts_parent(handler);
@@ -1947,13 +1971,6 @@ static int lxc_spawn(struct lxc_handler *handler)
                goto out_delete_net;
        }
 
-       /* Now all networks are created, network devices are moved into place,
-        * and the correct names and ifindices in the respective namespaces have
-        * been recorded. The corresponding structs have now all been filled. So
-        * log them for debugging purposes.
-        */
-       lxc_log_configured_netdevs(conf);
-
        /* Read tty fds allocated by child. */
        ret = lxc_recv_ttys_from_child(handler);
        if (ret < 0) {
@@ -1961,12 +1978,31 @@ static int lxc_spawn(struct lxc_handler *handler)
                goto out_delete_net;
        }
 
-       ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
-       if (ret < 0) {
-               SYSERROR("Failed to receive seccomp notify fd from child");
-               goto out_delete_net;
+       if (handler->ns_clone_flags & CLONE_NEWNET) {
+               ret = lxc_network_recv_name_and_ifindex_from_child(handler);
+               if (ret < 0) {
+                       ERROR("Failed to receive names and ifindices for network devices from child");
+                       goto out_delete_net;
+               }
        }
 
+       /*
+        * Tell the child to complete its initialization and wait for it to
+        * exec or return an error. (The child will never return
+        * START_SYNC_READY_START+1. It will either close the sync pipe,
+        * causing lxc_sync_barrier_child to return success, or return a
+        * different value, causing us to error out).
+        */
+       if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
+               goto out_delete_net;
+
+       /* Now all networks are created, network devices are moved into place,
+        * and the correct names and ifindices in the respective namespaces have
+        * been recorded. The corresponding structs have now all been filled. So
+        * log them for debugging purposes.
+        */
+       lxc_log_configured_netdevs(conf);
+
        ret = handler->ops->post_start(handler, handler->data);
        if (ret < 0)
                goto out_abort;
index 514bdae30f8675ee8d576d8188ed8e04b4d6202a..f1bea3e30dd082c19b1ab1db45ffa840afdeca4c 100644 (file)
@@ -321,7 +321,6 @@ struct lxc_storage *storage_copy(struct lxc_container *c, const char *cname,
                .dfd_host               = -EBADF,
                .fd_path_pin            = -EBADF,
                .dfd_idmapped           = -EBADF,
-               .mnt_opts.userns_fd     = -EBADF,
        };
 
        if (!src) {
index 1d018387ec0827a3f4e17264899cd46199fc08dd..f194e6776127fed1fa1a238fec867c2b74d2fbe0 100644 (file)
@@ -74,6 +74,10 @@ static inline const char *start_sync_to_string(int state)
                return "cgroup-unshare";
        case START_SYNC_CGROUP_LIMITS:
                return "cgroup-limits";
+       case START_SYNC_IDMAPPED_MOUNTS:
+               return "idmapped-mounts";
+       case START_SYNC_FDS:
+               return "fds";
        case START_SYNC_READY_START:
                return "ready-start";
        case START_SYNC_RESTART:
@@ -109,13 +113,13 @@ bool lxc_sync_wake_parent(struct lxc_handler *handler, int sequence)
 
 bool lxc_sync_wait_parent(struct lxc_handler *handler, int sequence)
 {
-       TRACE("Parent waiting for child with sequence %s", start_sync_to_string(sequence));
+       TRACE("Child waiting for parent with sequence %s", start_sync_to_string(sequence));
        return sync_wait(handler->sync_sock[0], sequence);
 }
 
 bool lxc_sync_wait_child(struct lxc_handler *handler, int sequence)
 {
-       TRACE("Child waiting for parent with sequence %s", start_sync_to_string(sequence));
+       TRACE("Parent waiting for child with sequence %s", start_sync_to_string(sequence));
        return sync_wait(handler->sync_sock[1], sequence);
 }
 
index 57191c1cbba2dcc26ab9e024f64cbf0a46a8365c..e7b3b4d374147d4fbff2f8ca44bdd008a2f9032a 100644 (file)
@@ -19,10 +19,12 @@ enum /* start */ {
        START_SYNC_POST_CONFIGURE       =  2,
        START_SYNC_CGROUP               =  3,
        START_SYNC_CGROUP_UNSHARE       =  4,
-       START_SYNC_CGROUP_LIMITS        =  5,
-       START_SYNC_READY_START          =  6,
-       START_SYNC_RESTART              =  7,
-       START_SYNC_POST_RESTART         =  8,
+       START_SYNC_IDMAPPED_MOUNTS      =  5,
+       START_SYNC_CGROUP_LIMITS        =  6,
+       START_SYNC_FDS                  =  7,
+       START_SYNC_READY_START          =  8,
+       START_SYNC_RESTART              =  9,
+       START_SYNC_POST_RESTART         =  10,
 };
 
 enum /* attach */ {