container. This is useful to mount /etc, /var or /home for
examples.
</para>
+ <para>
+ NOTE - LXC will generally ensure that mount targets and relative
+ bind-mount sources are properly confined under the container
+ root, to avoid attacks involving over-mounting host directories
+ and files. (Symbolic links in absolute mount sources are ignored)
+ However, if the container configuration first mounts a directory which
+ is under the control of the container user, such as /home/joe, into
+ the container at some <filename>path</filename>, and then mounts
+ under <filename>path</filename>, then a TOCTTOU attack would be
+ possible where the container user modifies a symbolic link under
+ his home directory at just the right time.
+ </para>
<variablelist>
<varlistentry>
<term>
if (!path)
return false;
snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
- r = mount("cgroup_root", path, "tmpfs", MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME, "size=10240k,mode=755");
+ r = safe_mount("cgroup_root", path, "tmpfs",
+ MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
+ "size=10240k,mode=755",
+ root);
if (r < 0) {
SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
return false;
}
/* mount a tmpfs there so we can create subdirs */
- if (mount("cgroup", cgpath, "tmpfs", 0, "size=10000,mode=755")) {
+ if (safe_mount("cgroup", cgpath, "tmpfs", 0, "size=10000,mode=755", root)) {
SYSERROR("Failed to mount tmpfs at %s", cgpath);
return false;
}
return false;
}
- if (mount(dirname, cgpath, "none", MS_BIND, 0)) {
+ if (safe_mount(dirname, cgpath, "none", MS_BIND, 0, root)) {
SYSERROR("Failed to bind mount %s to %s", dirname, cgpath);
return false;
}
* 2.6.32...
*/
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
- { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/net", NULL, MS_BIND, NULL },
+ /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
+ { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
- { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/net", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
+ { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
}
mflags = add_required_remount_flags(source, destination,
default_mounts[i].flags);
- r = mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options);
+ r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
saved_errno = errno;
if (r < 0 && errno == ENOENT) {
INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
return 0;
}
- if (mount("none", path, "tmpfs", 0, "size=100000,mode=755")) {
+ if (safe_mount("none", path, "tmpfs", 0, "size=100000,mode=755",
+ rootfs->path ? rootfs->mount : NULL)) {
SYSERROR("Failed mounting tmpfs onto %s\n", path);
return false;
}
return -1;
}
fclose(pathfile);
- if (mount(hostpath, path, 0, MS_BIND, NULL) != 0) {
+ if (safe_mount(hostpath, path, 0, MS_BIND, NULL,
+ rootfs->path ? rootfs->mount : NULL) != 0) {
SYSERROR("Failed bind mounting device %s from host into container",
d->name);
return -1;
return -1;
}
- if (mount(console->name, path, "none", MS_BIND, 0)) {
+ if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount)) {
ERROR("failed to mount '%s' on '%s'", console->name, path);
return -1;
}
return 0;
}
- if (mount(console->name, lxcpath, "none", MS_BIND, 0)) {
+ if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount)) {
ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
return -1;
}
static int mount_entry(const char *fsname, const char *target,
const char *fstype, unsigned long mountflags,
- const char *data, int optional)
+ const char *data, int optional, const char *rootfs)
{
#ifdef HAVE_STATVFS
struct statvfs sb;
#endif
- if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) {
+ if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
if (optional) {
INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
target, strerror(errno));
#endif
if (mount(fsname, target, fstype,
- mountflags | MS_REMOUNT, data)) {
+ mountflags | MS_REMOUNT, data) < 0) {
if (optional) {
INFO("failed to mount '%s' on '%s' (optional): %s",
fsname, target, strerror(errno));
}
static inline int mount_entry_on_generic(struct mntent *mntent,
- const char* path)
+ const char* path, const char *rootfs)
{
unsigned long mntflags;
char *mntdata;
}
ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
- mntflags, mntdata, optional);
+ mntflags, mntdata, optional, rootfs);
free(mntdata);
static inline int mount_entry_on_systemfs(struct mntent *mntent)
{
- return mount_entry_on_generic(mntent, mntent->mnt_dir);
+ return mount_entry_on_generic(mntent, mntent->mnt_dir, NULL);
}
static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
return -1;
}
- return mount_entry_on_generic(mntent, path);
+ return mount_entry_on_generic(mntent, path, rootfs->mount);
}
static int mount_entry_on_relative_rootfs(struct mntent *mntent,
return -1;
}
- return mount_entry_on_generic(mntent, path);
+ return mount_entry_on_generic(mntent, path, rootfs);
}
static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
fclose(pathfile);
}
- ret = mount(path, destpath, "none", MS_BIND, NULL);
+ ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
if (ret < 0)
SYSERROR("Failed to bind lxc.init.static into container");
INFO("lxc.init.static bound into container at %s", path);
return ret;
}
+/*
+ * @path: a pathname where / replaced with '\0'.
+ * @offsetp: pointer to int showing which path segment was last seen.
+ * Updated on return to reflect the next segment.
+ * @fulllen: full original path length.
+ * Returns a pointer to the next path segment, or NULL if done.
+ */
+static char *get_nextpath(char *path, int *offsetp, int fulllen)
+{
+ int offset = *offsetp;
+
+ if (offset >= fulllen)
+ return NULL;
+
+ while (path[offset] != '\0' && offset < fulllen)
+ offset++;
+ while (path[offset] == '\0' && offset < fulllen)
+ offset++;
+
+ *offsetp = offset;
+ return (offset < fulllen) ? &path[offset] : NULL;
+}
+
+/*
+ * Check that @subdir is a subdir of @dir. @len is the length of
+ * @dir (to avoid having to recalculate it).
+ */
+static bool is_subdir(const char *subdir, const char *dir, size_t len)
+{
+ size_t subdirlen = strlen(subdir);
+
+ if (subdirlen < len)
+ return false;
+ if (strncmp(subdir, dir, len) != 0)
+ return false;
+ if (dir[len-1] == '/')
+ return true;
+ if (subdir[len] == '/' || subdirlen == len)
+ return true;
+ return false;
+}
+
+/*
+ * Check if the open fd is a symlink. Return -ELOOP if it is. Return
+ * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
+ */
+static int check_symlink(int fd)
+{
+ struct stat sb;
+ int ret = fstat(fd, &sb);
+ if (ret < 0)
+ return -ENOENT;
+ if (S_ISLNK(sb.st_mode))
+ return -ELOOP;
+ return 0;
+}
+
+/*
+ * Open a file or directory, provided that it contains no symlinks.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+static int open_if_safe(int dirfd, const char *nextpath)
+{
+ int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW);
+ if (newfd >= 0) // was not a symlink, all good
+ return newfd;
+
+ if (errno == ELOOP)
+ return newfd;
+
+ if (errno == EPERM || errno == EACCES) {
+ /* we're not root (cause we got EPERM) so
+ try opening with O_PATH */
+ newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW);
+ if (newfd >= 0) {
+ /* O_PATH will return an fd for symlinks. We know
+ * nextpath wasn't a symlink at last openat, so if fd
+ * is now a link, then something * fishy is going on
+ */
+ int ret = check_symlink(newfd);
+ if (ret < 0) {
+ close(newfd);
+ newfd = ret;
+ }
+ }
+ }
+
+ return newfd;
+}
+
+/*
+ * Open a path intending for mounting, ensuring that the final path
+ * is inside the container's rootfs.
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ *
+ * @target: path to be opened
+ * @prefix_skip: a part of @target in which to ignore symbolic links. This
+ * would be the container's rootfs.
+ *
+ * Return an open fd for the path, or <0 on error.
+ */
+static int open_without_symlink(const char *target, const char *prefix_skip)
+{
+ int curlen = 0, dirfd, fulllen, i;
+ char *dup = NULL;
+
+ fulllen = strlen(target);
+
+ /* make sure prefix-skip makes sense */
+ if (prefix_skip) {
+ curlen = strlen(prefix_skip);
+ if (!is_subdir(target, prefix_skip, curlen)) {
+ ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
+ target, prefix_skip);
+ return -EINVAL;
+ }
+ /*
+ * get_nextpath() expects the curlen argument to be
+ * on a (turned into \0) / or before it, so decrement
+ * curlen to make sure that happens
+ */
+ if (curlen)
+ curlen--;
+ } else {
+ prefix_skip = "/";
+ curlen = 0;
+ }
+
+ /* Make a copy of target which we can hack up, and tokenize it */
+ if ((dup = strdup(target)) == NULL) {
+ SYSERROR("Out of memory checking for symbolic link");
+ return -ENOMEM;
+ }
+ for (i = 0; i < fulllen; i++) {
+ if (dup[i] == '/')
+ dup[i] = '\0';
+ }
+
+ dirfd = open(prefix_skip, O_RDONLY);
+ if (dirfd < 0)
+ goto out;
+ while (1) {
+ int newfd, saved_errno;
+ char *nextpath;
+
+ if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL)
+ goto out;
+ newfd = open_if_safe(dirfd, nextpath);
+ saved_errno = errno;
+ close(dirfd);
+ dirfd = newfd;
+ if (newfd < 0) {
+ errno = saved_errno;
+ if (errno == ELOOP)
+ SYSERROR("%s in %s was a symbolic link!", nextpath, target);
+ else
+ SYSERROR("Error examining %s in %s", nextpath, target);
+ goto out;
+ }
+ }
+
+out:
+ free(dup);
+ return dirfd;
+}
+
+/*
+ * Safely mount a path into a container, ensuring that the mount target
+ * is under the container's @rootfs. (If @rootfs is NULL, then the container
+ * uses the host's /)
+ *
+ * CAVEAT: This function must not be used for other purposes than container
+ * setup before executing the container's init
+ */
+int safe_mount(const char *src, const char *dest, const char *fstype,
+ unsigned long flags, const void *data, const char *rootfs)
+{
+ int srcfd = -1, destfd, ret, saved_errno;
+ char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/<fd>
+ const char *mntsrc = src;
+
+ if (!rootfs)
+ rootfs = "";
+
+ /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
+ if (flags & MS_BIND && src && src[0] != '/') {
+ INFO("this is a relative bind mount");
+ srcfd = open_without_symlink(src, NULL);
+ if (srcfd < 0)
+ return srcfd;
+ ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd);
+ if (ret < 0 || ret > 50) {
+ close(srcfd);
+ ERROR("Out of memory");
+ return -EINVAL;
+ }
+ mntsrc = srcbuf;
+ }
+
+ destfd = open_without_symlink(dest, rootfs);
+ if (destfd < 0) {
+ if (srcfd != -1)
+ close(srcfd);
+ return destfd;
+ }
+
+ ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd);
+ if (ret < 0 || ret > 50) {
+ if (srcfd != -1)
+ close(srcfd);
+ close(destfd);
+ ERROR("Out of memory");
+ return -EINVAL;
+ }
+
+ ret = mount(mntsrc, destbuf, fstype, flags, data);
+ saved_errno = errno;
+ if (srcfd != -1)
+ close(srcfd);
+ close(destfd);
+ if (ret < 0) {
+ errno = saved_errno;
+ SYSERROR("Failed to mount %s onto %s", src, dest);
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* Mount a proc under @rootfs if proc self points to a pid other than
* my own. This is needed to have a known-good proc mount for setting
return 0;
domount:
- if (mount("proc", path, "proc", 0, NULL))
+ if (safe_mount("proc", path, "proc", 0, NULL, rootfs) < 0)
return -1;
INFO("Mounted /proc in container for security transition");
return 1;
int is_dir(const char *path);
char *get_template_path(const char *t);
int setproctitle(char *title);
+int safe_mount(const char *src, const char *dest, const char *fstype,
+ unsigned long flags, const void *data, const char *rootfs);
int mount_proc_if_needed(const char *rootfs);
int null_stdfds(void);
#endif /* __LXC_UTILS_H */
lxc-test-apparmor-mount \
lxc-test-checkpoint-restore \
lxc-test-snapdeps \
+ lxc-test-symlink \
lxc-test-ubuntu \
lxc-test-unpriv \
lxc-test-usernic
lxc-test-cloneconfig \
lxc-test-createconfig \
lxc-test-snapdeps \
+ lxc-test-symlink \
lxc-test-ubuntu \
lxc-test-unpriv \
may_control.c \
--- /dev/null
+#!/bin/bash
+
+set -ex
+
+# lxc: linux Container library
+
+# Authors:
+# Serge Hallyn <serge.hallyn@ubuntu.com>
+#
+# This is a regression test for symbolic links
+
+dirname=`mktemp -d`
+fname=`mktemp`
+fname2=`mktemp`
+
+lxcpath=/var/lib/lxcsym1
+
+cleanup() {
+ lxc-destroy -P $lxcpath -f -n symtest1 || true
+ rm -f $lxcpath
+ rmdir $dirname || true
+ rm -f $fname || true
+ rm -f $fname2 || true
+}
+
+trap cleanup EXIT SIGHUP SIGINT SIGTERM
+
+testrun() {
+ expected=$1
+ run=$2
+ pass="pass"
+ lxc-start -P $lxcpath -n symtest1 -l trace -o $lxcpath/log || pass="fail"
+ [ $pass = "pass" ] && lxc-wait -P $lxcpath -n symtest1 -t 10 -s RUNNING || pass="fail"
+ if [ "$pass" != "$expected" ]; then
+ echo "Test $run: expected $expected but container did not. Start log:"
+ cat $lxcpath/log
+ echo "FAIL: Test $run: expected $expected but container did not."
+ false
+ fi
+ lxc-stop -P $lxcpath -n symtest1 -k || true
+}
+
+# make lxcpath a symlink - this should NOT cause failure
+ln -s /var/lib/lxc $lxcpath
+
+lxc-destroy -P $lxcpath -f -n symtest1 || true
+lxc-create -P $lxcpath -t busybox -n symtest1
+
+cat >> /var/lib/lxc/symtest1/config << EOF
+lxc.mount.entry = $dirname opt/xxx/dir none bind,create=dir
+lxc.mount.entry = $fname opt/xxx/file none bind,create=file
+lxc.mount.entry = $fname2 opt/xxx/file2 none bind
+EOF
+
+# Regular - should succeed
+mkdir -p /var/lib/lxc/symtest1/rootfs/opt/xxx
+touch /var/lib/lxc/symtest1/rootfs/opt/xxx/file2
+testrun pass 1
+
+# symlink - should fail
+rm -rf /var/lib/lxc/symtest1/rootfs/opt/xxx
+mkdir -p /var/lib/lxc/symtest1/rootfs/opt/xxx2
+ln -s /var/lib/lxc/symtest1/rootfs/opt/xxx2 /var/lib/lxc/symtest1/rootfs/opt/xxx
+touch /var/lib/lxc/symtest1/rootfs/opt/xxx/file2
+testrun fail 2
+
+# final final symlink - should fail
+rm -rf $lxcpath/symtest1/rootfs/opt/xxx
+mkdir -p $lxcpath/symtest1/rootfs/opt/xxx
+mkdir -p $lxcpath/symtest1/rootfs/opt/xxx/dir
+touch $lxcpath/symtest1/rootfs/opt/xxx/file
+touch $lxcpath/symtest1/rootfs/opt/xxx/file2src
+ln -s $lxcpath/symtest1/rootfs/opt/xxx/file2src $lxcpath/symtest1/rootfs/opt/xxx/file2
+testrun fail 3
+
+# Ideally we'd also try a loop device, but that won't work in nested containers
+# anyway - TODO
+
+# what about /proc itself
+
+rm -rf $lxcpath/symtest1/rootfs/opt/xxx
+mkdir -p $lxcpath/symtest1/rootfs/opt/xxx
+touch $lxcpath/symtest1/rootfs/opt/xxx/file2
+mv $lxcpath/symtest1/rootfs/proc $lxcpath/symtest1/rootfs/proc1
+ln -s $lxcpath/symtest1/rootfs/proc1 $lxcpath/symtest1/rootfs/proc
+testrun fail 4
+
+echo "all tests passed"