--- /dev/null
+From 81b6b06197606b4bef4e427a197aeb808e8d89e1 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Sat, 30 Aug 2014 18:32:05 -0400
+Subject: fix EBUSY on umount() from MNT_SHRINKABLE
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 81b6b06197606b4bef4e427a197aeb808e8d89e1 upstream.
+
+We need the parents of victims alive until namespace_unlock() gets to
+dput() of the (ex-)mountpoints. However, that screws up the "is it
+busy" checks in case when we have shrinkable mounts that need to be
+killed. Solution: go ahead and decrement refcounts of parents right
+in umount_tree(), increment them again just before dropping rwsem in
+namespace_unlock() (and let the loop in the end of namespace_unlock()
+finally drop those references for good, as we do now). Parents can't
+get freed until we drop rwsem - at least one reference is kept until
+then, both in case when parent is among the victims and when it is
+not. So they'll still be around when we get to namespace_unlock().
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1217,6 +1217,11 @@ static void namespace_unlock(void)
+ head.first->pprev = &head.first;
+ INIT_HLIST_HEAD(&unmounted);
+
++ /* undo decrements we'd done in umount_tree() */
++ hlist_for_each_entry(mnt, &head, mnt_hash)
++ if (mnt->mnt_ex_mountpoint.mnt)
++ mntget(mnt->mnt_ex_mountpoint.mnt);
++
+ up_write(&namespace_sem);
+
+ synchronize_rcu();
+@@ -1268,6 +1273,7 @@ void umount_tree(struct mount *mnt, int
+ p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+ if (mnt_has_parent(p)) {
+ put_mountpoint(p->mnt_mp);
++ mnt_add_count(p->mnt_parent, -1);
+ /* move the reference to mountpoint into ->mnt_ex_mountpoint */
+ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
+ p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
--- /dev/null
+From 88b368f27a094277143d8ecd5a056116f6a41520 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Mon, 18 Aug 2014 15:09:26 -0400
+Subject: get rid of propagate_umount() mistakenly treating slaves as busy.
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 88b368f27a094277143d8ecd5a056116f6a41520 upstream.
+
+The check in __propagate_umount() ("has somebody explicitly mounted
+something on that slave?") is done *before* taking the already doomed
+victims out of the child lists.
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 4 +++-
+ fs/pnode.c | 1 +
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1253,6 +1253,9 @@ void umount_tree(struct mount *mnt, int
+ hlist_add_head(&p->mnt_hash, &tmp_list);
+ }
+
++ hlist_for_each_entry(p, &tmp_list, mnt_hash)
++ list_del_init(&p->mnt_child);
++
+ if (how)
+ propagate_umount(&tmp_list);
+
+@@ -1263,7 +1266,6 @@ void umount_tree(struct mount *mnt, int
+ p->mnt_ns = NULL;
+ if (how < 2)
+ p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+- list_del_init(&p->mnt_child);
+ if (mnt_has_parent(p)) {
+ put_mountpoint(p->mnt_mp);
+ /* move the reference to mountpoint into ->mnt_ex_mountpoint */
+--- a/fs/pnode.c
++++ b/fs/pnode.c
+@@ -381,6 +381,7 @@ static void __propagate_umount(struct mo
+ * other children
+ */
+ if (child && list_empty(&child->mnt_mounts)) {
++ list_del_init(&child->mnt_child);
+ hlist_del_init_rcu(&child->mnt_hash);
+ hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+ }
--- /dev/null
+From db181ce011e3c033328608299cd6fac06ea50130 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Tue, 29 Jul 2014 15:50:44 -0700
+Subject: mnt: Add tests for unprivileged remount cases that have found to be faulty
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+commit db181ce011e3c033328608299cd6fac06ea50130 upstream.
+
+Kenton Varda <kenton@sandstorm.io> discovered that by remounting a
+read-only bind mount read-only in a user namespace the
+MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user
+to the remount a read-only mount read-write.
+
+Upon review of the code in remount it was discovered that the code allowed
+nosuid, noexec, and nodev to be cleared. It was also discovered that
+the code was allowing the per mount atime flags to be changed.
+
+The first naive patch to fix these issues contained the flaw that using
+default atime settings when remounting a filesystem could be disallowed.
+
+To avoid this problems in the future add tests to ensure unprivileged
+remounts are succeeding and failing at the appropriate times.
+
+Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/testing/selftests/Makefile | 1
+ tools/testing/selftests/mount/Makefile | 17
+ tools/testing/selftests/mount/unprivileged-remount-test.c | 242 ++++++++++++++
+ 3 files changed, 260 insertions(+)
+
+--- a/tools/testing/selftests/Makefile
++++ b/tools/testing/selftests/Makefile
+@@ -4,6 +4,7 @@ TARGETS += efivarfs
+ TARGETS += kcmp
+ TARGETS += memory-hotplug
+ TARGETS += mqueue
++TARGETS += mount
+ TARGETS += net
+ TARGETS += ptrace
+ TARGETS += timers
+--- /dev/null
++++ b/tools/testing/selftests/mount/Makefile
+@@ -0,0 +1,17 @@
++# Makefile for mount selftests.
++
++all: unprivileged-remount-test
++
++unprivileged-remount-test: unprivileged-remount-test.c
++ gcc -Wall -O2 unprivileged-remount-test.c -o unprivileged-remount-test
++
++# Allow specific tests to be selected.
++test_unprivileged_remount: unprivileged-remount-test
++ @if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi
++
++run_tests: all test_unprivileged_remount
++
++clean:
++ rm -f unprivileged-remount-test
++
++.PHONY: all test_unprivileged_remount
+--- /dev/null
++++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
+@@ -0,0 +1,242 @@
++#define _GNU_SOURCE
++#include <sched.h>
++#include <stdio.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/types.h>
++#include <sys/mount.h>
++#include <sys/wait.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <grp.h>
++#include <stdbool.h>
++#include <stdarg.h>
++
++#ifndef CLONE_NEWNS
++# define CLONE_NEWNS 0x00020000
++#endif
++#ifndef CLONE_NEWUTS
++# define CLONE_NEWUTS 0x04000000
++#endif
++#ifndef CLONE_NEWIPC
++# define CLONE_NEWIPC 0x08000000
++#endif
++#ifndef CLONE_NEWNET
++# define CLONE_NEWNET 0x40000000
++#endif
++#ifndef CLONE_NEWUSER
++# define CLONE_NEWUSER 0x10000000
++#endif
++#ifndef CLONE_NEWPID
++# define CLONE_NEWPID 0x20000000
++#endif
++
++#ifndef MS_RELATIME
++#define MS_RELATIME (1 << 21)
++#endif
++#ifndef MS_STRICTATIME
++#define MS_STRICTATIME (1 << 24)
++#endif
++
++static void die(char *fmt, ...)
++{
++ va_list ap;
++ va_start(ap, fmt);
++ vfprintf(stderr, fmt, ap);
++ va_end(ap);
++ exit(EXIT_FAILURE);
++}
++
++static void write_file(char *filename, char *fmt, ...)
++{
++ char buf[4096];
++ int fd;
++ ssize_t written;
++ int buf_len;
++ va_list ap;
++
++ va_start(ap, fmt);
++ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
++ va_end(ap);
++ if (buf_len < 0) {
++ die("vsnprintf failed: %s\n",
++ strerror(errno));
++ }
++ if (buf_len >= sizeof(buf)) {
++ die("vsnprintf output truncated\n");
++ }
++
++ fd = open(filename, O_WRONLY);
++ if (fd < 0) {
++ die("open of %s failed: %s\n",
++ filename, strerror(errno));
++ }
++ written = write(fd, buf, buf_len);
++ if (written != buf_len) {
++ if (written >= 0) {
++ die("short write to %s\n", filename);
++ } else {
++ die("write to %s failed: %s\n",
++ filename, strerror(errno));
++ }
++ }
++ if (close(fd) != 0) {
++ die("close of %s failed: %s\n",
++ filename, strerror(errno));
++ }
++}
++
++static void create_and_enter_userns(void)
++{
++ uid_t uid;
++ gid_t gid;
++
++ uid = getuid();
++ gid = getgid();
++
++ if (unshare(CLONE_NEWUSER) !=0) {
++ die("unshare(CLONE_NEWUSER) failed: %s\n",
++ strerror(errno));
++ }
++
++ write_file("/proc/self/uid_map", "0 %d 1", uid);
++ write_file("/proc/self/gid_map", "0 %d 1", gid);
++
++ if (setgroups(0, NULL) != 0) {
++ die("setgroups failed: %s\n",
++ strerror(errno));
++ }
++ if (setgid(0) != 0) {
++ die ("setgid(0) failed %s\n",
++ strerror(errno));
++ }
++ if (setuid(0) != 0) {
++ die("setuid(0) failed %s\n",
++ strerror(errno));
++ }
++}
++
++static
++bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
++{
++ pid_t child;
++
++ child = fork();
++ if (child == -1) {
++ die("fork failed: %s\n",
++ strerror(errno));
++ }
++ if (child != 0) { /* parent */
++ pid_t pid;
++ int status;
++ pid = waitpid(child, &status, 0);
++ if (pid == -1) {
++ die("waitpid failed: %s\n",
++ strerror(errno));
++ }
++ if (pid != child) {
++ die("waited for %d got %d\n",
++ child, pid);
++ }
++ if (!WIFEXITED(status)) {
++ die("child did not terminate cleanly\n");
++ }
++ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
++ }
++
++ create_and_enter_userns();
++ if (unshare(CLONE_NEWNS) != 0) {
++ die("unshare(CLONE_NEWNS) failed: %s\n",
++ strerror(errno));
++ }
++
++ if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) {
++ die("mount of /tmp failed: %s\n",
++ strerror(errno));
++ }
++
++ create_and_enter_userns();
++
++ if (unshare(CLONE_NEWNS) != 0) {
++ die("unshare(CLONE_NEWNS) failed: %s\n",
++ strerror(errno));
++ }
++
++ if (mount("/tmp", "/tmp", "none",
++ MS_REMOUNT | MS_BIND | remount_flags, NULL) != 0) {
++ /* system("cat /proc/self/mounts"); */
++ die("remount of /tmp failed: %s\n",
++ strerror(errno));
++ }
++
++ if (mount("/tmp", "/tmp", "none",
++ MS_REMOUNT | MS_BIND | invalid_flags, NULL) == 0) {
++ /* system("cat /proc/self/mounts"); */
++ die("remount of /tmp with invalid flags "
++ "succeeded unexpectedly\n");
++ }
++ exit(EXIT_SUCCESS);
++}
++
++static bool test_unpriv_remount_simple(int mount_flags)
++{
++ return test_unpriv_remount(mount_flags, mount_flags, 0);
++}
++
++static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags)
++{
++ return test_unpriv_remount(mount_flags, mount_flags, invalid_flags);
++}
++
++int main(int argc, char **argv)
++{
++ if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) {
++ die("MS_RDONLY malfunctions\n");
++ }
++ if (!test_unpriv_remount_simple(MS_NODEV)) {
++ die("MS_NODEV malfunctions\n");
++ }
++ if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) {
++ die("MS_NOSUID malfunctions\n");
++ }
++ if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) {
++ die("MS_NOEXEC malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV,
++ MS_NOATIME|MS_NODEV))
++ {
++ die("MS_RELATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV,
++ MS_NOATIME|MS_NODEV))
++ {
++ die("MS_STRICTATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV,
++ MS_STRICTATIME|MS_NODEV))
++ {
++ die("MS_RELATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV,
++ MS_NOATIME|MS_NODEV))
++ {
++ die("MS_RELATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV,
++ MS_NOATIME|MS_NODEV))
++ {
++ die("MS_RELATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV,
++ MS_STRICTATIME|MS_NODEV))
++ {
++ die("MS_RELATIME malfunctions\n");
++ }
++ if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV,
++ MS_NOATIME|MS_NODEV))
++ {
++ die("Default atime malfunctions\n");
++ }
++ return EXIT_SUCCESS;
++}
--- /dev/null
+From ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 28 Jul 2014 17:36:04 -0700
+Subject: mnt: Change the default remount atime from relatime to the existing value
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+commit ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e upstream.
+
+Since March 2009 the kernel has treated the state that if no
+MS_..ATIME flags are passed then the kernel defaults to relatime.
+
+Defaulting to relatime instead of the existing atime state during a
+remount is silly, and causes problems in practice for people who don't
+specify any MS_...ATIME flags and to get the default filesystem atime
+setting. Those users may encounter a permission error because the
+default atime setting does not work.
+
+A default that does not work and causes permission problems is
+ridiculous, so preserve the existing value to have a default
+atime setting that is always guaranteed to work.
+
+Using the default atime setting in this way is particularly
+interesting for applications built to run in restricted userspace
+environments without /proc mounted, as the existing atime mount
+options of a filesystem can not be read from /proc/mounts.
+
+In practice this fixes user space that uses the default atime
+setting on remount that are broken by the permission checks
+keeping less privileged users from changing more privileged users
+atime settings.
+
+Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -2464,6 +2464,14 @@ long do_mount(const char *dev_name, cons
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
+
++ /* The default atime for remount is preservation */
++ if ((flags & MS_REMOUNT) &&
++ ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
++ MS_STRICTATIME)) == 0)) {
++ mnt_flags &= ~MNT_ATIME_MASK;
++ mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
++ }
++
+ flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
+ MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+ MS_STRICTATIME);
--- /dev/null
+From 9566d6742852c527bf5af38af5cbb878dad75705 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 28 Jul 2014 17:26:07 -0700
+Subject: mnt: Correct permission checks in do_remount
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+commit 9566d6742852c527bf5af38af5cbb878dad75705 upstream.
+
+While invesgiating the issue where in "mount --bind -oremount,ro ..."
+would result in later "mount --bind -oremount,rw" succeeding even if
+the mount started off locked I realized that there are several
+additional mount flags that should be locked and are not.
+
+In particular MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, and the atime
+flags in addition to MNT_READONLY should all be locked. These
+flags are all per superblock, can all be changed with MS_BIND,
+and should not be changable if set by a more privileged user.
+
+The following additions to the current logic are added in this patch.
+- nosuid may not be clearable by a less privileged user.
+- nodev may not be clearable by a less privielged user.
+- noexec may not be clearable by a less privileged user.
+- atime flags may not be changeable by a less privileged user.
+
+The logic with atime is that always setting atime on access is a
+global policy and backup software and auditing software could break if
+atime bits are not updated (when they are configured to be updated),
+and serious performance degradation could result (DOS attack) if atime
+updates happen when they have been explicitly disabled. Therefore an
+unprivileged user should not be able to mess with the atime bits set
+by a more privileged user.
+
+The additional restrictions are implemented with the addition of
+MNT_LOCK_NOSUID, MNT_LOCK_NODEV, MNT_LOCK_NOEXEC, and MNT_LOCK_ATIME
+mnt flags.
+
+Taken together these changes and the fixes for MNT_LOCK_READONLY
+should make it safe for an unprivileged user to create a user
+namespace and to call "mount --bind -o remount,... ..." without
+the danger of mount flags being changed maliciously.
+
+Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 36 +++++++++++++++++++++++++++++++++---
+ include/linux/mount.h | 5 +++++
+ 2 files changed, 38 insertions(+), 3 deletions(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -887,8 +887,21 @@ static struct mount *clone_mnt(struct mo
+
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
+ /* Don't allow unprivileged users to change mount flags */
+- if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
+- mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
++ if (flag & CL_UNPRIVILEGED) {
++ mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
++
++ if (mnt->mnt.mnt_flags & MNT_READONLY)
++ mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
++
++ if (mnt->mnt.mnt_flags & MNT_NODEV)
++ mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
++
++ if (mnt->mnt.mnt_flags & MNT_NOSUID)
++ mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
++
++ if (mnt->mnt.mnt_flags & MNT_NOEXEC)
++ mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
++ }
+
+ /* Don't allow unprivileged users to reveal what is under a mount */
+ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
+@@ -1922,6 +1935,23 @@ static int do_remount(struct path *path,
+ !(mnt_flags & MNT_READONLY)) {
+ return -EPERM;
+ }
++ if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
++ !(mnt_flags & MNT_NODEV)) {
++ return -EPERM;
++ }
++ if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
++ !(mnt_flags & MNT_NOSUID)) {
++ return -EPERM;
++ }
++ if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
++ !(mnt_flags & MNT_NOEXEC)) {
++ return -EPERM;
++ }
++ if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
++ ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
++ return -EPERM;
++ }
++
+ err = security_sb_remount(sb, data);
+ if (err)
+ return err;
+@@ -2120,7 +2150,7 @@ static int do_new_mount(struct path *pat
+ */
+ if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+ flags |= MS_NODEV;
+- mnt_flags |= MNT_NODEV;
++ mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
+ }
+ }
+
+--- a/include/linux/mount.h
++++ b/include/linux/mount.h
+@@ -45,12 +45,17 @@ struct mnt_namespace;
+ #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
+ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
+ | MNT_READONLY)
++#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
+
+ #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
+ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
+
+ #define MNT_INTERNAL 0x4000
+
++#define MNT_LOCK_ATIME 0x040000
++#define MNT_LOCK_NOEXEC 0x080000
++#define MNT_LOCK_NOSUID 0x100000
++#define MNT_LOCK_NODEV 0x200000
+ #define MNT_LOCK_READONLY 0x400000
+ #define MNT_LOCKED 0x800000
+ #define MNT_DOOMED 0x1000000
--- /dev/null
+From 07b645589dcda8b7a5249e096fece2a67556f0f4 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 28 Jul 2014 17:10:56 -0700
+Subject: mnt: Move the test for MNT_LOCK_READONLY from change_mount_flags into do_remount
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+commit 07b645589dcda8b7a5249e096fece2a67556f0f4 upstream.
+
+There are no races as locked mount flags are guaranteed to never change.
+
+Moving the test into do_remount makes it more visible, and ensures all
+filesystem remounts pass the MNT_LOCK_READONLY permission check. This
+second case is not an issue today as filesystem remounts are guarded
+by capable(CAP_DAC_ADMIN) and thus will always fail in less privileged
+mount namespaces, but it could become an issue in the future.
+
+Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1887,9 +1887,6 @@ static int change_mount_flags(struct vfs
+ if (readonly_request == __mnt_is_readonly(mnt))
+ return 0;
+
+- if (mnt->mnt_flags & MNT_LOCK_READONLY)
+- return -EPERM;
+-
+ if (readonly_request)
+ error = mnt_make_readonly(real_mount(mnt));
+ else
+@@ -1915,6 +1912,16 @@ static int do_remount(struct path *path,
+ if (path->dentry != path->mnt->mnt_root)
+ return -EINVAL;
+
++ /* Don't allow changing of locked mnt flags.
++ *
++ * No locks need to be held here while testing the various
++ * MNT_LOCK flags because those flags can never be cleared
++ * once they are set.
++ */
++ if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
++ !(mnt_flags & MNT_READONLY)) {
++ return -EPERM;
++ }
+ err = security_sb_remount(sb, data);
+ if (err)
+ return err;
--- /dev/null
+From a6138db815df5ee542d848318e5dae681590fccd Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 28 Jul 2014 16:26:53 -0700
+Subject: mnt: Only change user settable mount flags in remount
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+commit a6138db815df5ee542d848318e5dae681590fccd upstream.
+
+Kenton Varda <kenton@sandstorm.io> discovered that by remounting a
+read-only bind mount read-only in a user namespace the
+MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user
+to the remount a read-only mount read-write.
+
+Correct this by replacing the mask of mount flags to preserve
+with a mask of mount flags that may be changed, and preserve
+all others. This ensures that any future bugs with this mask and
+remount will fail in an easy to detect way where new mount flags
+simply won't change.
+
+Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c | 2 +-
+ include/linux/mount.h | 4 +++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1928,7 +1928,7 @@ static int do_remount(struct path *path,
+ err = do_remount_sb(sb, flags, data, 0);
+ if (!err) {
+ lock_mount_hash();
+- mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
++ mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
+ mnt->mnt.mnt_flags = mnt_flags;
+ touch_mnt_namespace(mnt->mnt_ns);
+ unlock_mount_hash();
+--- a/include/linux/mount.h
++++ b/include/linux/mount.h
+@@ -42,7 +42,9 @@ struct mnt_namespace;
+ * flag, consider how it interacts with shared mounts.
+ */
+ #define MNT_SHARED_MASK (MNT_UNBINDABLE)
+-#define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE)
++#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
++ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
++ | MNT_READONLY)
+
+ #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
+ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
--- /dev/null
+From 651e22f2701b4113989237c3048d17337dd2185c Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Wed, 6 Aug 2014 14:11:33 -0400
+Subject: ring-buffer: Always reset iterator to reader page
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit 651e22f2701b4113989237c3048d17337dd2185c upstream.
+
+When performing a consuming read, the ring buffer swaps out a
+page from the ring buffer with a empty page and this page that
+was swapped out becomes the new reader page. The reader page
+is owned by the reader and since it was swapped out of the ring
+buffer, writers do not have access to it (there's an exception
+to that rule, but it's out of scope for this commit).
+
+When reading the "trace" file, it is a non consuming read, which
+means that the data in the ring buffer will not be modified.
+When the trace file is opened, a ring buffer iterator is allocated
+and writes to the ring buffer are disabled, such that the iterator
+will not have issues iterating over the data.
+
+Although the ring buffer disabled writes, it does not disable other
+reads, or even consuming reads. If a consuming read happens, then
+the iterator is reset and starts reading from the beginning again.
+
+My tests would sometimes trigger this bug on my i386 box:
+
+WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa()
+Modules linked in:
+CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8
+Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006
+ 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b
+ f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0
+ ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M
+Call Trace:
+ [<c18796b3>] dump_stack+0x4b/0x75
+ [<c103a0e3>] warn_slowpath_common+0x7e/0x95
+ [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa
+ [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa
+ [<c103a185>] warn_slowpath_fmt+0x33/0x35
+ [<c10bd85a>] __trace_find_cmdline+0x66/0xaa^M
+ [<c10bed04>] trace_find_cmdline+0x40/0x64
+ [<c10c3c16>] trace_print_context+0x27/0xec
+ [<c10c4360>] ? trace_seq_printf+0x37/0x5b
+ [<c10c0b15>] print_trace_line+0x319/0x39b
+ [<c10ba3fb>] ? ring_buffer_read+0x47/0x50
+ [<c10c13b1>] s_show+0x192/0x1ab
+ [<c10bfd9a>] ? s_next+0x5a/0x7c
+ [<c112e76e>] seq_read+0x267/0x34c
+ [<c1115a25>] vfs_read+0x8c/0xef
+ [<c112e507>] ? seq_lseek+0x154/0x154
+ [<c1115ba2>] SyS_read+0x54/0x7f
+ [<c188488e>] syscall_call+0x7/0xb
+---[ end trace 3f507febd6b4cc83 ]---
+>>>> ##### CPU 1 buffer started ####
+
+Which was the __trace_find_cmdline() function complaining about the pid
+in the event record being negative.
+
+After adding more test cases, this would trigger more often. Strangely
+enough, it would never trigger on a single test, but instead would trigger
+only when running all the tests. I believe that was the case because it
+required one of the tests to be shutting down via delayed instances while
+a new test started up.
+
+After spending several days debugging this, I found that it was caused by
+the iterator becoming corrupted. Debugging further, I found out why
+the iterator became corrupted. It happened with the rb_iter_reset().
+
+As consuming reads may not read the full reader page, and only part
+of it, there's a "read" field to know where the last read took place.
+The iterator, must also start at the read position. In the rb_iter_reset()
+code, if the reader page was disconnected from the ring buffer, the iterator
+would start at the head page within the ring buffer (where writes still
+happen). But the mistake there was that it still used the "read" field
+to start the iterator on the head page, where it should always start
+at zero because readers never read from within the ring buffer where
+writes occur.
+
+I originally wrote a patch to have it set the iter->head to 0 instead
+of iter->head_page->read, but then I questioned why it wasn't always
+setting the iter to point to the reader page, as the reader page is
+still valid. The list_empty(reader_page->list) just means that it was
+successful in swapping out. But the reader_page may still have data.
+
+There was a bug report a long time ago that was not reproducible that
+had something about trace_pipe (consuming read) not matching trace
+(iterator read). This may explain why that happened.
+
+Anyway, the correct answer to this bug is to always use the reader page
+an not reset the iterator to inside the writable ring buffer.
+
+Fixes: d769041f8653 "ring_buffer: implement new locking"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/ring_buffer.c | 17 ++++++-----------
+ 1 file changed, 6 insertions(+), 11 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -3354,21 +3354,16 @@ static void rb_iter_reset(struct ring_bu
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ /* Iterator usage is expected to have record disabled */
+- if (list_empty(&cpu_buffer->reader_page->list)) {
+- iter->head_page = rb_set_head_page(cpu_buffer);
+- if (unlikely(!iter->head_page))
+- return;
+- iter->head = iter->head_page->read;
+- } else {
+- iter->head_page = cpu_buffer->reader_page;
+- iter->head = cpu_buffer->reader_page->read;
+- }
++ iter->head_page = cpu_buffer->reader_page;
++ iter->head = cpu_buffer->reader_page->read;
++
++ iter->cache_reader_page = iter->head_page;
++ iter->cache_read = iter->head;
++
+ if (iter->head)
+ iter->read_stamp = cpu_buffer->read_stamp;
+ else
+ iter->read_stamp = iter->head_page->page->time_stamp;
+- iter->cache_reader_page = cpu_buffer->reader_page;
+- iter->cache_read = cpu_buffer->read;
+ }
+
+ /**
--- /dev/null
+From 021de3d904b88b1771a3a2cfc5b75023c391e646 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Wed, 6 Aug 2014 15:36:31 -0400
+Subject: ring-buffer: Up rb_iter_peek() loop count to 3
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit 021de3d904b88b1771a3a2cfc5b75023c391e646 upstream.
+
+After writting a test to try to trigger the bug that caused the
+ring buffer iterator to become corrupted, I hit another bug:
+
+ WARNING: CPU: 1 PID: 5281 at kernel/trace/ring_buffer.c:3766 rb_iter_peek+0x113/0x238()
+ Modules linked in: ipt_MASQUERADE sunrpc [...]
+ CPU: 1 PID: 5281 Comm: grep Tainted: G W 3.16.0-rc3-test+ #143
+ Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007
+ 0000000000000000 ffffffff81809a80 ffffffff81503fb0 0000000000000000
+ ffffffff81040ca1 ffff8800796d6010 ffffffff810c138d ffff8800796d6010
+ ffff880077438c80 ffff8800796d6010 ffff88007abbe600 0000000000000003
+ Call Trace:
+ [<ffffffff81503fb0>] ? dump_stack+0x4a/0x75
+ [<ffffffff81040ca1>] ? warn_slowpath_common+0x7e/0x97
+ [<ffffffff810c138d>] ? rb_iter_peek+0x113/0x238
+ [<ffffffff810c138d>] ? rb_iter_peek+0x113/0x238
+ [<ffffffff810c14df>] ? ring_buffer_iter_peek+0x2d/0x5c
+ [<ffffffff810c6f73>] ? tracing_iter_reset+0x6e/0x96
+ [<ffffffff810c74a3>] ? s_start+0xd7/0x17b
+ [<ffffffff8112b13e>] ? kmem_cache_alloc_trace+0xda/0xea
+ [<ffffffff8114cf94>] ? seq_read+0x148/0x361
+ [<ffffffff81132d98>] ? vfs_read+0x93/0xf1
+ [<ffffffff81132f1b>] ? SyS_read+0x60/0x8e
+ [<ffffffff8150bf9f>] ? tracesys+0xdd/0xe2
+
+Debugging this bug, which triggers when the rb_iter_peek() loops too
+many times (more than 2 times), I discovered there's a case that can
+cause that function to legitimately loop 3 times!
+
+rb_iter_peek() is different than rb_buffer_peek() as the rb_buffer_peek()
+only deals with the reader page (it's for consuming reads). The
+rb_iter_peek() is for traversing the buffer without consuming it, and as
+such, it can loop for one more reason. That is, if we hit the end of
+the reader page or any page, it will go to the next page and try again.
+
+That is, we have this:
+
+ 1. iter->head > iter->head_page->page->commit
+ (rb_inc_iter() which moves the iter to the next page)
+ try again
+
+ 2. event = rb_iter_head_event()
+ event->type_len == RINGBUF_TYPE_TIME_EXTEND
+ rb_advance_iter()
+ try again
+
+ 3. read the event.
+
+But we never get to 3, because the count is greater than 2 and we
+cause the WARNING and return NULL.
+
+Up the counter to 3.
+
+Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/ring_buffer.c | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -1981,7 +1981,7 @@ rb_add_time_stamp(struct ring_buffer_eve
+
+ /**
+ * rb_update_event - update event type and data
+- * @event: the even to update
++ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+@@ -3756,12 +3756,14 @@ rb_iter_peek(struct ring_buffer_iter *it
+ return NULL;
+
+ /*
+- * We repeat when a time extend is encountered.
+- * Since the time extend is always attached to a data event,
+- * we should never loop more than once.
+- * (We never hit the following condition more than twice).
++ * We repeat when a time extend is encountered or we hit
++ * the end of the page. Since the time extend is always attached
++ * to a data event, we should never loop more than three times.
++ * Once for going to next page, once on time extend, and
++ * finally once to get the event.
++ * (We never hit the following condition more than thrice).
+ */
+- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
++ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
+ return NULL;
+
+ if (rb_per_cpu_empty(cpu_buffer))
acpi-run-fixed-event-device-notifications-in-process-context.patch
acpi-scan-not-cache-_sun-value-in-struct-acpi_device_pnp.patch
acpi-cpuidle-fix-deadlock-between-cpuidle_lock-and-cpu_hotplug.lock.patch
+xen-events-fifo-reset-control-block-and-local-heads-on-resume.patch
+ring-buffer-always-reset-iterator-to-reader-page.patch
+ring-buffer-up-rb_iter_peek-loop-count-to-3.patch
+mnt-only-change-user-settable-mount-flags-in-remount.patch
+mnt-move-the-test-for-mnt_lock_readonly-from-change_mount_flags-into-do_remount.patch
+mnt-correct-permission-checks-in-do_remount.patch
+mnt-change-the-default-remount-atime-from-relatime-to-the-existing-value.patch
+mnt-add-tests-for-unprivileged-remount-cases-that-have-found-to-be-faulty.patch
+get-rid-of-propagate_umount-mistakenly-treating-slaves-as-busy.patch
+fix-ebusy-on-umount-from-mnt_shrinkable.patch
--- /dev/null
+From c12784c3d14a2110468ec4d1383f60cfd2665576 Mon Sep 17 00:00:00 2001
+From: David Vrabel <david.vrabel@citrix.com>
+Date: Thu, 31 Jul 2014 16:22:24 +0100
+Subject: xen/events/fifo: reset control block and local HEADs on resume
+
+From: David Vrabel <david.vrabel@citrix.com>
+
+commit c12784c3d14a2110468ec4d1383f60cfd2665576 upstream.
+
+When using the FIFO-based event channel ABI, if the control block or
+the local HEADs are not reset after resuming the guest may see stale
+HEAD values and will fail to traverse the FIFO correctly.
+
+This may prevent one or more VCPUs from receiving any events following
+a resume.
+
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/events/events_fifo.c | 48 ++++++++++++++++++++++-----------------
+ 1 file changed, 28 insertions(+), 20 deletions(-)
+
+--- a/drivers/xen/events/events_fifo.c
++++ b/drivers/xen/events/events_fifo.c
+@@ -99,6 +99,25 @@ static unsigned evtchn_fifo_nr_channels(
+ return event_array_pages * EVENT_WORDS_PER_PAGE;
+ }
+
++static int init_control_block(int cpu,
++ struct evtchn_fifo_control_block *control_block)
++{
++ struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
++ struct evtchn_init_control init_control;
++ unsigned int i;
++
++ /* Reset the control block and the local HEADs. */
++ clear_page(control_block);
++ for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++)
++ q->head[i] = 0;
++
++ init_control.control_gfn = virt_to_mfn(control_block);
++ init_control.offset = 0;
++ init_control.vcpu = cpu;
++
++ return HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
++}
++
+ static void free_unused_array_pages(void)
+ {
+ unsigned i;
+@@ -327,7 +346,6 @@ static void evtchn_fifo_resume(void)
+
+ for_each_possible_cpu(cpu) {
+ void *control_block = per_cpu(cpu_control_block, cpu);
+- struct evtchn_init_control init_control;
+ int ret;
+
+ if (!control_block)
+@@ -344,12 +362,7 @@ static void evtchn_fifo_resume(void)
+ continue;
+ }
+
+- init_control.control_gfn = virt_to_mfn(control_block);
+- init_control.offset = 0;
+- init_control.vcpu = cpu;
+-
+- ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control,
+- &init_control);
++ ret = init_control_block(cpu, control_block);
+ if (ret < 0)
+ BUG();
+ }
+@@ -377,30 +390,25 @@ static const struct evtchn_ops evtchn_op
+ .resume = evtchn_fifo_resume,
+ };
+
+-static int evtchn_fifo_init_control_block(unsigned cpu)
++static int evtchn_fifo_alloc_control_block(unsigned cpu)
+ {
+- struct page *control_block = NULL;
+- struct evtchn_init_control init_control;
++ void *control_block = NULL;
+ int ret = -ENOMEM;
+
+- control_block = alloc_page(GFP_KERNEL|__GFP_ZERO);
++ control_block = (void *)__get_free_page(GFP_KERNEL);
+ if (control_block == NULL)
+ goto error;
+
+- init_control.control_gfn = virt_to_mfn(page_address(control_block));
+- init_control.offset = 0;
+- init_control.vcpu = cpu;
+-
+- ret = HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control);
++ ret = init_control_block(cpu, control_block);
+ if (ret < 0)
+ goto error;
+
+- per_cpu(cpu_control_block, cpu) = page_address(control_block);
++ per_cpu(cpu_control_block, cpu) = control_block;
+
+ return 0;
+
+ error:
+- __free_page(control_block);
++ free_page((unsigned long)control_block);
+ return ret;
+ }
+
+@@ -414,7 +422,7 @@ static int evtchn_fifo_cpu_notification(
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!per_cpu(cpu_control_block, cpu))
+- ret = evtchn_fifo_init_control_block(cpu);
++ ret = evtchn_fifo_alloc_control_block(cpu);
+ break;
+ default:
+ break;
+@@ -431,7 +439,7 @@ int __init xen_evtchn_fifo_init(void)
+ int cpu = get_cpu();
+ int ret;
+
+- ret = evtchn_fifo_init_control_block(cpu);
++ ret = evtchn_fifo_alloc_control_block(cpu);
+ if (ret < 0)
+ goto out;
+