]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 4 Jan 2017 10:07:33 +0000 (11:07 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 4 Jan 2017 10:07:33 +0000 (11:07 +0100)
added patches:
block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch
crypto-caam-fix-aead-givenc-descriptors.patch
exec-ensure-mm-user_ns-contains-the-execed-files.patch
ext4-add-sanity-checking-to-count_overhead.patch
ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch
ext4-don-t-lock-buffer-in-ext4_commit_super-if-holding-spinlock.patch
ext4-fix-in-superblock-mount-options-processing.patch
ext4-fix-mballoc-breakage-with-64k-block-size.patch
ext4-fix-stack-memory-corruption-with-64k-block-size.patch
ext4-reject-inodes-with-negative-size.patch
ext4-return-enomem-instead-of-success.patch
ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch
f2fs-fix-overflow-due-to-condition-check-order.patch
f2fs-fix-to-determine-start_cp_addr-by-sbi-cur_cp_pack.patch
f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch
fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch
loop-return-proper-error-from-loop_queue_rq.patch
mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch
mm-page_alloc-keep-pcp-count-and-list-contents-in-sync-if-struct-page-is-corrupted.patch
mm-vmscan.c-set-correct-defer-count-for-shrinker.patch
nvmet-fix-possible-infinite-loop-triggered-on-hot-namespace-removal.patch
ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch
ptrace-don-t-allow-accessing-an-undumpable-mm.patch
revert-f2fs-use-percpu_counter-for-of-dirty-pages-in-inode.patch
splice-reinstate-sigpipe-epipe-handling.patch
vfs-mm-fix-return-value-of-read-at-s_maxbytes.patch

27 files changed:
queue-4.9/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch [new file with mode: 0644]
queue-4.9/crypto-caam-fix-aead-givenc-descriptors.patch [new file with mode: 0644]
queue-4.9/exec-ensure-mm-user_ns-contains-the-execed-files.patch [new file with mode: 0644]
queue-4.9/ext4-add-sanity-checking-to-count_overhead.patch [new file with mode: 0644]
queue-4.9/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch [new file with mode: 0644]
queue-4.9/ext4-don-t-lock-buffer-in-ext4_commit_super-if-holding-spinlock.patch [new file with mode: 0644]
queue-4.9/ext4-fix-in-superblock-mount-options-processing.patch [new file with mode: 0644]
queue-4.9/ext4-fix-mballoc-breakage-with-64k-block-size.patch [new file with mode: 0644]
queue-4.9/ext4-fix-stack-memory-corruption-with-64k-block-size.patch [new file with mode: 0644]
queue-4.9/ext4-reject-inodes-with-negative-size.patch [new file with mode: 0644]
queue-4.9/ext4-return-enomem-instead-of-success.patch [new file with mode: 0644]
queue-4.9/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch [new file with mode: 0644]
queue-4.9/f2fs-fix-overflow-due-to-condition-check-order.patch [new file with mode: 0644]
queue-4.9/f2fs-fix-to-determine-start_cp_addr-by-sbi-cur_cp_pack.patch [new file with mode: 0644]
queue-4.9/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch [new file with mode: 0644]
queue-4.9/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch [new file with mode: 0644]
queue-4.9/loop-return-proper-error-from-loop_queue_rq.patch [new file with mode: 0644]
queue-4.9/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch [new file with mode: 0644]
queue-4.9/mm-page_alloc-keep-pcp-count-and-list-contents-in-sync-if-struct-page-is-corrupted.patch [new file with mode: 0644]
queue-4.9/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch [new file with mode: 0644]
queue-4.9/nvmet-fix-possible-infinite-loop-triggered-on-hot-namespace-removal.patch [new file with mode: 0644]
queue-4.9/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch [new file with mode: 0644]
queue-4.9/ptrace-don-t-allow-accessing-an-undumpable-mm.patch [new file with mode: 0644]
queue-4.9/revert-f2fs-use-percpu_counter-for-of-dirty-pages-in-inode.patch [new file with mode: 0644]
queue-4.9/series
queue-4.9/splice-reinstate-sigpipe-epipe-handling.patch [new file with mode: 0644]
queue-4.9/vfs-mm-fix-return-value-of-read-at-s_maxbytes.patch [new file with mode: 0644]

diff --git a/queue-4.9/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch b/queue-4.9/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch
new file mode 100644 (file)
index 0000000..0d4c42f
--- /dev/null
@@ -0,0 +1,68 @@
+From bcc7f5b4bee8e327689a4d994022765855c807ff Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.com>
+Date: Mon, 12 Dec 2016 08:21:51 -0700
+Subject: block_dev: don't test bdev->bd_contains when it is not stable
+
+From: NeilBrown <neilb@suse.com>
+
+commit bcc7f5b4bee8e327689a4d994022765855c807ff upstream.
+
+bdev->bd_contains is not stable before calling __blkdev_get().
+When __blkdev_get() is called on a parition with ->bd_openers == 0
+it sets
+  bdev->bd_contains = bdev;
+which is not correct for a partition.
+After a call to __blkdev_get() succeeds, ->bd_openers will be > 0
+and then ->bd_contains is stable.
+
+When FMODE_EXCL is used, blkdev_get() calls
+   bd_start_claiming() ->  bd_prepare_to_claim() -> bd_may_claim()
+
+This call happens before __blkdev_get() is called, so ->bd_contains
+is not stable.  So bd_may_claim() cannot safely use ->bd_contains.
+It currently tries to use it, and this can lead to a BUG_ON().
+
+This happens when a whole device is already open with a bd_holder (in
+use by dm in my particular example) and two threads race to open a
+partition of that device for the first time, one opening with O_EXCL and
+one without.
+
+The thread that doesn't use O_EXCL gets through blkdev_get() to
+__blkdev_get(), gains the ->bd_mutex, and sets bdev->bd_contains = bdev;
+
+Immediately thereafter the other thread, using FMODE_EXCL, calls
+bd_start_claiming() from blkdev_get().  This should fail because the
+whole device has a holder, but because bdev->bd_contains == bdev
+bd_may_claim() incorrectly reports success.
+This thread continues and blocks on bd_mutex.
+
+The first thread then sets bdev->bd_contains correctly and drops the mutex.
+The thread using FMODE_EXCL then continues and when it calls bd_may_claim()
+again in:
+                       BUG_ON(!bd_may_claim(bdev, whole, holder));
+The BUG_ON fires.
+
+Fix this by removing the dependency on ->bd_contains in
+bd_may_claim().  As bd_may_claim() has direct access to the whole
+device, it can simply test if the target bdev is the whole device.
+
+Fixes: 6b4517a7913a ("block: implement bd_claiming and claiming block")
+Signed-off-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/block_dev.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/block_dev.c
++++ b/fs/block_dev.c
+@@ -832,7 +832,7 @@ static bool bd_may_claim(struct block_de
+               return true;     /* already a holder */
+       else if (bdev->bd_holder != NULL)
+               return false;    /* held by someone else */
+-      else if (bdev->bd_contains == bdev)
++      else if (whole == bdev)
+               return true;     /* is a whole device which isn't held */
+       else if (whole->bd_holder == bd_may_claim)
diff --git a/queue-4.9/crypto-caam-fix-aead-givenc-descriptors.patch b/queue-4.9/crypto-caam-fix-aead-givenc-descriptors.patch
new file mode 100644 (file)
index 0000000..52afcef
--- /dev/null
@@ -0,0 +1,48 @@
+From d128af17876d79b87edf048303f98b35f6a53dbc Mon Sep 17 00:00:00 2001
+From: Alex Porosanu <alexandru.porosanu@nxp.com>
+Date: Wed, 9 Nov 2016 10:46:11 +0200
+Subject: crypto: caam - fix AEAD givenc descriptors
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Alex Porosanu <alexandru.porosanu@nxp.com>
+
+commit d128af17876d79b87edf048303f98b35f6a53dbc upstream.
+
+The AEAD givenc descriptor relies on moving the IV through the
+output FIFO and then back to the CTX2 for authentication. The
+SEQ FIFO STORE could be scheduled before the data can be
+read from OFIFO, especially since the SEQ FIFO LOAD needs
+to wait for the SEQ FIFO LOAD SKIP to finish first. The
+SKIP takes more time when the input is SG than when it's
+a contiguous buffer. If the SEQ FIFO LOAD is not scheduled
+before the STORE, the DECO will hang waiting for data
+to be available in the OFIFO so it can be transferred to C2.
+In order to overcome this, first force transfer of IV to C2
+by starting the "cryptlen" transfer first and then starting to
+store data from OFIFO to the output buffer.
+
+Fixes: 1acebad3d8db8 ("crypto: caam - faster aead implementation")
+Signed-off-by: Alex Porosanu <alexandru.porosanu@nxp.com>
+Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/crypto/caam/caamalg.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/crypto/caam/caamalg.c
++++ b/drivers/crypto/caam/caamalg.c
+@@ -736,7 +736,9 @@ copy_iv:
+       /* Will read cryptlen */
+       append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ);
+-      aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2);
++      append_seq_fifo_load(desc, 0, FIFOLD_CLASS_BOTH | KEY_VLF |
++                           FIFOLD_TYPE_MSG1OUT2 | FIFOLD_TYPE_LASTBOTH);
++      append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | KEY_VLF);
+       /* Write ICV */
+       append_seq_store(desc, ctx->authsize, LDST_CLASS_2_CCB |
diff --git a/queue-4.9/exec-ensure-mm-user_ns-contains-the-execed-files.patch b/queue-4.9/exec-ensure-mm-user_ns-contains-the-execed-files.patch
new file mode 100644 (file)
index 0000000..291704a
--- /dev/null
@@ -0,0 +1,117 @@
+From f84df2a6f268de584a201e8911384a2d244876e3 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Wed, 16 Nov 2016 22:06:51 -0600
+Subject: exec: Ensure mm->user_ns contains the execed files
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit f84df2a6f268de584a201e8911384a2d244876e3 upstream.
+
+When the user namespace support was merged the need to prevent
+ptrace from revealing the contents of an unreadable executable
+was overlooked.
+
+Correct this oversight by ensuring that the executed file
+or files are in mm->user_ns, by adjusting mm->user_ns.
+
+Use the new function privileged_wrt_inode_uidgid to see if
+the executable is a member of the user namespace, and as such
+if having CAP_SYS_PTRACE in the user namespace should allow
+tracing the executable.  If not update mm->user_ns to
+the parent user namespace until an appropriate parent is found.
+
+Reported-by: Jann Horn <jann@thejh.net>
+Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c                  |   19 +++++++++++++++++--
+ include/linux/capability.h |    1 +
+ kernel/capability.c        |   16 ++++++++++++++--
+ 3 files changed, 32 insertions(+), 4 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1275,8 +1275,22 @@ EXPORT_SYMBOL(flush_old_exec);
+ void would_dump(struct linux_binprm *bprm, struct file *file)
+ {
+-      if (inode_permission(file_inode(file), MAY_READ) < 0)
++      struct inode *inode = file_inode(file);
++      if (inode_permission(inode, MAY_READ) < 0) {
++              struct user_namespace *old, *user_ns;
+               bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
++
++              /* Ensure mm->user_ns contains the executable */
++              user_ns = old = bprm->mm->user_ns;
++              while ((user_ns != &init_user_ns) &&
++                     !privileged_wrt_inode_uidgid(user_ns, inode))
++                      user_ns = user_ns->parent;
++
++              if (old != user_ns) {
++                      bprm->mm->user_ns = get_user_ns(user_ns);
++                      put_user_ns(old);
++              }
++      }
+ }
+ EXPORT_SYMBOL(would_dump);
+@@ -1306,7 +1320,6 @@ void setup_new_exec(struct linux_binprm
+           !gid_eq(bprm->cred->gid, current_egid())) {
+               current->pdeath_signal = 0;
+       } else {
+-              would_dump(bprm, bprm->file);
+               if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
+                       set_dumpable(current->mm, suid_dumpable);
+       }
+@@ -1741,6 +1754,8 @@ static int do_execveat_common(int fd, st
+       if (retval < 0)
+               goto out;
++      would_dump(bprm, bprm->file);
++
+       retval = exec_binprm(bprm);
+       if (retval < 0)
+               goto out;
+--- a/include/linux/capability.h
++++ b/include/linux/capability.h
+@@ -240,6 +240,7 @@ static inline bool ns_capable_noaudit(st
+       return true;
+ }
+ #endif /* CONFIG_MULTIUSER */
++extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
+ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
+ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
+--- a/kernel/capability.c
++++ b/kernel/capability.c
+@@ -457,6 +457,19 @@ bool file_ns_capable(const struct file *
+ EXPORT_SYMBOL(file_ns_capable);
+ /**
++ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
++ * @ns: The user namespace in question
++ * @inode: The inode in question
++ *
++ * Return true if the inode uid and gid are within the namespace.
++ */
++bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
++{
++      return kuid_has_mapping(ns, inode->i_uid) &&
++              kgid_has_mapping(ns, inode->i_gid);
++}
++
++/**
+  * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+  * @inode: The inode in question
+  * @cap: The capability in question
+@@ -469,7 +482,6 @@ bool capable_wrt_inode_uidgid(const stru
+ {
+       struct user_namespace *ns = current_user_ns();
+-      return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+-              kgid_has_mapping(ns, inode->i_gid);
++      return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+ }
+ EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/queue-4.9/ext4-add-sanity-checking-to-count_overhead.patch b/queue-4.9/ext4-add-sanity-checking-to-count_overhead.patch
new file mode 100644 (file)
index 0000000..4d50dbc
--- /dev/null
@@ -0,0 +1,42 @@
+From c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 18 Nov 2016 13:37:47 -0500
+Subject: ext4: add sanity checking to count_overhead()
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 upstream.
+
+The commit "ext4: sanity check the block and cluster size at mount
+time" should prevent any problems, but in case the superblock is
+modified while the file system is mounted, add an extra safety check
+to make sure we won't overrun the allocated buffer.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3193,10 +3193,15 @@ static int count_overhead(struct super_b
+                       ext4_set_bit(s++, buf);
+                       count++;
+               }
+-              for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+-                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+-                      count++;
++              j = ext4_bg_num_gdb(sb, grp);
++              if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
++                      ext4_error(sb, "Invalid number of block group "
++                                 "descriptor blocks: %d", j);
++                      j = EXT4_BLOCKS_PER_GROUP(sb) - s;
+               }
++              count += j;
++              for (; j > 0; j--)
++                      ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+       }
+       if (!count)
+               return 0;
diff --git a/queue-4.9/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch b/queue-4.9/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch
new file mode 100644 (file)
index 0000000..603c2a2
--- /dev/null
@@ -0,0 +1,93 @@
+From 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 Mon Sep 17 00:00:00 2001
+From: Sergey Karamov <skaramov@google.com>
+Date: Sat, 10 Dec 2016 17:54:58 -0500
+Subject: ext4: do not perform data journaling when data is encrypted
+
+From: Sergey Karamov <skaramov@google.com>
+
+commit 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 upstream.
+
+Currently data journalling is incompatible with encryption: enabling both
+at the same time has never been supported by design, and would result in
+unpredictable behavior. However, users are not precluded from turning on
+both features simultaneously. This change programmatically replaces data
+journaling for encrypted regular files with ordered data journaling mode.
+
+Background:
+Journaling encrypted data has not been supported because it operates on
+buffer heads of the page in the page cache. Namely, when the commit
+happens, which could be up to five seconds after caching, the commit
+thread uses the buffer heads attached to the page to copy the contents of
+the page to the journal. With encryption, it would have been required to
+keep the bounce buffer with ciphertext for up to the aforementioned five
+seconds, since the page cache can only hold plaintext and could not be
+used for journaling. Alternatively, it would be required to setup the
+journal to initiate a callback at the commit time to perform deferred
+encryption - in this case, not only would the data have to be written
+twice, but it would also have to be encrypted twice. This level of
+complexity was not justified for a mode that in practice is very rarely
+used because of the overhead from the data journalling.
+
+Solution:
+If data=journaled has been set as a mount option for a filesystem, or if
+journaling is enabled on a regular file, do not perform journaling if the
+file is also encrypted, instead fall back to the data=ordered mode for the
+file.
+
+Rationale:
+The intent is to allow seamless and proper filesystem operation when
+journaling and encryption have both been enabled, and have these two
+conflicting features gracefully resolved by the filesystem.
+
+Fixes: 4461471107b7
+Signed-off-by: Sergey Karamov <skaramov@google.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/ext4_jbd2.h |   14 ++++++++------
+ fs/ext4/super.c     |    5 +++++
+ 2 files changed, 13 insertions(+), 6 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mod
+               return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
+       /* We do not support data journalling with delayed allocation */
+       if (!S_ISREG(inode->i_mode) ||
+-          test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+-              return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
+-      if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+-          !test_opt(inode->i_sb, DELALLOC))
++          test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
++          (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
++          !test_opt(inode->i_sb, DELALLOC))) {
++              /* We do not support data journalling for encrypted data */
++              if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
++                      return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
+               return EXT4_INODE_JOURNAL_DATA_MODE;    /* journal data */
++      }
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+               return EXT4_INODE_ORDERED_DATA_MODE;    /* ordered */
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+               return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
+-      else
+-              BUG();
++      BUG();
+ }
+ static inline int ext4_should_journal_data(struct inode *inode)
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3516,6 +3516,11 @@ static int ext4_fill_super(struct super_
+                                "both data=journal and dax");
+                       goto failed_mount;
+               }
++              if (ext4_has_feature_encrypt(sb)) {
++                      ext4_msg(sb, KERN_WARNING,
++                               "encrypted files will use data=ordered "
++                               "instead of data journaling mode");
++              }
+               if (test_opt(sb, DELALLOC))
+                       clear_opt(sb, DELALLOC);
+       } else {
diff --git a/queue-4.9/ext4-don-t-lock-buffer-in-ext4_commit_super-if-holding-spinlock.patch b/queue-4.9/ext4-don-t-lock-buffer-in-ext4_commit_super-if-holding-spinlock.patch
new file mode 100644 (file)
index 0000000..861956d
--- /dev/null
@@ -0,0 +1,69 @@
+From 1566a48aaa10c6bb29b9a69dd8279f9a4fc41e35 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 13 Nov 2016 22:02:29 -0500
+Subject: ext4: don't lock buffer in ext4_commit_super if holding spinlock
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 1566a48aaa10c6bb29b9a69dd8279f9a4fc41e35 upstream.
+
+If there is an error reported in mballoc via ext4_grp_locked_error(),
+the code is holding a spinlock, so ext4_commit_super() must not try to
+lock the buffer head, or else it will trigger a BUG:
+
+  BUG: sleeping function called from invalid context at ./include/linux/buffer_head.h:358
+  in_atomic(): 1, irqs_disabled(): 0, pid: 993, name: mount
+  CPU: 0 PID: 993 Comm: mount Not tainted 4.9.0-rc1-clouder1 #62
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
+   ffff880006423548 ffffffff81318c89 ffffffff819ecdd0 0000000000000166
+   ffff880006423558 ffffffff810810b0 ffff880006423580 ffffffff81081153
+   ffff880006e5a1a0 ffff88000690e400 0000000000000000 ffff8800064235c0
+  Call Trace:
+    [<ffffffff81318c89>] dump_stack+0x67/0x9e
+    [<ffffffff810810b0>] ___might_sleep+0xf0/0x140
+    [<ffffffff81081153>] __might_sleep+0x53/0xb0
+    [<ffffffff8126c1dc>] ext4_commit_super+0x19c/0x290
+    [<ffffffff8126e61a>] __ext4_grp_locked_error+0x14a/0x230
+    [<ffffffff81081153>] ? __might_sleep+0x53/0xb0
+    [<ffffffff812822be>] ext4_mb_generate_buddy+0x1de/0x320
+
+Since ext4_grp_locked_error() calls ext4_commit_super with sync == 0
+(and it is the only caller which does so), avoid locking and unlocking
+the buffer in this case.
+
+This can result in races with ext4_commit_super() if there are other
+problems (which is what commit 4743f83990614 was trying to address),
+but a Warning is better than BUG.
+
+Fixes: 4743f83990614
+Reported-by: Nikolay Borisov <kernel@kyup.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4550,7 +4550,8 @@ static int ext4_commit_super(struct supe
+                               &EXT4_SB(sb)->s_freeinodes_counter));
+       BUFFER_TRACE(sbh, "marking dirty");
+       ext4_superblock_csum_set(sb);
+-      lock_buffer(sbh);
++      if (sync)
++              lock_buffer(sbh);
+       if (buffer_write_io_error(sbh)) {
+               /*
+                * Oh, dear.  A previous attempt to write the
+@@ -4566,8 +4567,8 @@ static int ext4_commit_super(struct supe
+               set_buffer_uptodate(sbh);
+       }
+       mark_buffer_dirty(sbh);
+-      unlock_buffer(sbh);
+       if (sync) {
++              unlock_buffer(sbh);
+               error = __sync_dirty_buffer(sbh,
+                       test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+               if (error)
diff --git a/queue-4.9/ext4-fix-in-superblock-mount-options-processing.patch b/queue-4.9/ext4-fix-in-superblock-mount-options-processing.patch
new file mode 100644 (file)
index 0000000..1d09fc8
--- /dev/null
@@ -0,0 +1,105 @@
+From 5aee0f8a3f42c94c5012f1673420aee96315925a Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 18 Nov 2016 13:24:26 -0500
+Subject: ext4: fix in-superblock mount options processing
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 5aee0f8a3f42c94c5012f1673420aee96315925a upstream.
+
+Fix a large number of problems with how we handle mount options in the
+superblock.  For one, if the string in the superblock is long enough
+that it is not null terminated, we could run off the end of the string
+and try to interpret superblocks fields as characters.  It's unlikely
+this will cause a security problem, but it could result in an invalid
+parse.  Also, parse_options is destructive to the string, so in some
+cases if there is a comma-separated string, it would be modified in
+the superblock.  (Fortunately it only happens on file systems with a
+1k block size.)
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c |   38 +++++++++++++++++++++++---------------
+ 1 file changed, 23 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3301,7 +3301,7 @@ static int ext4_fill_super(struct super_
+       char *orig_data = kstrdup(data, GFP_KERNEL);
+       struct buffer_head *bh;
+       struct ext4_super_block *es = NULL;
+-      struct ext4_sb_info *sbi;
++      struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+       ext4_fsblk_t block;
+       ext4_fsblk_t sb_block = get_sb_block(&data);
+       ext4_fsblk_t logical_sb_block;
+@@ -3320,16 +3320,14 @@ static int ext4_fill_super(struct super_
+       unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+       ext4_group_t first_not_zeroed;
+-      sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+-      if (!sbi)
+-              goto out_free_orig;
++      if ((data && !orig_data) || !sbi)
++              goto out_free_base;
+       sbi->s_blockgroup_lock =
+               kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+-      if (!sbi->s_blockgroup_lock) {
+-              kfree(sbi);
+-              goto out_free_orig;
+-      }
++      if (!sbi->s_blockgroup_lock)
++              goto out_free_base;
++
+       sb->s_fs_info = sbi;
+       sbi->s_sb = sb;
+       sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+@@ -3475,11 +3473,19 @@ static int ext4_fill_super(struct super_
+        */
+       sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+-      if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+-                         &journal_devnum, &journal_ioprio, 0)) {
+-              ext4_msg(sb, KERN_WARNING,
+-                       "failed to parse options in superblock: %s",
+-                       sbi->s_es->s_mount_opts);
++      if (sbi->s_es->s_mount_opts[0]) {
++              char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
++                                            sizeof(sbi->s_es->s_mount_opts),
++                                            GFP_KERNEL);
++              if (!s_mount_opts)
++                      goto failed_mount;
++              if (!parse_options(s_mount_opts, sb, &journal_devnum,
++                                 &journal_ioprio, 0)) {
++                      ext4_msg(sb, KERN_WARNING,
++                               "failed to parse options in superblock: %s",
++                               s_mount_opts);
++              }
++              kfree(s_mount_opts);
+       }
+       sbi->s_def_mount_opt = sbi->s_mount_opt;
+       if (!parse_options((char *) data, sb, &journal_devnum,
+@@ -4157,7 +4163,9 @@ no_journal:
+       if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
+               ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+-                       "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
++                       "Opts: %.*s%s%s", descr,
++                       (int) sizeof(sbi->s_es->s_mount_opts),
++                       sbi->s_es->s_mount_opts,
+                        *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+       if (es->s_error_count)
+@@ -4236,8 +4244,8 @@ failed_mount:
+ out_fail:
+       sb->s_fs_info = NULL;
+       kfree(sbi->s_blockgroup_lock);
++out_free_base:
+       kfree(sbi);
+-out_free_orig:
+       kfree(orig_data);
+       return err ? err : ret;
+ }
diff --git a/queue-4.9/ext4-fix-mballoc-breakage-with-64k-block-size.patch b/queue-4.9/ext4-fix-mballoc-breakage-with-64k-block-size.patch
new file mode 100644 (file)
index 0000000..74c0e3e
--- /dev/null
@@ -0,0 +1,35 @@
+From 69e43e8cc971a79dd1ee5d4343d8e63f82725123 Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Mon, 14 Nov 2016 21:04:37 -0500
+Subject: ext4: fix mballoc breakage with 64k block size
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit 69e43e8cc971a79dd1ee5d4343d8e63f82725123 upstream.
+
+'border' variable is set to a value of 2 times the block size of the
+underlying filesystem. With 64k block size, the resulting value won't
+fit into a 16-bit variable. Hence this commit changes the data type of
+'border' to 'unsigned int'.
+
+Fixes: c9de560ded61f
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/mballoc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(str
+       ext4_grpblk_t min;
+       ext4_grpblk_t max;
+       ext4_grpblk_t chunk;
+-      unsigned short border;
++      unsigned int border;
+       BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
diff --git a/queue-4.9/ext4-fix-stack-memory-corruption-with-64k-block-size.patch b/queue-4.9/ext4-fix-stack-memory-corruption-with-64k-block-size.patch
new file mode 100644 (file)
index 0000000..1c30ffe
--- /dev/null
@@ -0,0 +1,36 @@
+From 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Mon, 14 Nov 2016 21:26:26 -0500
+Subject: ext4: fix stack memory corruption with 64k block size
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 upstream.
+
+The number of 'counters' elements needed in 'struct sg' is
+super_block->s_blocksize_bits + 2. Presently we have 16 'counters'
+elements in the array. This is insufficient for block sizes >= 32k. In
+such cases the memcpy operation performed in ext4_mb_seq_groups_show()
+would cause stack memory corruption.
+
+Fixes: c9de560ded61f
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/mballoc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struc
+       struct ext4_group_info *grinfo;
+       struct sg {
+               struct ext4_group_info info;
+-              ext4_grpblk_t counters[16];
++              ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
+       } sg;
+       group--;
diff --git a/queue-4.9/ext4-reject-inodes-with-negative-size.patch b/queue-4.9/ext4-reject-inodes-with-negative-size.patch
new file mode 100644 (file)
index 0000000..c8bad8e
--- /dev/null
@@ -0,0 +1,45 @@
+From 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Sat, 10 Dec 2016 09:55:01 -0500
+Subject: ext4: reject inodes with negative size
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 upstream.
+
+Don't load an inode with a negative size; this causes integer overflow
+problems in the VFS.
+
+[ Added EXT4_ERROR_INODE() to mark file system as corrupted. -TYT]
+
+Fixes: a48380f769df (ext4: rename i_dir_acl to i_size_high)
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/inode.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4434,6 +4434,7 @@ struct inode *ext4_iget(struct super_blo
+       struct inode *inode;
+       journal_t *journal = EXT4_SB(sb)->s_journal;
+       long ret;
++      loff_t size;
+       int block;
+       uid_t i_uid;
+       gid_t i_gid;
+@@ -4534,6 +4535,11 @@ struct inode *ext4_iget(struct super_blo
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+       inode->i_size = ext4_isize(raw_inode);
++      if ((size = i_size_read(inode)) < 0) {
++              EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
++              ret = -EFSCORRUPTED;
++              goto bad_inode;
++      }
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
diff --git a/queue-4.9/ext4-return-enomem-instead-of-success.patch b/queue-4.9/ext4-return-enomem-instead-of-success.patch
new file mode 100644 (file)
index 0000000..6d8f344
--- /dev/null
@@ -0,0 +1,34 @@
+From 578620f451f836389424833f1454eeeb2ffc9e9f Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Sat, 10 Dec 2016 09:56:01 -0500
+Subject: ext4: return -ENOMEM instead of success
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 578620f451f836389424833f1454eeeb2ffc9e9f upstream.
+
+We should set the error code if kzalloc() fails.
+
+Fixes: 67cf5b09a46f ("ext4: add the basic function for inline data support")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/inline.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -336,8 +336,10 @@ static int ext4_update_inline_data(handl
+       len -= EXT4_MIN_INLINE_DATA_SIZE;
+       value = kzalloc(len, GFP_NOFS);
+-      if (!value)
++      if (!value) {
++              error = -ENOMEM;
+               goto out;
++      }
+       error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+                                    value, len);
diff --git a/queue-4.9/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch b/queue-4.9/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch
new file mode 100644 (file)
index 0000000..dbde9a4
--- /dev/null
@@ -0,0 +1,55 @@
+From cd6bb35bf7f6d7d922509bf50265383a0ceabe96 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 18 Nov 2016 13:28:30 -0500
+Subject: ext4: use more strict checks for inodes_per_block on mount
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit cd6bb35bf7f6d7d922509bf50265383a0ceabe96 upstream.
+
+Centralize the checks for inodes_per_block and be more strict to make
+sure the inodes_per_block_group can't end up being zero.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c |   15 ++++++---------
+ 1 file changed, 6 insertions(+), 9 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3660,12 +3660,16 @@ static int ext4_fill_super(struct super_
+       sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+       sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+-      if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
+-              goto cantfind_ext4;
+       sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+       if (sbi->s_inodes_per_block == 0)
+               goto cantfind_ext4;
++      if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
++          sbi->s_inodes_per_group > blocksize * 8) {
++              ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
++                       sbi->s_blocks_per_group);
++              goto failed_mount;
++      }
+       sbi->s_itb_per_group = sbi->s_inodes_per_group /
+                                       sbi->s_inodes_per_block;
+       sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+@@ -3748,13 +3752,6 @@ static int ext4_fill_super(struct super_
+       }
+       sbi->s_cluster_ratio = clustersize / blocksize;
+-      if (sbi->s_inodes_per_group > blocksize * 8) {
+-              ext4_msg(sb, KERN_ERR,
+-                     "#inodes per group too big: %lu",
+-                     sbi->s_inodes_per_group);
+-              goto failed_mount;
+-      }
+-
+       /* Do we have standard group size of clustersize * 8 blocks ? */
+       if (sbi->s_blocks_per_group == clustersize << 3)
+               set_opt2(sb, STD_GROUP_SIZE);
diff --git a/queue-4.9/f2fs-fix-overflow-due-to-condition-check-order.patch b/queue-4.9/f2fs-fix-overflow-due-to-condition-check-order.patch
new file mode 100644 (file)
index 0000000..9f6a446
--- /dev/null
@@ -0,0 +1,32 @@
+From e87f7329bbd6760c2acc4f1eb423362b08851a71 Mon Sep 17 00:00:00 2001
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+Date: Wed, 23 Nov 2016 10:51:17 -0800
+Subject: f2fs: fix overflow due to condition check order
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+commit e87f7329bbd6760c2acc4f1eb423362b08851a71 upstream.
+
+In the last ilen case, i was already increased, resulting in accessing out-
+of-boundary entry of do_replace and blkaddr.
+Fix to check ilen first to exit the loop.
+
+Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up")
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/f2fs/file.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -967,7 +967,7 @@ static int __clone_blkaddrs(struct inode
+                               new_size = (dst + i) << PAGE_SHIFT;
+                               if (dst_inode->i_size < new_size)
+                                       f2fs_i_size_write(dst_inode, new_size);
+-                      } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
++                      } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
+                       f2fs_put_dnode(&dn);
+               } else {
diff --git a/queue-4.9/f2fs-fix-to-determine-start_cp_addr-by-sbi-cur_cp_pack.patch b/queue-4.9/f2fs-fix-to-determine-start_cp_addr-by-sbi-cur_cp_pack.patch
new file mode 100644 (file)
index 0000000..d0dcdea
--- /dev/null
@@ -0,0 +1,99 @@
+From 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc Mon Sep 17 00:00:00 2001
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+Date: Thu, 24 Nov 2016 12:45:15 -0800
+Subject: f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+commit 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc upstream.
+
+We don't guarantee cp_addr is fixed by cp_version.
+This is to sync with f2fs-tools.
+
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/f2fs/checkpoint.c |    8 +++++++-
+ fs/f2fs/f2fs.h       |   26 ++++++++++++++++----------
+ 2 files changed, 23 insertions(+), 11 deletions(-)
+
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -772,6 +772,11 @@ int get_valid_checkpoint(struct f2fs_sb_
+       if (sanity_check_ckpt(sbi))
+               goto fail_no_cp;
++      if (cur_page == cp1)
++              sbi->cur_cp_pack = 1;
++      else
++              sbi->cur_cp_pack = 2;
++
+       if (cp_blks <= 1)
+               goto done;
+@@ -1123,7 +1128,7 @@ static int do_checkpoint(struct f2fs_sb_
+                               le32_to_cpu(ckpt->checksum_offset)))
+                               = cpu_to_le32(crc32);
+-      start_blk = __start_cp_addr(sbi);
++      start_blk = __start_cp_next_addr(sbi);
+       /* need to wait for end_io results */
+       wait_on_all_pages_writeback(sbi);
+@@ -1187,6 +1192,7 @@ static int do_checkpoint(struct f2fs_sb_
+       clear_prefree_segments(sbi, cpc);
+       clear_sbi_flag(sbi, SBI_IS_DIRTY);
+       clear_sbi_flag(sbi, SBI_NEED_CP);
++      __set_cp_next_pack(sbi);
+       /*
+        * redirty superblock if metadata like node page or inode cache is
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -764,6 +764,7 @@ struct f2fs_sb_info {
+       /* for checkpoint */
+       struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
++      int cur_cp_pack;                        /* remain current cp pack */
+       spinlock_t cp_lock;                     /* for flag in ckpt */
+       struct inode *meta_inode;               /* cache meta blocks */
+       struct mutex cp_mutex;                  /* checkpoint procedure lock */
+@@ -1329,22 +1330,27 @@ static inline void *__bitmap_ptr(struct
+ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
+ {
+-      block_t start_addr;
+-      struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+-      unsigned long long ckpt_version = cur_cp_version(ckpt);
+-
+-      start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
++      block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+-      /*
+-       * odd numbered checkpoint should at cp segment 0
+-       * and even segment must be at cp segment 1
+-       */
+-      if (!(ckpt_version & 1))
++      if (sbi->cur_cp_pack == 2)
+               start_addr += sbi->blocks_per_seg;
++      return start_addr;
++}
++static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
++{
++      block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
++
++      if (sbi->cur_cp_pack == 1)
++              start_addr += sbi->blocks_per_seg;
+       return start_addr;
+ }
++static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
++{
++      sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
++}
++
+ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
+ {
+       return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
diff --git a/queue-4.9/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch b/queue-4.9/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch
new file mode 100644 (file)
index 0000000..f18a10f
--- /dev/null
@@ -0,0 +1,63 @@
+From 05e6ea2685c964db1e675a24a4f4e2adc22d2388 Mon Sep 17 00:00:00 2001
+From: Nicolai Stange <nicstange@gmail.com>
+Date: Sun, 20 Nov 2016 19:57:23 +0100
+Subject: f2fs: set ->owner for debugfs status file's file_operations
+
+From: Nicolai Stange <nicstange@gmail.com>
+
+commit 05e6ea2685c964db1e675a24a4f4e2adc22d2388 upstream.
+
+The struct file_operations instance serving the f2fs/status debugfs file
+lacks an initialization of its ->owner.
+
+This means that although that file might have been opened, the f2fs module
+can still get removed. Any further operation on that opened file, releasing
+included,  will cause accesses to unmapped memory.
+
+Indeed, Mike Marshall reported the following:
+
+  BUG: unable to handle kernel paging request at ffffffffa0307430
+  IP: [<ffffffff8132a224>] full_proxy_release+0x24/0x90
+  <...>
+  Call Trace:
+   [] __fput+0xdf/0x1d0
+   [] ____fput+0xe/0x10
+   [] task_work_run+0x8e/0xc0
+   [] do_exit+0x2ae/0xae0
+   [] ? __audit_syscall_entry+0xae/0x100
+   [] ? syscall_trace_enter+0x1ca/0x310
+   [] do_group_exit+0x44/0xc0
+   [] SyS_exit_group+0x14/0x20
+   [] do_syscall_64+0x61/0x150
+   [] entry_SYSCALL64_slow_path+0x25/0x25
+  <...>
+  ---[ end trace f22ae883fa3ea6b8 ]---
+  Fixing recursive fault but reboot is needed!
+
+Fix this by initializing the f2fs/status file_operations' ->owner with
+THIS_MODULE.
+
+This will allow debugfs to grab a reference to the f2fs module upon any
+open on that file, thus preventing it from getting removed.
+
+Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs")
+Reported-by: Mike Marshall <hubcap@omnibond.com>
+Reported-by: Martin Brandenburg <martin@omnibond.com>
+Signed-off-by: Nicolai Stange <nicstange@gmail.com>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/f2fs/debug.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/f2fs/debug.c
++++ b/fs/f2fs/debug.c
+@@ -373,6 +373,7 @@ static int stat_open(struct inode *inode
+ }
+ static const struct file_operations stat_fops = {
++      .owner = THIS_MODULE,
+       .open = stat_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
diff --git a/queue-4.9/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch b/queue-4.9/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch
new file mode 100644 (file)
index 0000000..00c7af4
--- /dev/null
@@ -0,0 +1,79 @@
+From 613cc2b6f272c1a8ad33aefa21cad77af23139f7 Mon Sep 17 00:00:00 2001
+From: Aleksa Sarai <asarai@suse.de>
+Date: Wed, 21 Dec 2016 16:26:24 +1100
+Subject: fs: exec: apply CLOEXEC before changing dumpable task flags
+
+From: Aleksa Sarai <asarai@suse.de>
+
+commit 613cc2b6f272c1a8ad33aefa21cad77af23139f7 upstream.
+
+If you have a process that has set itself to be non-dumpable, and it
+then undergoes exec(2), any CLOEXEC file descriptors it has open are
+"exposed" during a race window between the dumpable flags of the process
+being reset for exec(2) and CLOEXEC being applied to the file
+descriptors. This can be exploited by a process by attempting to access
+/proc/<pid>/fd/... during this window, without requiring CAP_SYS_PTRACE.
+
+The race in question is after set_dumpable has been (for get_link,
+though the trace is basically the same for readlink):
+
+[vfs]
+-> proc_pid_link_inode_operations.get_link
+   -> proc_pid_get_link
+      -> proc_fd_access_allowed
+         -> ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+
+Which will return 0, during the race window and CLOEXEC file descriptors
+will still be open during this window because do_close_on_exec has not
+been called yet. As a result, the ordering of these calls should be
+reversed to avoid this race window.
+
+This is of particular concern to container runtimes, where joining a
+PID namespace with file descriptors referring to the host filesystem
+can result in security issues (since PRCTL_SET_DUMPABLE doesn't protect
+against access of CLOEXEC file descriptors -- file descriptors which may
+reference filesystem objects the container shouldn't have access to).
+
+Cc: dev@opencontainers.org
+Reported-by: Michael Crosby <crosbymichael@gmail.com>
+Signed-off-by: Aleksa Sarai <asarai@suse.de>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -19,7 +19,7 @@
+  * current->executable is only used by the procfs.  This allows a dispatch
+  * table to check for several different types  of binary formats.  We keep
+  * trying until we recognize the file or we run out of supported binary
+- * formats. 
++ * formats.
+  */
+ #include <linux/slab.h>
+@@ -1266,6 +1266,13 @@ int flush_old_exec(struct linux_binprm *
+       flush_thread();
+       current->personality &= ~bprm->per_clear;
++      /*
++       * We have to apply CLOEXEC before we change whether the process is
++       * dumpable (in setup_new_exec) to avoid a race with a process in userspace
++       * trying to access the should-be-closed file descriptors of a process
++       * undergoing exec(2).
++       */
++      do_close_on_exec(current->files);
+       return 0;
+ out:
+@@ -1328,7 +1335,6 @@ void setup_new_exec(struct linux_binprm
+          group */
+       current->self_exec_id++;
+       flush_signal_handlers(current, 0);
+-      do_close_on_exec(current->files);
+ }
+ EXPORT_SYMBOL(setup_new_exec);
diff --git a/queue-4.9/loop-return-proper-error-from-loop_queue_rq.patch b/queue-4.9/loop-return-proper-error-from-loop_queue_rq.patch
new file mode 100644 (file)
index 0000000..7274d5b
--- /dev/null
@@ -0,0 +1,32 @@
+From b4a567e8114327518c09f5632339a5954ab975a3 Mon Sep 17 00:00:00 2001
+From: Omar Sandoval <osandov@fb.com>
+Date: Mon, 14 Nov 2016 14:56:17 -0800
+Subject: loop: return proper error from loop_queue_rq()
+
+From: Omar Sandoval <osandov@fb.com>
+
+commit b4a567e8114327518c09f5632339a5954ab975a3 upstream.
+
+->queue_rq() should return one of the BLK_MQ_RQ_QUEUE_* constants, not
+an errno.
+
+Fixes: f4aa4c7bbac6 ("block: loop: convert to per-device workqueue")
+Signed-off-by: Omar Sandoval <osandov@fb.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/loop.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/block/loop.c
++++ b/drivers/block/loop.c
+@@ -1646,7 +1646,7 @@ static int loop_queue_rq(struct blk_mq_h
+       blk_mq_start_request(bd->rq);
+       if (lo->lo_state != Lo_bound)
+-              return -EIO;
++              return BLK_MQ_RQ_QUEUE_ERROR;
+       switch (req_op(cmd->rq)) {
+       case REQ_OP_FLUSH:
diff --git a/queue-4.9/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch b/queue-4.9/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch
new file mode 100644 (file)
index 0000000..352bd80
--- /dev/null
@@ -0,0 +1,181 @@
+From bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Thu, 13 Oct 2016 21:23:16 -0500
+Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 upstream.
+
+During exec dumpable is cleared if the file that is being executed is
+not readable by the user executing the file.  A bug in
+ptrace_may_access allows reading the file if the executable happens to
+enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
+unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).
+
+This problem is fixed with only necessary userspace breakage by adding
+a user namespace owner to mm_struct, captured at the time of exec, so
+it is clear in which user namespace CAP_SYS_PTRACE must be present in
+to be able to safely give read permission to the executable.
+
+The function ptrace_may_access is modified to verify that the ptracer
+has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
+This ensures that if the task changes it's cred into a subordinate
+user namespace it does not become ptraceable.
+
+The function ptrace_attach is modified to only set PT_PTRACE_CAP when
+CAP_SYS_PTRACE is held over task->mm->user_ns.  The intent of
+PT_PTRACE_CAP is to be a flag to note that whatever permission changes
+the task might go through the tracer has sufficient permissions for
+it not to be an issue.  task->cred->user_ns is always the same
+as or descendent of mm->user_ns.  Which guarantees that having
+CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks
+credentials.
+
+To prevent regressions mm->dumpable and mm->user_ns are not considered
+when a task has no mm.  As simply failing ptrace_may_attach causes
+regressions in privileged applications attempting to read things
+such as /proc/<pid>/stat
+
+Acked-by: Kees Cook <keescook@chromium.org>
+Tested-by: Cyrill Gorcunov <gorcunov@openvz.org>
+Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces")
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm_types.h |    1 +
+ kernel/fork.c            |    9 ++++++---
+ kernel/ptrace.c          |   26 +++++++++++---------------
+ mm/init-mm.c             |    2 ++
+ 4 files changed, 20 insertions(+), 18 deletions(-)
+
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -473,6 +473,7 @@ struct mm_struct {
+        */
+       struct task_struct __rcu *owner;
+ #endif
++      struct user_namespace *user_ns;
+       /* store ref to file /proc/<pid>/exe symlink points to */
+       struct file __rcu *exe_file;
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -745,7 +745,8 @@ static void mm_init_owner(struct mm_stru
+ #endif
+ }
+-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
++static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
++      struct user_namespace *user_ns)
+ {
+       mm->mmap = NULL;
+       mm->mm_rb = RB_ROOT;
+@@ -785,6 +786,7 @@ static struct mm_struct *mm_init(struct
+       if (init_new_context(p, mm))
+               goto fail_nocontext;
++      mm->user_ns = get_user_ns(user_ns);
+       return mm;
+ fail_nocontext:
+@@ -830,7 +832,7 @@ struct mm_struct *mm_alloc(void)
+               return NULL;
+       memset(mm, 0, sizeof(*mm));
+-      return mm_init(mm, current);
++      return mm_init(mm, current, current_user_ns());
+ }
+ /*
+@@ -845,6 +847,7 @@ void __mmdrop(struct mm_struct *mm)
+       destroy_context(mm);
+       mmu_notifier_mm_destroy(mm);
+       check_mm(mm);
++      put_user_ns(mm->user_ns);
+       free_mm(mm);
+ }
+ EXPORT_SYMBOL_GPL(__mmdrop);
+@@ -1126,7 +1129,7 @@ static struct mm_struct *dup_mm(struct t
+       memcpy(mm, oldmm, sizeof(*mm));
+-      if (!mm_init(mm, tsk))
++      if (!mm_init(mm, tsk, mm->user_ns))
+               goto fail_nomem;
+       err = dup_mmap(mm, oldmm);
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -220,7 +220,7 @@ static int ptrace_has_cap(struct user_na
+ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
+ {
+       const struct cred *cred = current_cred(), *tcred;
+-      int dumpable = 0;
++      struct mm_struct *mm;
+       kuid_t caller_uid;
+       kgid_t caller_gid;
+@@ -271,16 +271,11 @@ static int __ptrace_may_access(struct ta
+       return -EPERM;
+ ok:
+       rcu_read_unlock();
+-      smp_rmb();
+-      if (task->mm)
+-              dumpable = get_dumpable(task->mm);
+-      rcu_read_lock();
+-      if (dumpable != SUID_DUMP_USER &&
+-          !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+-              rcu_read_unlock();
+-              return -EPERM;
+-      }
+-      rcu_read_unlock();
++      mm = task->mm;
++      if (mm &&
++          ((get_dumpable(mm) != SUID_DUMP_USER) &&
++           !ptrace_has_cap(mm->user_ns, mode)))
++          return -EPERM;
+       return security_ptrace_access_check(task, mode);
+ }
+@@ -331,6 +326,11 @@ static int ptrace_attach(struct task_str
+       task_lock(task);
+       retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
++      if (!retval) {
++              struct mm_struct *mm = task->mm;
++              if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE))
++                      flags |= PT_PTRACE_CAP;
++      }
+       task_unlock(task);
+       if (retval)
+               goto unlock_creds;
+@@ -344,10 +344,6 @@ static int ptrace_attach(struct task_str
+       if (seize)
+               flags |= PT_SEIZED;
+-      rcu_read_lock();
+-      if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
+-              flags |= PT_PTRACE_CAP;
+-      rcu_read_unlock();
+       task->ptrace = flags;
+       __ptrace_link(task, current);
+--- a/mm/init-mm.c
++++ b/mm/init-mm.c
+@@ -6,6 +6,7 @@
+ #include <linux/cpumask.h>
+ #include <linux/atomic.h>
++#include <linux/user_namespace.h>
+ #include <asm/pgtable.h>
+ #include <asm/mmu.h>
+@@ -21,5 +22,6 @@ struct mm_struct init_mm = {
+       .mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+       .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+       .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
++      .user_ns        = &init_user_ns,
+       INIT_MM_CONTEXT(init_mm)
+ };
diff --git a/queue-4.9/mm-page_alloc-keep-pcp-count-and-list-contents-in-sync-if-struct-page-is-corrupted.patch b/queue-4.9/mm-page_alloc-keep-pcp-count-and-list-contents-in-sync-if-struct-page-is-corrupted.patch
new file mode 100644 (file)
index 0000000..6fa4466
--- /dev/null
@@ -0,0 +1,82 @@
+From a6de734bc002fe2027ccc074fbbd87d72957b7a4 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@techsingularity.net>
+Date: Mon, 12 Dec 2016 16:44:41 -0800
+Subject: mm, page_alloc: keep pcp count and list contents in sync if struct page is corrupted
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+commit a6de734bc002fe2027ccc074fbbd87d72957b7a4 upstream.
+
+Vlastimil Babka pointed out that commit 479f854a207c ("mm, page_alloc:
+defer debugging checks of pages allocated from the PCP") will allow the
+per-cpu list counter to be out of sync with the per-cpu list contents if
+a struct page is corrupted.
+
+The consequence is an infinite loop if the per-cpu lists get fully
+drained by free_pcppages_bulk because all the lists are empty but the
+count is positive.  The infinite loop occurs here
+
+                do {
+                        batch_free++;
+                        if (++migratetype == MIGRATE_PCPTYPES)
+                                migratetype = 0;
+                        list = &pcp->lists[migratetype];
+                } while (list_empty(list));
+
+What the user sees is a bad page warning followed by a soft lockup with
+interrupts disabled in free_pcppages_bulk().
+
+This patch keeps the accounting in sync.
+
+Fixes: 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP")
+Link: http://lkml.kernel.org/r/20161202112951.23346-2-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Jesper Dangaard Brouer <brouer@redhat.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2192,7 +2192,7 @@ static int rmqueue_bulk(struct zone *zon
+                       unsigned long count, struct list_head *list,
+                       int migratetype, bool cold)
+ {
+-      int i;
++      int i, alloced = 0;
+       spin_lock(&zone->lock);
+       for (i = 0; i < count; ++i) {
+@@ -2217,13 +2217,21 @@ static int rmqueue_bulk(struct zone *zon
+               else
+                       list_add_tail(&page->lru, list);
+               list = &page->lru;
++              alloced++;
+               if (is_migrate_cma(get_pcppage_migratetype(page)))
+                       __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+                                             -(1 << order));
+       }
++
++      /*
++       * i pages were removed from the buddy list even if some leak due
++       * to check_pcp_refill failing so adjust NR_FREE_PAGES based
++       * on i. Do not confuse with 'alloced' which is the number of
++       * pages added to the pcp list.
++       */
+       __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+       spin_unlock(&zone->lock);
+-      return i;
++      return alloced;
+ }
+ #ifdef CONFIG_NUMA
diff --git a/queue-4.9/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch b/queue-4.9/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch
new file mode 100644 (file)
index 0000000..021fcec
--- /dev/null
@@ -0,0 +1,85 @@
+From 5f33a0803bbd781de916f5c7448cbbbbc763d911 Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@fb.com>
+Date: Mon, 12 Dec 2016 16:41:50 -0800
+Subject: mm/vmscan.c: set correct defer count for shrinker
+
+From: Shaohua Li <shli@fb.com>
+
+commit 5f33a0803bbd781de916f5c7448cbbbbc763d911 upstream.
+
+Our system uses significantly more slab memory with memcg enabled with
+the latest kernel.  With 3.10 kernel, slab uses 2G memory, while with
+4.6 kernel, 6G memory is used.  The shrinker has problem.  Let's see we
+have two memcg for one shrinker.  In do_shrink_slab:
+
+1. Check cg1.  nr_deferred = 0, assume total_scan = 700.  batch size
+   is 1024, then no memory is freed.  nr_deferred = 700
+
+2. Check cg2.  nr_deferred = 700.  Assume freeable = 20, then
+   total_scan = 10 or 40.  Let's assume it's 10.  No memory is freed.
+   nr_deferred = 10.
+
+The deferred share of cg1 is lost in this case.  kswapd will free no
+memory even run above steps again and again.
+
+The fix makes sure one memcg's deferred share isn't lost.
+
+Link: http://lkml.kernel.org/r/2414be961b5d25892060315fbb56bb19d81d0c07.1476227351.git.shli@fb.com
+Signed-off-by: Shaohua Li <shli@fb.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov@parallels.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(stru
+       int nid = shrinkctl->nid;
+       long batch_size = shrinker->batch ? shrinker->batch
+                                         : SHRINK_BATCH;
++      long scanned = 0, next_deferred;
+       freeable = shrinker->count_objects(shrinker, shrinkctl);
+       if (freeable == 0)
+@@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(stru
+               pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+                      shrinker->scan_objects, total_scan);
+               total_scan = freeable;
+-      }
++              next_deferred = nr;
++      } else
++              next_deferred = total_scan;
+       /*
+        * We need to avoid excessive windup on filesystem shrinkers
+@@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(stru
+               count_vm_events(SLABS_SCANNED, nr_to_scan);
+               total_scan -= nr_to_scan;
++              scanned += nr_to_scan;
+               cond_resched();
+       }
++      if (next_deferred >= scanned)
++              next_deferred -= scanned;
++      else
++              next_deferred = 0;
+       /*
+        * move the unused scan count back into the shrinker in a
+        * manner that handles concurrent updates. If we exhausted the
+        * scan, there is no need to do an update.
+        */
+-      if (total_scan > 0)
+-              new_nr = atomic_long_add_return(total_scan,
++      if (next_deferred > 0)
++              new_nr = atomic_long_add_return(next_deferred,
+                                               &shrinker->nr_deferred[nid]);
+       else
+               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
diff --git a/queue-4.9/nvmet-fix-possible-infinite-loop-triggered-on-hot-namespace-removal.patch b/queue-4.9/nvmet-fix-possible-infinite-loop-triggered-on-hot-namespace-removal.patch
new file mode 100644 (file)
index 0000000..2fd5c3a
--- /dev/null
@@ -0,0 +1,129 @@
+From e4fcf07cca6a3b6c4be00df16f08be894325eaa3 Mon Sep 17 00:00:00 2001
+From: Solganik Alexander <sashas@lightbitslabs.com>
+Date: Sun, 30 Oct 2016 10:35:15 +0200
+Subject: nvmet: Fix possible infinite loop triggered on hot namespace removal
+
+From: Solganik Alexander <sashas@lightbitslabs.com>
+
+commit e4fcf07cca6a3b6c4be00df16f08be894325eaa3 upstream.
+
+When removing a namespace we delete it from the subsystem namespaces
+list with list_del_init which allows us to know if it is enabled or
+not.
+
+The problem is that list_del_init initialize the list next and does
+not respect the RCU list-traversal we do on the IO path for locating
+a namespace. Instead we need to use list_del_rcu which is allowed to
+run concurrently with the _rcu list-traversal primitives (keeps list
+next intact) and guarantees concurrent nvmet_find_naespace forward
+progress.
+
+By changing that, we cannot rely on ns->dev_link for knowing if the
+namspace is enabled, so add enabled indicator entry to nvmet_ns for
+that.
+
+Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
+Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/nvme/target/configfs.c |    6 +++---
+ drivers/nvme/target/core.c     |   14 ++++++++------
+ drivers/nvme/target/nvmet.h    |    6 +-----
+ 3 files changed, 12 insertions(+), 14 deletions(-)
+
+--- a/drivers/nvme/target/configfs.c
++++ b/drivers/nvme/target/configfs.c
+@@ -271,7 +271,7 @@ static ssize_t nvmet_ns_device_path_stor
+       mutex_lock(&subsys->lock);
+       ret = -EBUSY;
+-      if (nvmet_ns_enabled(ns))
++      if (ns->enabled)
+               goto out_unlock;
+       kfree(ns->device_path);
+@@ -307,7 +307,7 @@ static ssize_t nvmet_ns_device_nguid_sto
+       int ret = 0;
+       mutex_lock(&subsys->lock);
+-      if (nvmet_ns_enabled(ns)) {
++      if (ns->enabled) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+@@ -339,7 +339,7 @@ CONFIGFS_ATTR(nvmet_ns_, device_nguid);
+ static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
+ {
+-      return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item)));
++      return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
+ }
+ static ssize_t nvmet_ns_enable_store(struct config_item *item,
+--- a/drivers/nvme/target/core.c
++++ b/drivers/nvme/target/core.c
+@@ -264,7 +264,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
+       int ret = 0;
+       mutex_lock(&subsys->lock);
+-      if (!list_empty(&ns->dev_link))
++      if (ns->enabled)
+               goto out_unlock;
+       ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
+@@ -309,6 +309,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
+       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+               nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
++      ns->enabled = true;
+       ret = 0;
+ out_unlock:
+       mutex_unlock(&subsys->lock);
+@@ -325,11 +326,11 @@ void nvmet_ns_disable(struct nvmet_ns *n
+       struct nvmet_ctrl *ctrl;
+       mutex_lock(&subsys->lock);
+-      if (list_empty(&ns->dev_link)) {
+-              mutex_unlock(&subsys->lock);
+-              return;
+-      }
+-      list_del_init(&ns->dev_link);
++      if (!ns->enabled)
++              goto out_unlock;
++
++      ns->enabled = false;
++      list_del_rcu(&ns->dev_link);
+       mutex_unlock(&subsys->lock);
+       /*
+@@ -351,6 +352,7 @@ void nvmet_ns_disable(struct nvmet_ns *n
+       if (ns->bdev)
+               blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
++out_unlock:
+       mutex_unlock(&subsys->lock);
+ }
+--- a/drivers/nvme/target/nvmet.h
++++ b/drivers/nvme/target/nvmet.h
+@@ -47,6 +47,7 @@ struct nvmet_ns {
+       loff_t                  size;
+       u8                      nguid[16];
++      bool                    enabled;
+       struct nvmet_subsys     *subsys;
+       const char              *device_path;
+@@ -61,11 +62,6 @@ static inline struct nvmet_ns *to_nvmet_
+       return container_of(to_config_group(item), struct nvmet_ns, group);
+ }
+-static inline bool nvmet_ns_enabled(struct nvmet_ns *ns)
+-{
+-      return !list_empty_careful(&ns->dev_link);
+-}
+-
+ struct nvmet_cq {
+       u16                     qid;
+       u16                     size;
diff --git a/queue-4.9/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch b/queue-4.9/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch
new file mode 100644 (file)
index 0000000..2cff45b
--- /dev/null
@@ -0,0 +1,148 @@
+From 64b875f7ac8a5d60a4e191479299e931ee949b67 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Mon, 14 Nov 2016 18:48:07 -0600
+Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit 64b875f7ac8a5d60a4e191479299e931ee949b67 upstream.
+
+When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
+overlooked.  This can result in incorrect behavior when an application
+like strace traces an exec of a setuid executable.
+
+Further PT_PTRACE_CAP does not have enough information for making good
+security decisions as it does not report which user namespace the
+capability is in.  This has already allowed one mistake through
+insufficient granulariy.
+
+I found this issue when I was testing another corner case of exec and
+discovered that I could not get strace to set PT_PTRACE_CAP even when
+running strace as root with a full set of caps.
+
+This change fixes the above issue with strace allowing stracing as
+root a setuid executable without disabling setuid.  More fundamentaly
+this change allows what is allowable at all times, by using the correct
+information in it's decision.
+
+Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/exec.c                  |    2 +-
+ include/linux/capability.h |    1 +
+ include/linux/ptrace.h     |    1 -
+ include/linux/sched.h      |    1 +
+ kernel/capability.c        |   20 ++++++++++++++++++++
+ kernel/ptrace.c            |   12 +++++++-----
+ 6 files changed, 30 insertions(+), 7 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1425,7 +1425,7 @@ static void check_unsafe_exec(struct lin
+       unsigned n_fs;
+       if (p->ptrace) {
+-              if (p->ptrace & PT_PTRACE_CAP)
++              if (ptracer_capable(p, current_user_ns()))
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
+               else
+                       bprm->unsafe |= LSM_UNSAFE_PTRACE;
+--- a/include/linux/capability.h
++++ b/include/linux/capability.h
+@@ -243,6 +243,7 @@ static inline bool ns_capable_noaudit(st
+ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
+ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
+ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
++extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
+ /* audit system wants to get cap info from files as well */
+ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+--- a/include/linux/ptrace.h
++++ b/include/linux/ptrace.h
+@@ -19,7 +19,6 @@
+ #define PT_SEIZED     0x00010000      /* SEIZE used, enable new behavior */
+ #define PT_PTRACED    0x00000001
+ #define PT_DTRACE     0x00000002      /* delayed trace (used on m68k, i386) */
+-#define PT_PTRACE_CAP 0x00000004      /* ptracer can follow suid-exec */
+ #define PT_OPT_FLAG_SHIFT     3
+ /* PT_TRACE_* event enable flags */
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1656,6 +1656,7 @@ struct task_struct {
+       struct list_head cpu_timers[3];
+ /* process credentials */
++      const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
+       const struct cred __rcu *real_cred; /* objective and real subjective task
+                                        * credentials (COW) */
+       const struct cred __rcu *cred;  /* effective (overridable) subjective task
+--- a/kernel/capability.c
++++ b/kernel/capability.c
+@@ -485,3 +485,23 @@ bool capable_wrt_inode_uidgid(const stru
+       return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
+ }
+ EXPORT_SYMBOL(capable_wrt_inode_uidgid);
++
++/**
++ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
++ * @tsk: The task that may be ptraced
++ * @ns: The user namespace to search for CAP_SYS_PTRACE in
++ *
++ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
++ * in the specified user namespace.
++ */
++bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
++{
++      int ret = 0;  /* An absent tracer adds no restrictions */
++      const struct cred *cred;
++      rcu_read_lock();
++      cred = rcu_dereference(tsk->ptracer_cred);
++      if (cred)
++              ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
++      rcu_read_unlock();
++      return (ret == 0);
++}
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *c
+       BUG_ON(!list_empty(&child->ptrace_entry));
+       list_add(&child->ptrace_entry, &new_parent->ptraced);
+       child->parent = new_parent;
++      rcu_read_lock();
++      child->ptracer_cred = get_cred(__task_cred(new_parent));
++      rcu_read_unlock();
+ }
+ /**
+@@ -71,12 +74,16 @@ void __ptrace_link(struct task_struct *c
+  */
+ void __ptrace_unlink(struct task_struct *child)
+ {
++      const struct cred *old_cred;
+       BUG_ON(!child->ptrace);
+       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+       child->parent = child->real_parent;
+       list_del_init(&child->ptrace_entry);
++      old_cred = child->ptracer_cred;
++      child->ptracer_cred = NULL;
++      put_cred(old_cred);
+       spin_lock(&child->sighand->siglock);
+       child->ptrace = 0;
+@@ -326,11 +333,6 @@ static int ptrace_attach(struct task_str
+       task_lock(task);
+       retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+-      if (!retval) {
+-              struct mm_struct *mm = task->mm;
+-              if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE))
+-                      flags |= PT_PTRACE_CAP;
+-      }
+       task_unlock(task);
+       if (retval)
+               goto unlock_creds;
diff --git a/queue-4.9/ptrace-don-t-allow-accessing-an-undumpable-mm.patch b/queue-4.9/ptrace-don-t-allow-accessing-an-undumpable-mm.patch
new file mode 100644 (file)
index 0000000..7e10707
--- /dev/null
@@ -0,0 +1,279 @@
+From 84d77d3f06e7e8dea057d10e8ec77ad71f721be3 Mon Sep 17 00:00:00 2001
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Tue, 22 Nov 2016 12:06:50 -0600
+Subject: ptrace: Don't allow accessing an undumpable mm
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit 84d77d3f06e7e8dea057d10e8ec77ad71f721be3 upstream.
+
+It is the reasonable expectation that if an executable file is not
+readable there will be no way for a user without special privileges to
+read the file.  This is enforced in ptrace_attach but if ptrace
+is already attached before exec there is no enforcement for read-only
+executables.
+
+As the only way to read such an mm is through access_process_vm
+spin a variant called ptrace_access_vm that will fail if the
+target process is not being ptraced by the current process, or
+the current process did not have sufficient privileges when ptracing
+began to read the target processes mm.
+
+In the ptrace implementations replace access_process_vm by
+ptrace_access_vm.  There remain several ptrace sites that still use
+access_process_vm as they are reading the target executables
+instructions (for kernel consumption) or register stacks.  As such it
+does not appear necessary to add a permission check to those calls.
+
+This bug has always existed in Linux.
+
+Fixes: v1.0
+Reported-by: Andy Lutomirski <luto@amacapital.net>
+Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/alpha/kernel/ptrace.c         |    2 -
+ arch/blackfin/kernel/ptrace.c      |    4 +--
+ arch/cris/arch-v32/kernel/ptrace.c |    2 -
+ arch/ia64/kernel/ptrace.c          |    2 -
+ arch/mips/kernel/ptrace32.c        |    4 +--
+ arch/powerpc/kernel/ptrace32.c     |    4 +--
+ include/linux/mm.h                 |    2 +
+ include/linux/ptrace.h             |    3 ++
+ kernel/ptrace.c                    |   42 +++++++++++++++++++++++++++++++------
+ mm/memory.c                        |    2 -
+ mm/nommu.c                         |    2 -
+ 11 files changed, 52 insertions(+), 17 deletions(-)
+
+--- a/arch/alpha/kernel/ptrace.c
++++ b/arch/alpha/kernel/ptrace.c
+@@ -283,7 +283,7 @@ long arch_ptrace(struct task_struct *chi
+       /* When I and D space are separate, these will need to be fixed.  */
+       case PTRACE_PEEKTEXT: /* read word at location addr. */
+       case PTRACE_PEEKDATA:
+-              copied = access_process_vm(child, addr, &tmp, sizeof(tmp),
++              copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp),
+                               FOLL_FORCE);
+               ret = -EIO;
+               if (copied != sizeof(tmp))
+--- a/arch/blackfin/kernel/ptrace.c
++++ b/arch/blackfin/kernel/ptrace.c
+@@ -270,7 +270,7 @@ long arch_ptrace(struct task_struct *chi
+                       switch (bfin_mem_access_type(addr, to_copy)) {
+                       case BFIN_MEM_ACCESS_CORE:
+                       case BFIN_MEM_ACCESS_CORE_ONLY:
+-                              copied = access_process_vm(child, addr, &tmp,
++                              copied = ptrace_access_vm(child, addr, &tmp,
+                                                          to_copy, FOLL_FORCE);
+                               if (copied)
+                                       break;
+@@ -323,7 +323,7 @@ long arch_ptrace(struct task_struct *chi
+                       switch (bfin_mem_access_type(addr, to_copy)) {
+                       case BFIN_MEM_ACCESS_CORE:
+                       case BFIN_MEM_ACCESS_CORE_ONLY:
+-                              copied = access_process_vm(child, addr, &data,
++                              copied = ptrace_access_vm(child, addr, &data,
+                                                          to_copy,
+                                                          FOLL_FORCE | FOLL_WRITE);
+                               break;
+--- a/arch/cris/arch-v32/kernel/ptrace.c
++++ b/arch/cris/arch-v32/kernel/ptrace.c
+@@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *chi
+                               /* The trampoline page is globally mapped, no page table to traverse.*/
+                               tmp = *(unsigned long*)addr;
+                       } else {
+-                              copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
++                              copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+                               if (copied != sizeof(tmp))
+                                       break;
+--- a/arch/ia64/kernel/ptrace.c
++++ b/arch/ia64/kernel/ptrace.c
+@@ -1159,7 +1159,7 @@ arch_ptrace (struct task_struct *child,
+       case PTRACE_PEEKTEXT:
+       case PTRACE_PEEKDATA:
+               /* read word at location addr */
+-              if (access_process_vm(child, addr, &data, sizeof(data),
++              if (ptrace_access_vm(child, addr, &data, sizeof(data),
+                               FOLL_FORCE)
+                   != sizeof(data))
+                       return -EIO;
+--- a/arch/mips/kernel/ptrace32.c
++++ b/arch/mips/kernel/ptrace32.c
+@@ -69,7 +69,7 @@ long compat_arch_ptrace(struct task_stru
+               if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
+                       break;
+-              copied = access_process_vm(child, (u64)addrOthers, &tmp,
++              copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
+                               sizeof(tmp), FOLL_FORCE);
+               if (copied != sizeof(tmp))
+                       break;
+@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_stru
+               if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
+                       break;
+               ret = 0;
+-              if (access_process_vm(child, (u64)addrOthers, &data,
++              if (ptrace_access_vm(child, (u64)addrOthers, &data,
+                                       sizeof(data),
+                                       FOLL_FORCE | FOLL_WRITE) == sizeof(data))
+                       break;
+--- a/arch/powerpc/kernel/ptrace32.c
++++ b/arch/powerpc/kernel/ptrace32.c
+@@ -73,7 +73,7 @@ long compat_arch_ptrace(struct task_stru
+               if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+                       break;
+-              copied = access_process_vm(child, (u64)addrOthers, &tmp,
++              copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
+                               sizeof(tmp), FOLL_FORCE);
+               if (copied != sizeof(tmp))
+                       break;
+@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_stru
+               if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
+                       break;
+               ret = 0;
+-              if (access_process_vm(child, (u64)addrOthers, &tmp,
++              if (ptrace_access_vm(child, (u64)addrOthers, &tmp,
+                                       sizeof(tmp),
+                                       FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
+                       break;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1270,6 +1270,8 @@ extern int access_process_vm(struct task
+               unsigned int gup_flags);
+ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+               void *buf, int len, unsigned int gup_flags);
++extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
++              unsigned long addr, void *buf, int len, unsigned int gup_flags);
+ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+                           unsigned long start, unsigned long nr_pages,
+--- a/include/linux/ptrace.h
++++ b/include/linux/ptrace.h
+@@ -8,6 +8,9 @@
+ #include <linux/pid_namespace.h>      /* For task_active_pid_ns.  */
+ #include <uapi/linux/ptrace.h>
++extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
++                          void *buf, int len, unsigned int gup_flags);
++
+ /*
+  * Ptrace flags
+  *
+--- a/kernel/ptrace.c
++++ b/kernel/ptrace.c
+@@ -27,6 +27,35 @@
+ #include <linux/cn_proc.h>
+ #include <linux/compat.h>
++/*
++ * Access another process' address space via ptrace.
++ * Source/target buffer must be kernel space,
++ * Do not walk the page table directly, use get_user_pages
++ */
++int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
++                   void *buf, int len, unsigned int gup_flags)
++{
++      struct mm_struct *mm;
++      int ret;
++
++      mm = get_task_mm(tsk);
++      if (!mm)
++              return 0;
++
++      if (!tsk->ptrace ||
++          (current != tsk->parent) ||
++          ((get_dumpable(mm) != SUID_DUMP_USER) &&
++           !ptracer_capable(tsk, mm->user_ns))) {
++              mmput(mm);
++              return 0;
++      }
++
++      ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
++      mmput(mm);
++
++      return ret;
++}
++
+ /*
+  * ptrace a task: make the debugger its new parent and
+@@ -535,7 +564,8 @@ int ptrace_readdata(struct task_struct *
+               int this_len, retval;
+               this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+-              retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
++              retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
++
+               if (!retval) {
+                       if (copied)
+                               break;
+@@ -562,7 +592,7 @@ int ptrace_writedata(struct task_struct
+               this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
+               if (copy_from_user(buf, src, this_len))
+                       return -EFAULT;
+-              retval = access_process_vm(tsk, dst, buf, this_len,
++              retval = ptrace_access_vm(tsk, dst, buf, this_len,
+                               FOLL_FORCE | FOLL_WRITE);
+               if (!retval) {
+                       if (copied)
+@@ -1126,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_
+       unsigned long tmp;
+       int copied;
+-      copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
++      copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+       if (copied != sizeof(tmp))
+               return -EIO;
+       return put_user(tmp, (unsigned long __user *)data);
+@@ -1137,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_
+ {
+       int copied;
+-      copied = access_process_vm(tsk, addr, &data, sizeof(data),
++      copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
+                       FOLL_FORCE | FOLL_WRITE);
+       return (copied == sizeof(data)) ? 0 : -EIO;
+ }
+@@ -1155,7 +1185,7 @@ int compat_ptrace_request(struct task_st
+       switch (request) {
+       case PTRACE_PEEKTEXT:
+       case PTRACE_PEEKDATA:
+-              ret = access_process_vm(child, addr, &word, sizeof(word),
++              ret = ptrace_access_vm(child, addr, &word, sizeof(word),
+                               FOLL_FORCE);
+               if (ret != sizeof(word))
+                       ret = -EIO;
+@@ -1165,7 +1195,7 @@ int compat_ptrace_request(struct task_st
+       case PTRACE_POKETEXT:
+       case PTRACE_POKEDATA:
+-              ret = access_process_vm(child, addr, &data, sizeof(data),
++              ret = ptrace_access_vm(child, addr, &data, sizeof(data),
+                               FOLL_FORCE | FOLL_WRITE);
+               ret = (ret != sizeof(data) ? -EIO : 0);
+               break;
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
+  * Access another process' address space as given in mm.  If non-NULL, use the
+  * given task for page fault accounting.
+  */
+-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
++int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long addr, void *buf, int len, unsigned int gup_flags)
+ {
+       struct vm_area_struct *vma;
+--- a/mm/nommu.c
++++ b/mm/nommu.c
+@@ -1808,7 +1808,7 @@ void filemap_map_pages(struct fault_env
+ }
+ EXPORT_SYMBOL(filemap_map_pages);
+-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
++int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long addr, void *buf, int len, unsigned int gup_flags)
+ {
+       struct vm_area_struct *vma;
diff --git a/queue-4.9/revert-f2fs-use-percpu_counter-for-of-dirty-pages-in-inode.patch b/queue-4.9/revert-f2fs-use-percpu_counter-for-of-dirty-pages-in-inode.patch
new file mode 100644 (file)
index 0000000..9073f92
--- /dev/null
@@ -0,0 +1,100 @@
+From 204706c7accfabb67b97eef9f9a28361b6201199 Mon Sep 17 00:00:00 2001
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+Date: Fri, 2 Dec 2016 15:11:32 -0800
+Subject: Revert "f2fs: use percpu_counter for # of dirty pages in inode"
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+commit 204706c7accfabb67b97eef9f9a28361b6201199 upstream.
+
+This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76.
+
+The perpcu_counter doesn't provide atomicity in single core and consume more
+DRAM. That incurs fs_mark test failure due to ENOMEM.
+
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/f2fs/f2fs.h  |   10 +++++-----
+ fs/f2fs/file.c  |    2 +-
+ fs/f2fs/super.c |    7 +------
+ 3 files changed, 7 insertions(+), 12 deletions(-)
+
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -428,7 +428,7 @@ struct f2fs_inode_info {
+       /* Use below internally in f2fs*/
+       unsigned long flags;            /* use to pass per-file flags */
+       struct rw_semaphore i_sem;      /* protect fi info */
+-      struct percpu_counter dirty_pages;      /* # of dirty pages */
++      atomic_t dirty_pages;           /* # of dirty pages */
+       f2fs_hash_t chash;              /* hash value of given file name */
+       unsigned int clevel;            /* maximum level of given file name */
+       nid_t i_xattr_nid;              /* node id that contains xattrs */
+@@ -1242,7 +1242,7 @@ static inline void inc_page_count(struct
+ static inline void inode_inc_dirty_pages(struct inode *inode)
+ {
+-      percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
++      atomic_inc(&F2FS_I(inode)->dirty_pages);
+       inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+                               F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+ }
+@@ -1258,7 +1258,7 @@ static inline void inode_dec_dirty_pages
+                       !S_ISLNK(inode->i_mode))
+               return;
+-      percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
++      atomic_dec(&F2FS_I(inode)->dirty_pages);
+       dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+                               F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+ }
+@@ -1268,9 +1268,9 @@ static inline s64 get_pages(struct f2fs_
+       return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
+ }
+-static inline s64 get_dirty_pages(struct inode *inode)
++static inline int get_dirty_pages(struct inode *inode)
+ {
+-      return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
++      return atomic_read(&F2FS_I(inode)->dirty_pages);
+ }
+ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -1526,7 +1526,7 @@ static int f2fs_ioc_start_atomic_write(s
+               goto out;
+       f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+-              "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
++              "Unexpected flush for atomic writes: ino=%lu, npages=%u",
+                                       inode->i_ino, get_dirty_pages(inode));
+       ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+       if (ret)
+--- a/fs/f2fs/super.c
++++ b/fs/f2fs/super.c
+@@ -558,13 +558,9 @@ static struct inode *f2fs_alloc_inode(st
+       init_once((void *) fi);
+-      if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
+-              kmem_cache_free(f2fs_inode_cachep, fi);
+-              return NULL;
+-      }
+-
+       /* Initialize f2fs-specific inode info */
+       fi->vfs_inode.i_version = 1;
++      atomic_set(&fi->dirty_pages, 0);
+       fi->i_current_depth = 1;
+       fi->i_advise = 0;
+       init_rwsem(&fi->i_sem);
+@@ -687,7 +683,6 @@ static void f2fs_i_callback(struct rcu_h
+ static void f2fs_destroy_inode(struct inode *inode)
+ {
+-      percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
+       call_rcu(&inode->i_rcu, f2fs_i_callback);
+ }
index e158e8848eb6358f172093ea2719c33e53fc2c88..aae541eab2e8ba0d0011d9af03abdafc27382778 100644 (file)
@@ -26,3 +26,29 @@ alsa-hda-fix-headset-mic-problem-on-a-dell-laptop.patch
 alsa-hda-gate-the-mic-jack-on-hp-z1-gen3-aio.patch
 alsa-hda-when-comparing-pin-configurations-ignore-assoc-in-addition-to-seq.patch
 clk-ti-omap36xx-work-around-sprz319-advisory-2.1.patch
+exec-ensure-mm-user_ns-contains-the-execed-files.patch
+fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch
+splice-reinstate-sigpipe-epipe-handling.patch
+block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch
+mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch
+vfs-mm-fix-return-value-of-read-at-s_maxbytes.patch
+ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch
+ptrace-don-t-allow-accessing-an-undumpable-mm.patch
+crypto-caam-fix-aead-givenc-descriptors.patch
+ext4-don-t-lock-buffer-in-ext4_commit_super-if-holding-spinlock.patch
+ext4-fix-mballoc-breakage-with-64k-block-size.patch
+ext4-fix-stack-memory-corruption-with-64k-block-size.patch
+ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch
+ext4-fix-in-superblock-mount-options-processing.patch
+ext4-add-sanity-checking-to-count_overhead.patch
+ext4-reject-inodes-with-negative-size.patch
+ext4-return-enomem-instead-of-success.patch
+ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch
+revert-f2fs-use-percpu_counter-for-of-dirty-pages-in-inode.patch
+f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch
+f2fs-fix-overflow-due-to-condition-check-order.patch
+f2fs-fix-to-determine-start_cp_addr-by-sbi-cur_cp_pack.patch
+loop-return-proper-error-from-loop_queue_rq.patch
+nvmet-fix-possible-infinite-loop-triggered-on-hot-namespace-removal.patch
+mm-vmscan.c-set-correct-defer-count-for-shrinker.patch
+mm-page_alloc-keep-pcp-count-and-list-contents-in-sync-if-struct-page-is-corrupted.patch
diff --git a/queue-4.9/splice-reinstate-sigpipe-epipe-handling.patch b/queue-4.9/splice-reinstate-sigpipe-epipe-handling.patch
new file mode 100644 (file)
index 0000000..e954b4e
--- /dev/null
@@ -0,0 +1,50 @@
+From 52bce91165e5f2db422b2b972e83d389e5e4725c Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Wed, 21 Dec 2016 10:59:34 -0800
+Subject: splice: reinstate SIGPIPE/EPIPE handling
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit 52bce91165e5f2db422b2b972e83d389e5e4725c upstream.
+
+Commit 8924feff66f3 ("splice: lift pipe_lock out of splice_to_pipe()")
+caused a regression when there were no more readers left on a pipe that
+was being spliced into: rather than the expected SIGPIPE and -EPIPE
+return value, the writer would end up waiting forever for space to free
+up (which obviously was not going to happen with no readers around).
+
+Fixes: 8924feff66f3 ("splice: lift pipe_lock out of splice_to_pipe()")
+Reported-and-tested-by: Andreas Schwab <schwab@linux-m68k.org>
+Debugged-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/splice.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -1086,7 +1086,13 @@ EXPORT_SYMBOL(do_splice_direct);
+ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
+ {
+-      while (pipe->nrbufs == pipe->buffers) {
++      for (;;) {
++              if (unlikely(!pipe->readers)) {
++                      send_sig(SIGPIPE, current, 0);
++                      return -EPIPE;
++              }
++              if (pipe->nrbufs != pipe->buffers)
++                      return 0;
+               if (flags & SPLICE_F_NONBLOCK)
+                       return -EAGAIN;
+               if (signal_pending(current))
+@@ -1095,7 +1101,6 @@ static int wait_for_space(struct pipe_in
+               pipe_wait(pipe);
+               pipe->waiting_writers--;
+       }
+-      return 0;
+ }
+ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
diff --git a/queue-4.9/vfs-mm-fix-return-value-of-read-at-s_maxbytes.patch b/queue-4.9/vfs-mm-fix-return-value-of-read-at-s_maxbytes.patch
new file mode 100644 (file)
index 0000000..950f404
--- /dev/null
@@ -0,0 +1,49 @@
+From d05c5f7ba164aed3db02fb188c26d0dd94f5455b Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Wed, 14 Dec 2016 12:45:25 -0800
+Subject: vfs,mm: fix return value of read() at s_maxbytes
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d05c5f7ba164aed3db02fb188c26d0dd94f5455b upstream.
+
+We truncated the possible read iterator to s_maxbytes in commit
+c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()"),
+but our end condition handling was wrong: it's not an error to try to
+read at the end of the file.
+
+Reading past the end should return EOF (0), not EINVAL.
+
+See for example
+
+  https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1649342
+  http://lists.gnu.org/archive/html/bug-coreutils/2016-12/msg00008.html
+
+where a md5sum of a maximally sized file fails because the final read is
+exactly at s_maxbytes.
+
+Fixes: c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()")
+Reported-by: Joseph Salisbury <joseph.salisbury@canonical.com>
+Cc: Wei Fang <fangwei1@huawei.com>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/filemap.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -1686,7 +1686,7 @@ static ssize_t do_generic_file_read(stru
+       int error = 0;
+       if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+-              return -EINVAL;
++              return 0;
+       iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+       index = *ppos >> PAGE_SHIFT;