From: Greg Kroah-Hartman Date: Wed, 4 Jan 2017 10:05:46 +0000 (+0100) Subject: 4.4-stable patches X-Git-Tag: v4.9.1~18 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=238bc08a1dcf2cf86a4dd5fe5d5fc085e97d1bfa;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch crypto-caam-fix-aead-givenc-descriptors.patch exec-ensure-mm-user_ns-contains-the-execed-files.patch ext4-add-sanity-checking-to-count_overhead.patch ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch ext4-fix-in-superblock-mount-options-processing.patch ext4-fix-mballoc-breakage-with-64k-block-size.patch ext4-fix-stack-memory-corruption-with-64k-block-size.patch ext4-reject-inodes-with-negative-size.patch ext4-return-enomem-instead-of-success.patch ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch loop-return-proper-error-from-loop_queue_rq.patch mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch mm-vmscan.c-set-correct-defer-count-for-shrinker.patch ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch --- diff --git a/queue-4.4/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch b/queue-4.4/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch new file mode 100644 index 00000000000..0d0e7b87a02 --- /dev/null +++ b/queue-4.4/block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch @@ -0,0 +1,68 @@ +From bcc7f5b4bee8e327689a4d994022765855c807ff Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Mon, 12 Dec 2016 08:21:51 -0700 +Subject: block_dev: don't test bdev->bd_contains when it is not stable + +From: NeilBrown + +commit bcc7f5b4bee8e327689a4d994022765855c807ff upstream. + +bdev->bd_contains is not stable before calling __blkdev_get(). +When __blkdev_get() is called on a parition with ->bd_openers == 0 +it sets + bdev->bd_contains = bdev; +which is not correct for a partition. +After a call to __blkdev_get() succeeds, ->bd_openers will be > 0 +and then ->bd_contains is stable. + +When FMODE_EXCL is used, blkdev_get() calls + bd_start_claiming() -> bd_prepare_to_claim() -> bd_may_claim() + +This call happens before __blkdev_get() is called, so ->bd_contains +is not stable. So bd_may_claim() cannot safely use ->bd_contains. +It currently tries to use it, and this can lead to a BUG_ON(). + +This happens when a whole device is already open with a bd_holder (in +use by dm in my particular example) and two threads race to open a +partition of that device for the first time, one opening with O_EXCL and +one without. + +The thread that doesn't use O_EXCL gets through blkdev_get() to +__blkdev_get(), gains the ->bd_mutex, and sets bdev->bd_contains = bdev; + +Immediately thereafter the other thread, using FMODE_EXCL, calls +bd_start_claiming() from blkdev_get(). This should fail because the +whole device has a holder, but because bdev->bd_contains == bdev +bd_may_claim() incorrectly reports success. +This thread continues and blocks on bd_mutex. + +The first thread then sets bdev->bd_contains correctly and drops the mutex. +The thread using FMODE_EXCL then continues and when it calls bd_may_claim() +again in: + BUG_ON(!bd_may_claim(bdev, whole, holder)); +The BUG_ON fires. + +Fix this by removing the dependency on ->bd_contains in +bd_may_claim(). As bd_may_claim() has direct access to the whole +device, it can simply test if the target bdev is the whole device. + +Fixes: 6b4517a7913a ("block: implement bd_claiming and claiming block") +Signed-off-by: NeilBrown +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + fs/block_dev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/block_dev.c ++++ b/fs/block_dev.c +@@ -759,7 +759,7 @@ static bool bd_may_claim(struct block_de + return true; /* already a holder */ + else if (bdev->bd_holder != NULL) + return false; /* held by someone else */ +- else if (bdev->bd_contains == bdev) ++ else if (whole == bdev) + return true; /* is a whole device which isn't held */ + + else if (whole->bd_holder == bd_may_claim) diff --git a/queue-4.4/crypto-caam-fix-aead-givenc-descriptors.patch b/queue-4.4/crypto-caam-fix-aead-givenc-descriptors.patch new file mode 100644 index 00000000000..3ec2b0c85cc --- /dev/null +++ b/queue-4.4/crypto-caam-fix-aead-givenc-descriptors.patch @@ -0,0 +1,48 @@ +From d128af17876d79b87edf048303f98b35f6a53dbc Mon Sep 17 00:00:00 2001 +From: Alex Porosanu +Date: Wed, 9 Nov 2016 10:46:11 +0200 +Subject: crypto: caam - fix AEAD givenc descriptors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Alex Porosanu + +commit d128af17876d79b87edf048303f98b35f6a53dbc upstream. + +The AEAD givenc descriptor relies on moving the IV through the +output FIFO and then back to the CTX2 for authentication. The +SEQ FIFO STORE could be scheduled before the data can be +read from OFIFO, especially since the SEQ FIFO LOAD needs +to wait for the SEQ FIFO LOAD SKIP to finish first. The +SKIP takes more time when the input is SG than when it's +a contiguous buffer. If the SEQ FIFO LOAD is not scheduled +before the STORE, the DECO will hang waiting for data +to be available in the OFIFO so it can be transferred to C2. +In order to overcome this, first force transfer of IV to C2 +by starting the "cryptlen" transfer first and then starting to +store data from OFIFO to the output buffer. + +Fixes: 1acebad3d8db8 ("crypto: caam - faster aead implementation") +Signed-off-by: Alex Porosanu +Signed-off-by: Horia Geantă +Signed-off-by: Herbert Xu +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/crypto/caam/caamalg.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/crypto/caam/caamalg.c ++++ b/drivers/crypto/caam/caamalg.c +@@ -702,7 +702,9 @@ copy_iv: + + /* Will read cryptlen */ + append_math_add(desc, VARSEQINLEN, SEQINLEN, REG0, CAAM_CMD_SZ); +- aead_append_src_dst(desc, FIFOLD_TYPE_MSG1OUT2); ++ append_seq_fifo_load(desc, 0, FIFOLD_CLASS_BOTH | KEY_VLF | ++ FIFOLD_TYPE_MSG1OUT2 | FIFOLD_TYPE_LASTBOTH); ++ append_seq_fifo_store(desc, 0, FIFOST_TYPE_MESSAGE_DATA | KEY_VLF); + + /* Write ICV */ + append_seq_store(desc, ctx->authsize, LDST_CLASS_2_CCB | diff --git a/queue-4.4/exec-ensure-mm-user_ns-contains-the-execed-files.patch b/queue-4.4/exec-ensure-mm-user_ns-contains-the-execed-files.patch new file mode 100644 index 00000000000..0dbacf6f9af --- /dev/null +++ b/queue-4.4/exec-ensure-mm-user_ns-contains-the-execed-files.patch @@ -0,0 +1,117 @@ +From f84df2a6f268de584a201e8911384a2d244876e3 Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" +Date: Wed, 16 Nov 2016 22:06:51 -0600 +Subject: exec: Ensure mm->user_ns contains the execed files + +From: Eric W. Biederman + +commit f84df2a6f268de584a201e8911384a2d244876e3 upstream. + +When the user namespace support was merged the need to prevent +ptrace from revealing the contents of an unreadable executable +was overlooked. + +Correct this oversight by ensuring that the executed file +or files are in mm->user_ns, by adjusting mm->user_ns. + +Use the new function privileged_wrt_inode_uidgid to see if +the executable is a member of the user namespace, and as such +if having CAP_SYS_PTRACE in the user namespace should allow +tracing the executable. If not update mm->user_ns to +the parent user namespace until an appropriate parent is found. + +Reported-by: Jann Horn +Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.") +Signed-off-by: "Eric W. Biederman" +Signed-off-by: Greg Kroah-Hartman + +--- + fs/exec.c | 19 +++++++++++++++++-- + include/linux/capability.h | 1 + + kernel/capability.c | 16 ++++++++++++++-- + 3 files changed, 32 insertions(+), 4 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1123,8 +1123,22 @@ EXPORT_SYMBOL(flush_old_exec); + + void would_dump(struct linux_binprm *bprm, struct file *file) + { +- if (inode_permission(file_inode(file), MAY_READ) < 0) ++ struct inode *inode = file_inode(file); ++ if (inode_permission(inode, MAY_READ) < 0) { ++ struct user_namespace *old, *user_ns; + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; ++ ++ /* Ensure mm->user_ns contains the executable */ ++ user_ns = old = bprm->mm->user_ns; ++ while ((user_ns != &init_user_ns) && ++ !privileged_wrt_inode_uidgid(user_ns, inode)) ++ user_ns = user_ns->parent; ++ ++ if (old != user_ns) { ++ bprm->mm->user_ns = get_user_ns(user_ns); ++ put_user_ns(old); ++ } ++ } + } + EXPORT_SYMBOL(would_dump); + +@@ -1154,7 +1168,6 @@ void setup_new_exec(struct linux_binprm + !gid_eq(bprm->cred->gid, current_egid())) { + current->pdeath_signal = 0; + } else { +- would_dump(bprm, bprm->file); + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) + set_dumpable(current->mm, suid_dumpable); + } +@@ -1587,6 +1600,8 @@ static int do_execveat_common(int fd, st + if (retval < 0) + goto out; + ++ would_dump(bprm, bprm->file); ++ + retval = exec_binprm(bprm); + if (retval < 0) + goto out; +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -247,6 +247,7 @@ static inline bool ns_capable_noaudit(st + return true; + } + #endif /* CONFIG_MULTIUSER */ ++extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); + extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); + extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); + +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -457,6 +457,19 @@ bool file_ns_capable(const struct file * + EXPORT_SYMBOL(file_ns_capable); + + /** ++ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? ++ * @ns: The user namespace in question ++ * @inode: The inode in question ++ * ++ * Return true if the inode uid and gid are within the namespace. ++ */ ++bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) ++{ ++ return kuid_has_mapping(ns, inode->i_uid) && ++ kgid_has_mapping(ns, inode->i_gid); ++} ++ ++/** + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped + * @inode: The inode in question + * @cap: The capability in question +@@ -469,7 +482,6 @@ bool capable_wrt_inode_uidgid(const stru + { + struct user_namespace *ns = current_user_ns(); + +- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && +- kgid_has_mapping(ns, inode->i_gid); ++ return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); + } + EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/queue-4.4/ext4-add-sanity-checking-to-count_overhead.patch b/queue-4.4/ext4-add-sanity-checking-to-count_overhead.patch new file mode 100644 index 00000000000..58703d48388 --- /dev/null +++ b/queue-4.4/ext4-add-sanity-checking-to-count_overhead.patch @@ -0,0 +1,42 @@ +From c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 18 Nov 2016 13:37:47 -0500 +Subject: ext4: add sanity checking to count_overhead() + +From: Theodore Ts'o + +commit c48ae41bafe31e9a66d8be2ced4e42a6b57fa814 upstream. + +The commit "ext4: sanity check the block and cluster size at mount +time" should prevent any problems, but in case the superblock is +modified while the file system is mounted, add an extra safety check +to make sure we won't overrun the allocated buffer. + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3037,10 +3037,15 @@ static int count_overhead(struct super_b + ext4_set_bit(s++, buf); + count++; + } +- for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { +- ext4_set_bit(EXT4_B2C(sbi, s++), buf); +- count++; ++ j = ext4_bg_num_gdb(sb, grp); ++ if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ++ ext4_error(sb, "Invalid number of block group " ++ "descriptor blocks: %d", j); ++ j = EXT4_BLOCKS_PER_GROUP(sb) - s; + } ++ count += j; ++ for (; j > 0; j--) ++ ext4_set_bit(EXT4_B2C(sbi, s++), buf); + } + if (!count) + return 0; diff --git a/queue-4.4/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch b/queue-4.4/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch new file mode 100644 index 00000000000..1ecca43bac2 --- /dev/null +++ b/queue-4.4/ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch @@ -0,0 +1,93 @@ +From 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 Mon Sep 17 00:00:00 2001 +From: Sergey Karamov +Date: Sat, 10 Dec 2016 17:54:58 -0500 +Subject: ext4: do not perform data journaling when data is encrypted + +From: Sergey Karamov + +commit 73b92a2a5e97d17cc4d5c4fe9d724d3273fb6fd2 upstream. + +Currently data journalling is incompatible with encryption: enabling both +at the same time has never been supported by design, and would result in +unpredictable behavior. However, users are not precluded from turning on +both features simultaneously. This change programmatically replaces data +journaling for encrypted regular files with ordered data journaling mode. + +Background: +Journaling encrypted data has not been supported because it operates on +buffer heads of the page in the page cache. Namely, when the commit +happens, which could be up to five seconds after caching, the commit +thread uses the buffer heads attached to the page to copy the contents of +the page to the journal. With encryption, it would have been required to +keep the bounce buffer with ciphertext for up to the aforementioned five +seconds, since the page cache can only hold plaintext and could not be +used for journaling. Alternatively, it would be required to setup the +journal to initiate a callback at the commit time to perform deferred +encryption - in this case, not only would the data have to be written +twice, but it would also have to be encrypted twice. This level of +complexity was not justified for a mode that in practice is very rarely +used because of the overhead from the data journalling. + +Solution: +If data=journaled has been set as a mount option for a filesystem, or if +journaling is enabled on a regular file, do not perform journaling if the +file is also encrypted, instead fall back to the data=ordered mode for the +file. + +Rationale: +The intent is to allow seamless and proper filesystem operation when +journaling and encryption have both been enabled, and have these two +conflicting features gracefully resolved by the filesystem. + +Fixes: 4461471107b7 +Signed-off-by: Sergey Karamov +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4_jbd2.h | 14 ++++++++------ + fs/ext4/super.c | 5 +++++ + 2 files changed, 13 insertions(+), 6 deletions(-) + +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -395,17 +395,19 @@ static inline int ext4_inode_journal_mod + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || +- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) +- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ +- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && +- !test_opt(inode->i_sb, DELALLOC)) ++ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || ++ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && ++ !test_opt(inode->i_sb, DELALLOC))) { ++ /* We do not support data journalling for encrypted data */ ++ if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) ++ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ ++ } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ +- else +- BUG(); ++ BUG(); + } + + static inline int ext4_should_journal_data(struct inode *inode) +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3345,6 +3345,11 @@ static int ext4_fill_super(struct super_ + "both data=journal and dax"); + goto failed_mount; + } ++ if (ext4_has_feature_encrypt(sb)) { ++ ext4_msg(sb, KERN_WARNING, ++ "encrypted files will use data=ordered " ++ "instead of data journaling mode"); ++ } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } else { diff --git a/queue-4.4/ext4-fix-in-superblock-mount-options-processing.patch b/queue-4.4/ext4-fix-in-superblock-mount-options-processing.patch new file mode 100644 index 00000000000..f156d61346d --- /dev/null +++ b/queue-4.4/ext4-fix-in-superblock-mount-options-processing.patch @@ -0,0 +1,105 @@ +From 5aee0f8a3f42c94c5012f1673420aee96315925a Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 18 Nov 2016 13:24:26 -0500 +Subject: ext4: fix in-superblock mount options processing + +From: Theodore Ts'o + +commit 5aee0f8a3f42c94c5012f1673420aee96315925a upstream. + +Fix a large number of problems with how we handle mount options in the +superblock. For one, if the string in the superblock is long enough +that it is not null terminated, we could run off the end of the string +and try to interpret superblocks fields as characters. It's unlikely +this will cause a security problem, but it could result in an invalid +parse. Also, parse_options is destructive to the string, so in some +cases if there is a comma-separated string, it would be modified in +the superblock. (Fortunately it only happens on file systems with a +1k block size.) + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 38 +++++++++++++++++++++++--------------- + 1 file changed, 23 insertions(+), 15 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3130,7 +3130,7 @@ static int ext4_fill_super(struct super_ + char *orig_data = kstrdup(data, GFP_KERNEL); + struct buffer_head *bh; + struct ext4_super_block *es = NULL; +- struct ext4_sb_info *sbi; ++ struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; +@@ -3149,16 +3149,14 @@ static int ext4_fill_super(struct super_ + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; + +- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); +- if (!sbi) +- goto out_free_orig; ++ if ((data && !orig_data) || !sbi) ++ goto out_free_base; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); +- if (!sbi->s_blockgroup_lock) { +- kfree(sbi); +- goto out_free_orig; +- } ++ if (!sbi->s_blockgroup_lock) ++ goto out_free_base; ++ + sb->s_fs_info = sbi; + sbi->s_sb = sb; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; +@@ -3304,11 +3302,19 @@ static int ext4_fill_super(struct super_ + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + +- if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, +- &journal_devnum, &journal_ioprio, 0)) { +- ext4_msg(sb, KERN_WARNING, +- "failed to parse options in superblock: %s", +- sbi->s_es->s_mount_opts); ++ if (sbi->s_es->s_mount_opts[0]) { ++ char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, ++ sizeof(sbi->s_es->s_mount_opts), ++ GFP_KERNEL); ++ if (!s_mount_opts) ++ goto failed_mount; ++ if (!parse_options(s_mount_opts, sb, &journal_devnum, ++ &journal_ioprio, 0)) { ++ ext4_msg(sb, KERN_WARNING, ++ "failed to parse options in superblock: %s", ++ s_mount_opts); ++ } ++ kfree(s_mount_opts); + } + sbi->s_def_mount_opt = sbi->s_mount_opt; + if (!parse_options((char *) data, sb, &journal_devnum, +@@ -3991,7 +3997,9 @@ no_journal: + + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " +- "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, ++ "Opts: %.*s%s%s", descr, ++ (int) sizeof(sbi->s_es->s_mount_opts), ++ sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + if (es->s_error_count) +@@ -4061,8 +4069,8 @@ failed_mount: + out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); ++out_free_base: + kfree(sbi); +-out_free_orig: + kfree(orig_data); + return err ? err : ret; + } diff --git a/queue-4.4/ext4-fix-mballoc-breakage-with-64k-block-size.patch b/queue-4.4/ext4-fix-mballoc-breakage-with-64k-block-size.patch new file mode 100644 index 00000000000..74c0e3e6ca3 --- /dev/null +++ b/queue-4.4/ext4-fix-mballoc-breakage-with-64k-block-size.patch @@ -0,0 +1,35 @@ +From 69e43e8cc971a79dd1ee5d4343d8e63f82725123 Mon Sep 17 00:00:00 2001 +From: Chandan Rajendra +Date: Mon, 14 Nov 2016 21:04:37 -0500 +Subject: ext4: fix mballoc breakage with 64k block size + +From: Chandan Rajendra + +commit 69e43e8cc971a79dd1ee5d4343d8e63f82725123 upstream. + +'border' variable is set to a value of 2 times the block size of the +underlying filesystem. With 64k block size, the resulting value won't +fit into a 16-bit variable. Hence this commit changes the data type of +'border' to 'unsigned int'. + +Fixes: c9de560ded61f +Signed-off-by: Chandan Rajendra +Signed-off-by: Theodore Ts'o +Reviewed-by: Andreas Dilger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/mballoc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(str + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; +- unsigned short border; ++ unsigned int border; + + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); + diff --git a/queue-4.4/ext4-fix-stack-memory-corruption-with-64k-block-size.patch b/queue-4.4/ext4-fix-stack-memory-corruption-with-64k-block-size.patch new file mode 100644 index 00000000000..1c30ffe046b --- /dev/null +++ b/queue-4.4/ext4-fix-stack-memory-corruption-with-64k-block-size.patch @@ -0,0 +1,36 @@ +From 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 Mon Sep 17 00:00:00 2001 +From: Chandan Rajendra +Date: Mon, 14 Nov 2016 21:26:26 -0500 +Subject: ext4: fix stack memory corruption with 64k block size + +From: Chandan Rajendra + +commit 30a9d7afe70ed6bd9191d3000e2ef1a34fb58493 upstream. + +The number of 'counters' elements needed in 'struct sg' is +super_block->s_blocksize_bits + 2. Presently we have 16 'counters' +elements in the array. This is insufficient for block sizes >= 32k. In +such cases the memcpy operation performed in ext4_mb_seq_groups_show() +would cause stack memory corruption. + +Fixes: c9de560ded61f +Signed-off-by: Chandan Rajendra +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/mballoc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struc + struct ext4_group_info *grinfo; + struct sg { + struct ext4_group_info info; +- ext4_grpblk_t counters[16]; ++ ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; + } sg; + + group--; diff --git a/queue-4.4/ext4-reject-inodes-with-negative-size.patch b/queue-4.4/ext4-reject-inodes-with-negative-size.patch new file mode 100644 index 00000000000..5281ac5cd7c --- /dev/null +++ b/queue-4.4/ext4-reject-inodes-with-negative-size.patch @@ -0,0 +1,45 @@ +From 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Sat, 10 Dec 2016 09:55:01 -0500 +Subject: ext4: reject inodes with negative size + +From: Darrick J. Wong + +commit 7e6e1ef48fc02f3ac5d0edecbb0c6087cd758d58 upstream. + +Don't load an inode with a negative size; this causes integer overflow +problems in the VFS. + +[ Added EXT4_ERROR_INODE() to mark file system as corrupted. -TYT] + +Fixes: a48380f769df (ext4: rename i_dir_acl to i_size_high) +Signed-off-by: Darrick J. Wong +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4175,6 +4175,7 @@ struct inode *ext4_iget(struct super_blo + struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; ++ loff_t size; + int block; + uid_t i_uid; + gid_t i_gid; +@@ -4266,6 +4267,11 @@ struct inode *ext4_iget(struct super_blo + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + inode->i_size = ext4_isize(raw_inode); ++ if ((size = i_size_read(inode)) < 0) { ++ EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); ++ ret = -EFSCORRUPTED; ++ goto bad_inode; ++ } + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; diff --git a/queue-4.4/ext4-return-enomem-instead-of-success.patch b/queue-4.4/ext4-return-enomem-instead-of-success.patch new file mode 100644 index 00000000000..6d8f344b5b9 --- /dev/null +++ b/queue-4.4/ext4-return-enomem-instead-of-success.patch @@ -0,0 +1,34 @@ +From 578620f451f836389424833f1454eeeb2ffc9e9f Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Sat, 10 Dec 2016 09:56:01 -0500 +Subject: ext4: return -ENOMEM instead of success + +From: Dan Carpenter + +commit 578620f451f836389424833f1454eeeb2ffc9e9f upstream. + +We should set the error code if kzalloc() fails. + +Fixes: 67cf5b09a46f ("ext4: add the basic function for inline data support") +Signed-off-by: Dan Carpenter +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inline.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -336,8 +336,10 @@ static int ext4_update_inline_data(handl + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); +- if (!value) ++ if (!value) { ++ error = -ENOMEM; + goto out; ++ } + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); diff --git a/queue-4.4/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch b/queue-4.4/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch new file mode 100644 index 00000000000..c134b6ebe3c --- /dev/null +++ b/queue-4.4/ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch @@ -0,0 +1,55 @@ +From cd6bb35bf7f6d7d922509bf50265383a0ceabe96 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 18 Nov 2016 13:28:30 -0500 +Subject: ext4: use more strict checks for inodes_per_block on mount + +From: Theodore Ts'o + +commit cd6bb35bf7f6d7d922509bf50265383a0ceabe96 upstream. + +Centralize the checks for inodes_per_block and be more strict to make +sure the inodes_per_block_group can't end up being zero. + +Signed-off-by: Theodore Ts'o +Reviewed-by: Andreas Dilger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 15 ++++++--------- + 1 file changed, 6 insertions(+), 9 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3496,12 +3496,16 @@ static int ext4_fill_super(struct super_ + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); +- if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) +- goto cantfind_ext4; + + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext4; ++ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || ++ sbi->s_inodes_per_group > blocksize * 8) { ++ ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", ++ sbi->s_blocks_per_group); ++ goto failed_mount; ++ } + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); +@@ -3584,13 +3588,6 @@ static int ext4_fill_super(struct super_ + } + sbi->s_cluster_ratio = clustersize / blocksize; + +- if (sbi->s_inodes_per_group > blocksize * 8) { +- ext4_msg(sb, KERN_ERR, +- "#inodes per group too big: %lu", +- sbi->s_inodes_per_group); +- goto failed_mount; +- } +- + /* Do we have standard group size of clustersize * 8 blocks ? */ + if (sbi->s_blocks_per_group == clustersize << 3) + set_opt2(sb, STD_GROUP_SIZE); diff --git a/queue-4.4/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch b/queue-4.4/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch new file mode 100644 index 00000000000..0dbb25bfdb2 --- /dev/null +++ b/queue-4.4/f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch @@ -0,0 +1,63 @@ +From 05e6ea2685c964db1e675a24a4f4e2adc22d2388 Mon Sep 17 00:00:00 2001 +From: Nicolai Stange +Date: Sun, 20 Nov 2016 19:57:23 +0100 +Subject: f2fs: set ->owner for debugfs status file's file_operations + +From: Nicolai Stange + +commit 05e6ea2685c964db1e675a24a4f4e2adc22d2388 upstream. + +The struct file_operations instance serving the f2fs/status debugfs file +lacks an initialization of its ->owner. + +This means that although that file might have been opened, the f2fs module +can still get removed. Any further operation on that opened file, releasing +included, will cause accesses to unmapped memory. + +Indeed, Mike Marshall reported the following: + + BUG: unable to handle kernel paging request at ffffffffa0307430 + IP: [] full_proxy_release+0x24/0x90 + <...> + Call Trace: + [] __fput+0xdf/0x1d0 + [] ____fput+0xe/0x10 + [] task_work_run+0x8e/0xc0 + [] do_exit+0x2ae/0xae0 + [] ? __audit_syscall_entry+0xae/0x100 + [] ? syscall_trace_enter+0x1ca/0x310 + [] do_group_exit+0x44/0xc0 + [] SyS_exit_group+0x14/0x20 + [] do_syscall_64+0x61/0x150 + [] entry_SYSCALL64_slow_path+0x25/0x25 + <...> + ---[ end trace f22ae883fa3ea6b8 ]--- + Fixing recursive fault but reboot is needed! + +Fix this by initializing the f2fs/status file_operations' ->owner with +THIS_MODULE. + +This will allow debugfs to grab a reference to the f2fs module upon any +open on that file, thus preventing it from getting removed. + +Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs") +Reported-by: Mike Marshall +Reported-by: Martin Brandenburg +Signed-off-by: Nicolai Stange +Signed-off-by: Jaegeuk Kim +Signed-off-by: Greg Kroah-Hartman + +--- + fs/f2fs/debug.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/f2fs/debug.c ++++ b/fs/f2fs/debug.c +@@ -352,6 +352,7 @@ static int stat_open(struct inode *inode + } + + static const struct file_operations stat_fops = { ++ .owner = THIS_MODULE, + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, diff --git a/queue-4.4/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch b/queue-4.4/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch new file mode 100644 index 00000000000..a100826d531 --- /dev/null +++ b/queue-4.4/fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch @@ -0,0 +1,79 @@ +From 613cc2b6f272c1a8ad33aefa21cad77af23139f7 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Wed, 21 Dec 2016 16:26:24 +1100 +Subject: fs: exec: apply CLOEXEC before changing dumpable task flags + +From: Aleksa Sarai + +commit 613cc2b6f272c1a8ad33aefa21cad77af23139f7 upstream. + +If you have a process that has set itself to be non-dumpable, and it +then undergoes exec(2), any CLOEXEC file descriptors it has open are +"exposed" during a race window between the dumpable flags of the process +being reset for exec(2) and CLOEXEC being applied to the file +descriptors. This can be exploited by a process by attempting to access +/proc//fd/... during this window, without requiring CAP_SYS_PTRACE. + +The race in question is after set_dumpable has been (for get_link, +though the trace is basically the same for readlink): + +[vfs] +-> proc_pid_link_inode_operations.get_link + -> proc_pid_get_link + -> proc_fd_access_allowed + -> ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + +Which will return 0, during the race window and CLOEXEC file descriptors +will still be open during this window because do_close_on_exec has not +been called yet. As a result, the ordering of these calls should be +reversed to avoid this race window. + +This is of particular concern to container runtimes, where joining a +PID namespace with file descriptors referring to the host filesystem +can result in security issues (since PRCTL_SET_DUMPABLE doesn't protect +against access of CLOEXEC file descriptors -- file descriptors which may +reference filesystem objects the container shouldn't have access to). + +Cc: dev@opencontainers.org +Reported-by: Michael Crosby +Signed-off-by: Aleksa Sarai +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/exec.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -19,7 +19,7 @@ + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary +- * formats. ++ * formats. + */ + + #include +@@ -1114,6 +1114,13 @@ int flush_old_exec(struct linux_binprm * + flush_thread(); + current->personality &= ~bprm->per_clear; + ++ /* ++ * We have to apply CLOEXEC before we change whether the process is ++ * dumpable (in setup_new_exec) to avoid a race with a process in userspace ++ * trying to access the should-be-closed file descriptors of a process ++ * undergoing exec(2). ++ */ ++ do_close_on_exec(current->files); + return 0; + + out: +@@ -1176,7 +1183,6 @@ void setup_new_exec(struct linux_binprm + group */ + current->self_exec_id++; + flush_signal_handlers(current, 0); +- do_close_on_exec(current->files); + } + EXPORT_SYMBOL(setup_new_exec); + diff --git a/queue-4.4/loop-return-proper-error-from-loop_queue_rq.patch b/queue-4.4/loop-return-proper-error-from-loop_queue_rq.patch new file mode 100644 index 00000000000..fc1fd8bd44d --- /dev/null +++ b/queue-4.4/loop-return-proper-error-from-loop_queue_rq.patch @@ -0,0 +1,32 @@ +From b4a567e8114327518c09f5632339a5954ab975a3 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Mon, 14 Nov 2016 14:56:17 -0800 +Subject: loop: return proper error from loop_queue_rq() + +From: Omar Sandoval + +commit b4a567e8114327518c09f5632339a5954ab975a3 upstream. + +->queue_rq() should return one of the BLK_MQ_RQ_QUEUE_* constants, not +an errno. + +Fixes: f4aa4c7bbac6 ("block: loop: convert to per-device workqueue") +Signed-off-by: Omar Sandoval +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/loop.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -1657,7 +1657,7 @@ static int loop_queue_rq(struct blk_mq_h + blk_mq_start_request(bd->rq); + + if (lo->lo_state != Lo_bound) +- return -EIO; ++ return BLK_MQ_RQ_QUEUE_ERROR; + + if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | + REQ_DISCARD))) diff --git a/queue-4.4/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch b/queue-4.4/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch new file mode 100644 index 00000000000..e879a726276 --- /dev/null +++ b/queue-4.4/mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch @@ -0,0 +1,181 @@ +From bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" +Date: Thu, 13 Oct 2016 21:23:16 -0500 +Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks + +From: Eric W. Biederman + +commit bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 upstream. + +During exec dumpable is cleared if the file that is being executed is +not readable by the user executing the file. A bug in +ptrace_may_access allows reading the file if the executable happens to +enter into a subordinate user namespace (aka clone(CLONE_NEWUSER), +unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER). + +This problem is fixed with only necessary userspace breakage by adding +a user namespace owner to mm_struct, captured at the time of exec, so +it is clear in which user namespace CAP_SYS_PTRACE must be present in +to be able to safely give read permission to the executable. + +The function ptrace_may_access is modified to verify that the ptracer +has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns. +This ensures that if the task changes it's cred into a subordinate +user namespace it does not become ptraceable. + +The function ptrace_attach is modified to only set PT_PTRACE_CAP when +CAP_SYS_PTRACE is held over task->mm->user_ns. The intent of +PT_PTRACE_CAP is to be a flag to note that whatever permission changes +the task might go through the tracer has sufficient permissions for +it not to be an issue. task->cred->user_ns is always the same +as or descendent of mm->user_ns. Which guarantees that having +CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks +credentials. + +To prevent regressions mm->dumpable and mm->user_ns are not considered +when a task has no mm. As simply failing ptrace_may_attach causes +regressions in privileged applications attempting to read things +such as /proc//stat + +Acked-by: Kees Cook +Tested-by: Cyrill Gorcunov +Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces") +Signed-off-by: "Eric W. Biederman" +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mm_types.h | 1 + + kernel/fork.c | 9 ++++++--- + kernel/ptrace.c | 26 +++++++++++--------------- + mm/init-mm.c | 2 ++ + 4 files changed, 20 insertions(+), 18 deletions(-) + +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -469,6 +469,7 @@ struct mm_struct { + */ + struct task_struct __rcu *owner; + #endif ++ struct user_namespace *user_ns; + + /* store ref to file /proc//exe symlink points to */ + struct file __rcu *exe_file; +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -585,7 +585,8 @@ static void mm_init_owner(struct mm_stru + #endif + } + +-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) ++static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, ++ struct user_namespace *user_ns) + { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; +@@ -625,6 +626,7 @@ static struct mm_struct *mm_init(struct + if (init_new_context(p, mm)) + goto fail_nocontext; + ++ mm->user_ns = get_user_ns(user_ns); + return mm; + + fail_nocontext: +@@ -670,7 +672,7 @@ struct mm_struct *mm_alloc(void) + return NULL; + + memset(mm, 0, sizeof(*mm)); +- return mm_init(mm, current); ++ return mm_init(mm, current, current_user_ns()); + } + + /* +@@ -685,6 +687,7 @@ void __mmdrop(struct mm_struct *mm) + destroy_context(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); ++ put_user_ns(mm->user_ns); + free_mm(mm); + } + EXPORT_SYMBOL_GPL(__mmdrop); +@@ -942,7 +945,7 @@ static struct mm_struct *dup_mm(struct t + + memcpy(mm, oldmm, sizeof(*mm)); + +- if (!mm_init(mm, tsk)) ++ if (!mm_init(mm, tsk, mm->user_ns)) + goto fail_nomem; + + err = dup_mmap(mm, oldmm); +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -219,7 +219,7 @@ static int ptrace_has_cap(struct user_na + static int __ptrace_may_access(struct task_struct *task, unsigned int mode) + { + const struct cred *cred = current_cred(), *tcred; +- int dumpable = 0; ++ struct mm_struct *mm; + kuid_t caller_uid; + kgid_t caller_gid; + +@@ -270,16 +270,11 @@ static int __ptrace_may_access(struct ta + return -EPERM; + ok: + rcu_read_unlock(); +- smp_rmb(); +- if (task->mm) +- dumpable = get_dumpable(task->mm); +- rcu_read_lock(); +- if (dumpable != SUID_DUMP_USER && +- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { +- rcu_read_unlock(); +- return -EPERM; +- } +- rcu_read_unlock(); ++ mm = task->mm; ++ if (mm && ++ ((get_dumpable(mm) != SUID_DUMP_USER) && ++ !ptrace_has_cap(mm->user_ns, mode))) ++ return -EPERM; + + return security_ptrace_access_check(task, mode); + } +@@ -330,6 +325,11 @@ static int ptrace_attach(struct task_str + + task_lock(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); ++ if (!retval) { ++ struct mm_struct *mm = task->mm; ++ if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) ++ flags |= PT_PTRACE_CAP; ++ } + task_unlock(task); + if (retval) + goto unlock_creds; +@@ -343,10 +343,6 @@ static int ptrace_attach(struct task_str + + if (seize) + flags |= PT_SEIZED; +- rcu_read_lock(); +- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) +- flags |= PT_PTRACE_CAP; +- rcu_read_unlock(); + task->ptrace = flags; + + __ptrace_link(task, current); +--- a/mm/init-mm.c ++++ b/mm/init-mm.c +@@ -6,6 +6,7 @@ + #include + + #include ++#include + #include + #include + +@@ -21,5 +22,6 @@ struct mm_struct init_mm = { + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), ++ .user_ns = &init_user_ns, + INIT_MM_CONTEXT(init_mm) + }; diff --git a/queue-4.4/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch b/queue-4.4/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch new file mode 100644 index 00000000000..71395f67c04 --- /dev/null +++ b/queue-4.4/mm-vmscan.c-set-correct-defer-count-for-shrinker.patch @@ -0,0 +1,85 @@ +From 5f33a0803bbd781de916f5c7448cbbbbc763d911 Mon Sep 17 00:00:00 2001 +From: Shaohua Li +Date: Mon, 12 Dec 2016 16:41:50 -0800 +Subject: mm/vmscan.c: set correct defer count for shrinker + +From: Shaohua Li + +commit 5f33a0803bbd781de916f5c7448cbbbbc763d911 upstream. + +Our system uses significantly more slab memory with memcg enabled with +the latest kernel. With 3.10 kernel, slab uses 2G memory, while with +4.6 kernel, 6G memory is used. The shrinker has problem. Let's see we +have two memcg for one shrinker. In do_shrink_slab: + +1. Check cg1. nr_deferred = 0, assume total_scan = 700. batch size + is 1024, then no memory is freed. nr_deferred = 700 + +2. Check cg2. nr_deferred = 700. Assume freeable = 20, then + total_scan = 10 or 40. Let's assume it's 10. No memory is freed. + nr_deferred = 10. + +The deferred share of cg1 is lost in this case. kswapd will free no +memory even run above steps again and again. + +The fix makes sure one memcg's deferred share isn't lost. + +Link: http://lkml.kernel.org/r/2414be961b5d25892060315fbb56bb19d81d0c07.1476227351.git.shli@fb.com +Signed-off-by: Shaohua Li +Cc: Johannes Weiner +Cc: Michal Hocko +Cc: Vladimir Davydov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -277,6 +277,7 @@ static unsigned long do_shrink_slab(stru + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; ++ long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) +@@ -298,7 +299,9 @@ static unsigned long do_shrink_slab(stru + pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; +- } ++ next_deferred = nr; ++ } else ++ next_deferred = total_scan; + + /* + * We need to avoid excessive windup on filesystem shrinkers +@@ -355,17 +358,22 @@ static unsigned long do_shrink_slab(stru + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; ++ scanned += nr_to_scan; + + cond_resched(); + } + ++ if (next_deferred >= scanned) ++ next_deferred -= scanned; ++ else ++ next_deferred = 0; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ +- if (total_scan > 0) +- new_nr = atomic_long_add_return(total_scan, ++ if (next_deferred > 0) ++ new_nr = atomic_long_add_return(next_deferred, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); diff --git a/queue-4.4/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch b/queue-4.4/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch new file mode 100644 index 00000000000..7d3687415bc --- /dev/null +++ b/queue-4.4/ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch @@ -0,0 +1,147 @@ +From 64b875f7ac8a5d60a4e191479299e931ee949b67 Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" +Date: Mon, 14 Nov 2016 18:48:07 -0600 +Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP + +From: Eric W. Biederman + +commit 64b875f7ac8a5d60a4e191479299e931ee949b67 upstream. + +When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was +overlooked. This can result in incorrect behavior when an application +like strace traces an exec of a setuid executable. + +Further PT_PTRACE_CAP does not have enough information for making good +security decisions as it does not report which user namespace the +capability is in. This has already allowed one mistake through +insufficient granulariy. + +I found this issue when I was testing another corner case of exec and +discovered that I could not get strace to set PT_PTRACE_CAP even when +running strace as root with a full set of caps. + +This change fixes the above issue with strace allowing stracing as +root a setuid executable without disabling setuid. More fundamentaly +this change allows what is allowable at all times, by using the correct +information in it's decision. + +Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12") +Signed-off-by: "Eric W. Biederman" +Signed-off-by: Greg Kroah-Hartman + +--- + fs/exec.c | 2 +- + include/linux/capability.h | 1 + + include/linux/ptrace.h | 1 - + include/linux/sched.h | 1 + + kernel/capability.c | 20 ++++++++++++++++++++ + kernel/ptrace.c | 12 +++++++----- + 6 files changed, 30 insertions(+), 7 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1273,7 +1273,7 @@ static void check_unsafe_exec(struct lin + unsigned n_fs; + + if (p->ptrace) { +- if (p->ptrace & PT_PTRACE_CAP) ++ if (ptracer_capable(p, current_user_ns())) + bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; + else + bprm->unsafe |= LSM_UNSAFE_PTRACE; +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -250,6 +250,7 @@ static inline bool ns_capable_noaudit(st + extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); + extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); + extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); ++extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); + + /* audit system wants to get cap info from files as well */ + extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +--- a/include/linux/ptrace.h ++++ b/include/linux/ptrace.h +@@ -19,7 +19,6 @@ + #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ + #define PT_PTRACED 0x00000001 + #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ +-#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */ + + #define PT_OPT_FLAG_SHIFT 3 + /* PT_TRACE_* event enable flags */ +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1540,6 +1540,7 @@ struct task_struct { + struct list_head cpu_timers[3]; + + /* process credentials */ ++ const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */ + const struct cred __rcu *real_cred; /* objective and real subjective task + * credentials (COW) */ + const struct cred __rcu *cred; /* effective (overridable) subjective task +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -485,3 +485,23 @@ bool capable_wrt_inode_uidgid(const stru + return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); + } + EXPORT_SYMBOL(capable_wrt_inode_uidgid); ++ ++/** ++ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace ++ * @tsk: The task that may be ptraced ++ * @ns: The user namespace to search for CAP_SYS_PTRACE in ++ * ++ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE ++ * in the specified user namespace. ++ */ ++bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) ++{ ++ int ret = 0; /* An absent tracer adds no restrictions */ ++ const struct cred *cred; ++ rcu_read_lock(); ++ cred = rcu_dereference(tsk->ptracer_cred); ++ if (cred) ++ ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); ++ rcu_read_unlock(); ++ return (ret == 0); ++} +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *c + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; ++ rcu_read_lock(); ++ child->ptracer_cred = get_cred(__task_cred(new_parent)); ++ rcu_read_unlock(); + } + + /** +@@ -71,11 +74,15 @@ void __ptrace_link(struct task_struct *c + */ + void __ptrace_unlink(struct task_struct *child) + { ++ const struct cred *old_cred; + BUG_ON(!child->ptrace); + + child->ptrace = 0; + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); ++ old_cred = child->ptracer_cred; ++ child->ptracer_cred = NULL; ++ put_cred(old_cred); + + spin_lock(&child->sighand->siglock); + +@@ -325,11 +332,6 @@ static int ptrace_attach(struct task_str + + task_lock(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); +- if (!retval) { +- struct mm_struct *mm = task->mm; +- if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) +- flags |= PT_PTRACE_CAP; +- } + task_unlock(task); + if (retval) + goto unlock_creds; diff --git a/queue-4.4/series b/queue-4.4/series index ea92807d004..f10b3234d47 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -25,3 +25,20 @@ btrfs-don-t-leak-reloc-root-nodes-on-error.patch btrfs-fix-memory-leak-in-do_walk_down.patch btrfs-don-t-bug-during-drop-snapshot.patch btrfs-make-file-clone-aware-of-fatal-signals.patch +exec-ensure-mm-user_ns-contains-the-execed-files.patch +fs-exec-apply-cloexec-before-changing-dumpable-task-flags.patch +block_dev-don-t-test-bdev-bd_contains-when-it-is-not-stable.patch +mm-add-a-user_ns-owner-to-mm_struct-and-fix-ptrace-permission-checks.patch +ptrace-capture-the-ptracer-s-creds-not-pt_ptrace_cap.patch +crypto-caam-fix-aead-givenc-descriptors.patch +ext4-fix-mballoc-breakage-with-64k-block-size.patch +ext4-fix-stack-memory-corruption-with-64k-block-size.patch +ext4-use-more-strict-checks-for-inodes_per_block-on-mount.patch +ext4-fix-in-superblock-mount-options-processing.patch +ext4-add-sanity-checking-to-count_overhead.patch +ext4-reject-inodes-with-negative-size.patch +ext4-return-enomem-instead-of-success.patch +ext4-do-not-perform-data-journaling-when-data-is-encrypted.patch +f2fs-set-owner-for-debugfs-status-file-s-file_operations.patch +loop-return-proper-error-from-loop_queue_rq.patch +mm-vmscan.c-set-correct-defer-count-for-shrinker.patch