From: Greg Kroah-Hartman Date: Fri, 27 Jun 2014 21:38:55 +0000 (-0700) Subject: 3.10-stable patches X-Git-Tag: v3.4.96~55 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9d5d1a0e4dc0f84fd3c48d055f0949f062870f2c;p=thirdparty%2Fkernel%2Fstable-queue.git 3.10-stable patches added patches: arm64-ptrace-change-fs-when-passing-kernel-pointer-to-regset-code.patch ext4-fix-wrong-assert-in-ext4_mb_normalize_request.patch ext4-fix-zeroing-of-page-during-writeback.patch hid-core-fix-validation-of-report-id-0.patch idr-fix-overflow-bug-during-maximum-id-calculation-at-maximum-height.patch mm-fix-sleeping-function-warning-from-__put_anon_vma.patch mm-memory-failure.c-don-t-let-collect_procs-skip-over-processes-for-mf_action_required.patch mm-memory-failure.c-failure-send-right-signal-code-to-correct-thread.patch mm-vmscan-clear-kswapd-s-special-reclaim-powers-before-exiting.patch mm-vmscan-do-not-throttle-based-on-pfmemalloc-reserves-if-node-has-no-zone_normal.patch ptrace-fix-fork-event-messages-across-pid-namespaces.patch s390-lowcore-reserve-96-bytes-for-irb-in-lowcore.patch --- diff --git a/queue-3.10/arm64-ptrace-change-fs-when-passing-kernel-pointer-to-regset-code.patch b/queue-3.10/arm64-ptrace-change-fs-when-passing-kernel-pointer-to-regset-code.patch new file mode 100644 index 00000000000..ebece6f09d4 --- /dev/null +++ b/queue-3.10/arm64-ptrace-change-fs-when-passing-kernel-pointer-to-regset-code.patch @@ -0,0 +1,51 @@ +From c168870704bcde6bb63d05f7882b620dd3985a46 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Mon, 2 Jun 2014 11:47:23 +0100 +Subject: arm64: ptrace: change fs when passing kernel pointer to regset code + +From: Will Deacon + +commit c168870704bcde6bb63d05f7882b620dd3985a46 upstream. + +Our compat PTRACE_POKEUSR implementation simply passes the user data to +regset_copy_from_user after some simple range checking. Unfortunately, +the data in question has already been copied to the kernel stack by this +point, so the subsequent access_ok check fails and the ptrace request +returns -EFAULT. This causes problems tracing fork() with older versions +of strace. + +This patch briefly changes the fs to KERNEL_DS, so that the access_ok +check passes even with a kernel address. + +Signed-off-by: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/kernel/ptrace.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/arm64/kernel/ptrace.c ++++ b/arch/arm64/kernel/ptrace.c +@@ -872,6 +872,7 @@ static int compat_ptrace_write_user(stru + compat_ulong_t val) + { + int ret; ++ mm_segment_t old_fs = get_fs(); + + if (off & 3 || off >= COMPAT_USER_SZ) + return -EIO; +@@ -879,10 +880,13 @@ static int compat_ptrace_write_user(stru + if (off >= sizeof(compat_elf_gregset_t)) + return 0; + ++ set_fs(KERNEL_DS); + ret = copy_regset_from_user(tsk, &user_aarch32_view, + REGSET_COMPAT_GPR, off, + sizeof(compat_ulong_t), + &val); ++ set_fs(old_fs); ++ + return ret; + } + diff --git a/queue-3.10/ext4-fix-wrong-assert-in-ext4_mb_normalize_request.patch b/queue-3.10/ext4-fix-wrong-assert-in-ext4_mb_normalize_request.patch new file mode 100644 index 00000000000..25046e71430 --- /dev/null +++ b/queue-3.10/ext4-fix-wrong-assert-in-ext4_mb_normalize_request.patch @@ -0,0 +1,32 @@ +From b5b60778558cafad17bbcbf63e0310bd3c68eb17 Mon Sep 17 00:00:00 2001 +From: Maurizio Lombardi +Date: Tue, 27 May 2014 12:48:56 -0400 +Subject: ext4: fix wrong assert in ext4_mb_normalize_request() + +From: Maurizio Lombardi + +commit b5b60778558cafad17bbcbf63e0310bd3c68eb17 upstream. + +The variable "size" is expressed as number of blocks and not as +number of clusters, this could trigger a kernel panic when using +ext4 with the size of a cluster different from the size of a block. + +Signed-off-by: Maurizio Lombardi +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/mballoc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3116,7 +3116,7 @@ ext4_mb_normalize_request(struct ext4_al + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); +- BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); ++ BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + diff --git a/queue-3.10/ext4-fix-zeroing-of-page-during-writeback.patch b/queue-3.10/ext4-fix-zeroing-of-page-during-writeback.patch new file mode 100644 index 00000000000..ca72a2a0962 --- /dev/null +++ b/queue-3.10/ext4-fix-zeroing-of-page-during-writeback.patch @@ -0,0 +1,70 @@ +From eeece469dedadf3918bad50ad80f4616a0064e90 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 27 May 2014 12:48:55 -0400 +Subject: ext4: fix zeroing of page during writeback + +From: Jan Kara + +commit eeece469dedadf3918bad50ad80f4616a0064e90 upstream. + +Tail of a page straddling inode size must be zeroed when being written +out due to POSIX requirement that modifications of mmaped page beyond +inode size must not be written to the file. ext4_bio_write_page() did +this only for blocks fully beyond inode size but didn't properly zero +blocks partially beyond inode size. Fix this. + +The problem has been uncovered by mmap_11-4 test in openposix test suite +(part of LTP). + +Reported-by: Xiaoguang Wang +Fixes: 5a0dc7365c240 +Fixes: bd2d0210cf22f +CC: stable@vger.kernel.org +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/page-io.c | 24 +++++++++++------------- + 1 file changed, 11 insertions(+), 13 deletions(-) + +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -384,6 +384,17 @@ int ext4_bio_write_page(struct ext4_io_s + ClearPageError(page); + + /* ++ * Comments copied from block_write_full_page_endio: ++ * ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ if (len < PAGE_CACHE_SIZE) ++ zero_user_segment(page, len, PAGE_CACHE_SIZE); ++ /* + * In the first loop we prepare and mark buffers to submit. We have to + * mark all buffers in the page before submitting so that + * end_page_writeback() cannot be called from ext4_bio_end_io() when IO +@@ -394,19 +405,6 @@ int ext4_bio_write_page(struct ext4_io_s + do { + block_start = bh_offset(bh); + if (block_start >= len) { +- /* +- * Comments copied from block_write_full_page_endio: +- * +- * The page straddles i_size. It must be zeroed out on +- * each and every writepage invocation because it may +- * be mmapped. "A file is mapped in multiples of the +- * page size. For a file that is not a multiple of +- * the page size, the remaining memory is zeroed when +- * mapped, and writes to that region are not written +- * out to the file." +- */ +- zero_user_segment(page, block_start, +- block_start + blocksize); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; diff --git a/queue-3.10/hid-core-fix-validation-of-report-id-0.patch b/queue-3.10/hid-core-fix-validation-of-report-id-0.patch new file mode 100644 index 00000000000..7f7393c86a4 --- /dev/null +++ b/queue-3.10/hid-core-fix-validation-of-report-id-0.patch @@ -0,0 +1,54 @@ +From 1b15d2e5b8077670b1e6a33250a0d9577efff4a5 Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Thu, 17 Apr 2014 13:22:09 -0700 +Subject: HID: core: fix validation of report id 0 + +From: Kees Cook + +commit 1b15d2e5b8077670b1e6a33250a0d9577efff4a5 upstream. + +Some drivers use the first HID report in the list instead of using an +index. In these cases, validation uses ID 0, which was supposed to mean +"first known report". This fixes the problem, which was causing at least +the lgff family of devices to stop working since hid_validate_values +was being called with ID 0, but the devices used single numbered IDs +for their reports: + +0x05, 0x01, /* Usage Page (Desktop), */ +0x09, 0x05, /* Usage (Gamepad), */ +0xA1, 0x01, /* Collection (Application), */ +0xA1, 0x02, /* Collection (Logical), */ +0x85, 0x01, /* Report ID (1), */ +... + +Reported-by: Simon Wood +Signed-off-by: Kees Cook +Reviewed-by: Benjamin Tissoires +Signed-off-by: Jiri Kosina +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hid/hid-core.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/drivers/hid/hid-core.c ++++ b/drivers/hid/hid-core.c +@@ -796,7 +796,17 @@ struct hid_report *hid_validate_values(s + * ->numbered being checked, which may not always be the case when + * drivers go to access report values. + */ +- report = hid->report_enum[type].report_id_hash[id]; ++ if (id == 0) { ++ /* ++ * Validating on id 0 means we should examine the first ++ * report in the list. ++ */ ++ report = list_entry( ++ hid->report_enum[type].report_list.next, ++ struct hid_report, list); ++ } else { ++ report = hid->report_enum[type].report_id_hash[id]; ++ } + if (!report) { + hid_err(hid, "missing %s %u\n", hid_report_names[type], id); + return NULL; diff --git a/queue-3.10/idr-fix-overflow-bug-during-maximum-id-calculation-at-maximum-height.patch b/queue-3.10/idr-fix-overflow-bug-during-maximum-id-calculation-at-maximum-height.patch new file mode 100644 index 00000000000..1a7532691df --- /dev/null +++ b/queue-3.10/idr-fix-overflow-bug-during-maximum-id-calculation-at-maximum-height.patch @@ -0,0 +1,77 @@ +From 3afb69cb5572b3c8c898c00880803cf1a49852c4 Mon Sep 17 00:00:00 2001 +From: Lai Jiangshan +Date: Fri, 6 Jun 2014 14:37:10 -0700 +Subject: idr: fix overflow bug during maximum ID calculation at maximum height + +From: Lai Jiangshan + +commit 3afb69cb5572b3c8c898c00880803cf1a49852c4 upstream. + +idr_replace() open-codes the logic to calculate the maximum valid ID +given the height of the idr tree; unfortunately, the open-coded logic +doesn't account for the fact that the top layer may have unused slots +and over-shifts the limit to zero when the tree is at its maximum +height. + +The following test code shows it fails to replace the value for +id=((1<<27)+42): + + static void test5(void) + { + int id; + DEFINE_IDR(test_idr); + #define TEST5_START ((1<<27)+42) /* use the highest layer */ + + printk(KERN_INFO "Start test5\n"); + id = idr_alloc(&test_idr, (void *)1, TEST5_START, 0, GFP_KERNEL); + BUG_ON(id != TEST5_START); + TEST_BUG_ON(idr_replace(&test_idr, (void *)2, TEST5_START) != (void *)1); + idr_destroy(&test_idr); + printk(KERN_INFO "End of test5\n"); + } + +Fix the bug by using idr_max() which correctly takes into account the +maximum allowed shift. + +sub_alloc() shares the same problem and may incorrectly fail with +-EAGAIN; however, this bug doesn't affect correct operation because +idr_get_empty_slot(), which already uses idr_max(), retries with the +increased @id in such cases. + +[tj@kernel.org: Updated patch description.] +Signed-off-by: Lai Jiangshan +Acked-by: Tejun Heo +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + lib/idr.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/lib/idr.c ++++ b/lib/idr.c +@@ -250,7 +250,7 @@ static int sub_alloc(struct idr *idp, in + id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1; + + /* if already at the top layer, we need to grow */ +- if (id >= 1 << (idp->layers * IDR_BITS)) { ++ if (id > idr_max(idp->layers)) { + *starting_id = id; + return -EAGAIN; + } +@@ -829,12 +829,10 @@ void *idr_replace(struct idr *idp, void + if (!p) + return ERR_PTR(-EINVAL); + +- n = (p->layer+1) * IDR_BITS; +- +- if (id >= (1 << n)) ++ if (id > idr_max(p->layer + 1)) + return ERR_PTR(-EINVAL); + +- n -= IDR_BITS; ++ n = p->layer * IDR_BITS; + while ((n > 0) && p) { + p = p->ary[(id >> n) & IDR_MASK]; + n -= IDR_BITS; diff --git a/queue-3.10/mm-fix-sleeping-function-warning-from-__put_anon_vma.patch b/queue-3.10/mm-fix-sleeping-function-warning-from-__put_anon_vma.patch new file mode 100644 index 00000000000..d35c97adf38 --- /dev/null +++ b/queue-3.10/mm-fix-sleeping-function-warning-from-__put_anon_vma.patch @@ -0,0 +1,66 @@ +From 7f39dda9d86fb4f4f17af0de170decf125726f8c Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Wed, 4 Jun 2014 16:05:33 -0700 +Subject: mm: fix sleeping function warning from __put_anon_vma + +From: Hugh Dickins + +commit 7f39dda9d86fb4f4f17af0de170decf125726f8c upstream. + +Trinity reports BUG: + + sleeping function called from invalid context at kernel/locking/rwsem.c:47 + in_atomic(): 0, irqs_disabled(): 0, pid: 5787, name: trinity-c27 + +__might_sleep < down_write < __put_anon_vma < page_get_anon_vma < +migrate_pages < compact_zone < compact_zone_order < try_to_compact_pages .. + +Right, since conversion to mutex then rwsem, we should not put_anon_vma() +from inside an rcu_read_lock()ed section: fix the two places that did so. +And add might_sleep() to anon_vma_free(), as suggested by Peter Zijlstra. + +Fixes: 88c22088bf23 ("mm: optimize page_lock_anon_vma() fast-path") +Reported-by: Dave Jones +Signed-off-by: Hugh Dickins +Cc: Peter Zijlstra +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/rmap.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ ++ might_sleep(); + if (rwsem_is_locked(&anon_vma->root->rwsem)) { + anon_vma_lock_write(anon_vma); + anon_vma_unlock_write(anon_vma); +@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struc + * above cannot corrupt). + */ + if (!page_mapped(page)) { ++ rcu_read_unlock(); + put_anon_vma(anon_vma); +- anon_vma = NULL; ++ return NULL; + } + out: + rcu_read_unlock(); +@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read + } + + if (!page_mapped(page)) { ++ rcu_read_unlock(); + put_anon_vma(anon_vma); +- anon_vma = NULL; +- goto out; ++ return NULL; + } + + /* we pinned the anon_vma, its safe to sleep */ diff --git a/queue-3.10/mm-memory-failure.c-don-t-let-collect_procs-skip-over-processes-for-mf_action_required.patch b/queue-3.10/mm-memory-failure.c-don-t-let-collect_procs-skip-over-processes-for-mf_action_required.patch new file mode 100644 index 00000000000..4c6e78df270 --- /dev/null +++ b/queue-3.10/mm-memory-failure.c-don-t-let-collect_procs-skip-over-processes-for-mf_action_required.patch @@ -0,0 +1,123 @@ +From 74614de17db6fb472370c426d4f934d8d616edf2 Mon Sep 17 00:00:00 2001 +From: Tony Luck +Date: Wed, 4 Jun 2014 16:11:01 -0700 +Subject: mm/memory-failure.c: don't let collect_procs() skip over processes for MF_ACTION_REQUIRED + +From: Tony Luck + +commit 74614de17db6fb472370c426d4f934d8d616edf2 upstream. + +When Linux sees an "action optional" machine check (where h/w has reported +an error that is not in the current execution path) we generally do not +want to signal a process, since most processes do not have a SIGBUS +handler - we'd just prematurely terminate the process for a problem that +they might never actually see. + +task_early_kill() decides whether to consider a process - and it checks +whether this specific process has been marked for early signals with +"prctl", or if the system administrator has requested early signals for +all processes using /proc/sys/vm/memory_failure_early_kill. + +But for MF_ACTION_REQUIRED case we must not defer. The error is in the +execution path of the current thread so we must send the SIGBUS +immediatley. + +Fix by passing a flag argument through collect_procs*() to +task_early_kill() so it knows whether we can defer or must take action. + +Signed-off-by: Tony Luck +Signed-off-by: Naoya Horiguchi +Cc: Andi Kleen +Cc: Borislav Petkov +Cc: Chen Gong +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -382,10 +382,12 @@ static void kill_procs(struct list_head + } + } + +-static int task_early_kill(struct task_struct *tsk) ++static int task_early_kill(struct task_struct *tsk, int force_early) + { + if (!tsk->mm) + return 0; ++ if (force_early) ++ return 1; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +@@ -395,7 +397,7 @@ static int task_early_kill(struct task_s + * Collect processes when the error hit an anonymous page. + */ + static void collect_procs_anon(struct page *page, struct list_head *to_kill, +- struct to_kill **tkc) ++ struct to_kill **tkc, int force_early) + { + struct vm_area_struct *vma; + struct task_struct *tsk; +@@ -411,7 +413,7 @@ static void collect_procs_anon(struct pa + for_each_process (tsk) { + struct anon_vma_chain *vmac; + +- if (!task_early_kill(tsk)) ++ if (!task_early_kill(tsk, force_early)) + continue; + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { +@@ -430,7 +432,7 @@ static void collect_procs_anon(struct pa + * Collect processes when the error hit a file mapped page. + */ + static void collect_procs_file(struct page *page, struct list_head *to_kill, +- struct to_kill **tkc) ++ struct to_kill **tkc, int force_early) + { + struct vm_area_struct *vma; + struct task_struct *tsk; +@@ -441,7 +443,7 @@ static void collect_procs_file(struct pa + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + +- if (!task_early_kill(tsk)) ++ if (!task_early_kill(tsk, force_early)) + continue; + + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, +@@ -467,7 +469,8 @@ static void collect_procs_file(struct pa + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +-static void collect_procs(struct page *page, struct list_head *tokill) ++static void collect_procs(struct page *page, struct list_head *tokill, ++ int force_early) + { + struct to_kill *tk; + +@@ -478,9 +481,9 @@ static void collect_procs(struct page *p + if (!tk) + return; + if (PageAnon(page)) +- collect_procs_anon(page, tokill, &tk); ++ collect_procs_anon(page, tokill, &tk, force_early); + else +- collect_procs_file(page, tokill, &tk); ++ collect_procs_file(page, tokill, &tk, force_early); + kfree(tk); + } + +@@ -965,7 +968,7 @@ static int hwpoison_user_mappings(struct + * there's nothing that can be done. + */ + if (kill) +- collect_procs(ppage, &tokill); ++ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + + ret = try_to_unmap(ppage, ttu); + if (ret != SWAP_SUCCESS) diff --git a/queue-3.10/mm-memory-failure.c-failure-send-right-signal-code-to-correct-thread.patch b/queue-3.10/mm-memory-failure.c-failure-send-right-signal-code-to-correct-thread.patch new file mode 100644 index 00000000000..aea244f7fdd --- /dev/null +++ b/queue-3.10/mm-memory-failure.c-failure-send-right-signal-code-to-correct-thread.patch @@ -0,0 +1,53 @@ +From a70ffcac741d31a406c1d2b832ae43d658e7e1cf Mon Sep 17 00:00:00 2001 +From: Tony Luck +Date: Wed, 4 Jun 2014 16:10:59 -0700 +Subject: mm/memory-failure.c-failure: send right signal code to correct thread + +From: Tony Luck + +commit a70ffcac741d31a406c1d2b832ae43d658e7e1cf upstream. + +When a thread in a multi-threaded application hits a machine check because +of an uncorrectable error in memory - we want to send the SIGBUS with +si.si_code = BUS_MCEERR_AR to that thread. Currently we fail to do that +if the active thread is not the primary thread in the process. +collect_procs() just finds primary threads and this test: + + if ((flags & MF_ACTION_REQUIRED) && t == current) { + +will see that the thread we found isn't the current thread and so send a +si.si_code = BUS_MCEERR_AO to the primary (and nothing to the active +thread at this time). + +We can fix this by checking whether "current" shares the same mm with the +process that collect_procs() said owned the page. If so, we send the +SIGBUS to current (with code BUS_MCEERR_AR). + +Signed-off-by: Tony Luck +Signed-off-by: Naoya Horiguchi +Reported-by: Otto Bruggeman +Cc: Andi Kleen +Cc: Borislav Petkov +Cc: Chen Gong +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -208,9 +208,9 @@ static int kill_proc(struct task_struct + #endif + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + +- if ((flags & MF_ACTION_REQUIRED) && t == current) { ++ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { + si.si_code = BUS_MCEERR_AR; +- ret = force_sig_info(SIGBUS, &si, t); ++ ret = force_sig_info(SIGBUS, &si, current); + } else { + /* + * Don't use force here, it's convenient if the signal diff --git a/queue-3.10/mm-vmscan-clear-kswapd-s-special-reclaim-powers-before-exiting.patch b/queue-3.10/mm-vmscan-clear-kswapd-s-special-reclaim-powers-before-exiting.patch new file mode 100644 index 00000000000..d059b40b286 --- /dev/null +++ b/queue-3.10/mm-vmscan-clear-kswapd-s-special-reclaim-powers-before-exiting.patch @@ -0,0 +1,94 @@ +From 71abdc15adf8c702a1dd535f8e30df50758848d2 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Fri, 6 Jun 2014 14:35:35 -0700 +Subject: mm: vmscan: clear kswapd's special reclaim powers before exiting + +From: Johannes Weiner + +commit 71abdc15adf8c702a1dd535f8e30df50758848d2 upstream. + +When kswapd exits, it can end up taking locks that were previously held +by allocating tasks while they waited for reclaim. Lockdep currently +warns about this: + +On Wed, May 28, 2014 at 06:06:34PM +0800, Gu Zheng wrote: +> inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-R} usage. +> kswapd2/1151 [HC0[0]:SC0[0]:HE1:SE1] takes: +> (&sig->group_rwsem){+++++?}, at: exit_signals+0x24/0x130 +> {RECLAIM_FS-ON-W} state was registered at: +> mark_held_locks+0xb9/0x140 +> lockdep_trace_alloc+0x7a/0xe0 +> kmem_cache_alloc_trace+0x37/0x240 +> flex_array_alloc+0x99/0x1a0 +> cgroup_attach_task+0x63/0x430 +> attach_task_by_pid+0x210/0x280 +> cgroup_procs_write+0x16/0x20 +> cgroup_file_write+0x120/0x2c0 +> vfs_write+0xc0/0x1f0 +> SyS_write+0x4c/0xa0 +> tracesys+0xdd/0xe2 +> irq event stamp: 49 +> hardirqs last enabled at (49): _raw_spin_unlock_irqrestore+0x36/0x70 +> hardirqs last disabled at (48): _raw_spin_lock_irqsave+0x2b/0xa0 +> softirqs last enabled at (0): copy_process.part.24+0x627/0x15f0 +> softirqs last disabled at (0): (null) +> +> other info that might help us debug this: +> Possible unsafe locking scenario: +> +> CPU0 +> ---- +> lock(&sig->group_rwsem); +> +> lock(&sig->group_rwsem); +> +> *** DEADLOCK *** +> +> no locks held by kswapd2/1151. +> +> stack backtrace: +> CPU: 30 PID: 1151 Comm: kswapd2 Not tainted 3.10.39+ #4 +> Call Trace: +> dump_stack+0x19/0x1b +> print_usage_bug+0x1f7/0x208 +> mark_lock+0x21d/0x2a0 +> __lock_acquire+0x52a/0xb60 +> lock_acquire+0xa2/0x140 +> down_read+0x51/0xa0 +> exit_signals+0x24/0x130 +> do_exit+0xb5/0xa50 +> kthread+0xdb/0x100 +> ret_from_fork+0x7c/0xb0 + +This is because the kswapd thread is still marked as a reclaimer at the +time of exit. But because it is exiting, nobody is actually waiting on +it to make reclaim progress anymore, and it's nothing but a regular +thread at this point. Be tidy and strip it of all its powers +(PF_MEMALLOC, PF_SWAPWRITE, PF_KSWAPD, and the lockdep reclaim state) +before returning from the thread function. + +Signed-off-by: Johannes Weiner +Reported-by: Gu Zheng +Cc: Yasuaki Ishimatsu +Cc: Tang Chen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -3090,7 +3090,10 @@ static int kswapd(void *p) + } + } + ++ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + current->reclaim_state = NULL; ++ lockdep_clear_current_reclaim_state(); ++ + return 0; + } + diff --git a/queue-3.10/mm-vmscan-do-not-throttle-based-on-pfmemalloc-reserves-if-node-has-no-zone_normal.patch b/queue-3.10/mm-vmscan-do-not-throttle-based-on-pfmemalloc-reserves-if-node-has-no-zone_normal.patch new file mode 100644 index 00000000000..3683b9c9ef6 --- /dev/null +++ b/queue-3.10/mm-vmscan-do-not-throttle-based-on-pfmemalloc-reserves-if-node-has-no-zone_normal.patch @@ -0,0 +1,106 @@ +From 675becce15f320337499bc1a9356260409a5ba29 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 4 Jun 2014 16:07:35 -0700 +Subject: mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL + +From: Mel Gorman + +commit 675becce15f320337499bc1a9356260409a5ba29 upstream. + +throttle_direct_reclaim() is meant to trigger during swap-over-network +during which the min watermark is treated as a pfmemalloc reserve. It +throttes on the first node in the zonelist but this is flawed. + +The user-visible impact is that a process running on CPU whose local +memory node has no ZONE_NORMAL will stall for prolonged periods of time, +possibly indefintely. This is due to throttle_direct_reclaim thinking the +pfmemalloc reserves are depleted when in fact they don't exist on that +node. + +On a NUMA machine running a 32-bit kernel (I know) allocation requests +from CPUs on node 1 would detect no pfmemalloc reserves and the process +gets throttled. This patch adjusts throttling of direct reclaim to +throttle based on the first node in the zonelist that has a usable +ZONE_NORMAL or lower zone. + +[akpm@linux-foundation.org: coding-style fixes] +Signed-off-by: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 37 insertions(+), 6 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2286,10 +2286,17 @@ static bool pfmemalloc_watermark_ok(pg_d + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; ++ if (!populated_zone(zone)) ++ continue; ++ + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + ++ /* If there are no reserves (unexpected config) then do not throttle */ ++ if (!pfmemalloc_reserve) ++ return true; ++ + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ +@@ -2314,9 +2321,9 @@ static bool pfmemalloc_watermark_ok(pg_d + static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) + { ++ struct zoneref *z; + struct zone *zone; +- int high_zoneidx = gfp_zone(gfp_mask); +- pg_data_t *pgdat; ++ pg_data_t *pgdat = NULL; + + /* + * Kernel threads should not be throttled as they may be indirectly +@@ -2335,10 +2342,34 @@ static bool throttle_direct_reclaim(gfp_ + if (fatal_signal_pending(current)) + goto out; + +- /* Check if the pfmemalloc reserves are ok */ +- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); +- pgdat = zone->zone_pgdat; +- if (pfmemalloc_watermark_ok(pgdat)) ++ /* ++ * Check if the pfmemalloc reserves are ok by finding the first node ++ * with a usable ZONE_NORMAL or lower zone. The expectation is that ++ * GFP_KERNEL will be required for allocating network buffers when ++ * swapping over the network so ZONE_HIGHMEM is unusable. ++ * ++ * Throttling is based on the first usable node and throttled processes ++ * wait on a queue until kswapd makes progress and wakes them. There ++ * is an affinity then between processes waking up and where reclaim ++ * progress has been made assuming the process wakes on the same node. ++ * More importantly, processes running on remote nodes will not compete ++ * for remote pfmemalloc reserves and processes on different nodes ++ * should make reasonable progress. ++ */ ++ for_each_zone_zonelist_nodemask(zone, z, zonelist, ++ gfp_mask, nodemask) { ++ if (zone_idx(zone) > ZONE_NORMAL) ++ continue; ++ ++ /* Throttle based on the first usable node */ ++ pgdat = zone->zone_pgdat; ++ if (pfmemalloc_watermark_ok(pgdat)) ++ goto out; ++ break; ++ } ++ ++ /* If no zone was usable by the allocation flags then do not throttle */ ++ if (!pgdat) + goto out; + + /* Account for the throttling */ diff --git a/queue-3.10/ptrace-fix-fork-event-messages-across-pid-namespaces.patch b/queue-3.10/ptrace-fix-fork-event-messages-across-pid-namespaces.patch new file mode 100644 index 00000000000..d868fdfc645 --- /dev/null +++ b/queue-3.10/ptrace-fix-fork-event-messages-across-pid-namespaces.patch @@ -0,0 +1,118 @@ +From 4e52365f279564cef0ddd41db5237f0471381093 Mon Sep 17 00:00:00 2001 +From: Matthew Dempsky +Date: Fri, 6 Jun 2014 14:36:42 -0700 +Subject: ptrace: fix fork event messages across pid namespaces + +From: Matthew Dempsky + +commit 4e52365f279564cef0ddd41db5237f0471381093 upstream. + +When tracing a process in another pid namespace, it's important for fork +event messages to contain the child's pid as seen from the tracer's pid +namespace, not the parent's. Otherwise, the tracer won't be able to +correlate the fork event with later SIGTRAP signals it receives from the +child. + +We still risk a race condition if a ptracer from a different pid +namespace attaches after we compute the pid_t value. However, sending a +bogus fork event message in this unlikely scenario is still a vast +improvement over the status quo where we always send bogus fork event +messages to debuggers in a different pid namespace than the forking +process. + +Signed-off-by: Matthew Dempsky +Acked-by: Oleg Nesterov +Cc: Kees Cook +Cc: Julien Tinnes +Cc: Roland McGrath +Cc: Jan Kratochvil +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/ptrace.h | 32 ++++++++++++++++++++++++++++++++ + kernel/fork.c | 10 +++++++--- + 2 files changed, 39 insertions(+), 3 deletions(-) + +--- a/include/linux/ptrace.h ++++ b/include/linux/ptrace.h +@@ -5,6 +5,7 @@ + #include /* For struct task_struct. */ + #include /* for IS_ERR_VALUE */ + #include /* For BUG_ON. */ ++#include /* For task_active_pid_ns. */ + #include + + /* +@@ -129,6 +130,37 @@ static inline void ptrace_event(int even + } + + /** ++ * ptrace_event_pid - possibly stop for a ptrace event notification ++ * @event: %PTRACE_EVENT_* value to report ++ * @pid: process identifier for %PTRACE_GETEVENTMSG to return ++ * ++ * Check whether @event is enabled and, if so, report @event and @pid ++ * to the ptrace parent. @pid is reported as the pid_t seen from the ++ * the ptrace parent's pid namespace. ++ * ++ * Called without locks. ++ */ ++static inline void ptrace_event_pid(int event, struct pid *pid) ++{ ++ /* ++ * FIXME: There's a potential race if a ptracer in a different pid ++ * namespace than parent attaches between computing message below and ++ * when we acquire tasklist_lock in ptrace_stop(). If this happens, ++ * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. ++ */ ++ unsigned long message = 0; ++ struct pid_namespace *ns; ++ ++ rcu_read_lock(); ++ ns = task_active_pid_ns(rcu_dereference(current->parent)); ++ if (ns) ++ message = pid_nr_ns(pid, ns); ++ rcu_read_unlock(); ++ ++ ptrace_event(event, message); ++} ++ ++/** + * ptrace_init_task - initialize ptrace state for a new child + * @child: new child task + * @ptrace: true if child should be ptrace'd by parent's tracer +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1607,10 +1607,12 @@ long do_fork(unsigned long clone_flags, + */ + if (!IS_ERR(p)) { + struct completion vfork; ++ struct pid *pid; + + trace_sched_process_fork(current, p); + +- nr = task_pid_vnr(p); ++ pid = get_task_pid(p, PIDTYPE_PID); ++ nr = pid_vnr(pid); + + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); +@@ -1625,12 +1627,14 @@ long do_fork(unsigned long clone_flags, + + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) +- ptrace_event(trace, nr); ++ ptrace_event_pid(trace, pid); + + if (clone_flags & CLONE_VFORK) { + if (!wait_for_vfork_done(p, &vfork)) +- ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); ++ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); + } ++ ++ put_pid(pid); + } else { + nr = PTR_ERR(p); + } diff --git a/queue-3.10/s390-lowcore-reserve-96-bytes-for-irb-in-lowcore.patch b/queue-3.10/s390-lowcore-reserve-96-bytes-for-irb-in-lowcore.patch new file mode 100644 index 00000000000..4b2f08c0eef --- /dev/null +++ b/queue-3.10/s390-lowcore-reserve-96-bytes-for-irb-in-lowcore.patch @@ -0,0 +1,63 @@ +From 993072ee67aa179c48c85eb19869804e68887d86 Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Mon, 26 May 2014 21:55:08 +0200 +Subject: s390/lowcore: reserve 96 bytes for IRB in lowcore + +From: Christian Borntraeger + +commit 993072ee67aa179c48c85eb19869804e68887d86 upstream. + +The IRB might be 96 bytes if the extended-I/O-measurement facility is +used. This feature is currently not used by Linux, but struct irb +already has the emw defined. So let's make the irb in lowcore match the +size of the internal data structure to be future proof. +We also have to add a pad, to correctly align the paste. + +The bigger irb field also circumvents a bug in some QEMU versions that +always write the emw field on test subchannel and therefore destroy the +paste definitions of this CPU. Running under these QEMU version broke +some timing functions in the VDSO and all users of these functions, +e.g. some JREs. + +Signed-off-by: Christian Borntraeger +Signed-off-by: Martin Schwidefsky +Cc: Heiko Carstens +Cc: Sebastian Ott +Cc: Cornelia Huck +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/include/asm/lowcore.h | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/s390/include/asm/lowcore.h ++++ b/arch/s390/include/asm/lowcore.h +@@ -142,9 +142,9 @@ struct _lowcore { + __u8 pad_0x02fc[0x0300-0x02fc]; /* 0x02fc */ + + /* Interrupt response block */ +- __u8 irb[64]; /* 0x0300 */ ++ __u8 irb[96]; /* 0x0300 */ + +- __u8 pad_0x0340[0x0e00-0x0340]; /* 0x0340 */ ++ __u8 pad_0x0360[0x0e00-0x0360]; /* 0x0360 */ + + /* + * 0xe00 contains the address of the IPL Parameter Information +@@ -288,12 +288,13 @@ struct _lowcore { + __u8 pad_0x03a0[0x0400-0x03a0]; /* 0x03a0 */ + + /* Interrupt response block. */ +- __u8 irb[64]; /* 0x0400 */ ++ __u8 irb[96]; /* 0x0400 */ ++ __u8 pad_0x0460[0x0480-0x0460]; /* 0x0460 */ + + /* Per cpu primary space access list */ +- __u32 paste[16]; /* 0x0440 */ ++ __u32 paste[16]; /* 0x0480 */ + +- __u8 pad_0x0480[0x0e00-0x0480]; /* 0x0480 */ ++ __u8 pad_0x04c0[0x0e00-0x04c0]; /* 0x04c0 */ + + /* + * 0xe00 contains the address of the IPL Parameter Information diff --git a/queue-3.10/series b/queue-3.10/series index 677ad57d1eb..5467d47256e 100644 --- a/queue-3.10/series +++ b/queue-3.10/series @@ -11,3 +11,15 @@ usb-dwc3-gadget-clear-stall-when-disabling-endpoint.patch arm-omap-replace-checks-for-config_usb_gadget_omap.patch usb-ehci-avoid-bios-handover-on-the-hasee-e200.patch usb-option-fix-runtime-pm-handling.patch +mm-vmscan-do-not-throttle-based-on-pfmemalloc-reserves-if-node-has-no-zone_normal.patch +mm-memory-failure.c-failure-send-right-signal-code-to-correct-thread.patch +mm-memory-failure.c-don-t-let-collect_procs-skip-over-processes-for-mf_action_required.patch +mm-fix-sleeping-function-warning-from-__put_anon_vma.patch +hid-core-fix-validation-of-report-id-0.patch +mm-vmscan-clear-kswapd-s-special-reclaim-powers-before-exiting.patch +ptrace-fix-fork-event-messages-across-pid-namespaces.patch +arm64-ptrace-change-fs-when-passing-kernel-pointer-to-regset-code.patch +idr-fix-overflow-bug-during-maximum-id-calculation-at-maximum-height.patch +s390-lowcore-reserve-96-bytes-for-irb-in-lowcore.patch +ext4-fix-zeroing-of-page-during-writeback.patch +ext4-fix-wrong-assert-in-ext4_mb_normalize_request.patch