From cae0d5cf4e7d585ca774a44871ce5e07a7b3f018 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 12 Feb 2016 13:00:34 -0800 Subject: [PATCH] 4.3-stable patches added patches: fat-fix-fake_offset-handling-on-error-path.patch fs-seqfile-always-allow-oom-killer.patch kernel-signal.c-unexport-sigsuspend.patch lib-hexdump.c-truncate-output-in-case-of-overflow.patch memcg-fix-thresholds-for-32b-architectures.patch mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch ocfs2-fix-bug-when-calculate-new-backup-super.patch ocfs2-fix-sgid-not-inherited-issue.patch proc-actually-make-proc_fd_permission-thread-friendly.patch proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch remoteproc-avoid-stack-overflow-in-debugfs-file.patch sh64-fix-__nr_fgetxattr.patch --- ...x-fake_offset-handling-on-error-path.patch | 80 +++++++++ .../fs-seqfile-always-allow-oom-killer.patch | 64 +++++++ .../kernel-signal.c-unexport-sigsuspend.patch | 64 +++++++ ...-truncate-output-in-case-of-overflow.patch | 51 ++++++ ...fix-thresholds-for-32b-architectures.patch | 105 +++++++++++ ...-huge_pte_alloc-only-if-ptep-is-null.patch | 61 +++++++ ...y-leak-caused-by-wrong-reserve-count.patch | 59 +++++++ ...-memory-leak-for-placeholder-entries.patch | 92 ++++++++++ ...ocate-hole-punch-of-areas-with-holes.patch | 167 ++++++++++++++++++ ...tting-tif_memdie-and-sending-sigkill.patch | 77 ++++++++ ...r-sizes-larger-than-kmalloc_min_size.patch | 87 +++++++++ ...ry-reclaim-doesn-t-make-any-progress.patch | 122 +++++++++++++ ...k-while-doing-local-recovery-cleanup.patch | 38 ++++ ...ning-the-migration-mle-that-is-inuse.patch | 97 ++++++++++ ...-bug-when-calculate-new-backup-super.patch | 98 ++++++++++ .../ocfs2-fix-sgid-not-inherited-issue.patch | 44 +++++ ...e-proc_fd_permission-thread-friendly.patch | 53 ++++++ ...-writing-to-proc-pid-coredump_filter.patch | 40 +++++ ...avoid-stack-overflow-in-debugfs-file.patch | 40 +++++ queue-4.3/series | 20 +++ queue-4.3/sh64-fix-__nr_fgetxattr.patch | 37 ++++ 21 files changed, 1496 insertions(+) create mode 100644 queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch create mode 100644 queue-4.3/fs-seqfile-always-allow-oom-killer.patch create mode 100644 queue-4.3/kernel-signal.c-unexport-sigsuspend.patch create mode 100644 queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch create mode 100644 queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch create mode 100644 queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch create mode 100644 queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch create mode 100644 queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch create mode 100644 queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch create mode 100644 queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch create mode 100644 queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch create mode 100644 queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch create mode 100644 queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch create mode 100644 queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch create mode 100644 queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch create mode 100644 queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch create mode 100644 queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch create mode 100644 queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch create mode 100644 queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch create mode 100644 queue-4.3/sh64-fix-__nr_fgetxattr.patch diff --git a/queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch b/queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch new file mode 100644 index 00000000000..155daff64de --- /dev/null +++ b/queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch @@ -0,0 +1,80 @@ +From 928a477102c4fc6739883415b66987207e3502f4 Mon Sep 17 00:00:00 2001 +From: OGAWA Hirofumi +Date: Fri, 20 Nov 2015 15:57:15 -0800 +Subject: fat: fix fake_offset handling on error path + +From: OGAWA Hirofumi + +commit 928a477102c4fc6739883415b66987207e3502f4 upstream. + +For the root directory, . and .. are faked (using dir_emit_dots()) and +ctx->pos is reset from 2 to 0. + +A corrupted root directory could cause fat_get_entry() to fail, but +->iterate() (fat_readdir()) reports progress to the VFS (with ctx->pos +rewound to 0), so any following calls to ->iterate() continue to return +the same entries again and again. + +The result is that userspace will never see the end of the directory, +causing e.g. 'ls' to hang in a getdents() loop. + +[hirofumi@mail.parknet.co.jp: cleanup and make sure to correct fake_offset] +Reported-by: Vegard Nossum +Tested-by: Vegard Nossum +Signed-off-by: Richard Weinberger +Signed-off-by: OGAWA Hirofumi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fat/dir.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/fs/fat/dir.c ++++ b/fs/fat/dir.c +@@ -610,9 +610,9 @@ parse_record: + int status = fat_parse_long(inode, &cpos, &bh, &de, + &unicode, &nr_slots); + if (status < 0) { +- ctx->pos = cpos; ++ bh = NULL; + ret = status; +- goto out; ++ goto end_of_dir; + } else if (status == PARSE_INVALID) + goto record_end; + else if (status == PARSE_NOT_LONGNAME) +@@ -654,8 +654,9 @@ parse_record: + fill_len = short_len; + + start_filldir: +- if (!fake_offset) +- ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); ++ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); ++ if (fake_offset && ctx->pos < 2) ++ ctx->pos = 2; + + if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { + if (!dir_emit_dot(file, ctx)) +@@ -681,14 +682,19 @@ record_end: + fake_offset = 0; + ctx->pos = cpos; + goto get_new; ++ + end_of_dir: +- ctx->pos = cpos; ++ if (fake_offset && cpos < 2) ++ ctx->pos = 2; ++ else ++ ctx->pos = cpos; + fill_failed: + brelse(bh); + if (unicode) + __putname(unicode); + out: + mutex_unlock(&sbi->s_lock); ++ + return ret; + } + diff --git a/queue-4.3/fs-seqfile-always-allow-oom-killer.patch b/queue-4.3/fs-seqfile-always-allow-oom-killer.patch new file mode 100644 index 00000000000..c4919c40532 --- /dev/null +++ b/queue-4.3/fs-seqfile-always-allow-oom-killer.patch @@ -0,0 +1,64 @@ +From 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 Mon Sep 17 00:00:00 2001 +From: Greg Thelen +Date: Fri, 6 Nov 2015 16:32:42 -0800 +Subject: fs, seqfile: always allow oom killer + +From: Greg Thelen + +commit 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 upstream. + +Since 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill +processes") seq_buf_alloc() avoids calling the oom killer for PAGE_SIZE or +smaller allocations; but larger allocations can use the oom killer via +vmalloc(). Thus reads of small files can return ENOMEM, but larger files +use the oom killer to avoid ENOMEM. + +The effect of this bug is that reads from /proc and other virtual +filesystems can return ENOMEM instead of the preferred behavior - oom +killing something (possibly the calling process). I don't know of anyone +except Google who has noticed the issue. + +I suspect the fix is more needed in smaller systems where there isn't any +reclaimable memory. But these seem like the kinds of systems which +probably don't use the oom killer for production situations. + +Memory overcommit requires use of the oom killer to select a victim +regardless of file size. + +Enable oom killer for small seq_buf_alloc() allocations. + +Fixes: 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes") +Signed-off-by: David Rientjes +Signed-off-by: Greg Thelen +Acked-by: Eric Dumazet +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/seq_file.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/seq_file.c ++++ b/fs/seq_file.c +@@ -25,12 +25,17 @@ static void seq_set_overflow(struct seq_ + static void *seq_buf_alloc(unsigned long size) + { + void *buf; ++ gfp_t gfp = GFP_KERNEL; + + /* +- * __GFP_NORETRY to avoid oom-killings with high-order allocations - +- * it's better to fall back to vmalloc() than to kill things. ++ * For high order allocations, use __GFP_NORETRY to avoid oom-killing - ++ * it's better to fall back to vmalloc() than to kill things. For small ++ * allocations, just use GFP_KERNEL which will oom kill, thus no need ++ * for vmalloc fallback. + */ +- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); ++ if (size > PAGE_SIZE) ++ gfp |= __GFP_NORETRY | __GFP_NOWARN; ++ buf = kmalloc(size, gfp); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; diff --git a/queue-4.3/kernel-signal.c-unexport-sigsuspend.patch b/queue-4.3/kernel-signal.c-unexport-sigsuspend.patch new file mode 100644 index 00000000000..82422de58b8 --- /dev/null +++ b/queue-4.3/kernel-signal.c-unexport-sigsuspend.patch @@ -0,0 +1,64 @@ +From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001 +From: Richard Weinberger +Date: Fri, 20 Nov 2015 15:57:21 -0800 +Subject: kernel/signal.c: unexport sigsuspend() + +From: Richard Weinberger + +commit 9d8a765211335cfdad464b90fb19f546af5706ae upstream. + +sigsuspend() is nowhere used except in signal.c itself, so we can mark it +static do not pollute the global namespace. + +But this patch is more than a boring cleanup patch, it fixes a real issue +on UserModeLinux. UML has a special console driver to display ttys using +xterm, or other terminal emulators, on the host side. Vegard reported +that sometimes UML is unable to spawn a xterm and he's facing the +following warning: + + WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0() + +It turned out that this warning makes absolutely no sense as the UML +xterm code calls sigsuspend() on the host side, at least it tries. But +as the kernel itself offers a sigsuspend() symbol the linker choose this +one instead of the glibc wrapper. Interestingly this code used to work +since ever but always blocked signals on the wrong side. Some recent +kernel change made the WARN_ON() trigger and uncovered the bug. + +It is a wonderful example of how much works by chance on computers. :-) + +Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()") +Signed-off-by: Richard Weinberger +Reported-by: Vegard Nossum +Tested-by: Vegard Nossum +Acked-by: Oleg Nesterov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/signal.h | 1 - + kernel/signal.c | 2 +- + 2 files changed, 1 insertion(+), 2 deletions(-) + +--- a/include/linux/signal.h ++++ b/include/linux/signal.h +@@ -239,7 +239,6 @@ extern int sigprocmask(int, sigset_t *, + extern void set_current_blocked(sigset_t *); + extern void __set_current_blocked(const sigset_t *); + extern int show_unhandled_signals; +-extern int sigsuspend(sigset_t *); + + struct sigaction { + #ifndef __ARCH_HAS_IRIX_SIGACTION +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -3552,7 +3552,7 @@ SYSCALL_DEFINE0(pause) + + #endif + +-int sigsuspend(sigset_t *set) ++static int sigsuspend(sigset_t *set) + { + current->saved_sigmask = current->blocked; + set_current_blocked(set); diff --git a/queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch b/queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch new file mode 100644 index 00000000000..6a09620add3 --- /dev/null +++ b/queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch @@ -0,0 +1,51 @@ +From 9f029f540c2f7e010e4922d44ba0dfd05da79f88 Mon Sep 17 00:00:00 2001 +From: Andy Shevchenko +Date: Fri, 6 Nov 2015 16:31:31 -0800 +Subject: lib/hexdump.c: truncate output in case of overflow + +From: Andy Shevchenko + +commit 9f029f540c2f7e010e4922d44ba0dfd05da79f88 upstream. + +There is a classical off-by-one error in case when we try to place, for +example, 1+1 bytes as hex in the buffer of size 6. The expected result is +to get an output truncated, but in the reality we get 6 bytes filed +followed by terminating NUL. + +Change the logic how we fill the output in case of byte dumping into +limited space. This will follow the snprintf() behaviour by truncating +output even on half bytes. + +Fixes: 114fc1afb2de (hexdump: make it return number of bytes placed in buffer) +Signed-off-by: Andy Shevchenko +Reported-by: Aaro Koskinen +Tested-by: Aaro Koskinen +Cc: Al Viro +Cc: Catalin Marinas +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + lib/hexdump.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/lib/hexdump.c ++++ b/lib/hexdump.c +@@ -169,11 +169,15 @@ int hex_dump_to_buffer(const void *buf, + } + } else { + for (j = 0; j < len; j++) { +- if (linebuflen < lx + 3) ++ if (linebuflen < lx + 2) + goto overflow2; + ch = ptr[j]; + linebuf[lx++] = hex_asc_hi(ch); ++ if (linebuflen < lx + 2) ++ goto overflow2; + linebuf[lx++] = hex_asc_lo(ch); ++ if (linebuflen < lx + 2) ++ goto overflow2; + linebuf[lx++] = ' '; + } + if (j) diff --git a/queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch b/queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch new file mode 100644 index 00000000000..4e05dc06479 --- /dev/null +++ b/queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch @@ -0,0 +1,105 @@ +From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Thu, 5 Nov 2015 18:50:29 -0800 +Subject: memcg: fix thresholds for 32b architectures. + +From: Michal Hocko + +commit c12176d3368b9b36ae484d323d41e94be26f9b65 upstream. + +Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a +regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page +counters") where thresholds were silently converted to use page units +rather than bytes when interpreting the user input. + +The fix is not complete, though, as properly pointed out by Ben Hutchings +during stable backport review. The page count is converted to bytes but +unsigned long is used to hold the value which would be obviously not +sufficient for 32b systems with more than 4G thresholds. The same applies +to usage as taken from mem_cgroup_usage which might overflow. + +Let's remove this bytes vs. pages internal tracking differences and +handle thresholds in page units internally. Chage mem_cgroup_usage() to +return the value in page units and revert 424cdc141380 because this should +be sufficient for the consistent handling. mem_cgroup_read_u64 as the +only users of mem_cgroup_usage outside of the threshold handling code is +converted to give the proper in bytes result. It is doing that already +for page_counter output so this is more consistent as well. + +The value presented to the userspace is still in bytes units. + +Fixes: 424cdc141380 ("memcg: convert threshold to bytes") +Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters") +Signed-off-by: Michal Hocko +Reported-by: Ben Hutchings +Reviewed-by: Vladimir Davydov +Acked-by: Johannes Weiner +From: Michal Hocko +Subject: memcg: fix thresholds for 32b architectures. +Signed-off-by: Greg Kroah-Hartman + +Cc: Ben Hutchings +Cc: Vladimir Davydov +Cc: Johannes Weiner +From: Andrew Morton +Subject: memcg: fix thresholds for 32b architectures. + +don't attempt to inline mem_cgroup_usage() + +The compiler ignores the inline anwyay. And __always_inlining it adds 600 +bytes of goop to the .o file. + +Cc: Ben Hutchings +Cc: Johannes Weiner +Cc: Michal Hocko +Cc: Vladimir Davydov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds + +--- + mm/memcontrol.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2836,9 +2836,9 @@ static unsigned long tree_stat(struct me + return val; + } + +-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) ++static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) + { +- u64 val; ++ unsigned long val; + + if (mem_cgroup_is_root(memcg)) { + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); +@@ -2851,7 +2851,7 @@ static inline u64 mem_cgroup_usage(struc + else + val = page_counter_read(&memcg->memsw); + } +- return val << PAGE_SHIFT; ++ return val; + } + + enum { +@@ -2885,9 +2885,9 @@ static u64 mem_cgroup_read_u64(struct cg + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + if (counter == &memcg->memory) +- return mem_cgroup_usage(memcg, false); ++ return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; + if (counter == &memcg->memsw) +- return mem_cgroup_usage(memcg, true); ++ return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; +@@ -3387,7 +3387,6 @@ static int __mem_cgroup_usage_register_e + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; +- threshold <<= PAGE_SHIFT; + + mutex_lock(&memcg->thresholds_lock); + diff --git a/queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch b/queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch new file mode 100644 index 00000000000..79a23e5533d --- /dev/null +++ b/queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch @@ -0,0 +1,61 @@ +From 0d777df5d8953293be090d9ab5a355db893e8357 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Fri, 11 Dec 2015 13:40:49 -0800 +Subject: mm: hugetlb: call huge_pte_alloc() only if ptep is null + +From: Naoya Horiguchi + +commit 0d777df5d8953293be090d9ab5a355db893e8357 upstream. + +Currently at the beginning of hugetlb_fault(), we call huge_pte_offset() +and check whether the obtained *ptep is a migration/hwpoison entry or +not. And if not, then we get to call huge_pte_alloc(). This is racy +because the *ptep could turn into migration/hwpoison entry after the +huge_pte_offset() check. This race results in BUG_ON in +huge_pte_alloc(). + +We don't have to call huge_pte_alloc() when the huge_pte_offset() +returns non-NULL, so let's fix this bug with moving the code into else +block. + +Note that the *ptep could turn into a migration/hwpoison entry after +this block, but that's not a problem because we have another +!pte_present check later (we never go into hugetlb_no_page() in that +case.) + +Fixes: 290408d4a250 ("hugetlb: hugepage migration core") +Signed-off-by: Naoya Horiguchi +Acked-by: Hillf Danton +Acked-by: David Rientjes +Cc: Hugh Dickins +Cc: Dave Hansen +Cc: Mel Gorman +Cc: Joonsoo Kim +Cc: Mike Kravetz +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3590,12 +3590,12 @@ int hugetlb_fault(struct mm_struct *mm, + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); ++ } else { ++ ptep = huge_pte_alloc(mm, address, huge_page_size(h)); ++ if (!ptep) ++ return VM_FAULT_OOM; + } + +- ptep = huge_pte_alloc(mm, address, huge_page_size(h)); +- if (!ptep) +- return VM_FAULT_OOM; +- + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + diff --git a/queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch b/queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch new file mode 100644 index 00000000000..fbab19957bb --- /dev/null +++ b/queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch @@ -0,0 +1,59 @@ +From a88c769548047b21f76fd71e04b6a3300ff17160 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Fri, 11 Dec 2015 13:40:24 -0800 +Subject: mm: hugetlb: fix hugepage memory leak caused by wrong reserve count + +From: Naoya Horiguchi + +commit a88c769548047b21f76fd71e04b6a3300ff17160 upstream. + +When dequeue_huge_page_vma() in alloc_huge_page() fails, we fall back on +alloc_buddy_huge_page() to directly create a hugepage from the buddy +allocator. + +In that case, however, if alloc_buddy_huge_page() succeeds we don't +decrement h->resv_huge_pages, which means that successful +hugetlb_fault() returns without releasing the reserve count. As a +result, subsequent hugetlb_fault() might fail despite that there are +still free hugepages. + +This patch simply adds decrementing code on that code path. + +I reproduced this problem when testing v4.3 kernel in the following situation: + - the test machine/VM is a NUMA system, + - hugepage overcommiting is enabled, + - most of hugepages are allocated and there's only one free hugepage + which is on node 0 (for example), + - another program, which calls set_mempolicy(MPOL_BIND) to bind itself to + node 1, tries to allocate a hugepage, + - the allocation should fail but the reserve count is still hold. + +Signed-off-by: Naoya Horiguchi +Cc: David Rientjes +Cc: Dave Hansen +Cc: Mel Gorman +Cc: Joonsoo Kim +Cc: Hillf Danton +Cc: Mike Kravetz +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1790,7 +1790,10 @@ struct page *alloc_huge_page(struct vm_a + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) + goto out_uncharge_cgroup; +- ++ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { ++ SetPagePrivate(page); ++ h->resv_huge_pages--; ++ } + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + /* Fall through */ diff --git a/queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch b/queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch new file mode 100644 index 00000000000..e4a9a089b10 --- /dev/null +++ b/queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch @@ -0,0 +1,92 @@ +From dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Fri, 11 Dec 2015 13:40:52 -0800 +Subject: mm/hugetlb.c: fix resv map memory leak for placeholder entries + +From: Mike Kravetz + +commit dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 upstream. + +Dmitry Vyukov reported the following memory leak + +unreferenced object 0xffff88002eaafd88 (size 32): + comm "a.out", pid 5063, jiffies 4295774645 (age 15.810s) + hex dump (first 32 bytes): + 28 e9 4e 63 00 88 ff ff 28 e9 4e 63 00 88 ff ff (.Nc....(.Nc.... + 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + backtrace: + kmalloc include/linux/slab.h:458 + region_chg+0x2d4/0x6b0 mm/hugetlb.c:398 + __vma_reservation_common+0x2c3/0x390 mm/hugetlb.c:1791 + vma_needs_reservation mm/hugetlb.c:1813 + alloc_huge_page+0x19e/0xc70 mm/hugetlb.c:1845 + hugetlb_no_page mm/hugetlb.c:3543 + hugetlb_fault+0x7a1/0x1250 mm/hugetlb.c:3717 + follow_hugetlb_page+0x339/0xc70 mm/hugetlb.c:3880 + __get_user_pages+0x542/0xf30 mm/gup.c:497 + populate_vma_page_range+0xde/0x110 mm/gup.c:919 + __mm_populate+0x1c7/0x310 mm/gup.c:969 + do_mlock+0x291/0x360 mm/mlock.c:637 + SYSC_mlock2 mm/mlock.c:658 + SyS_mlock2+0x4b/0x70 mm/mlock.c:648 + +Dmitry identified a potential memory leak in the routine region_chg, +where a region descriptor is not free'ed on an error path. + +However, the root cause for the above memory leak resides in region_del. +In this specific case, a "placeholder" entry is created in region_chg. +The associated page allocation fails, and the placeholder entry is left +in the reserve map. This is "by design" as the entry should be deleted +when the map is released. The bug is in the region_del routine which is +used to delete entries within a specific range (and when the map is +released). region_del did not handle the case where a placeholder entry +exactly matched the start of the range range to be deleted. In this +case, the entry would not be deleted and leaked. The fix is to take +these special placeholder entries into account in region_del. + +The region_chg error path leak is also fixed. + +Fixes: feba16e25a57 ("mm/hugetlb: add region_del() to delete a specific range of entries") +Signed-off-by: Mike Kravetz +Reported-by: Dmitry Vyukov +Acked-by: Hillf Danton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -372,8 +372,10 @@ retry_locked: + spin_unlock(&resv->lock); + + trg = kmalloc(sizeof(*trg), GFP_KERNEL); +- if (!trg) ++ if (!trg) { ++ kfree(nrg); + return -ENOMEM; ++ } + + spin_lock(&resv->lock); + list_add(&trg->link, &resv->region_cache); +@@ -483,8 +485,16 @@ static long region_del(struct resv_map * + retry: + spin_lock(&resv->lock); + list_for_each_entry_safe(rg, trg, head, link) { +- if (rg->to <= f) ++ /* ++ * Skip regions before the range to be deleted. file_region ++ * ranges are normally of the form [from, to). However, there ++ * may be a "placeholder" entry in the map which is of the form ++ * (from, to) with from == to. Check for placeholder entries ++ * at the beginning of the range to be deleted. ++ */ ++ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) + continue; ++ + if (rg->from >= t) + break; + diff --git a/queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch b/queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch new file mode 100644 index 00000000000..f6bcd4031f8 --- /dev/null +++ b/queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch @@ -0,0 +1,167 @@ +From 1817889e3b2cc1db8abb595712095129ff9156c1 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Fri, 20 Nov 2015 15:57:13 -0800 +Subject: mm/hugetlbfs: fix bugs in fallocate hole punch of areas with holes + +From: Mike Kravetz + +commit 1817889e3b2cc1db8abb595712095129ff9156c1 upstream. + +Hugh Dickins pointed out problems with the new hugetlbfs fallocate hole +punch code. These problems are in the routine remove_inode_hugepages and +mostly occur in the case where there are holes in the range of pages to be +removed. These holes could be the result of a previous hole punch or +simply sparse allocation. The current code could access pages outside the +specified range. + +remove_inode_hugepages handles both hole punch and truncate operations. +Page index handling was fixed/cleaned up so that the loop index always +matches the page being processed. The code now only makes a single pass +through the range of pages as it was determined page faults could not race +with truncate. A cond_resched() was added after removing up to +PAGEVEC_SIZE pages. + +Some totally unnecessary code in hugetlbfs_fallocate() that remained from +early development was also removed. + +Tested with fallocate tests submitted here: +http://librelist.com/browser//libhugetlbfs/2015/6/25/patch-tests-add-tests-for-fallocate-system-call/ +And, some ftruncate tests under development + +Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") +Signed-off-by: Mike Kravetz +Acked-by: Hugh Dickins +Cc: Dave Hansen +Cc: Naoya Horiguchi +Cc: Davidlohr Bueso +Cc: "Hillf Danton" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/hugetlbfs/inode.c | 65 +++++++++++++++++++++++++-------------------------- + 1 file changed, 32 insertions(+), 33 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -332,12 +332,17 @@ static void remove_huge_page(struct page + * truncation is indicated by end of range being LLONG_MAX + * In this case, we first scan the range and release found pages. + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv +- * maps and global counts. ++ * maps and global counts. Page faults can not race with truncation ++ * in this routine. hugetlb_no_page() prevents page faults in the ++ * truncated range. It checks i_size before allocation, and again after ++ * with the page table lock for the page held. The same lock must be ++ * acquired to unmap a page. + * hole punch is indicated if end is not LLONG_MAX + * In the hole punch case we scan the range and release found pages. + * Only when releasing a page is the associated region/reserv map + * deleted. The region/reserv map for ranges without associated +- * pages are not modified. ++ * pages are not modified. Page faults can race with hole punch. ++ * This is indicated if we find a mapped page. + * Note: If the passed end of range value is beyond the end of file, but + * not LLONG_MAX this routine still performs a hole punch operation. + */ +@@ -361,46 +366,37 @@ static void remove_inode_hugepages(struc + next = start; + while (next < end) { + /* +- * Make sure to never grab more pages that we +- * might possibly need. ++ * Don't grab more pages than the number left in the range. + */ + if (end - next < lookup_nr) + lookup_nr = end - next; + + /* +- * This pagevec_lookup() may return pages past 'end', +- * so we must check for page->index > end. ++ * When no more pages are found, we are done. + */ +- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { +- if (next == start) +- break; +- next = start; +- continue; +- } ++ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) ++ break; + + for (i = 0; i < pagevec_count(&pvec); ++i) { + struct page *page = pvec.pages[i]; + u32 hash; + ++ /* ++ * The page (index) could be beyond end. This is ++ * only possible in the punch hole case as end is ++ * max page offset in the truncate case. ++ */ ++ next = page->index; ++ if (next >= end) ++ break; ++ + hash = hugetlb_fault_mutex_hash(h, current->mm, + &pseudo_vma, + mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + lock_page(page); +- if (page->index >= end) { +- unlock_page(page); +- mutex_unlock(&hugetlb_fault_mutex_table[hash]); +- next = end; /* we are done */ +- break; +- } +- +- /* +- * If page is mapped, it was faulted in after being +- * unmapped. Do nothing in this race case. In the +- * normal case page is not mapped. +- */ +- if (!page_mapped(page)) { ++ if (likely(!page_mapped(page))) { + bool rsv_on_error = !PagePrivate(page); + /* + * We must free the huge page and remove +@@ -421,17 +417,23 @@ static void remove_inode_hugepages(struc + hugetlb_fix_reserve_counts( + inode, rsv_on_error); + } ++ } else { ++ /* ++ * If page is mapped, it was faulted in after ++ * being unmapped. It indicates a race between ++ * hole punch and page fault. Do nothing in ++ * this case. Getting here in a truncate ++ * operation is a bug. ++ */ ++ BUG_ON(truncate_op); + } + +- if (page->index > next) +- next = page->index; +- +- ++next; + unlock_page(page); +- + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + } ++ ++next; + huge_pagevec_release(&pvec); ++ cond_resched(); + } + + if (truncate_op) +@@ -647,9 +649,6 @@ static long hugetlbfs_fallocate(struct f + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; +- spin_lock(&inode->i_lock); +- inode->i_private = NULL; +- spin_unlock(&inode->i_lock); + out: + mutex_unlock(&inode->i_mutex); + return error; diff --git a/queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch b/queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch new file mode 100644 index 00000000000..bc6b68a49fe --- /dev/null +++ b/queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch @@ -0,0 +1,77 @@ +From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Thu, 5 Nov 2015 18:47:44 -0800 +Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL + +From: Tetsuo Handa + +commit 426fb5e72d92b868912e47a1e3ca2df6eabc3872 upstream. + +It was confirmed that a local unprivileged user can consume all memory +reserves and hang up that system using time lag between the OOM killer +sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for +printk() inside for_each_process() loop at oom_kill_process() can consume +many seconds when there are many thread groups sharing the same memory. + +Before starting oom-depleter process: + + Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB + Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB + +As of invoking the OOM killer: + + Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB + Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB + +Between the thread group leader got TIF_MEMDIE and receives SIGKILL: + + Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB + Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB + +The oom-depleter's thread group leader which got TIF_MEMDIE started +memset() in user space after the OOM killer set TIF_MEMDIE, and it was +free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space +until SIGKILL is delivered. If SIGKILL is delivered before TIF_MEMDIE is +set, the oom-depleter can terminate without touching memory reserves. + +Although the possibility of hitting this time lag is very small for 3.19 +and earlier kernels because TIF_MEMDIE is set immediately before sending +SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can +step between and allow memory allocations which are not needed for +terminating the OOM victim. + +Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock") +Signed-off-by: Tetsuo Handa +Acked-by: Michal Hocko +Cc: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/oom_kill.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control + + /* mm cannot safely be dereferenced after task_unlock(victim) */ + mm = victim->mm; ++ /* ++ * We should send SIGKILL before setting TIF_MEMDIE in order to prevent ++ * the OOM victim from depleting the memory reserves from the user ++ * space under its control. ++ */ ++ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + mark_oom_victim(victim); + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), +@@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control + } + rcu_read_unlock(); + +- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + put_task_struct(victim); + } + #undef K diff --git a/queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch b/queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch new file mode 100644 index 00000000000..4c020025fa9 --- /dev/null +++ b/queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch @@ -0,0 +1,87 @@ +From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001 +From: Catalin Marinas +Date: Thu, 5 Nov 2015 18:45:54 -0800 +Subject: mm: slab: only move management objects off-slab for sizes larger than KMALLOC_MIN_SIZE + +From: Catalin Marinas + +commit d4322d88f5fdf92729dd40f923013414fbb2184d upstream. + +On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc +configurations defining ARCH_DMA_MINALIGN to 128), the first +kmalloc_caches[] entry to be initialised after slab_early_init = 0 is +"kmalloc-128" with index 7. Depending on the debug kernel configuration, +sizeof(struct kmem_cache) can be larger than 128 resulting in an +INDEX_NODE of 8. + +Commit 8fc9cf420b36 ("slab: make more slab management structure off the +slab") enables off-slab management objects for sizes starting with +PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation +of the "kmalloc-128" cache would try to place the management objects +off-slab. However, since KMALLOC_MIN_SIZE is already 128 and +freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size) +returns NULL (kmalloc_caches[7] not populated yet). This triggers the +following bug on arm64: + + kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283! + Internal error: Oops - BUG: 0 [#1] SMP + Modules linked in: + CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540 + Hardware name: Juno (DT) + PC is at __kmem_cache_create+0x21c/0x280 + LR is at __kmem_cache_create+0x210/0x280 + [...] + Call trace: + __kmem_cache_create+0x21c/0x280 + create_boot_cache+0x48/0x80 + create_kmalloc_cache+0x50/0x88 + create_kmalloc_caches+0x4c/0xf4 + kmem_cache_init+0x100/0x118 + start_kernel+0x214/0x33c + +This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab +management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE. + +Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab") +Signed-off-by: Catalin Marinas +Reported-by: Geert Uytterhoeven +Acked-by: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/slab.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct + + #define CFLGS_OFF_SLAB (0x80000000UL) + #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) ++#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) + + #define BATCHREFILL_LIMIT 16 + /* +@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache * + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + */ +- if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && ++ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && + !(flags & SLAB_NOLEAKTRACE)) + /* + * Size is large, assume best to place the slab management obj +@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache * + /* + * This is a possibility for one of the kmalloc_{dma,}_caches. + * But since we go off slab only for object size greater than +- * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created ++ * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. + * But leave a BUG_ON for some lucky dude. + */ diff --git a/queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch b/queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch new file mode 100644 index 00000000000..3ffb87e8c8a --- /dev/null +++ b/queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch @@ -0,0 +1,122 @@ +From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Fri, 11 Dec 2015 13:40:32 -0800 +Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress + +From: Michal Hocko + +commit 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 upstream. + +Tetsuo Handa has reported that the system might basically livelock in +OOM condition without triggering the OOM killer. + +The issue is caused by internal dependency of the direct reclaim on +vmstat counter updates (via zone_reclaimable) which are performed from +the workqueue context. If all the current workers get assigned to an +allocation request, though, they will be looping inside the allocator +trying to reclaim memory but zone_reclaimable can see stalled numbers so +it will consider a zone reclaimable even though it has been scanned way +too much. WQ concurrency logic will not consider this situation as a +congested workqueue because it relies that worker would have to sleep in +such a situation. This also means that it doesn't try to spawn new +workers or invoke the rescuer thread if the one is assigned to the +queue. + +In order to fix this issue we need to do two things. First we have to +let wq concurrency code know that we are in trouble so we have to do a +short sleep. In order to prevent from issues handled by 0e093d99763e +("writeback: do not sleep on the congestion queue if there are no +congested BDIs or if significant congestion is not being encountered in +the current zone") we limit the sleep only to worker threads which are +the ones of the interest anyway. + +The second thing to do is to create a dedicated workqueue for vmstat and +mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to +have a spare worker thread for it. + +Signed-off-by: Michal Hocko +Reported-by: Tetsuo Handa +Cc: Tejun Heo +Cc: Cristopher Lameter +Cc: Joonsoo Kim +Cc: Arkadiusz Miskiewicz +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/backing-dev.c | 19 ++++++++++++++++--- + mm/vmstat.c | 6 ++++-- + 2 files changed, 20 insertions(+), 5 deletions(-) + +--- a/mm/backing-dev.c ++++ b/mm/backing-dev.c +@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * +- * In the absence of zone congestion, cond_resched() is called to yield +- * the processor if necessary but otherwise does not sleep. ++ * In the absence of zone congestion, a short sleep or a cond_resched is ++ * performed to yield the processor and to allow other subsystems to make ++ * a forward progress. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function +@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zon + */ + if (atomic_read(&nr_wb_congested[sync]) == 0 || + !test_bit(ZONE_CONGESTED, &zone->flags)) { +- cond_resched(); ++ ++ /* ++ * Memory allocation/reclaim might be called from a WQ ++ * context and the current implementation of the WQ ++ * concurrency control doesn't recognize that a particular ++ * WQ is congested if the worker thread is looping without ++ * ever sleeping. Therefore we have to do a short sleep ++ * here rather than calling cond_resched(). ++ */ ++ if (current->flags & PF_WQ_WORKER) ++ schedule_timeout(1); ++ else ++ cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1357,6 +1357,7 @@ static const struct file_operations proc + #endif /* CONFIG_PROC_FS */ + + #ifdef CONFIG_SMP ++static struct workqueue_struct *vmstat_wq; + static DEFINE_PER_CPU(struct delayed_work, vmstat_work); + int sysctl_stat_interval __read_mostly = HZ; + static cpumask_var_t cpu_stat_off; +@@ -1369,7 +1370,7 @@ static void vmstat_update(struct work_st + * to occur in the future. Keep on running the + * update worker thread. + */ +- schedule_delayed_work_on(smp_processor_id(), ++ queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } else { +@@ -1438,7 +1439,7 @@ static void vmstat_shepherd(struct work_ + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + +- schedule_delayed_work_on(cpu, ++ queue_delayed_work_on(cpu, vmstat_wq, + &per_cpu(vmstat_work, cpu), 0); + + put_online_cpus(); +@@ -1527,6 +1528,7 @@ static int __init setup_vmstat(void) + + start_shepherd_timer(); + cpu_notifier_register_done(); ++ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + #endif + #ifdef CONFIG_PROC_FS + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); diff --git a/queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch b/queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch new file mode 100644 index 00000000000..85b55f42930 --- /dev/null +++ b/queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch @@ -0,0 +1,38 @@ +From c95a51807b730e4681e2ecbdfd669ca52601959e Mon Sep 17 00:00:00 2001 +From: xuejiufei +Date: Fri, 5 Feb 2016 15:36:47 -0800 +Subject: ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup + +From: xuejiufei + +commit c95a51807b730e4681e2ecbdfd669ca52601959e upstream. + +When recovery master down, dlm_do_local_recovery_cleanup() only remove +the $RECOVERY lock owned by dead node, but do not clear the refmap bit. +Which will make umount thread falling in dead loop migrating $RECOVERY +to the dead node. + +Signed-off-by: xuejiufei +Reviewed-by: Joseph Qi +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Junxiao Bi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/dlm/dlmrecovery.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ocfs2/dlm/dlmrecovery.c ++++ b/fs/ocfs2/dlm/dlmrecovery.c +@@ -2360,6 +2360,8 @@ static void dlm_do_local_recovery_cleanu + break; + } + } ++ dlm_lockres_clear_refmap_bit(dlm, res, ++ dead_node); + spin_unlock(&res->spinlock); + continue; + } diff --git a/queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch b/queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch new file mode 100644 index 00000000000..5fe0d6a8858 --- /dev/null +++ b/queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch @@ -0,0 +1,97 @@ +From bef5502de074b6f6fa647b94b73155d675694420 Mon Sep 17 00:00:00 2001 +From: xuejiufei +Date: Thu, 14 Jan 2016 15:17:38 -0800 +Subject: ocfs2/dlm: ignore cleaning the migration mle that is inuse + +From: xuejiufei + +commit bef5502de074b6f6fa647b94b73155d675694420 upstream. + +We have found that migration source will trigger a BUG that the refcount +of mle is already zero before put when the target is down during +migration. The situation is as follows: + +dlm_migrate_lockres + dlm_add_migration_mle + dlm_mark_lockres_migrating + dlm_get_mle_inuse + <<<<<< Now the refcount of the mle is 2. + dlm_send_one_lockres and wait for the target to become the + new master. + <<<<<< o2hb detect the target down and clean the migration + mle. Now the refcount is 1. + +dlm_migrate_lockres woken, and put the mle twice when found the target +goes down which trigger the BUG with the following message: + + "ERROR: bad mle: ". + +Signed-off-by: Jiufei Xue +Reviewed-by: Joseph Qi +Cc: Mark Fasheh +Cc: Joel Becker +Cc: Junxiao Bi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +--- a/fs/ocfs2/dlm/dlmmaster.c ++++ b/fs/ocfs2/dlm/dlmmaster.c +@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dl + spin_lock(&dlm->master_lock); + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, + namelen, target, dlm->node_num); ++ /* get an extra reference on the mle. ++ * otherwise the assert_master from the new ++ * master will destroy this. ++ */ ++ dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +@@ -2554,6 +2559,7 @@ fail: + if (mle_added) { + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); ++ dlm_put_mle_inuse(mle); + } else if (mle) { + kmem_cache_free(dlm_mle_cache, mle); + mle = NULL; +@@ -2571,17 +2577,6 @@ fail: + * ensure that all assert_master work is flushed. */ + flush_workqueue(dlm->dlm_worker); + +- /* get an extra reference on the mle. +- * otherwise the assert_master from the new +- * master will destroy this. +- * also, make sure that all callers of dlm_get_mle +- * take both dlm->spinlock and dlm->master_lock */ +- spin_lock(&dlm->spinlock); +- spin_lock(&dlm->master_lock); +- dlm_get_mle_inuse(mle); +- spin_unlock(&dlm->master_lock); +- spin_unlock(&dlm->spinlock); +- + /* notify new node and send all lock state */ + /* call send_one_lockres with migration flag. + * this serves as notice to the target node that a +@@ -3310,6 +3305,15 @@ top: + mle->new_master != dead_node) + continue; + ++ if (mle->new_master == dead_node && mle->inuse) { ++ mlog(ML_NOTICE, "%s: target %u died during " ++ "migration from %u, the MLE is " ++ "still keep used, ignore it!\n", ++ dlm->name, dead_node, ++ mle->master); ++ continue; ++ } ++ + /* If we have reached this point, this mle needs to be + * removed from the list and freed. */ + dlm_clean_migration_mle(dlm, mle); diff --git a/queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch b/queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch new file mode 100644 index 00000000000..bf11d9f8d74 --- /dev/null +++ b/queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch @@ -0,0 +1,98 @@ +From 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 Mon Sep 17 00:00:00 2001 +From: Joseph Qi +Date: Tue, 29 Dec 2015 14:54:06 -0800 +Subject: ocfs2: fix BUG when calculate new backup super + +From: Joseph Qi + +commit 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 upstream. + +When resizing, it firstly extends the last gd. Once it should backup +super in the gd, it calculates new backup super and update the +corresponding value. + +But it currently doesn't consider the situation that the backup super is +already done. And in this case, it still sets the bit in gd bitmap and +then decrease from bg_free_bits_count, which leads to a corrupted gd and +trigger the BUG in ocfs2_block_group_set_bits: + + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + +So check whether the backup super is done and then do the updates. + +Signed-off-by: Joseph Qi +Reviewed-by: Jiufei Xue +Reviewed-by: Yiwen Jiang +Cc: Mark Fasheh +Cc: Joel Becker +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/resize.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +--- a/fs/ocfs2/resize.c ++++ b/fs/ocfs2/resize.c +@@ -54,11 +54,12 @@ + static u16 ocfs2_calc_new_backup_super(struct inode *inode, + struct ocfs2_group_desc *gd, + u16 cl_cpg, ++ u16 old_bg_clusters, + int set) + { + int i; + u16 backups = 0; +- u32 cluster; ++ u32 cluster, lgd_cluster; + u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); + + for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { +@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(s + else if (gd_blkno > lgd_blkno) + break; + ++ /* check if already done backup super */ ++ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); ++ lgd_cluster += old_bg_clusters; ++ if (lgd_cluster >= cluster) ++ continue; ++ + if (set) + ocfs2_set_bit(cluster % cl_cpg, + (unsigned long *)gd->bg_bitmap); +@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_i + u16 chain, num_bits, backups = 0; + u16 cl_bpc = le16_to_cpu(cl->cl_bpc); + u16 cl_cpg = le16_to_cpu(cl->cl_cpg); ++ u16 old_bg_clusters; + + trace_ocfs2_update_last_group_and_inode(new_clusters, + first_new_cluster); +@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_i + + group = (struct ocfs2_group_desc *)group_bh->b_data; + ++ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; + /* update the group first. */ + num_bits = new_clusters * cl_bpc; + le16_add_cpu(&group->bg_bits, num_bits); +@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_i + OCFS2_FEATURE_COMPAT_BACKUP_SB)) { + backups = ocfs2_calc_new_backup_super(bm_inode, + group, +- cl_cpg, 1); ++ cl_cpg, old_bg_clusters, 1); + le16_add_cpu(&group->bg_free_bits_count, -1 * backups); + } + +@@ -163,7 +172,7 @@ out_rollback: + if (ret < 0) { + ocfs2_calc_new_backup_super(bm_inode, + group, +- cl_cpg, 0); ++ cl_cpg, old_bg_clusters, 0); + le16_add_cpu(&group->bg_free_bits_count, backups); + le16_add_cpu(&group->bg_bits, -1 * num_bits); + le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); diff --git a/queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch b/queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch new file mode 100644 index 00000000000..6d8bff64432 --- /dev/null +++ b/queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch @@ -0,0 +1,44 @@ +From 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 Mon Sep 17 00:00:00 2001 +From: Junxiao Bi +Date: Fri, 11 Dec 2015 13:41:03 -0800 +Subject: ocfs2: fix SGID not inherited issue + +From: Junxiao Bi + +commit 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 upstream. + +Commit 8f1eb48758aa ("ocfs2: fix umask ignored issue") introduced an +issue, SGID of sub dir was not inherited from its parents dir. It is +because SGID is set into "inode->i_mode" in ocfs2_get_init_inode(), but +is overwritten by "mode" which don't have SGID set later. + +Fixes: 8f1eb48758aa ("ocfs2: fix umask ignored issue") +Signed-off-by: Junxiao Bi +Cc: Mark Fasheh +Cc: Joel Becker +Acked-by: Srinivas Eeda +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/namei.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -369,13 +369,11 @@ static int ocfs2_mknod(struct inode *dir + goto leave; + } + +- status = posix_acl_create(dir, &mode, &default_acl, &acl); ++ status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (status) { + mlog_errno(status); + goto leave; + } +- /* update inode->i_mode after mask with "umask". */ +- inode->i_mode = mode; + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), diff --git a/queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch b/queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch new file mode 100644 index 00000000000..588ffda4a55 --- /dev/null +++ b/queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch @@ -0,0 +1,53 @@ +From 54708d2858e79a2bdda10bf8a20c80eb96c20613 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov +Date: Fri, 6 Nov 2015 16:30:06 -0800 +Subject: proc: actually make proc_fd_permission() thread-friendly + +From: Oleg Nesterov + +commit 54708d2858e79a2bdda10bf8a20c80eb96c20613 upstream. + +The commit 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly") +fixed the access to /proc/self/fd from sub-threads, but introduced another +problem: a sub-thread can't access /proc//fd/ or /proc/thread-self/fd +if generic_permission() fails. + +Change proc_fd_permission() to check same_thread_group(pid_task(), current). + +Fixes: 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly") +Reported-by: "Jin, Yihua" +Signed-off-by: Oleg Nesterov +Cc: "Eric W. Biederman" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/fd.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/proc/fd.c ++++ b/fs/proc/fd.c +@@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(stru + */ + int proc_fd_permission(struct inode *inode, int mask) + { +- int rv = generic_permission(inode, mask); ++ struct task_struct *p; ++ int rv; ++ ++ rv = generic_permission(inode, mask); + if (rv == 0) +- return 0; +- if (task_tgid(current) == proc_pid(inode)) ++ return rv; ++ ++ rcu_read_lock(); ++ p = pid_task(proc_pid(inode), PIDTYPE_PID); ++ if (p && same_thread_group(p, current)) + rv = 0; ++ rcu_read_unlock(); ++ + return rv; + } + diff --git a/queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch b/queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch new file mode 100644 index 00000000000..97c125fb903 --- /dev/null +++ b/queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch @@ -0,0 +1,40 @@ +From 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b Mon Sep 17 00:00:00 2001 +From: Colin Ian King +Date: Fri, 18 Dec 2015 14:22:01 -0800 +Subject: proc: fix -ESRCH error when writing to /proc/$pid/coredump_filter + +From: Colin Ian King + +commit 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b upstream. + +Writing to /proc/$pid/coredump_filter always returns -ESRCH because commit +774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()") removed +the setting of ret after the get_proc_task call and incorrectly left it as +-ESRCH. Instead, return 0 when successful. + +Example breakage: + + echo 0 > /proc/self/coredump_filter + bash: echo: write error: No such process + +Fixes: 774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()") +Signed-off-by: Colin Ian King +Acked-by: Kees Cook +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/proc/base.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -2484,6 +2484,7 @@ static ssize_t proc_coredump_filter_writ + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; ++ ret = 0; + + for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { + if (val & mask) diff --git a/queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch b/queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch new file mode 100644 index 00000000000..694732e8e39 --- /dev/null +++ b/queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch @@ -0,0 +1,40 @@ +From 92792e48e2ae6051af30468a87994b5432da2f06 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Fri, 20 Nov 2015 18:26:07 +0100 +Subject: remoteproc: avoid stack overflow in debugfs file + +From: Arnd Bergmann + +commit 92792e48e2ae6051af30468a87994b5432da2f06 upstream. + +Recent gcc versions warn about reading from a negative offset of +an on-stack array: + +drivers/remoteproc/remoteproc_debugfs.c: In function 'rproc_recovery_write': +drivers/remoteproc/remoteproc_debugfs.c:167:9: warning: 'buf[4294967295u]' may be used uninitialized in this function [-Wmaybe-uninitialized] + +I don't see anything in sys_write() that prevents us from +being called with a zero 'count' argument, so we should +add an extra check in rproc_recovery_write() to prevent the +access and avoid the warning. + +Signed-off-by: Arnd Bergmann +Fixes: 2e37abb89a2e ("remoteproc: create a 'recovery' debugfs entry") +Signed-off-by: Ohad Ben-Cohen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/remoteproc/remoteproc_debugfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/remoteproc/remoteproc_debugfs.c ++++ b/drivers/remoteproc/remoteproc_debugfs.c +@@ -156,7 +156,7 @@ rproc_recovery_write(struct file *filp, + char buf[10]; + int ret; + +- if (count > sizeof(buf)) ++ if (count < 1 || count > sizeof(buf)) + return count; + + ret = copy_from_user(buf, user_buf, count); diff --git a/queue-4.3/series b/queue-4.3/series index 69bb5ea7ff9..604aed1b32f 100644 --- a/queue-4.3/series +++ b/queue-4.3/series @@ -16,3 +16,23 @@ nfsv4.1-pnfs-fixup-an-lo-plh_block_lgets-imbalance-in-layoutreturn.patch ocfs2-nfs-hangs-in-__ocfs2_cluster_lock-due-to-race-with-ocfs2_unblock_lock.patch hid-usbhid-fix-recursive-deadlock.patch alsa-hda-implement-loopback-control-switch-for-realtek-and-other-codecs.patch +proc-actually-make-proc_fd_permission-thread-friendly.patch +remoteproc-avoid-stack-overflow-in-debugfs-file.patch +proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch +mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch +mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch +lib-hexdump.c-truncate-output-in-case-of-overflow.patch +fs-seqfile-always-allow-oom-killer.patch +memcg-fix-thresholds-for-32b-architectures.patch +mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch +mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch +mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch +fat-fix-fake_offset-handling-on-error-path.patch +mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch +kernel-signal.c-unexport-sigsuspend.patch +mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch +ocfs2-fix-sgid-not-inherited-issue.patch +ocfs2-fix-bug-when-calculate-new-backup-super.patch +ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch +ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch +sh64-fix-__nr_fgetxattr.patch diff --git a/queue-4.3/sh64-fix-__nr_fgetxattr.patch b/queue-4.3/sh64-fix-__nr_fgetxattr.patch new file mode 100644 index 00000000000..b2d552c69df --- /dev/null +++ b/queue-4.3/sh64-fix-__nr_fgetxattr.patch @@ -0,0 +1,37 @@ +From 2d33fa1059da4c8e816627a688d950b613ec0474 Mon Sep 17 00:00:00 2001 +From: "Dmitry V. Levin" +Date: Fri, 11 Dec 2015 13:41:06 -0800 +Subject: sh64: fix __NR_fgetxattr + +From: Dmitry V. Levin + +commit 2d33fa1059da4c8e816627a688d950b613ec0474 upstream. + +According to arch/sh/kernel/syscalls_64.S and common sense, __NR_fgetxattr +has to be defined to 259, but it doesn't. Instead, it's defined to 269, +which is of course used by another syscall, __NR_sched_setaffinity in this +case. + +This bug was found by strace test suite. + +Signed-off-by: Dmitry V. Levin +Acked-by: Geert Uytterhoeven +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/sh/include/uapi/asm/unistd_64.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/sh/include/uapi/asm/unistd_64.h ++++ b/arch/sh/include/uapi/asm/unistd_64.h +@@ -278,7 +278,7 @@ + #define __NR_fsetxattr 256 + #define __NR_getxattr 257 + #define __NR_lgetxattr 258 +-#define __NR_fgetxattr 269 ++#define __NR_fgetxattr 259 + #define __NR_listxattr 260 + #define __NR_llistxattr 261 + #define __NR_flistxattr 262 -- 2.47.3