]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.3-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 12 Feb 2016 21:00:34 +0000 (13:00 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 12 Feb 2016 21:00:34 +0000 (13:00 -0800)
added patches:
fat-fix-fake_offset-handling-on-error-path.patch
fs-seqfile-always-allow-oom-killer.patch
kernel-signal.c-unexport-sigsuspend.patch
lib-hexdump.c-truncate-output-in-case-of-overflow.patch
memcg-fix-thresholds-for-32b-architectures.patch
mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch
mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch
mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch
mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch
mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch
mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch
mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch
ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch
ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch
ocfs2-fix-bug-when-calculate-new-backup-super.patch
ocfs2-fix-sgid-not-inherited-issue.patch
proc-actually-make-proc_fd_permission-thread-friendly.patch
proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch
remoteproc-avoid-stack-overflow-in-debugfs-file.patch
sh64-fix-__nr_fgetxattr.patch

21 files changed:
queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch [new file with mode: 0644]
queue-4.3/fs-seqfile-always-allow-oom-killer.patch [new file with mode: 0644]
queue-4.3/kernel-signal.c-unexport-sigsuspend.patch [new file with mode: 0644]
queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch [new file with mode: 0644]
queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch [new file with mode: 0644]
queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch [new file with mode: 0644]
queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch [new file with mode: 0644]
queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch [new file with mode: 0644]
queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch [new file with mode: 0644]
queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch [new file with mode: 0644]
queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch [new file with mode: 0644]
queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch [new file with mode: 0644]
queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch [new file with mode: 0644]
queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch [new file with mode: 0644]
queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch [new file with mode: 0644]
queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch [new file with mode: 0644]
queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch [new file with mode: 0644]
queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch [new file with mode: 0644]
queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch [new file with mode: 0644]
queue-4.3/series
queue-4.3/sh64-fix-__nr_fgetxattr.patch [new file with mode: 0644]

diff --git a/queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch b/queue-4.3/fat-fix-fake_offset-handling-on-error-path.patch
new file mode 100644 (file)
index 0000000..155daff
--- /dev/null
@@ -0,0 +1,80 @@
+From 928a477102c4fc6739883415b66987207e3502f4 Mon Sep 17 00:00:00 2001
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Date: Fri, 20 Nov 2015 15:57:15 -0800
+Subject: fat: fix fake_offset handling on error path
+
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+
+commit 928a477102c4fc6739883415b66987207e3502f4 upstream.
+
+For the root directory, .  and ..  are faked (using dir_emit_dots()) and
+ctx->pos is reset from 2 to 0.
+
+A corrupted root directory could cause fat_get_entry() to fail, but
+->iterate() (fat_readdir()) reports progress to the VFS (with ctx->pos
+rewound to 0), so any following calls to ->iterate() continue to return
+the same entries again and again.
+
+The result is that userspace will never see the end of the directory,
+causing e.g.  'ls' to hang in a getdents() loop.
+
+[hirofumi@mail.parknet.co.jp: cleanup and make sure to correct fake_offset]
+Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Signed-off-by: Richard Weinberger <richard.weinberger@gmail.com>
+Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fat/dir.c |   16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/fs/fat/dir.c
++++ b/fs/fat/dir.c
+@@ -610,9 +610,9 @@ parse_record:
+               int status = fat_parse_long(inode, &cpos, &bh, &de,
+                                           &unicode, &nr_slots);
+               if (status < 0) {
+-                      ctx->pos = cpos;
++                      bh = NULL;
+                       ret = status;
+-                      goto out;
++                      goto end_of_dir;
+               } else if (status == PARSE_INVALID)
+                       goto record_end;
+               else if (status == PARSE_NOT_LONGNAME)
+@@ -654,8 +654,9 @@ parse_record:
+       fill_len = short_len;
+ start_filldir:
+-      if (!fake_offset)
+-              ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
++      ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
++      if (fake_offset && ctx->pos < 2)
++              ctx->pos = 2;
+       if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+               if (!dir_emit_dot(file, ctx))
+@@ -681,14 +682,19 @@ record_end:
+       fake_offset = 0;
+       ctx->pos = cpos;
+       goto get_new;
++
+ end_of_dir:
+-      ctx->pos = cpos;
++      if (fake_offset && cpos < 2)
++              ctx->pos = 2;
++      else
++              ctx->pos = cpos;
+ fill_failed:
+       brelse(bh);
+       if (unicode)
+               __putname(unicode);
+ out:
+       mutex_unlock(&sbi->s_lock);
++
+       return ret;
+ }
diff --git a/queue-4.3/fs-seqfile-always-allow-oom-killer.patch b/queue-4.3/fs-seqfile-always-allow-oom-killer.patch
new file mode 100644 (file)
index 0000000..c4919c4
--- /dev/null
@@ -0,0 +1,64 @@
+From 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 Mon Sep 17 00:00:00 2001
+From: Greg Thelen <gthelen@google.com>
+Date: Fri, 6 Nov 2015 16:32:42 -0800
+Subject: fs, seqfile: always allow oom killer
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 upstream.
+
+Since 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill
+processes") seq_buf_alloc() avoids calling the oom killer for PAGE_SIZE or
+smaller allocations; but larger allocations can use the oom killer via
+vmalloc().  Thus reads of small files can return ENOMEM, but larger files
+use the oom killer to avoid ENOMEM.
+
+The effect of this bug is that reads from /proc and other virtual
+filesystems can return ENOMEM instead of the preferred behavior - oom
+killing something (possibly the calling process).  I don't know of anyone
+except Google who has noticed the issue.
+
+I suspect the fix is more needed in smaller systems where there isn't any
+reclaimable memory.  But these seem like the kinds of systems which
+probably don't use the oom killer for production situations.
+
+Memory overcommit requires use of the oom killer to select a victim
+regardless of file size.
+
+Enable oom killer for small seq_buf_alloc() allocations.
+
+Fixes: 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes")
+Signed-off-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/seq_file.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/seq_file.c
++++ b/fs/seq_file.c
+@@ -25,12 +25,17 @@ static void seq_set_overflow(struct seq_
+ static void *seq_buf_alloc(unsigned long size)
+ {
+       void *buf;
++      gfp_t gfp = GFP_KERNEL;
+       /*
+-       * __GFP_NORETRY to avoid oom-killings with high-order allocations -
+-       * it's better to fall back to vmalloc() than to kill things.
++       * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
++       * it's better to fall back to vmalloc() than to kill things.  For small
++       * allocations, just use GFP_KERNEL which will oom kill, thus no need
++       * for vmalloc fallback.
+        */
+-      buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
++      if (size > PAGE_SIZE)
++              gfp |= __GFP_NORETRY | __GFP_NOWARN;
++      buf = kmalloc(size, gfp);
+       if (!buf && size > PAGE_SIZE)
+               buf = vmalloc(size);
+       return buf;
diff --git a/queue-4.3/kernel-signal.c-unexport-sigsuspend.patch b/queue-4.3/kernel-signal.c-unexport-sigsuspend.patch
new file mode 100644 (file)
index 0000000..82422de
--- /dev/null
@@ -0,0 +1,64 @@
+From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001
+From: Richard Weinberger <richard@nod.at>
+Date: Fri, 20 Nov 2015 15:57:21 -0800
+Subject: kernel/signal.c: unexport sigsuspend()
+
+From: Richard Weinberger <richard@nod.at>
+
+commit 9d8a765211335cfdad464b90fb19f546af5706ae upstream.
+
+sigsuspend() is nowhere used except in signal.c itself, so we can mark it
+static do not pollute the global namespace.
+
+But this patch is more than a boring cleanup patch, it fixes a real issue
+on UserModeLinux.  UML has a special console driver to display ttys using
+xterm, or other terminal emulators, on the host side.  Vegard reported
+that sometimes UML is unable to spawn a xterm and he's facing the
+following warning:
+
+  WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0()
+
+It turned out that this warning makes absolutely no sense as the UML
+xterm code calls sigsuspend() on the host side, at least it tries.  But
+as the kernel itself offers a sigsuspend() symbol the linker choose this
+one instead of the glibc wrapper.  Interestingly this code used to work
+since ever but always blocked signals on the wrong side.  Some recent
+kernel change made the WARN_ON() trigger and uncovered the bug.
+
+It is a wonderful example of how much works by chance on computers. :-)
+
+Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()")
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Acked-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/signal.h |    1 -
+ kernel/signal.c        |    2 +-
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+--- a/include/linux/signal.h
++++ b/include/linux/signal.h
+@@ -239,7 +239,6 @@ extern int sigprocmask(int, sigset_t *,
+ extern void set_current_blocked(sigset_t *);
+ extern void __set_current_blocked(const sigset_t *);
+ extern int show_unhandled_signals;
+-extern int sigsuspend(sigset_t *);
+ struct sigaction {
+ #ifndef __ARCH_HAS_IRIX_SIGACTION
+--- a/kernel/signal.c
++++ b/kernel/signal.c
+@@ -3552,7 +3552,7 @@ SYSCALL_DEFINE0(pause)
+ #endif
+-int sigsuspend(sigset_t *set)
++static int sigsuspend(sigset_t *set)
+ {
+       current->saved_sigmask = current->blocked;
+       set_current_blocked(set);
diff --git a/queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch b/queue-4.3/lib-hexdump.c-truncate-output-in-case-of-overflow.patch
new file mode 100644 (file)
index 0000000..6a09620
--- /dev/null
@@ -0,0 +1,51 @@
+From 9f029f540c2f7e010e4922d44ba0dfd05da79f88 Mon Sep 17 00:00:00 2001
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Date: Fri, 6 Nov 2015 16:31:31 -0800
+Subject: lib/hexdump.c: truncate output in case of overflow
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+commit 9f029f540c2f7e010e4922d44ba0dfd05da79f88 upstream.
+
+There is a classical off-by-one error in case when we try to place, for
+example, 1+1 bytes as hex in the buffer of size 6.  The expected result is
+to get an output truncated, but in the reality we get 6 bytes filed
+followed by terminating NUL.
+
+Change the logic how we fill the output in case of byte dumping into
+limited space.  This will follow the snprintf() behaviour by truncating
+output even on half bytes.
+
+Fixes: 114fc1afb2de (hexdump: make it return number of bytes placed in buffer)
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Reported-by: Aaro Koskinen <aaro.koskinen@nokia.com>
+Tested-by: Aaro Koskinen <aaro.koskinen@nokia.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ lib/hexdump.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/lib/hexdump.c
++++ b/lib/hexdump.c
+@@ -169,11 +169,15 @@ int hex_dump_to_buffer(const void *buf,
+               }
+       } else {
+               for (j = 0; j < len; j++) {
+-                      if (linebuflen < lx + 3)
++                      if (linebuflen < lx + 2)
+                               goto overflow2;
+                       ch = ptr[j];
+                       linebuf[lx++] = hex_asc_hi(ch);
++                      if (linebuflen < lx + 2)
++                              goto overflow2;
+                       linebuf[lx++] = hex_asc_lo(ch);
++                      if (linebuflen < lx + 2)
++                              goto overflow2;
+                       linebuf[lx++] = ' ';
+               }
+               if (j)
diff --git a/queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch b/queue-4.3/memcg-fix-thresholds-for-32b-architectures.patch
new file mode 100644 (file)
index 0000000..4e05dc0
--- /dev/null
@@ -0,0 +1,105 @@
+From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Thu, 5 Nov 2015 18:50:29 -0800
+Subject: memcg: fix thresholds for 32b architectures.
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit c12176d3368b9b36ae484d323d41e94be26f9b65 upstream.
+
+Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a
+regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page
+counters") where thresholds were silently converted to use page units
+rather than bytes when interpreting the user input.
+
+The fix is not complete, though, as properly pointed out by Ben Hutchings
+during stable backport review.  The page count is converted to bytes but
+unsigned long is used to hold the value which would be obviously not
+sufficient for 32b systems with more than 4G thresholds.  The same applies
+to usage as taken from mem_cgroup_usage which might overflow.
+
+Let's remove this bytes vs.  pages internal tracking differences and
+handle thresholds in page units internally.  Chage mem_cgroup_usage() to
+return the value in page units and revert 424cdc141380 because this should
+be sufficient for the consistent handling.  mem_cgroup_read_u64 as the
+only users of mem_cgroup_usage outside of the threshold handling code is
+converted to give the proper in bytes result.  It is doing that already
+for page_counter output so this is more consistent as well.
+
+The value presented to the userspace is still in bytes units.
+
+Fixes: 424cdc141380 ("memcg: convert threshold to bytes")
+Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Ben Hutchings <ben@decadent.org.uk>
+Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+From: Michal Hocko <mhocko@kernel.org>
+Subject: memcg: fix thresholds for 32b architectures.
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+From: Andrew Morton <akpm@linux-foundation.org>
+Subject: memcg: fix thresholds for 32b architectures.
+
+don't attempt to inline mem_cgroup_usage()
+
+The compiler ignores the inline anwyay.  And __always_inlining it adds 600
+bytes of goop to the .o file.
+
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+---
+ mm/memcontrol.c |   11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2836,9 +2836,9 @@ static unsigned long tree_stat(struct me
+       return val;
+ }
+-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
++static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+ {
+-      u64 val;
++      unsigned long val;
+       if (mem_cgroup_is_root(memcg)) {
+               val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+@@ -2851,7 +2851,7 @@ static inline u64 mem_cgroup_usage(struc
+               else
+                       val = page_counter_read(&memcg->memsw);
+       }
+-      return val << PAGE_SHIFT;
++      return val;
+ }
+ enum {
+@@ -2885,9 +2885,9 @@ static u64 mem_cgroup_read_u64(struct cg
+       switch (MEMFILE_ATTR(cft->private)) {
+       case RES_USAGE:
+               if (counter == &memcg->memory)
+-                      return mem_cgroup_usage(memcg, false);
++                      return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
+               if (counter == &memcg->memsw)
+-                      return mem_cgroup_usage(memcg, true);
++                      return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
+               return (u64)page_counter_read(counter) * PAGE_SIZE;
+       case RES_LIMIT:
+               return (u64)counter->limit * PAGE_SIZE;
+@@ -3387,7 +3387,6 @@ static int __mem_cgroup_usage_register_e
+       ret = page_counter_memparse(args, "-1", &threshold);
+       if (ret)
+               return ret;
+-      threshold <<= PAGE_SHIFT;
+       mutex_lock(&memcg->thresholds_lock);
diff --git a/queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch b/queue-4.3/mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch
new file mode 100644 (file)
index 0000000..79a23e5
--- /dev/null
@@ -0,0 +1,61 @@
+From 0d777df5d8953293be090d9ab5a355db893e8357 Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Fri, 11 Dec 2015 13:40:49 -0800
+Subject: mm: hugetlb: call huge_pte_alloc() only if ptep is null
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit 0d777df5d8953293be090d9ab5a355db893e8357 upstream.
+
+Currently at the beginning of hugetlb_fault(), we call huge_pte_offset()
+and check whether the obtained *ptep is a migration/hwpoison entry or
+not.  And if not, then we get to call huge_pte_alloc().  This is racy
+because the *ptep could turn into migration/hwpoison entry after the
+huge_pte_offset() check.  This race results in BUG_ON in
+huge_pte_alloc().
+
+We don't have to call huge_pte_alloc() when the huge_pte_offset()
+returns non-NULL, so let's fix this bug with moving the code into else
+block.
+
+Note that the *ptep could turn into a migration/hwpoison entry after
+this block, but that's not a problem because we have another
+!pte_present check later (we never go into hugetlb_no_page() in that
+case.)
+
+Fixes: 290408d4a250 ("hugetlb: hugepage migration core")
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3590,12 +3590,12 @@ int hugetlb_fault(struct mm_struct *mm,
+               } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                       return VM_FAULT_HWPOISON_LARGE |
+                               VM_FAULT_SET_HINDEX(hstate_index(h));
++      } else {
++              ptep = huge_pte_alloc(mm, address, huge_page_size(h));
++              if (!ptep)
++                      return VM_FAULT_OOM;
+       }
+-      ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+-      if (!ptep)
+-              return VM_FAULT_OOM;
+-
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, vma, address);
diff --git a/queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch b/queue-4.3/mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch
new file mode 100644 (file)
index 0000000..fbab199
--- /dev/null
@@ -0,0 +1,59 @@
+From a88c769548047b21f76fd71e04b6a3300ff17160 Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Fri, 11 Dec 2015 13:40:24 -0800
+Subject: mm: hugetlb: fix hugepage memory leak caused by wrong reserve count
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit a88c769548047b21f76fd71e04b6a3300ff17160 upstream.
+
+When dequeue_huge_page_vma() in alloc_huge_page() fails, we fall back on
+alloc_buddy_huge_page() to directly create a hugepage from the buddy
+allocator.
+
+In that case, however, if alloc_buddy_huge_page() succeeds we don't
+decrement h->resv_huge_pages, which means that successful
+hugetlb_fault() returns without releasing the reserve count.  As a
+result, subsequent hugetlb_fault() might fail despite that there are
+still free hugepages.
+
+This patch simply adds decrementing code on that code path.
+
+I reproduced this problem when testing v4.3 kernel in the following situation:
+ - the test machine/VM is a NUMA system,
+ - hugepage overcommiting is enabled,
+ - most of hugepages are allocated and there's only one free hugepage
+   which is on node 0 (for example),
+ - another program, which calls set_mempolicy(MPOL_BIND) to bind itself to
+   node 1, tries to allocate a hugepage,
+ - the allocation should fail but the reserve count is still hold.
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1790,7 +1790,10 @@ struct page *alloc_huge_page(struct vm_a
+               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               if (!page)
+                       goto out_uncharge_cgroup;
+-
++              if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
++                      SetPagePrivate(page);
++                      h->resv_huge_pages--;
++              }
+               spin_lock(&hugetlb_lock);
+               list_move(&page->lru, &h->hugepage_activelist);
+               /* Fall through */
diff --git a/queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch b/queue-4.3/mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch
new file mode 100644 (file)
index 0000000..e4a9a08
--- /dev/null
@@ -0,0 +1,92 @@
+From dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 11 Dec 2015 13:40:52 -0800
+Subject: mm/hugetlb.c: fix resv map memory leak for placeholder entries
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 upstream.
+
+Dmitry Vyukov reported the following memory leak
+
+unreferenced object 0xffff88002eaafd88 (size 32):
+  comm "a.out", pid 5063, jiffies 4295774645 (age 15.810s)
+  hex dump (first 32 bytes):
+    28 e9 4e 63 00 88 ff ff 28 e9 4e 63 00 88 ff ff  (.Nc....(.Nc....
+    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+  backtrace:
+     kmalloc include/linux/slab.h:458
+     region_chg+0x2d4/0x6b0 mm/hugetlb.c:398
+     __vma_reservation_common+0x2c3/0x390 mm/hugetlb.c:1791
+     vma_needs_reservation mm/hugetlb.c:1813
+     alloc_huge_page+0x19e/0xc70 mm/hugetlb.c:1845
+     hugetlb_no_page mm/hugetlb.c:3543
+     hugetlb_fault+0x7a1/0x1250 mm/hugetlb.c:3717
+     follow_hugetlb_page+0x339/0xc70 mm/hugetlb.c:3880
+     __get_user_pages+0x542/0xf30 mm/gup.c:497
+     populate_vma_page_range+0xde/0x110 mm/gup.c:919
+     __mm_populate+0x1c7/0x310 mm/gup.c:969
+     do_mlock+0x291/0x360 mm/mlock.c:637
+     SYSC_mlock2 mm/mlock.c:658
+     SyS_mlock2+0x4b/0x70 mm/mlock.c:648
+
+Dmitry identified a potential memory leak in the routine region_chg,
+where a region descriptor is not free'ed on an error path.
+
+However, the root cause for the above memory leak resides in region_del.
+In this specific case, a "placeholder" entry is created in region_chg.
+The associated page allocation fails, and the placeholder entry is left
+in the reserve map.  This is "by design" as the entry should be deleted
+when the map is released.  The bug is in the region_del routine which is
+used to delete entries within a specific range (and when the map is
+released).  region_del did not handle the case where a placeholder entry
+exactly matched the start of the range range to be deleted.  In this
+case, the entry would not be deleted and leaked.  The fix is to take
+these special placeholder entries into account in region_del.
+
+The region_chg error path leak is also fixed.
+
+Fixes: feba16e25a57 ("mm/hugetlb: add region_del() to delete a specific range of entries")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -372,8 +372,10 @@ retry_locked:
+               spin_unlock(&resv->lock);
+               trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+-              if (!trg)
++              if (!trg) {
++                      kfree(nrg);
+                       return -ENOMEM;
++              }
+               spin_lock(&resv->lock);
+               list_add(&trg->link, &resv->region_cache);
+@@ -483,8 +485,16 @@ static long region_del(struct resv_map *
+ retry:
+       spin_lock(&resv->lock);
+       list_for_each_entry_safe(rg, trg, head, link) {
+-              if (rg->to <= f)
++              /*
++               * Skip regions before the range to be deleted.  file_region
++               * ranges are normally of the form [from, to).  However, there
++               * may be a "placeholder" entry in the map which is of the form
++               * (from, to) with from == to.  Check for placeholder entries
++               * at the beginning of the range to be deleted.
++               */
++              if (rg->to <= f && (rg->to != rg->from || rg->to != f))
+                       continue;
++
+               if (rg->from >= t)
+                       break;
diff --git a/queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch b/queue-4.3/mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch
new file mode 100644 (file)
index 0000000..f6bcd40
--- /dev/null
@@ -0,0 +1,167 @@
+From 1817889e3b2cc1db8abb595712095129ff9156c1 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 20 Nov 2015 15:57:13 -0800
+Subject: mm/hugetlbfs: fix bugs in fallocate hole punch of areas with holes
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit 1817889e3b2cc1db8abb595712095129ff9156c1 upstream.
+
+Hugh Dickins pointed out problems with the new hugetlbfs fallocate hole
+punch code.  These problems are in the routine remove_inode_hugepages and
+mostly occur in the case where there are holes in the range of pages to be
+removed.  These holes could be the result of a previous hole punch or
+simply sparse allocation.  The current code could access pages outside the
+specified range.
+
+remove_inode_hugepages handles both hole punch and truncate operations.
+Page index handling was fixed/cleaned up so that the loop index always
+matches the page being processed.  The code now only makes a single pass
+through the range of pages as it was determined page faults could not race
+with truncate.  A cond_resched() was added after removing up to
+PAGEVEC_SIZE pages.
+
+Some totally unnecessary code in hugetlbfs_fallocate() that remained from
+early development was also removed.
+
+Tested with fallocate tests submitted here:
+http://librelist.com/browser//libhugetlbfs/2015/6/25/patch-tests-add-tests-for-fallocate-system-call/
+And, some ftruncate tests under development
+
+Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: "Hillf Danton" <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/hugetlbfs/inode.c |   65 +++++++++++++++++++++++++--------------------------
+ 1 file changed, 32 insertions(+), 33 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -332,12 +332,17 @@ static void remove_huge_page(struct page
+  * truncation is indicated by end of range being LLONG_MAX
+  *    In this case, we first scan the range and release found pages.
+  *    After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+- *    maps and global counts.
++ *    maps and global counts.  Page faults can not race with truncation
++ *    in this routine.  hugetlb_no_page() prevents page faults in the
++ *    truncated range.  It checks i_size before allocation, and again after
++ *    with the page table lock for the page held.  The same lock must be
++ *    acquired to unmap a page.
+  * hole punch is indicated if end is not LLONG_MAX
+  *    In the hole punch case we scan the range and release found pages.
+  *    Only when releasing a page is the associated region/reserv map
+  *    deleted.  The region/reserv map for ranges without associated
+- *    pages are not modified.
++ *    pages are not modified.  Page faults can race with hole punch.
++ *    This is indicated if we find a mapped page.
+  * Note: If the passed end of range value is beyond the end of file, but
+  * not LLONG_MAX this routine still performs a hole punch operation.
+  */
+@@ -361,46 +366,37 @@ static void remove_inode_hugepages(struc
+       next = start;
+       while (next < end) {
+               /*
+-               * Make sure to never grab more pages that we
+-               * might possibly need.
++               * Don't grab more pages than the number left in the range.
+                */
+               if (end - next < lookup_nr)
+                       lookup_nr = end - next;
+               /*
+-               * This pagevec_lookup() may return pages past 'end',
+-               * so we must check for page->index > end.
++               * When no more pages are found, we are done.
+                */
+-              if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
+-                      if (next == start)
+-                              break;
+-                      next = start;
+-                      continue;
+-              }
++              if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
++                      break;
+               for (i = 0; i < pagevec_count(&pvec); ++i) {
+                       struct page *page = pvec.pages[i];
+                       u32 hash;
++                      /*
++                       * The page (index) could be beyond end.  This is
++                       * only possible in the punch hole case as end is
++                       * max page offset in the truncate case.
++                       */
++                      next = page->index;
++                      if (next >= end)
++                              break;
++
+                       hash = hugetlb_fault_mutex_hash(h, current->mm,
+                                                       &pseudo_vma,
+                                                       mapping, next, 0);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       lock_page(page);
+-                      if (page->index >= end) {
+-                              unlock_page(page);
+-                              mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+-                              next = end;     /* we are done */
+-                              break;
+-                      }
+-
+-                      /*
+-                       * If page is mapped, it was faulted in after being
+-                       * unmapped.  Do nothing in this race case.  In the
+-                       * normal case page is not mapped.
+-                       */
+-                      if (!page_mapped(page)) {
++                      if (likely(!page_mapped(page))) {
+                               bool rsv_on_error = !PagePrivate(page);
+                               /*
+                                * We must free the huge page and remove
+@@ -421,17 +417,23 @@ static void remove_inode_hugepages(struc
+                                               hugetlb_fix_reserve_counts(
+                                                       inode, rsv_on_error);
+                               }
++                      } else {
++                              /*
++                               * If page is mapped, it was faulted in after
++                               * being unmapped.  It indicates a race between
++                               * hole punch and page fault.  Do nothing in
++                               * this case.  Getting here in a truncate
++                               * operation is a bug.
++                               */
++                              BUG_ON(truncate_op);
+                       }
+-                      if (page->index > next)
+-                              next = page->index;
+-
+-                      ++next;
+                       unlock_page(page);
+-
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+               }
++              ++next;
+               huge_pagevec_release(&pvec);
++              cond_resched();
+       }
+       if (truncate_op)
+@@ -647,9 +649,6 @@ static long hugetlbfs_fallocate(struct f
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+               i_size_write(inode, offset + len);
+       inode->i_ctime = CURRENT_TIME;
+-      spin_lock(&inode->i_lock);
+-      inode->i_private = NULL;
+-      spin_unlock(&inode->i_lock);
+ out:
+       mutex_unlock(&inode->i_mutex);
+       return error;
diff --git a/queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch b/queue-4.3/mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch
new file mode 100644 (file)
index 0000000..bc6b68a
--- /dev/null
@@ -0,0 +1,77 @@
+From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
+Date: Thu, 5 Nov 2015 18:47:44 -0800
+Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL
+
+From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
+
+commit 426fb5e72d92b868912e47a1e3ca2df6eabc3872 upstream.
+
+It was confirmed that a local unprivileged user can consume all memory
+reserves and hang up that system using time lag between the OOM killer
+sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for
+printk() inside for_each_process() loop at oom_kill_process() can consume
+many seconds when there are many thread groups sharing the same memory.
+
+Before starting oom-depleter process:
+
+    Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB
+    Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB
+
+As of invoking the OOM killer:
+
+    Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB
+    Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB
+
+Between the thread group leader got TIF_MEMDIE and receives SIGKILL:
+
+    Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
+    Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
+
+The oom-depleter's thread group leader which got TIF_MEMDIE started
+memset() in user space after the OOM killer set TIF_MEMDIE, and it was
+free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space
+until SIGKILL is delivered.  If SIGKILL is delivered before TIF_MEMDIE is
+set, the oom-depleter can terminate without touching memory reserves.
+
+Although the possibility of hitting this time lag is very small for 3.19
+and earlier kernels because TIF_MEMDIE is set immediately before sending
+SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can
+step between and allow memory allocations which are not needed for
+terminating the OOM victim.
+
+Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock")
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/oom_kill.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control
+       /* mm cannot safely be dereferenced after task_unlock(victim) */
+       mm = victim->mm;
++      /*
++       * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
++       * the OOM victim from depleting the memory reserves from the user
++       * space under its control.
++       */
++      do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+       mark_oom_victim(victim);
+       pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+               task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+@@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control
+               }
+       rcu_read_unlock();
+-      do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+       put_task_struct(victim);
+ }
+ #undef K
diff --git a/queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch b/queue-4.3/mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch
new file mode 100644 (file)
index 0000000..4c02002
--- /dev/null
@@ -0,0 +1,87 @@
+From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001
+From: Catalin Marinas <catalin.marinas@arm.com>
+Date: Thu, 5 Nov 2015 18:45:54 -0800
+Subject: mm: slab: only move management objects off-slab for sizes larger than KMALLOC_MIN_SIZE
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit d4322d88f5fdf92729dd40f923013414fbb2184d upstream.
+
+On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc
+configurations defining ARCH_DMA_MINALIGN to 128), the first
+kmalloc_caches[] entry to be initialised after slab_early_init = 0 is
+"kmalloc-128" with index 7.  Depending on the debug kernel configuration,
+sizeof(struct kmem_cache) can be larger than 128 resulting in an
+INDEX_NODE of 8.
+
+Commit 8fc9cf420b36 ("slab: make more slab management structure off the
+slab") enables off-slab management objects for sizes starting with
+PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation
+of the "kmalloc-128" cache would try to place the management objects
+off-slab.  However, since KMALLOC_MIN_SIZE is already 128 and
+freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size)
+returns NULL (kmalloc_caches[7] not populated yet).  This triggers the
+following bug on arm64:
+
+  kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283!
+  Internal error: Oops - BUG: 0 [#1] SMP
+  Modules linked in:
+  CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540
+  Hardware name: Juno (DT)
+  PC is at __kmem_cache_create+0x21c/0x280
+  LR is at __kmem_cache_create+0x210/0x280
+  [...]
+  Call trace:
+    __kmem_cache_create+0x21c/0x280
+    create_boot_cache+0x48/0x80
+    create_kmalloc_cache+0x50/0x88
+    create_kmalloc_caches+0x4c/0xf4
+    kmem_cache_init+0x100/0x118
+    start_kernel+0x214/0x33c
+
+This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab
+management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE.
+
+Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab")
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Acked-by: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct
+ #define CFLGS_OFF_SLAB                (0x80000000UL)
+ #define       OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
++#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
+ #define BATCHREFILL_LIMIT     16
+ /*
+@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *
+        * it too early on. Always use on-slab management when
+        * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
+        */
+-      if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
++      if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
+           !(flags & SLAB_NOLEAKTRACE))
+               /*
+                * Size is large, assume best to place the slab management obj
+@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *
+               /*
+                * This is a possibility for one of the kmalloc_{dma,}_caches.
+                * But since we go off slab only for object size greater than
+-               * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
++               * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
+                * in ascending order,this should not happen at all.
+                * But leave a BUG_ON for some lucky dude.
+                */
diff --git a/queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch b/queue-4.3/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch
new file mode 100644 (file)
index 0000000..3ffb87e
--- /dev/null
@@ -0,0 +1,122 @@
+From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 11 Dec 2015 13:40:32 -0800
+Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 upstream.
+
+Tetsuo Handa has reported that the system might basically livelock in
+OOM condition without triggering the OOM killer.
+
+The issue is caused by internal dependency of the direct reclaim on
+vmstat counter updates (via zone_reclaimable) which are performed from
+the workqueue context.  If all the current workers get assigned to an
+allocation request, though, they will be looping inside the allocator
+trying to reclaim memory but zone_reclaimable can see stalled numbers so
+it will consider a zone reclaimable even though it has been scanned way
+too much.  WQ concurrency logic will not consider this situation as a
+congested workqueue because it relies that worker would have to sleep in
+such a situation.  This also means that it doesn't try to spawn new
+workers or invoke the rescuer thread if the one is assigned to the
+queue.
+
+In order to fix this issue we need to do two things.  First we have to
+let wq concurrency code know that we are in trouble so we have to do a
+short sleep.  In order to prevent from issues handled by 0e093d99763e
+("writeback: do not sleep on the congestion queue if there are no
+congested BDIs or if significant congestion is not being encountered in
+the current zone") we limit the sleep only to worker threads which are
+the ones of the interest anyway.
+
+The second thing to do is to create a dedicated workqueue for vmstat and
+mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to
+have a spare worker thread for it.
+
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Cristopher Lameter <clameter@sgi.com>
+Cc: Joonsoo Kim <js1304@gmail.com>
+Cc: Arkadiusz Miskiewicz <arekm@maven.pl>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/backing-dev.c |   19 ++++++++++++++++---
+ mm/vmstat.c      |    6 ++++--
+ 2 files changed, 20 insertions(+), 5 deletions(-)
+
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
+  * jiffies for either a BDI to exit congestion of the given @sync queue
+  * or a write to complete.
+  *
+- * In the absence of zone congestion, cond_resched() is called to yield
+- * the processor if necessary but otherwise does not sleep.
++ * In the absence of zone congestion, a short sleep or a cond_resched is
++ * performed to yield the processor and to allow other subsystems to make
++ * a forward progress.
+  *
+  * The return value is 0 if the sleep is for the full timeout. Otherwise,
+  * it is the number of jiffies that were still remaining when the function
+@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zon
+        */
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
+           !test_bit(ZONE_CONGESTED, &zone->flags)) {
+-              cond_resched();
++
++              /*
++               * Memory allocation/reclaim might be called from a WQ
++               * context and the current implementation of the WQ
++               * concurrency control doesn't recognize that a particular
++               * WQ is congested if the worker thread is looping without
++               * ever sleeping. Therefore we have to do a short sleep
++               * here rather than calling cond_resched().
++               */
++              if (current->flags & PF_WQ_WORKER)
++                      schedule_timeout(1);
++              else
++                      cond_resched();
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1357,6 +1357,7 @@ static const struct file_operations proc
+ #endif /* CONFIG_PROC_FS */
+ #ifdef CONFIG_SMP
++static struct workqueue_struct *vmstat_wq;
+ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+ int sysctl_stat_interval __read_mostly = HZ;
+ static cpumask_var_t cpu_stat_off;
+@@ -1369,7 +1370,7 @@ static void vmstat_update(struct work_st
+                * to occur in the future. Keep on running the
+                * update worker thread.
+                */
+-              schedule_delayed_work_on(smp_processor_id(),
++              queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+                       this_cpu_ptr(&vmstat_work),
+                       round_jiffies_relative(sysctl_stat_interval));
+       } else {
+@@ -1438,7 +1439,7 @@ static void vmstat_shepherd(struct work_
+               if (need_update(cpu) &&
+                       cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+-                      schedule_delayed_work_on(cpu,
++                      queue_delayed_work_on(cpu, vmstat_wq,
+                               &per_cpu(vmstat_work, cpu), 0);
+       put_online_cpus();
+@@ -1527,6 +1528,7 @@ static int __init setup_vmstat(void)
+       start_shepherd_timer();
+       cpu_notifier_register_done();
++      vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+       proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch b/queue-4.3/ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch
new file mode 100644 (file)
index 0000000..85b55f4
--- /dev/null
@@ -0,0 +1,38 @@
+From c95a51807b730e4681e2ecbdfd669ca52601959e Mon Sep 17 00:00:00 2001
+From: xuejiufei <xuejiufei@huawei.com>
+Date: Fri, 5 Feb 2016 15:36:47 -0800
+Subject: ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup
+
+From: xuejiufei <xuejiufei@huawei.com>
+
+commit c95a51807b730e4681e2ecbdfd669ca52601959e upstream.
+
+When recovery master down, dlm_do_local_recovery_cleanup() only remove
+the $RECOVERY lock owned by dead node, but do not clear the refmap bit.
+Which will make umount thread falling in dead loop migrating $RECOVERY
+to the dead node.
+
+Signed-off-by: xuejiufei <xuejiufei@huawei.com>
+Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/dlm/dlmrecovery.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ocfs2/dlm/dlmrecovery.c
++++ b/fs/ocfs2/dlm/dlmrecovery.c
+@@ -2360,6 +2360,8 @@ static void dlm_do_local_recovery_cleanu
+                                               break;
+                                       }
+                               }
++                              dlm_lockres_clear_refmap_bit(dlm, res,
++                                              dead_node);
+                               spin_unlock(&res->spinlock);
+                               continue;
+                       }
diff --git a/queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch b/queue-4.3/ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch
new file mode 100644 (file)
index 0000000..5fe0d6a
--- /dev/null
@@ -0,0 +1,97 @@
+From bef5502de074b6f6fa647b94b73155d675694420 Mon Sep 17 00:00:00 2001
+From: xuejiufei <xuejiufei@huawei.com>
+Date: Thu, 14 Jan 2016 15:17:38 -0800
+Subject: ocfs2/dlm: ignore cleaning the migration mle that is inuse
+
+From: xuejiufei <xuejiufei@huawei.com>
+
+commit bef5502de074b6f6fa647b94b73155d675694420 upstream.
+
+We have found that migration source will trigger a BUG that the refcount
+of mle is already zero before put when the target is down during
+migration.  The situation is as follows:
+
+dlm_migrate_lockres
+  dlm_add_migration_mle
+  dlm_mark_lockres_migrating
+  dlm_get_mle_inuse
+  <<<<<< Now the refcount of the mle is 2.
+  dlm_send_one_lockres and wait for the target to become the
+  new master.
+  <<<<<< o2hb detect the target down and clean the migration
+  mle. Now the refcount is 1.
+
+dlm_migrate_lockres woken, and put the mle twice when found the target
+goes down which trigger the BUG with the following message:
+
+  "ERROR: bad mle: ".
+
+Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
+Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/dlm/dlmmaster.c |   26 +++++++++++++++-----------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+--- a/fs/ocfs2/dlm/dlmmaster.c
++++ b/fs/ocfs2/dlm/dlmmaster.c
+@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dl
+       spin_lock(&dlm->master_lock);
+       ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+                                   namelen, target, dlm->node_num);
++      /* get an extra reference on the mle.
++       * otherwise the assert_master from the new
++       * master will destroy this.
++       */
++      dlm_get_mle_inuse(mle);
+       spin_unlock(&dlm->master_lock);
+       spin_unlock(&dlm->spinlock);
+@@ -2554,6 +2559,7 @@ fail:
+               if (mle_added) {
+                       dlm_mle_detach_hb_events(dlm, mle);
+                       dlm_put_mle(mle);
++                      dlm_put_mle_inuse(mle);
+               } else if (mle) {
+                       kmem_cache_free(dlm_mle_cache, mle);
+                       mle = NULL;
+@@ -2571,17 +2577,6 @@ fail:
+        * ensure that all assert_master work is flushed. */
+       flush_workqueue(dlm->dlm_worker);
+-      /* get an extra reference on the mle.
+-       * otherwise the assert_master from the new
+-       * master will destroy this.
+-       * also, make sure that all callers of dlm_get_mle
+-       * take both dlm->spinlock and dlm->master_lock */
+-      spin_lock(&dlm->spinlock);
+-      spin_lock(&dlm->master_lock);
+-      dlm_get_mle_inuse(mle);
+-      spin_unlock(&dlm->master_lock);
+-      spin_unlock(&dlm->spinlock);
+-
+       /* notify new node and send all lock state */
+       /* call send_one_lockres with migration flag.
+        * this serves as notice to the target node that a
+@@ -3310,6 +3305,15 @@ top:
+                           mle->new_master != dead_node)
+                               continue;
++                      if (mle->new_master == dead_node && mle->inuse) {
++                              mlog(ML_NOTICE, "%s: target %u died during "
++                                              "migration from %u, the MLE is "
++                                              "still keep used, ignore it!\n",
++                                              dlm->name, dead_node,
++                                              mle->master);
++                              continue;
++                      }
++
+                       /* If we have reached this point, this mle needs to be
+                        * removed from the list and freed. */
+                       dlm_clean_migration_mle(dlm, mle);
diff --git a/queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch b/queue-4.3/ocfs2-fix-bug-when-calculate-new-backup-super.patch
new file mode 100644 (file)
index 0000000..bf11d9f
--- /dev/null
@@ -0,0 +1,98 @@
+From 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 Mon Sep 17 00:00:00 2001
+From: Joseph Qi <joseph.qi@huawei.com>
+Date: Tue, 29 Dec 2015 14:54:06 -0800
+Subject: ocfs2: fix BUG when calculate new backup super
+
+From: Joseph Qi <joseph.qi@huawei.com>
+
+commit 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 upstream.
+
+When resizing, it firstly extends the last gd.  Once it should backup
+super in the gd, it calculates new backup super and update the
+corresponding value.
+
+But it currently doesn't consider the situation that the backup super is
+already done.  And in this case, it still sets the bit in gd bitmap and
+then decrease from bg_free_bits_count, which leads to a corrupted gd and
+trigger the BUG in ocfs2_block_group_set_bits:
+
+    BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+So check whether the backup super is done and then do the updates.
+
+Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
+Reviewed-by: Jiufei Xue <xuejiufei@huawei.com>
+Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/resize.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/fs/ocfs2/resize.c
++++ b/fs/ocfs2/resize.c
+@@ -54,11 +54,12 @@
+ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+                                      struct ocfs2_group_desc *gd,
+                                      u16 cl_cpg,
++                                     u16 old_bg_clusters,
+                                      int set)
+ {
+       int i;
+       u16 backups = 0;
+-      u32 cluster;
++      u32 cluster, lgd_cluster;
+       u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+       for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(s
+               else if (gd_blkno > lgd_blkno)
+                       break;
++              /* check if already done backup super */
++              lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
++              lgd_cluster += old_bg_clusters;
++              if (lgd_cluster >= cluster)
++                      continue;
++
+               if (set)
+                       ocfs2_set_bit(cluster % cl_cpg,
+                                     (unsigned long *)gd->bg_bitmap);
+@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_i
+       u16 chain, num_bits, backups = 0;
+       u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+       u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
++      u16 old_bg_clusters;
+       trace_ocfs2_update_last_group_and_inode(new_clusters,
+                                               first_new_cluster);
+@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_i
+       group = (struct ocfs2_group_desc *)group_bh->b_data;
++      old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
+       /* update the group first. */
+       num_bits = new_clusters * cl_bpc;
+       le16_add_cpu(&group->bg_bits, num_bits);
+@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_i
+                                    OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+               backups = ocfs2_calc_new_backup_super(bm_inode,
+                                                    group,
+-                                                   cl_cpg, 1);
++                                                   cl_cpg, old_bg_clusters, 1);
+               le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+       }
+@@ -163,7 +172,7 @@ out_rollback:
+       if (ret < 0) {
+               ocfs2_calc_new_backup_super(bm_inode,
+                                           group,
+-                                          cl_cpg, 0);
++                                          cl_cpg, old_bg_clusters, 0);
+               le16_add_cpu(&group->bg_free_bits_count, backups);
+               le16_add_cpu(&group->bg_bits, -1 * num_bits);
+               le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
diff --git a/queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch b/queue-4.3/ocfs2-fix-sgid-not-inherited-issue.patch
new file mode 100644 (file)
index 0000000..6d8bff6
--- /dev/null
@@ -0,0 +1,44 @@
+From 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Fri, 11 Dec 2015 13:41:03 -0800
+Subject: ocfs2: fix SGID not inherited issue
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 upstream.
+
+Commit 8f1eb48758aa ("ocfs2: fix umask ignored issue") introduced an
+issue, SGID of sub dir was not inherited from its parents dir.  It is
+because SGID is set into "inode->i_mode" in ocfs2_get_init_inode(), but
+is overwritten by "mode" which don't have SGID set later.
+
+Fixes: 8f1eb48758aa ("ocfs2: fix umask ignored issue")
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Acked-by: Srinivas Eeda <srinivas.eeda@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/namei.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/fs/ocfs2/namei.c
++++ b/fs/ocfs2/namei.c
+@@ -369,13 +369,11 @@ static int ocfs2_mknod(struct inode *dir
+               goto leave;
+       }
+-      status = posix_acl_create(dir, &mode, &default_acl, &acl);
++      status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+       if (status) {
+               mlog_errno(status);
+               goto leave;
+       }
+-      /* update inode->i_mode after mask with "umask". */
+-      inode->i_mode = mode;
+       handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+                                                           S_ISDIR(mode),
diff --git a/queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch b/queue-4.3/proc-actually-make-proc_fd_permission-thread-friendly.patch
new file mode 100644 (file)
index 0000000..588ffda
--- /dev/null
@@ -0,0 +1,53 @@
+From 54708d2858e79a2bdda10bf8a20c80eb96c20613 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Fri, 6 Nov 2015 16:30:06 -0800
+Subject: proc: actually make proc_fd_permission() thread-friendly
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 54708d2858e79a2bdda10bf8a20c80eb96c20613 upstream.
+
+The commit 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
+fixed the access to /proc/self/fd from sub-threads, but introduced another
+problem: a sub-thread can't access /proc/<tid>/fd/ or /proc/thread-self/fd
+if generic_permission() fails.
+
+Change proc_fd_permission() to check same_thread_group(pid_task(), current).
+
+Fixes: 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
+Reported-by: "Jin, Yihua" <yihua.jin@intel.com>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Cc: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/fd.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/fs/proc/fd.c
++++ b/fs/proc/fd.c
+@@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(stru
+  */
+ int proc_fd_permission(struct inode *inode, int mask)
+ {
+-      int rv = generic_permission(inode, mask);
++      struct task_struct *p;
++      int rv;
++
++      rv = generic_permission(inode, mask);
+       if (rv == 0)
+-              return 0;
+-      if (task_tgid(current) == proc_pid(inode))
++              return rv;
++
++      rcu_read_lock();
++      p = pid_task(proc_pid(inode), PIDTYPE_PID);
++      if (p && same_thread_group(p, current))
+               rv = 0;
++      rcu_read_unlock();
++
+       return rv;
+ }
diff --git a/queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch b/queue-4.3/proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch
new file mode 100644 (file)
index 0000000..97c125f
--- /dev/null
@@ -0,0 +1,40 @@
+From 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b Mon Sep 17 00:00:00 2001
+From: Colin Ian King <colin.king@canonical.com>
+Date: Fri, 18 Dec 2015 14:22:01 -0800
+Subject: proc: fix -ESRCH error when writing to /proc/$pid/coredump_filter
+
+From: Colin Ian King <colin.king@canonical.com>
+
+commit 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b upstream.
+
+Writing to /proc/$pid/coredump_filter always returns -ESRCH because commit
+774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()") removed
+the setting of ret after the get_proc_task call and incorrectly left it as
+-ESRCH.  Instead, return 0 when successful.
+
+Example breakage:
+
+  echo 0 > /proc/self/coredump_filter
+  bash: echo: write error: No such process
+
+Fixes: 774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()")
+Signed-off-by: Colin Ian King <colin.king@canonical.com>
+Acked-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -2484,6 +2484,7 @@ static ssize_t proc_coredump_filter_writ
+       mm = get_task_mm(task);
+       if (!mm)
+               goto out_no_mm;
++      ret = 0;
+       for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+               if (val & mask)
diff --git a/queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch b/queue-4.3/remoteproc-avoid-stack-overflow-in-debugfs-file.patch
new file mode 100644 (file)
index 0000000..694732e
--- /dev/null
@@ -0,0 +1,40 @@
+From 92792e48e2ae6051af30468a87994b5432da2f06 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Fri, 20 Nov 2015 18:26:07 +0100
+Subject: remoteproc: avoid stack overflow in debugfs file
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 92792e48e2ae6051af30468a87994b5432da2f06 upstream.
+
+Recent gcc versions warn about reading from a negative offset of
+an on-stack array:
+
+drivers/remoteproc/remoteproc_debugfs.c: In function 'rproc_recovery_write':
+drivers/remoteproc/remoteproc_debugfs.c:167:9: warning: 'buf[4294967295u]' may be used uninitialized in this function [-Wmaybe-uninitialized]
+
+I don't see anything in sys_write() that prevents us from
+being called with a zero 'count' argument, so we should
+add an extra check in rproc_recovery_write() to prevent the
+access and avoid the warning.
+
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Fixes: 2e37abb89a2e ("remoteproc: create a 'recovery' debugfs entry")
+Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/remoteproc/remoteproc_debugfs.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/remoteproc/remoteproc_debugfs.c
++++ b/drivers/remoteproc/remoteproc_debugfs.c
+@@ -156,7 +156,7 @@ rproc_recovery_write(struct file *filp,
+       char buf[10];
+       int ret;
+-      if (count > sizeof(buf))
++      if (count < 1 || count > sizeof(buf))
+               return count;
+       ret = copy_from_user(buf, user_buf, count);
index 69bb5ea7ff9a46df20d7cf1edaa0817bfee7efc4..604aed1b32f03ef2c4e54790787910554eda8052 100644 (file)
@@ -16,3 +16,23 @@ nfsv4.1-pnfs-fixup-an-lo-plh_block_lgets-imbalance-in-layoutreturn.patch
 ocfs2-nfs-hangs-in-__ocfs2_cluster_lock-due-to-race-with-ocfs2_unblock_lock.patch
 hid-usbhid-fix-recursive-deadlock.patch
 alsa-hda-implement-loopback-control-switch-for-realtek-and-other-codecs.patch
+proc-actually-make-proc_fd_permission-thread-friendly.patch
+remoteproc-avoid-stack-overflow-in-debugfs-file.patch
+proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch
+mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch
+mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch
+lib-hexdump.c-truncate-output-in-case-of-overflow.patch
+fs-seqfile-always-allow-oom-killer.patch
+memcg-fix-thresholds-for-32b-architectures.patch
+mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch
+mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch
+mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch
+fat-fix-fake_offset-handling-on-error-path.patch
+mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch
+kernel-signal.c-unexport-sigsuspend.patch
+mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch
+ocfs2-fix-sgid-not-inherited-issue.patch
+ocfs2-fix-bug-when-calculate-new-backup-super.patch
+ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch
+ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch
+sh64-fix-__nr_fgetxattr.patch
diff --git a/queue-4.3/sh64-fix-__nr_fgetxattr.patch b/queue-4.3/sh64-fix-__nr_fgetxattr.patch
new file mode 100644 (file)
index 0000000..b2d552c
--- /dev/null
@@ -0,0 +1,37 @@
+From 2d33fa1059da4c8e816627a688d950b613ec0474 Mon Sep 17 00:00:00 2001
+From: "Dmitry V. Levin" <ldv@altlinux.org>
+Date: Fri, 11 Dec 2015 13:41:06 -0800
+Subject: sh64: fix __NR_fgetxattr
+
+From: Dmitry V. Levin <ldv@altlinux.org>
+
+commit 2d33fa1059da4c8e816627a688d950b613ec0474 upstream.
+
+According to arch/sh/kernel/syscalls_64.S and common sense, __NR_fgetxattr
+has to be defined to 259, but it doesn't.  Instead, it's defined to 269,
+which is of course used by another syscall, __NR_sched_setaffinity in this
+case.
+
+This bug was found by strace test suite.
+
+Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
+Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/include/uapi/asm/unistd_64.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/sh/include/uapi/asm/unistd_64.h
++++ b/arch/sh/include/uapi/asm/unistd_64.h
+@@ -278,7 +278,7 @@
+ #define __NR_fsetxattr                256
+ #define __NR_getxattr         257
+ #define __NR_lgetxattr                258
+-#define __NR_fgetxattr                269
++#define __NR_fgetxattr                259
+ #define __NR_listxattr                260
+ #define __NR_llistxattr               261
+ #define __NR_flistxattr               262