--- /dev/null
+From 928a477102c4fc6739883415b66987207e3502f4 Mon Sep 17 00:00:00 2001
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Date: Fri, 20 Nov 2015 15:57:15 -0800
+Subject: fat: fix fake_offset handling on error path
+
+From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+
+commit 928a477102c4fc6739883415b66987207e3502f4 upstream.
+
+For the root directory, . and .. are faked (using dir_emit_dots()) and
+ctx->pos is reset from 2 to 0.
+
+A corrupted root directory could cause fat_get_entry() to fail, but
+->iterate() (fat_readdir()) reports progress to the VFS (with ctx->pos
+rewound to 0), so any following calls to ->iterate() continue to return
+the same entries again and again.
+
+The result is that userspace will never see the end of the directory,
+causing e.g. 'ls' to hang in a getdents() loop.
+
+[hirofumi@mail.parknet.co.jp: cleanup and make sure to correct fake_offset]
+Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Signed-off-by: Richard Weinberger <richard.weinberger@gmail.com>
+Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fat/dir.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/fs/fat/dir.c
++++ b/fs/fat/dir.c
+@@ -610,9 +610,9 @@ parse_record:
+ int status = fat_parse_long(inode, &cpos, &bh, &de,
+ &unicode, &nr_slots);
+ if (status < 0) {
+- ctx->pos = cpos;
++ bh = NULL;
+ ret = status;
+- goto out;
++ goto end_of_dir;
+ } else if (status == PARSE_INVALID)
+ goto record_end;
+ else if (status == PARSE_NOT_LONGNAME)
+@@ -654,8 +654,9 @@ parse_record:
+ fill_len = short_len;
+
+ start_filldir:
+- if (!fake_offset)
+- ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
++ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
++ if (fake_offset && ctx->pos < 2)
++ ctx->pos = 2;
+
+ if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+ if (!dir_emit_dot(file, ctx))
+@@ -681,14 +682,19 @@ record_end:
+ fake_offset = 0;
+ ctx->pos = cpos;
+ goto get_new;
++
+ end_of_dir:
+- ctx->pos = cpos;
++ if (fake_offset && cpos < 2)
++ ctx->pos = 2;
++ else
++ ctx->pos = cpos;
+ fill_failed:
+ brelse(bh);
+ if (unicode)
+ __putname(unicode);
+ out:
+ mutex_unlock(&sbi->s_lock);
++
+ return ret;
+ }
+
--- /dev/null
+From 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 Mon Sep 17 00:00:00 2001
+From: Greg Thelen <gthelen@google.com>
+Date: Fri, 6 Nov 2015 16:32:42 -0800
+Subject: fs, seqfile: always allow oom killer
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 0f930902eb8806cff8dcaef9ff9faf3cfa5fd748 upstream.
+
+Since 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill
+processes") seq_buf_alloc() avoids calling the oom killer for PAGE_SIZE or
+smaller allocations; but larger allocations can use the oom killer via
+vmalloc(). Thus reads of small files can return ENOMEM, but larger files
+use the oom killer to avoid ENOMEM.
+
+The effect of this bug is that reads from /proc and other virtual
+filesystems can return ENOMEM instead of the preferred behavior - oom
+killing something (possibly the calling process). I don't know of anyone
+except Google who has noticed the issue.
+
+I suspect the fix is more needed in smaller systems where there isn't any
+reclaimable memory. But these seem like the kinds of systems which
+probably don't use the oom killer for production situations.
+
+Memory overcommit requires use of the oom killer to select a victim
+regardless of file size.
+
+Enable oom killer for small seq_buf_alloc() allocations.
+
+Fixes: 5cec38ac866b ("fs, seq_file: fallback to vmalloc instead of oom kill processes")
+Signed-off-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/seq_file.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/seq_file.c
++++ b/fs/seq_file.c
+@@ -25,12 +25,17 @@ static void seq_set_overflow(struct seq_
+ static void *seq_buf_alloc(unsigned long size)
+ {
+ void *buf;
++ gfp_t gfp = GFP_KERNEL;
+
+ /*
+- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
+- * it's better to fall back to vmalloc() than to kill things.
++ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
++ * it's better to fall back to vmalloc() than to kill things. For small
++ * allocations, just use GFP_KERNEL which will oom kill, thus no need
++ * for vmalloc fallback.
+ */
+- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
++ if (size > PAGE_SIZE)
++ gfp |= __GFP_NORETRY | __GFP_NOWARN;
++ buf = kmalloc(size, gfp);
+ if (!buf && size > PAGE_SIZE)
+ buf = vmalloc(size);
+ return buf;
--- /dev/null
+From 9d8a765211335cfdad464b90fb19f546af5706ae Mon Sep 17 00:00:00 2001
+From: Richard Weinberger <richard@nod.at>
+Date: Fri, 20 Nov 2015 15:57:21 -0800
+Subject: kernel/signal.c: unexport sigsuspend()
+
+From: Richard Weinberger <richard@nod.at>
+
+commit 9d8a765211335cfdad464b90fb19f546af5706ae upstream.
+
+sigsuspend() is nowhere used except in signal.c itself, so we can mark it
+static do not pollute the global namespace.
+
+But this patch is more than a boring cleanup patch, it fixes a real issue
+on UserModeLinux. UML has a special console driver to display ttys using
+xterm, or other terminal emulators, on the host side. Vegard reported
+that sometimes UML is unable to spawn a xterm and he's facing the
+following warning:
+
+ WARNING: CPU: 0 PID: 908 at include/linux/thread_info.h:128 sigsuspend+0xab/0xc0()
+
+It turned out that this warning makes absolutely no sense as the UML
+xterm code calls sigsuspend() on the host side, at least it tries. But
+as the kernel itself offers a sigsuspend() symbol the linker choose this
+one instead of the glibc wrapper. Interestingly this code used to work
+since ever but always blocked signals on the wrong side. Some recent
+kernel change made the WARN_ON() trigger and uncovered the bug.
+
+It is a wonderful example of how much works by chance on computers. :-)
+
+Fixes: 68f3f16d9ad0f1 ("new helper: sigsuspend()")
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
+Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
+Acked-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/signal.h | 1 -
+ kernel/signal.c | 2 +-
+ 2 files changed, 1 insertion(+), 2 deletions(-)
+
+--- a/include/linux/signal.h
++++ b/include/linux/signal.h
+@@ -239,7 +239,6 @@ extern int sigprocmask(int, sigset_t *,
+ extern void set_current_blocked(sigset_t *);
+ extern void __set_current_blocked(const sigset_t *);
+ extern int show_unhandled_signals;
+-extern int sigsuspend(sigset_t *);
+
+ struct sigaction {
+ #ifndef __ARCH_HAS_IRIX_SIGACTION
+--- a/kernel/signal.c
++++ b/kernel/signal.c
+@@ -3552,7 +3552,7 @@ SYSCALL_DEFINE0(pause)
+
+ #endif
+
+-int sigsuspend(sigset_t *set)
++static int sigsuspend(sigset_t *set)
+ {
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(set);
--- /dev/null
+From 9f029f540c2f7e010e4922d44ba0dfd05da79f88 Mon Sep 17 00:00:00 2001
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Date: Fri, 6 Nov 2015 16:31:31 -0800
+Subject: lib/hexdump.c: truncate output in case of overflow
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+commit 9f029f540c2f7e010e4922d44ba0dfd05da79f88 upstream.
+
+There is a classical off-by-one error in case when we try to place, for
+example, 1+1 bytes as hex in the buffer of size 6. The expected result is
+to get an output truncated, but in the reality we get 6 bytes filed
+followed by terminating NUL.
+
+Change the logic how we fill the output in case of byte dumping into
+limited space. This will follow the snprintf() behaviour by truncating
+output even on half bytes.
+
+Fixes: 114fc1afb2de (hexdump: make it return number of bytes placed in buffer)
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Reported-by: Aaro Koskinen <aaro.koskinen@nokia.com>
+Tested-by: Aaro Koskinen <aaro.koskinen@nokia.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ lib/hexdump.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/lib/hexdump.c
++++ b/lib/hexdump.c
+@@ -169,11 +169,15 @@ int hex_dump_to_buffer(const void *buf,
+ }
+ } else {
+ for (j = 0; j < len; j++) {
+- if (linebuflen < lx + 3)
++ if (linebuflen < lx + 2)
+ goto overflow2;
+ ch = ptr[j];
+ linebuf[lx++] = hex_asc_hi(ch);
++ if (linebuflen < lx + 2)
++ goto overflow2;
+ linebuf[lx++] = hex_asc_lo(ch);
++ if (linebuflen < lx + 2)
++ goto overflow2;
+ linebuf[lx++] = ' ';
+ }
+ if (j)
--- /dev/null
+From c12176d3368b9b36ae484d323d41e94be26f9b65 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Thu, 5 Nov 2015 18:50:29 -0800
+Subject: memcg: fix thresholds for 32b architectures.
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit c12176d3368b9b36ae484d323d41e94be26f9b65 upstream.
+
+Commit 424cdc141380 ("memcg: convert threshold to bytes") has fixed a
+regression introduced by 3e32cb2e0a12 ("mm: memcontrol: lockless page
+counters") where thresholds were silently converted to use page units
+rather than bytes when interpreting the user input.
+
+The fix is not complete, though, as properly pointed out by Ben Hutchings
+during stable backport review. The page count is converted to bytes but
+unsigned long is used to hold the value which would be obviously not
+sufficient for 32b systems with more than 4G thresholds. The same applies
+to usage as taken from mem_cgroup_usage which might overflow.
+
+Let's remove this bytes vs. pages internal tracking differences and
+handle thresholds in page units internally. Chage mem_cgroup_usage() to
+return the value in page units and revert 424cdc141380 because this should
+be sufficient for the consistent handling. mem_cgroup_read_u64 as the
+only users of mem_cgroup_usage outside of the threshold handling code is
+converted to give the proper in bytes result. It is doing that already
+for page_counter output so this is more consistent as well.
+
+The value presented to the userspace is still in bytes units.
+
+Fixes: 424cdc141380 ("memcg: convert threshold to bytes")
+Fixes: 3e32cb2e0a12 ("mm: memcontrol: lockless page counters")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Ben Hutchings <ben@decadent.org.uk>
+Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+From: Michal Hocko <mhocko@kernel.org>
+Subject: memcg: fix thresholds for 32b architectures.
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+From: Andrew Morton <akpm@linux-foundation.org>
+Subject: memcg: fix thresholds for 32b architectures.
+
+don't attempt to inline mem_cgroup_usage()
+
+The compiler ignores the inline anwyay. And __always_inlining it adds 600
+bytes of goop to the .o file.
+
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+---
+ mm/memcontrol.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2836,9 +2836,9 @@ static unsigned long tree_stat(struct me
+ return val;
+ }
+
+-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
++static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+ {
+- u64 val;
++ unsigned long val;
+
+ if (mem_cgroup_is_root(memcg)) {
+ val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
+@@ -2851,7 +2851,7 @@ static inline u64 mem_cgroup_usage(struc
+ else
+ val = page_counter_read(&memcg->memsw);
+ }
+- return val << PAGE_SHIFT;
++ return val;
+ }
+
+ enum {
+@@ -2885,9 +2885,9 @@ static u64 mem_cgroup_read_u64(struct cg
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ if (counter == &memcg->memory)
+- return mem_cgroup_usage(memcg, false);
++ return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
+ if (counter == &memcg->memsw)
+- return mem_cgroup_usage(memcg, true);
++ return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
+ return (u64)page_counter_read(counter) * PAGE_SIZE;
+ case RES_LIMIT:
+ return (u64)counter->limit * PAGE_SIZE;
+@@ -3387,7 +3387,6 @@ static int __mem_cgroup_usage_register_e
+ ret = page_counter_memparse(args, "-1", &threshold);
+ if (ret)
+ return ret;
+- threshold <<= PAGE_SHIFT;
+
+ mutex_lock(&memcg->thresholds_lock);
+
--- /dev/null
+From 0d777df5d8953293be090d9ab5a355db893e8357 Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Fri, 11 Dec 2015 13:40:49 -0800
+Subject: mm: hugetlb: call huge_pte_alloc() only if ptep is null
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit 0d777df5d8953293be090d9ab5a355db893e8357 upstream.
+
+Currently at the beginning of hugetlb_fault(), we call huge_pte_offset()
+and check whether the obtained *ptep is a migration/hwpoison entry or
+not. And if not, then we get to call huge_pte_alloc(). This is racy
+because the *ptep could turn into migration/hwpoison entry after the
+huge_pte_offset() check. This race results in BUG_ON in
+huge_pte_alloc().
+
+We don't have to call huge_pte_alloc() when the huge_pte_offset()
+returns non-NULL, so let's fix this bug with moving the code into else
+block.
+
+Note that the *ptep could turn into a migration/hwpoison entry after
+this block, but that's not a problem because we have another
+!pte_present check later (we never go into hugetlb_no_page() in that
+case.)
+
+Fixes: 290408d4a250 ("hugetlb: hugepage migration core")
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3590,12 +3590,12 @@ int hugetlb_fault(struct mm_struct *mm,
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
++ } else {
++ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
++ if (!ptep)
++ return VM_FAULT_OOM;
+ }
+
+- ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+- if (!ptep)
+- return VM_FAULT_OOM;
+-
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
--- /dev/null
+From a88c769548047b21f76fd71e04b6a3300ff17160 Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Fri, 11 Dec 2015 13:40:24 -0800
+Subject: mm: hugetlb: fix hugepage memory leak caused by wrong reserve count
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit a88c769548047b21f76fd71e04b6a3300ff17160 upstream.
+
+When dequeue_huge_page_vma() in alloc_huge_page() fails, we fall back on
+alloc_buddy_huge_page() to directly create a hugepage from the buddy
+allocator.
+
+In that case, however, if alloc_buddy_huge_page() succeeds we don't
+decrement h->resv_huge_pages, which means that successful
+hugetlb_fault() returns without releasing the reserve count. As a
+result, subsequent hugetlb_fault() might fail despite that there are
+still free hugepages.
+
+This patch simply adds decrementing code on that code path.
+
+I reproduced this problem when testing v4.3 kernel in the following situation:
+ - the test machine/VM is a NUMA system,
+ - hugepage overcommiting is enabled,
+ - most of hugepages are allocated and there's only one free hugepage
+ which is on node 0 (for example),
+ - another program, which calls set_mempolicy(MPOL_BIND) to bind itself to
+ node 1, tries to allocate a hugepage,
+ - the allocation should fail but the reserve count is still hold.
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1790,7 +1790,10 @@ struct page *alloc_huge_page(struct vm_a
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
+ goto out_uncharge_cgroup;
+-
++ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
++ SetPagePrivate(page);
++ h->resv_huge_pages--;
++ }
+ spin_lock(&hugetlb_lock);
+ list_move(&page->lru, &h->hugepage_activelist);
+ /* Fall through */
--- /dev/null
+From dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 11 Dec 2015 13:40:52 -0800
+Subject: mm/hugetlb.c: fix resv map memory leak for placeholder entries
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit dbe409e4f5e5075bd9ff7f8dd5c627abf3ee38c1 upstream.
+
+Dmitry Vyukov reported the following memory leak
+
+unreferenced object 0xffff88002eaafd88 (size 32):
+ comm "a.out", pid 5063, jiffies 4295774645 (age 15.810s)
+ hex dump (first 32 bytes):
+ 28 e9 4e 63 00 88 ff ff 28 e9 4e 63 00 88 ff ff (.Nc....(.Nc....
+ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+ backtrace:
+ kmalloc include/linux/slab.h:458
+ region_chg+0x2d4/0x6b0 mm/hugetlb.c:398
+ __vma_reservation_common+0x2c3/0x390 mm/hugetlb.c:1791
+ vma_needs_reservation mm/hugetlb.c:1813
+ alloc_huge_page+0x19e/0xc70 mm/hugetlb.c:1845
+ hugetlb_no_page mm/hugetlb.c:3543
+ hugetlb_fault+0x7a1/0x1250 mm/hugetlb.c:3717
+ follow_hugetlb_page+0x339/0xc70 mm/hugetlb.c:3880
+ __get_user_pages+0x542/0xf30 mm/gup.c:497
+ populate_vma_page_range+0xde/0x110 mm/gup.c:919
+ __mm_populate+0x1c7/0x310 mm/gup.c:969
+ do_mlock+0x291/0x360 mm/mlock.c:637
+ SYSC_mlock2 mm/mlock.c:658
+ SyS_mlock2+0x4b/0x70 mm/mlock.c:648
+
+Dmitry identified a potential memory leak in the routine region_chg,
+where a region descriptor is not free'ed on an error path.
+
+However, the root cause for the above memory leak resides in region_del.
+In this specific case, a "placeholder" entry is created in region_chg.
+The associated page allocation fails, and the placeholder entry is left
+in the reserve map. This is "by design" as the entry should be deleted
+when the map is released. The bug is in the region_del routine which is
+used to delete entries within a specific range (and when the map is
+released). region_del did not handle the case where a placeholder entry
+exactly matched the start of the range range to be deleted. In this
+case, the entry would not be deleted and leaked. The fix is to take
+these special placeholder entries into account in region_del.
+
+The region_chg error path leak is also fixed.
+
+Fixes: feba16e25a57 ("mm/hugetlb: add region_del() to delete a specific range of entries")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -372,8 +372,10 @@ retry_locked:
+ spin_unlock(&resv->lock);
+
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+- if (!trg)
++ if (!trg) {
++ kfree(nrg);
+ return -ENOMEM;
++ }
+
+ spin_lock(&resv->lock);
+ list_add(&trg->link, &resv->region_cache);
+@@ -483,8 +485,16 @@ static long region_del(struct resv_map *
+ retry:
+ spin_lock(&resv->lock);
+ list_for_each_entry_safe(rg, trg, head, link) {
+- if (rg->to <= f)
++ /*
++ * Skip regions before the range to be deleted. file_region
++ * ranges are normally of the form [from, to). However, there
++ * may be a "placeholder" entry in the map which is of the form
++ * (from, to) with from == to. Check for placeholder entries
++ * at the beginning of the range to be deleted.
++ */
++ if (rg->to <= f && (rg->to != rg->from || rg->to != f))
+ continue;
++
+ if (rg->from >= t)
+ break;
+
--- /dev/null
+From 1817889e3b2cc1db8abb595712095129ff9156c1 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Fri, 20 Nov 2015 15:57:13 -0800
+Subject: mm/hugetlbfs: fix bugs in fallocate hole punch of areas with holes
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit 1817889e3b2cc1db8abb595712095129ff9156c1 upstream.
+
+Hugh Dickins pointed out problems with the new hugetlbfs fallocate hole
+punch code. These problems are in the routine remove_inode_hugepages and
+mostly occur in the case where there are holes in the range of pages to be
+removed. These holes could be the result of a previous hole punch or
+simply sparse allocation. The current code could access pages outside the
+specified range.
+
+remove_inode_hugepages handles both hole punch and truncate operations.
+Page index handling was fixed/cleaned up so that the loop index always
+matches the page being processed. The code now only makes a single pass
+through the range of pages as it was determined page faults could not race
+with truncate. A cond_resched() was added after removing up to
+PAGEVEC_SIZE pages.
+
+Some totally unnecessary code in hugetlbfs_fallocate() that remained from
+early development was also removed.
+
+Tested with fallocate tests submitted here:
+http://librelist.com/browser//libhugetlbfs/2015/6/25/patch-tests-add-tests-for-fallocate-system-call/
+And, some ftruncate tests under development
+
+Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: "Hillf Danton" <hillf.zj@alibaba-inc.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/hugetlbfs/inode.c | 65 +++++++++++++++++++++++++--------------------------
+ 1 file changed, 32 insertions(+), 33 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -332,12 +332,17 @@ static void remove_huge_page(struct page
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+- * maps and global counts.
++ * maps and global counts. Page faults can not race with truncation
++ * in this routine. hugetlb_no_page() prevents page faults in the
++ * truncated range. It checks i_size before allocation, and again after
++ * with the page table lock for the page held. The same lock must be
++ * acquired to unmap a page.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+- * pages are not modified.
++ * pages are not modified. Page faults can race with hole punch.
++ * This is indicated if we find a mapped page.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+@@ -361,46 +366,37 @@ static void remove_inode_hugepages(struc
+ next = start;
+ while (next < end) {
+ /*
+- * Make sure to never grab more pages that we
+- * might possibly need.
++ * Don't grab more pages than the number left in the range.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+- * This pagevec_lookup() may return pages past 'end',
+- * so we must check for page->index > end.
++ * When no more pages are found, we are done.
+ */
+- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
+- if (next == start)
+- break;
+- next = start;
+- continue;
+- }
++ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
++ break;
+
+ for (i = 0; i < pagevec_count(&pvec); ++i) {
+ struct page *page = pvec.pages[i];
+ u32 hash;
+
++ /*
++ * The page (index) could be beyond end. This is
++ * only possible in the punch hole case as end is
++ * max page offset in the truncate case.
++ */
++ next = page->index;
++ if (next >= end)
++ break;
++
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ lock_page(page);
+- if (page->index >= end) {
+- unlock_page(page);
+- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+- next = end; /* we are done */
+- break;
+- }
+-
+- /*
+- * If page is mapped, it was faulted in after being
+- * unmapped. Do nothing in this race case. In the
+- * normal case page is not mapped.
+- */
+- if (!page_mapped(page)) {
++ if (likely(!page_mapped(page))) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+@@ -421,17 +417,23 @@ static void remove_inode_hugepages(struc
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
++ } else {
++ /*
++ * If page is mapped, it was faulted in after
++ * being unmapped. It indicates a race between
++ * hole punch and page fault. Do nothing in
++ * this case. Getting here in a truncate
++ * operation is a bug.
++ */
++ BUG_ON(truncate_op);
+ }
+
+- if (page->index > next)
+- next = page->index;
+-
+- ++next;
+ unlock_page(page);
+-
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ }
++ ++next;
+ huge_pagevec_release(&pvec);
++ cond_resched();
+ }
+
+ if (truncate_op)
+@@ -647,9 +649,6 @@ static long hugetlbfs_fallocate(struct f
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+- spin_lock(&inode->i_lock);
+- inode->i_private = NULL;
+- spin_unlock(&inode->i_lock);
+ out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
--- /dev/null
+From 426fb5e72d92b868912e47a1e3ca2df6eabc3872 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
+Date: Thu, 5 Nov 2015 18:47:44 -0800
+Subject: mm/oom_kill.c: reverse the order of setting TIF_MEMDIE and sending SIGKILL
+
+From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
+
+commit 426fb5e72d92b868912e47a1e3ca2df6eabc3872 upstream.
+
+It was confirmed that a local unprivileged user can consume all memory
+reserves and hang up that system using time lag between the OOM killer
+sets TIF_MEMDIE on an OOM victim and sends SIGKILL to that victim, for
+printk() inside for_each_process() loop at oom_kill_process() can consume
+many seconds when there are many thread groups sharing the same memory.
+
+Before starting oom-depleter process:
+
+ Node 0 DMA: 3*4kB (UM) 6*8kB (U) 4*16kB (UEM) 0*32kB 0*64kB 1*128kB (M) 2*256kB (EM) 2*512kB (UE) 2*1024kB (EM) 1*2048kB (E) 1*4096kB (M) = 9980kB
+ Node 0 DMA32: 31*4kB (UEM) 27*8kB (UE) 32*16kB (UE) 13*32kB (UE) 14*64kB (UM) 7*128kB (UM) 8*256kB (UM) 8*512kB (UM) 3*1024kB (U) 4*2048kB (UM) 362*4096kB (UM) = 1503220kB
+
+As of invoking the OOM killer:
+
+ Node 0 DMA: 11*4kB (UE) 8*8kB (UEM) 6*16kB (UE) 2*32kB (EM) 0*64kB 1*128kB (U) 3*256kB (UEM) 2*512kB (UE) 3*1024kB (UEM) 1*2048kB (U) 0*4096kB = 7308kB
+ Node 0 DMA32: 1049*4kB (UEM) 507*8kB (UE) 151*16kB (UE) 53*32kB (UEM) 83*64kB (UEM) 52*128kB (EM) 25*256kB (UEM) 11*512kB (M) 6*1024kB (UM) 1*2048kB (M) 0*4096kB = 44556kB
+
+Between the thread group leader got TIF_MEMDIE and receives SIGKILL:
+
+ Node 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
+ Node 0 DMA32: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 0kB
+
+The oom-depleter's thread group leader which got TIF_MEMDIE started
+memset() in user space after the OOM killer set TIF_MEMDIE, and it was
+free to abuse ALLOC_NO_WATERMARKS by TIF_MEMDIE for memset() in user space
+until SIGKILL is delivered. If SIGKILL is delivered before TIF_MEMDIE is
+set, the oom-depleter can terminate without touching memory reserves.
+
+Although the possibility of hitting this time lag is very small for 3.19
+and earlier kernels because TIF_MEMDIE is set immediately before sending
+SIGKILL, preemption or long interrupts (an extreme example is SysRq-t) can
+step between and allow memory allocations which are not needed for
+terminating the OOM victim.
+
+Fixes: 83363b917a29 ("oom: make sure that TIF_MEMDIE is set under task_lock")
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/oom_kill.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -554,6 +554,12 @@ void oom_kill_process(struct oom_control
+
+ /* mm cannot safely be dereferenced after task_unlock(victim) */
+ mm = victim->mm;
++ /*
++ * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
++ * the OOM victim from depleting the memory reserves from the user
++ * space under its control.
++ */
++ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ mark_oom_victim(victim);
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+@@ -585,7 +591,6 @@ void oom_kill_process(struct oom_control
+ }
+ rcu_read_unlock();
+
+- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ put_task_struct(victim);
+ }
+ #undef K
--- /dev/null
+From d4322d88f5fdf92729dd40f923013414fbb2184d Mon Sep 17 00:00:00 2001
+From: Catalin Marinas <catalin.marinas@arm.com>
+Date: Thu, 5 Nov 2015 18:45:54 -0800
+Subject: mm: slab: only move management objects off-slab for sizes larger than KMALLOC_MIN_SIZE
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit d4322d88f5fdf92729dd40f923013414fbb2184d upstream.
+
+On systems with a KMALLOC_MIN_SIZE of 128 (arm64, some mips and powerpc
+configurations defining ARCH_DMA_MINALIGN to 128), the first
+kmalloc_caches[] entry to be initialised after slab_early_init = 0 is
+"kmalloc-128" with index 7. Depending on the debug kernel configuration,
+sizeof(struct kmem_cache) can be larger than 128 resulting in an
+INDEX_NODE of 8.
+
+Commit 8fc9cf420b36 ("slab: make more slab management structure off the
+slab") enables off-slab management objects for sizes starting with
+PAGE_SIZE >> 5 (128 bytes for a 4KB page configuration) and the creation
+of the "kmalloc-128" cache would try to place the management objects
+off-slab. However, since KMALLOC_MIN_SIZE is already 128 and
+freelist_size == 32 in __kmem_cache_create(), kmalloc_slab(freelist_size)
+returns NULL (kmalloc_caches[7] not populated yet). This triggers the
+following bug on arm64:
+
+ kernel BUG at /work/Linux/linux-2.6-aarch64/mm/slab.c:2283!
+ Internal error: Oops - BUG: 0 [#1] SMP
+ Modules linked in:
+ CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc4+ #540
+ Hardware name: Juno (DT)
+ PC is at __kmem_cache_create+0x21c/0x280
+ LR is at __kmem_cache_create+0x210/0x280
+ [...]
+ Call trace:
+ __kmem_cache_create+0x21c/0x280
+ create_boot_cache+0x48/0x80
+ create_kmalloc_cache+0x50/0x88
+ create_kmalloc_caches+0x4c/0xf4
+ kmem_cache_init+0x100/0x118
+ start_kernel+0x214/0x33c
+
+This patch introduces an OFF_SLAB_MIN_SIZE definition to avoid off-slab
+management objects for sizes equal to or smaller than KMALLOC_MIN_SIZE.
+
+Fixes: 8fc9cf420b36 ("slab: make more slab management structure off the slab")
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Acked-by: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct
+
+ #define CFLGS_OFF_SLAB (0x80000000UL)
+ #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
++#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
+
+ #define BATCHREFILL_LIMIT 16
+ /*
+@@ -2212,7 +2213,7 @@ __kmem_cache_create (struct kmem_cache *
+ * it too early on. Always use on-slab management when
+ * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
+ */
+- if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
++ if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
+ !(flags & SLAB_NOLEAKTRACE))
+ /*
+ * Size is large, assume best to place the slab management obj
+@@ -2276,7 +2277,7 @@ __kmem_cache_create (struct kmem_cache *
+ /*
+ * This is a possibility for one of the kmalloc_{dma,}_caches.
+ * But since we go off slab only for object size greater than
+- * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
++ * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
+ * in ascending order,this should not happen at all.
+ * But leave a BUG_ON for some lucky dude.
+ */
--- /dev/null
+From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 11 Dec 2015 13:40:32 -0800
+Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 upstream.
+
+Tetsuo Handa has reported that the system might basically livelock in
+OOM condition without triggering the OOM killer.
+
+The issue is caused by internal dependency of the direct reclaim on
+vmstat counter updates (via zone_reclaimable) which are performed from
+the workqueue context. If all the current workers get assigned to an
+allocation request, though, they will be looping inside the allocator
+trying to reclaim memory but zone_reclaimable can see stalled numbers so
+it will consider a zone reclaimable even though it has been scanned way
+too much. WQ concurrency logic will not consider this situation as a
+congested workqueue because it relies that worker would have to sleep in
+such a situation. This also means that it doesn't try to spawn new
+workers or invoke the rescuer thread if the one is assigned to the
+queue.
+
+In order to fix this issue we need to do two things. First we have to
+let wq concurrency code know that we are in trouble so we have to do a
+short sleep. In order to prevent from issues handled by 0e093d99763e
+("writeback: do not sleep on the congestion queue if there are no
+congested BDIs or if significant congestion is not being encountered in
+the current zone") we limit the sleep only to worker threads which are
+the ones of the interest anyway.
+
+The second thing to do is to create a dedicated workqueue for vmstat and
+mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to
+have a spare worker thread for it.
+
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Cristopher Lameter <clameter@sgi.com>
+Cc: Joonsoo Kim <js1304@gmail.com>
+Cc: Arkadiusz Miskiewicz <arekm@maven.pl>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/backing-dev.c | 19 ++++++++++++++++---
+ mm/vmstat.c | 6 ++++--
+ 2 files changed, 20 insertions(+), 5 deletions(-)
+
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+- * In the absence of zone congestion, cond_resched() is called to yield
+- * the processor if necessary but otherwise does not sleep.
++ * In the absence of zone congestion, a short sleep or a cond_resched is
++ * performed to yield the processor and to allow other subsystems to make
++ * a forward progress.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zon
+ */
+ if (atomic_read(&nr_wb_congested[sync]) == 0 ||
+ !test_bit(ZONE_CONGESTED, &zone->flags)) {
+- cond_resched();
++
++ /*
++ * Memory allocation/reclaim might be called from a WQ
++ * context and the current implementation of the WQ
++ * concurrency control doesn't recognize that a particular
++ * WQ is congested if the worker thread is looping without
++ * ever sleeping. Therefore we have to do a short sleep
++ * here rather than calling cond_resched().
++ */
++ if (current->flags & PF_WQ_WORKER)
++ schedule_timeout(1);
++ else
++ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1357,6 +1357,7 @@ static const struct file_operations proc
+ #endif /* CONFIG_PROC_FS */
+
+ #ifdef CONFIG_SMP
++static struct workqueue_struct *vmstat_wq;
+ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+ int sysctl_stat_interval __read_mostly = HZ;
+ static cpumask_var_t cpu_stat_off;
+@@ -1369,7 +1370,7 @@ static void vmstat_update(struct work_st
+ * to occur in the future. Keep on running the
+ * update worker thread.
+ */
+- schedule_delayed_work_on(smp_processor_id(),
++ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ } else {
+@@ -1438,7 +1439,7 @@ static void vmstat_shepherd(struct work_
+ if (need_update(cpu) &&
+ cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+
+- schedule_delayed_work_on(cpu,
++ queue_delayed_work_on(cpu, vmstat_wq,
+ &per_cpu(vmstat_work, cpu), 0);
+
+ put_online_cpus();
+@@ -1527,6 +1528,7 @@ static int __init setup_vmstat(void)
+
+ start_shepherd_timer();
+ cpu_notifier_register_done();
++ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+ proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
--- /dev/null
+From c95a51807b730e4681e2ecbdfd669ca52601959e Mon Sep 17 00:00:00 2001
+From: xuejiufei <xuejiufei@huawei.com>
+Date: Fri, 5 Feb 2016 15:36:47 -0800
+Subject: ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup
+
+From: xuejiufei <xuejiufei@huawei.com>
+
+commit c95a51807b730e4681e2ecbdfd669ca52601959e upstream.
+
+When recovery master down, dlm_do_local_recovery_cleanup() only remove
+the $RECOVERY lock owned by dead node, but do not clear the refmap bit.
+Which will make umount thread falling in dead loop migrating $RECOVERY
+to the dead node.
+
+Signed-off-by: xuejiufei <xuejiufei@huawei.com>
+Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/dlm/dlmrecovery.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ocfs2/dlm/dlmrecovery.c
++++ b/fs/ocfs2/dlm/dlmrecovery.c
+@@ -2360,6 +2360,8 @@ static void dlm_do_local_recovery_cleanu
+ break;
+ }
+ }
++ dlm_lockres_clear_refmap_bit(dlm, res,
++ dead_node);
+ spin_unlock(&res->spinlock);
+ continue;
+ }
--- /dev/null
+From bef5502de074b6f6fa647b94b73155d675694420 Mon Sep 17 00:00:00 2001
+From: xuejiufei <xuejiufei@huawei.com>
+Date: Thu, 14 Jan 2016 15:17:38 -0800
+Subject: ocfs2/dlm: ignore cleaning the migration mle that is inuse
+
+From: xuejiufei <xuejiufei@huawei.com>
+
+commit bef5502de074b6f6fa647b94b73155d675694420 upstream.
+
+We have found that migration source will trigger a BUG that the refcount
+of mle is already zero before put when the target is down during
+migration. The situation is as follows:
+
+dlm_migrate_lockres
+ dlm_add_migration_mle
+ dlm_mark_lockres_migrating
+ dlm_get_mle_inuse
+ <<<<<< Now the refcount of the mle is 2.
+ dlm_send_one_lockres and wait for the target to become the
+ new master.
+ <<<<<< o2hb detect the target down and clean the migration
+ mle. Now the refcount is 1.
+
+dlm_migrate_lockres woken, and put the mle twice when found the target
+goes down which trigger the BUG with the following message:
+
+ "ERROR: bad mle: ".
+
+Signed-off-by: Jiufei Xue <xuejiufei@huawei.com>
+Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/dlm/dlmmaster.c | 26 +++++++++++++++-----------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+--- a/fs/ocfs2/dlm/dlmmaster.c
++++ b/fs/ocfs2/dlm/dlmmaster.c
+@@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dl
+ spin_lock(&dlm->master_lock);
+ ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
+ namelen, target, dlm->node_num);
++ /* get an extra reference on the mle.
++ * otherwise the assert_master from the new
++ * master will destroy this.
++ */
++ dlm_get_mle_inuse(mle);
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+
+@@ -2554,6 +2559,7 @@ fail:
+ if (mle_added) {
+ dlm_mle_detach_hb_events(dlm, mle);
+ dlm_put_mle(mle);
++ dlm_put_mle_inuse(mle);
+ } else if (mle) {
+ kmem_cache_free(dlm_mle_cache, mle);
+ mle = NULL;
+@@ -2571,17 +2577,6 @@ fail:
+ * ensure that all assert_master work is flushed. */
+ flush_workqueue(dlm->dlm_worker);
+
+- /* get an extra reference on the mle.
+- * otherwise the assert_master from the new
+- * master will destroy this.
+- * also, make sure that all callers of dlm_get_mle
+- * take both dlm->spinlock and dlm->master_lock */
+- spin_lock(&dlm->spinlock);
+- spin_lock(&dlm->master_lock);
+- dlm_get_mle_inuse(mle);
+- spin_unlock(&dlm->master_lock);
+- spin_unlock(&dlm->spinlock);
+-
+ /* notify new node and send all lock state */
+ /* call send_one_lockres with migration flag.
+ * this serves as notice to the target node that a
+@@ -3310,6 +3305,15 @@ top:
+ mle->new_master != dead_node)
+ continue;
+
++ if (mle->new_master == dead_node && mle->inuse) {
++ mlog(ML_NOTICE, "%s: target %u died during "
++ "migration from %u, the MLE is "
++ "still keep used, ignore it!\n",
++ dlm->name, dead_node,
++ mle->master);
++ continue;
++ }
++
+ /* If we have reached this point, this mle needs to be
+ * removed from the list and freed. */
+ dlm_clean_migration_mle(dlm, mle);
--- /dev/null
+From 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 Mon Sep 17 00:00:00 2001
+From: Joseph Qi <joseph.qi@huawei.com>
+Date: Tue, 29 Dec 2015 14:54:06 -0800
+Subject: ocfs2: fix BUG when calculate new backup super
+
+From: Joseph Qi <joseph.qi@huawei.com>
+
+commit 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 upstream.
+
+When resizing, it firstly extends the last gd. Once it should backup
+super in the gd, it calculates new backup super and update the
+corresponding value.
+
+But it currently doesn't consider the situation that the backup super is
+already done. And in this case, it still sets the bit in gd bitmap and
+then decrease from bg_free_bits_count, which leads to a corrupted gd and
+trigger the BUG in ocfs2_block_group_set_bits:
+
+ BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+So check whether the backup super is done and then do the updates.
+
+Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
+Reviewed-by: Jiufei Xue <xuejiufei@huawei.com>
+Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/resize.c | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/fs/ocfs2/resize.c
++++ b/fs/ocfs2/resize.c
+@@ -54,11 +54,12 @@
+ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
+ struct ocfs2_group_desc *gd,
+ u16 cl_cpg,
++ u16 old_bg_clusters,
+ int set)
+ {
+ int i;
+ u16 backups = 0;
+- u32 cluster;
++ u32 cluster, lgd_cluster;
+ u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
+
+ for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(s
+ else if (gd_blkno > lgd_blkno)
+ break;
+
++ /* check if already done backup super */
++ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
++ lgd_cluster += old_bg_clusters;
++ if (lgd_cluster >= cluster)
++ continue;
++
+ if (set)
+ ocfs2_set_bit(cluster % cl_cpg,
+ (unsigned long *)gd->bg_bitmap);
+@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_i
+ u16 chain, num_bits, backups = 0;
+ u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
+ u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
++ u16 old_bg_clusters;
+
+ trace_ocfs2_update_last_group_and_inode(new_clusters,
+ first_new_cluster);
+@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_i
+
+ group = (struct ocfs2_group_desc *)group_bh->b_data;
+
++ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
+ /* update the group first. */
+ num_bits = new_clusters * cl_bpc;
+ le16_add_cpu(&group->bg_bits, num_bits);
+@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_i
+ OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
+ backups = ocfs2_calc_new_backup_super(bm_inode,
+ group,
+- cl_cpg, 1);
++ cl_cpg, old_bg_clusters, 1);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
+ }
+
+@@ -163,7 +172,7 @@ out_rollback:
+ if (ret < 0) {
+ ocfs2_calc_new_backup_super(bm_inode,
+ group,
+- cl_cpg, 0);
++ cl_cpg, old_bg_clusters, 0);
+ le16_add_cpu(&group->bg_free_bits_count, backups);
+ le16_add_cpu(&group->bg_bits, -1 * num_bits);
+ le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
--- /dev/null
+From 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Fri, 11 Dec 2015 13:41:03 -0800
+Subject: ocfs2: fix SGID not inherited issue
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 854ee2e944b4daf795e32562a7d2f9e90ab5a6a8 upstream.
+
+Commit 8f1eb48758aa ("ocfs2: fix umask ignored issue") introduced an
+issue, SGID of sub dir was not inherited from its parents dir. It is
+because SGID is set into "inode->i_mode" in ocfs2_get_init_inode(), but
+is overwritten by "mode" which don't have SGID set later.
+
+Fixes: 8f1eb48758aa ("ocfs2: fix umask ignored issue")
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Acked-by: Srinivas Eeda <srinivas.eeda@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/namei.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/fs/ocfs2/namei.c
++++ b/fs/ocfs2/namei.c
+@@ -369,13 +369,11 @@ static int ocfs2_mknod(struct inode *dir
+ goto leave;
+ }
+
+- status = posix_acl_create(dir, &mode, &default_acl, &acl);
++ status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+- /* update inode->i_mode after mask with "umask". */
+- inode->i_mode = mode;
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+ S_ISDIR(mode),
--- /dev/null
+From 54708d2858e79a2bdda10bf8a20c80eb96c20613 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Fri, 6 Nov 2015 16:30:06 -0800
+Subject: proc: actually make proc_fd_permission() thread-friendly
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 54708d2858e79a2bdda10bf8a20c80eb96c20613 upstream.
+
+The commit 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
+fixed the access to /proc/self/fd from sub-threads, but introduced another
+problem: a sub-thread can't access /proc/<tid>/fd/ or /proc/thread-self/fd
+if generic_permission() fails.
+
+Change proc_fd_permission() to check same_thread_group(pid_task(), current).
+
+Fixes: 96d0df79f264 ("proc: make proc_fd_permission() thread-friendly")
+Reported-by: "Jin, Yihua" <yihua.jin@intel.com>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Cc: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/fd.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/fs/proc/fd.c
++++ b/fs/proc/fd.c
+@@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(stru
+ */
+ int proc_fd_permission(struct inode *inode, int mask)
+ {
+- int rv = generic_permission(inode, mask);
++ struct task_struct *p;
++ int rv;
++
++ rv = generic_permission(inode, mask);
+ if (rv == 0)
+- return 0;
+- if (task_tgid(current) == proc_pid(inode))
++ return rv;
++
++ rcu_read_lock();
++ p = pid_task(proc_pid(inode), PIDTYPE_PID);
++ if (p && same_thread_group(p, current))
+ rv = 0;
++ rcu_read_unlock();
++
+ return rv;
+ }
+
--- /dev/null
+From 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b Mon Sep 17 00:00:00 2001
+From: Colin Ian King <colin.king@canonical.com>
+Date: Fri, 18 Dec 2015 14:22:01 -0800
+Subject: proc: fix -ESRCH error when writing to /proc/$pid/coredump_filter
+
+From: Colin Ian King <colin.king@canonical.com>
+
+commit 41a0c249cb8706a2efa1ab3d59466b23a27d0c8b upstream.
+
+Writing to /proc/$pid/coredump_filter always returns -ESRCH because commit
+774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()") removed
+the setting of ret after the get_proc_task call and incorrectly left it as
+-ESRCH. Instead, return 0 when successful.
+
+Example breakage:
+
+ echo 0 > /proc/self/coredump_filter
+ bash: echo: write error: No such process
+
+Fixes: 774636e19ed51 ("proc: convert to kstrto*()/kstrto*_from_user()")
+Signed-off-by: Colin Ian King <colin.king@canonical.com>
+Acked-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/proc/base.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -2484,6 +2484,7 @@ static ssize_t proc_coredump_filter_writ
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_no_mm;
++ ret = 0;
+
+ for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+ if (val & mask)
--- /dev/null
+From 92792e48e2ae6051af30468a87994b5432da2f06 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Fri, 20 Nov 2015 18:26:07 +0100
+Subject: remoteproc: avoid stack overflow in debugfs file
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 92792e48e2ae6051af30468a87994b5432da2f06 upstream.
+
+Recent gcc versions warn about reading from a negative offset of
+an on-stack array:
+
+drivers/remoteproc/remoteproc_debugfs.c: In function 'rproc_recovery_write':
+drivers/remoteproc/remoteproc_debugfs.c:167:9: warning: 'buf[4294967295u]' may be used uninitialized in this function [-Wmaybe-uninitialized]
+
+I don't see anything in sys_write() that prevents us from
+being called with a zero 'count' argument, so we should
+add an extra check in rproc_recovery_write() to prevent the
+access and avoid the warning.
+
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Fixes: 2e37abb89a2e ("remoteproc: create a 'recovery' debugfs entry")
+Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/remoteproc/remoteproc_debugfs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/remoteproc/remoteproc_debugfs.c
++++ b/drivers/remoteproc/remoteproc_debugfs.c
+@@ -156,7 +156,7 @@ rproc_recovery_write(struct file *filp,
+ char buf[10];
+ int ret;
+
+- if (count > sizeof(buf))
++ if (count < 1 || count > sizeof(buf))
+ return count;
+
+ ret = copy_from_user(buf, user_buf, count);
ocfs2-nfs-hangs-in-__ocfs2_cluster_lock-due-to-race-with-ocfs2_unblock_lock.patch
hid-usbhid-fix-recursive-deadlock.patch
alsa-hda-implement-loopback-control-switch-for-realtek-and-other-codecs.patch
+proc-actually-make-proc_fd_permission-thread-friendly.patch
+remoteproc-avoid-stack-overflow-in-debugfs-file.patch
+proc-fix-esrch-error-when-writing-to-proc-pid-coredump_filter.patch
+mm-slab-only-move-management-objects-off-slab-for-sizes-larger-than-kmalloc_min_size.patch
+mm-oom_kill.c-reverse-the-order-of-setting-tif_memdie-and-sending-sigkill.patch
+lib-hexdump.c-truncate-output-in-case-of-overflow.patch
+fs-seqfile-always-allow-oom-killer.patch
+memcg-fix-thresholds-for-32b-architectures.patch
+mm-hugetlb-fix-hugepage-memory-leak-caused-by-wrong-reserve-count.patch
+mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch
+mm-hugetlbfs-fix-bugs-in-fallocate-hole-punch-of-areas-with-holes.patch
+fat-fix-fake_offset-handling-on-error-path.patch
+mm-hugetlb-call-huge_pte_alloc-only-if-ptep-is-null.patch
+kernel-signal.c-unexport-sigsuspend.patch
+mm-hugetlb.c-fix-resv-map-memory-leak-for-placeholder-entries.patch
+ocfs2-fix-sgid-not-inherited-issue.patch
+ocfs2-fix-bug-when-calculate-new-backup-super.patch
+ocfs2-dlm-ignore-cleaning-the-migration-mle-that-is-inuse.patch
+ocfs2-dlm-clear-refmap-bit-of-recovery-lock-while-doing-local-recovery-cleanup.patch
+sh64-fix-__nr_fgetxattr.patch
--- /dev/null
+From 2d33fa1059da4c8e816627a688d950b613ec0474 Mon Sep 17 00:00:00 2001
+From: "Dmitry V. Levin" <ldv@altlinux.org>
+Date: Fri, 11 Dec 2015 13:41:06 -0800
+Subject: sh64: fix __NR_fgetxattr
+
+From: Dmitry V. Levin <ldv@altlinux.org>
+
+commit 2d33fa1059da4c8e816627a688d950b613ec0474 upstream.
+
+According to arch/sh/kernel/syscalls_64.S and common sense, __NR_fgetxattr
+has to be defined to 259, but it doesn't. Instead, it's defined to 269,
+which is of course used by another syscall, __NR_sched_setaffinity in this
+case.
+
+This bug was found by strace test suite.
+
+Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
+Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/include/uapi/asm/unistd_64.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/sh/include/uapi/asm/unistd_64.h
++++ b/arch/sh/include/uapi/asm/unistd_64.h
+@@ -278,7 +278,7 @@
+ #define __NR_fsetxattr 256
+ #define __NR_getxattr 257
+ #define __NR_lgetxattr 258
+-#define __NR_fgetxattr 269
++#define __NR_fgetxattr 259
+ #define __NR_listxattr 260
+ #define __NR_llistxattr 261
+ #define __NR_flistxattr 262