--- /dev/null
+From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Thu, 23 Jan 2014 15:53:14 -0800
+Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit 54b9dd14d09f24927285359a227aa363ce46089e upstream.
+
+After thp split in hwpoison_user_mappings(), we hold page lock on the
+raw error page only between try_to_unmap, hence we are in danger of race
+condition.
+
+I found in the RHEL7 MCE-relay testing that we have "bad page" error
+when a memory error happens on a thp tail page used by qemu-kvm:
+
+ Triggering MCE exception on CPU 10
+ mce: [Hardware Error]: Machine check events logged
+ MCE exception done on CPU 10
+ MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption
+ MCE 0x38c535: dirty LRU page recovery: Recovered
+ qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000]
+ BUG: Bad page state in process qemu-kvm pfn:38c400
+ page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00
+ page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked)
+ Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ...
+ CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1
+ Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011
+ Call Trace:
+ dump_stack+0x19/0x1b
+ bad_page.part.59+0xcf/0xe8
+ free_pages_prepare+0x148/0x160
+ free_hot_cold_page+0x31/0x140
+ free_hot_cold_page_list+0x46/0xa0
+ release_pages+0x1c1/0x200
+ free_pages_and_swap_cache+0xad/0xd0
+ tlb_flush_mmu.part.46+0x4c/0x90
+ tlb_finish_mmu+0x55/0x60
+ exit_mmap+0xcb/0x170
+ mmput+0x67/0xf0
+ vhost_dev_cleanup+0x231/0x260 [vhost_net]
+ vhost_net_release+0x3f/0x90 [vhost_net]
+ __fput+0xe9/0x270
+ ____fput+0xe/0x10
+ task_work_run+0xc4/0xe0
+ do_exit+0x2bb/0xa40
+ do_group_exit+0x3f/0xa0
+ get_signal_to_deliver+0x1d0/0x6e0
+ do_signal+0x48/0x5e0
+ do_notify_resume+0x71/0xc0
+ retint_signal+0x48/0x8c
+
+The reason of this bug is that a page fault happens before unlocking the
+head page at the end of memory_failure(). This strange page fault is
+trying to access to address 0x20 and I'm not sure why qemu-kvm does
+this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the
+way we catch the bad page bug/warning because we try to free a locked
+page (which was the former head page.)
+
+To fix this, this patch suggests to shift page lock from head page to
+tail page just after thp split. SIGSEGV still happens, but it affects
+only error affected VMs, not a whole system.
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c | 21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -854,14 +854,14 @@ static int page_action(struct page_state
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+- int trapno, int flags)
++ int trapno, int flags, struct page **hpagep)
+ {
+ enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ struct address_space *mapping;
+ LIST_HEAD(tokill);
+ int ret;
+ int kill = 1, forcekill;
+- struct page *hpage = compound_head(p);
++ struct page *hpage = *hpagep;
+ struct page *ppage;
+
+ if (PageReserved(p) || PageSlab(p))
+@@ -940,11 +940,14 @@ static int hwpoison_user_mappings(struct
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+- * to it.
++ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ put_page(hpage);
+ get_page(p);
++ lock_page(p);
++ unlock_page(hpage);
++ *hpagep = p;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+@@ -962,17 +965,11 @@ static int hwpoison_user_mappings(struct
+ if (kill)
+ collect_procs(ppage, &tokill);
+
+- if (hpage != ppage)
+- lock_page(ppage);
+-
+ ret = try_to_unmap(ppage, ttu);
+ if (ret != SWAP_SUCCESS)
+ printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(ppage));
+
+- if (hpage != ppage)
+- unlock_page(ppage);
+-
+ /*
+ * Now that the dirty bit has been propagated to the
+ * struct page and all unmaps done we can decide if
+@@ -1189,8 +1186,12 @@ int memory_failure(unsigned long pfn, in
+ /*
+ * Now take care of user space mappings.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
++ *
++ * When the raw error page is thp tail page, hpage points to the raw
++ * page after thp split.
+ */
+- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
++ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
++ != SWAP_SUCCESS) {
+ printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+ res = -EBUSY;
+ goto out;
--- /dev/null
+From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 29 Jan 2014 14:05:41 -0800
+Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 upstream.
+
+The VM is currently heavily tuned to avoid swapping. Whether that is
+good or bad is a separate discussion, but as long as the VM won't swap
+to make room for dirty cache, we can not consider anonymous pages when
+calculating the amount of dirtyable memory, the baseline to which
+dirty_background_ratio and dirty_ratio are applied.
+
+A simple workload that occupies a significant size (40+%, depending on
+memory layout, storage speeds etc.) of memory with anon/tmpfs pages and
+uses the remainder for a streaming writer demonstrates this problem. In
+that case, the actual cache pages are a small fraction of what is
+considered dirtyable overall, which results in an relatively large
+portion of the cache pages to be dirtied. As kswapd starts rotating
+these, random tasks enter direct reclaim and stall on IO.
+
+Only consider free pages and file pages dirtyable.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Tested-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/vmstat.h | 3 ---
+ mm/page-writeback.c | 6 ++++--
+ mm/vmscan.c | 49 ++++++++++++++-----------------------------------
+ 3 files changed, 18 insertions(+), 40 deletions(-)
+
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -142,9 +142,6 @@ static inline unsigned long zone_page_st
+ return x;
+ }
+
+-extern unsigned long global_reclaimable_pages(void);
+-extern unsigned long zone_reclaimable_pages(struct zone *zone);
+-
+ #ifdef CONFIG_NUMA
+ /*
+ * Determine the per node value of a stat item. This function
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -202,7 +202,8 @@ static unsigned long zone_dirtyable_memo
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+- nr_pages += zone_reclaimable_pages(zone);
++ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
++ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+ }
+@@ -255,7 +256,8 @@ static unsigned long global_dirtyable_me
+ x = global_page_state(NR_FREE_PAGES);
+ x -= min(x, dirty_balance_reserve);
+
+- x += global_reclaimable_pages();
++ x += global_page_state(NR_INACTIVE_FILE);
++ x += global_page_state(NR_ACTIVE_FILE);
+
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2117,6 +2117,20 @@ static bool shrink_zones(struct zonelist
+ return aborted_reclaim;
+ }
+
++static unsigned long zone_reclaimable_pages(struct zone *zone)
++{
++ int nr;
++
++ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
++ zone_page_state(zone, NR_INACTIVE_FILE);
++
++ if (get_nr_swap_pages() > 0)
++ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
++ zone_page_state(zone, NR_INACTIVE_ANON);
++
++ return nr;
++}
++
+ static bool zone_reclaimable(struct zone *zone)
+ {
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+@@ -3075,41 +3089,6 @@ void wakeup_kswapd(struct zone *zone, in
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+-/*
+- * The reclaimable count would be mostly accurate.
+- * The less reclaimable pages may be
+- * - mlocked pages, which will be moved to unevictable list when encountered
+- * - mapped pages, which may require several travels to be reclaimed
+- * - dirty pages, which is not "instantly" reclaimable
+- */
+-unsigned long global_reclaimable_pages(void)
+-{
+- int nr;
+-
+- nr = global_page_state(NR_ACTIVE_FILE) +
+- global_page_state(NR_INACTIVE_FILE);
+-
+- if (get_nr_swap_pages() > 0)
+- nr += global_page_state(NR_ACTIVE_ANON) +
+- global_page_state(NR_INACTIVE_ANON);
+-
+- return nr;
+-}
+-
+-unsigned long zone_reclaimable_pages(struct zone *zone)
+-{
+- int nr;
+-
+- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+- zone_page_state(zone, NR_INACTIVE_FILE);
+-
+- if (get_nr_swap_pages() > 0)
+- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+- zone_page_state(zone, NR_INACTIVE_ANON);
+-
+- return nr;
+-}
+-
+ #ifdef CONFIG_HIBERNATION
+ /*
+ * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
--- /dev/null
+From a804552b9a15c931cfc2a92a2e0aed1add8b580a Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 29 Jan 2014 14:05:39 -0800
+Subject: mm/page-writeback.c: fix dirty_balance_reserve subtraction from dirtyable memory
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit a804552b9a15c931cfc2a92a2e0aed1add8b580a upstream.
+
+Tejun reported stuttering and latency spikes on a system where random
+tasks would enter direct reclaim and get stuck on dirty pages. Around
+50% of memory was occupied by tmpfs backed by an SSD, and another disk
+(rotating) was reading and writing at max speed to shrink a partition.
+
+: The problem was pretty ridiculous. It's a 8gig machine w/ one ssd and 10k
+: rpm harddrive and I could reliably reproduce constant stuttering every
+: several seconds for as long as buffered IO was going on on the hard drive
+: either with tmpfs occupying somewhere above 4gig or a test program which
+: allocates about the same amount of anon memory. Although swap usage was
+: zero, turning off swap also made the problem go away too.
+:
+: The trigger conditions seem quite plausible - high anon memory usage w/
+: heavy buffered IO and swap configured - and it's highly likely that this
+: is happening in the wild too. (this can happen with copying large files
+: to usb sticks too, right?)
+
+This patch (of 2):
+
+The dirty_balance_reserve is an approximation of the fraction of free
+pages that the page allocator does not make available for page cache
+allocations. As a result, it has to be taken into account when
+calculating the amount of "dirtyable memory", the baseline to which
+dirty_background_ratio and dirty_ratio are applied.
+
+However, currently the reserve is subtracted from the sum of free and
+reclaimable pages, which is non-sensical and leads to erroneous results
+when the system is dominated by unreclaimable pages and the
+dirty_balance_reserve is bigger than free+reclaimable. In that case, at
+least the already allocated cache should be considered dirtyable.
+
+Fix the calculation by subtracting the reserve from the amount of free
+pages, then adding the reclaimable pages on top.
+
+[akpm@linux-foundation.org: fix CONFIG_HIGHMEM build]
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Tested-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page-writeback.c | 55 ++++++++++++++++++++++------------------------------
+ 1 file changed, 24 insertions(+), 31 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -188,6 +188,25 @@ static unsigned long writeout_period_tim
+ * global dirtyable memory first.
+ */
+
++/**
++ * zone_dirtyable_memory - number of dirtyable pages in a zone
++ * @zone: the zone
++ *
++ * Returns the zone's number of pages potentially available for dirty
++ * page cache. This is the base value for the per-zone dirty limits.
++ */
++static unsigned long zone_dirtyable_memory(struct zone *zone)
++{
++ unsigned long nr_pages;
++
++ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
++ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
++
++ nr_pages += zone_reclaimable_pages(zone);
++
++ return nr_pages;
++}
++
+ static unsigned long highmem_dirtyable_memory(unsigned long total)
+ {
+ #ifdef CONFIG_HIGHMEM
+@@ -195,11 +214,9 @@ static unsigned long highmem_dirtyable_m
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+- struct zone *z =
+- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
++ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+- x += zone_page_state(z, NR_FREE_PAGES) +
+- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
++ x += zone_dirtyable_memory(z);
+ }
+ /*
+ * Unreclaimable memory (kernel memory or anonymous memory
+@@ -235,9 +252,11 @@ static unsigned long global_dirtyable_me
+ {
+ unsigned long x;
+
+- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
++ x = global_page_state(NR_FREE_PAGES);
+ x -= min(x, dirty_balance_reserve);
+
++ x += global_reclaimable_pages();
++
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
+
+@@ -289,32 +308,6 @@ void global_dirty_limits(unsigned long *
+ }
+
+ /**
+- * zone_dirtyable_memory - number of dirtyable pages in a zone
+- * @zone: the zone
+- *
+- * Returns the zone's number of pages potentially available for dirty
+- * page cache. This is the base value for the per-zone dirty limits.
+- */
+-static unsigned long zone_dirtyable_memory(struct zone *zone)
+-{
+- /*
+- * The effective global number of dirtyable pages may exclude
+- * highmem as a big-picture measure to keep the ratio between
+- * dirty memory and lowmem reasonable.
+- *
+- * But this function is purely about the individual zone and a
+- * highmem zone can hold its share of dirty pages, so we don't
+- * care about vm_highmem_is_dirtyable here.
+- */
+- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
+- zone_reclaimable_pages(zone);
+-
+- /* don't allow this to underflow */
+- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+- return nr_pages;
+-}
+-
+-/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
fuse-fix-pipe_buf_operations.patch
audit-reset-audit-backlog-wait-time-after-error-recovery.patch
audit-correct-a-type-mismatch-in-audit_syscall_exit.patch
+mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch
+mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch
+mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch