--- /dev/null
+From 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 6 Aug 2014 16:08:03 -0700
+Subject: memcg, vmscan: Fix forced scan of anonymous pages
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e upstream.
+
+When memory cgoups are enabled, the code that decides to force to scan
+anonymous pages in get_scan_count() compares global values (free,
+high_watermark) to a value that is restricted to a memory cgroup (file).
+It make the code over-eager to force anon scan.
+
+For instance, it will force anon scan when scanning a memcg that is
+mainly populated by anonymous page, even when there is plenty of file
+pages to get rid of in others memcgs, even when swappiness == 0. It
+breaks user's expectation about swappiness and hurts performance.
+
+This patch makes sure that forced anon scan only happens when there not
+enough file pages for the all zone, not just in one random memcg.
+
+[hannes@cmpxchg.org: cleanups]
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c | 23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1847,7 +1847,7 @@ static void get_scan_count(struct lruvec
+ struct zone *zone = lruvec_zone(lruvec);
+ unsigned long anon_prio, file_prio;
+ enum scan_balance scan_balance;
+- unsigned long anon, file, free;
++ unsigned long anon, file;
+ bool force_scan = false;
+ unsigned long ap, fp;
+ enum lru_list lru;
+@@ -1895,11 +1895,6 @@ static void get_scan_count(struct lruvec
+ goto out;
+ }
+
+- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+- get_lru_size(lruvec, LRU_INACTIVE_ANON);
+- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+-
+ /*
+ * If it's foreseeable that reclaiming the file cache won't be
+ * enough to get the zone back into a desirable shape, we have
+@@ -1907,8 +1902,14 @@ static void get_scan_count(struct lruvec
+ * thrashing - remaining file pages alone.
+ */
+ if (global_reclaim(sc)) {
+- free = zone_page_state(zone, NR_FREE_PAGES);
+- if (unlikely(file + free <= high_wmark_pages(zone))) {
++ unsigned long zonefile;
++ unsigned long zonefree;
++
++ zonefree = zone_page_state(zone, NR_FREE_PAGES);
++ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
++ zone_page_state(zone, NR_INACTIVE_FILE);
++
++ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+ scan_balance = SCAN_ANON;
+ goto out;
+ }
+@@ -1943,6 +1944,12 @@ static void get_scan_count(struct lruvec
+ *
+ * anon in [0], file in [1]
+ */
++
++ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
++ get_lru_size(lruvec, LRU_INACTIVE_ANON);
++ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
++ get_lru_size(lruvec, LRU_INACTIVE_FILE);
++
+ spin_lock_irq(&zone->lru_lock);
+ if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
+ reclaim_stat->recent_scanned[0] /= 2;
--- /dev/null
+From 888cf2db475a256fb0cda042140f73d7881f81fe Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 4 Jun 2014 16:10:34 -0700
+Subject: mm: avoid unnecessary atomic operations during end_page_writeback()
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 888cf2db475a256fb0cda042140f73d7881f81fe upstream.
+
+If a page is marked for immediate reclaim then it is moved to the tail of
+the LRU list. This occurs when the system is under enough memory pressure
+for pages under writeback to reach the end of the LRU but we test for this
+using atomic operations on every writeback. This patch uses an optimistic
+non-atomic test first. It'll miss some pages in rare cases but the
+consequences are not severe enough to warrant such a penalty.
+
+While the function does not dominate profiles during a simple dd test the
+cost of it is reduced.
+
+73048 0.7428 vmlinux-3.15.0-rc5-mmotm-20140513 end_page_writeback
+23740 0.2409 vmlinux-3.15.0-rc5-lessatomic end_page_writeback
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/filemap.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -644,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
+ */
+ void end_page_writeback(struct page *page)
+ {
+- if (TestClearPageReclaim(page))
++ /*
++ * TestClearPageReclaim could be used here but it is an atomic
++ * operation and overkill in this particular case. Failing to
++ * shuffle a page marked for immediate reclaim is too mild to
++ * justify taking an atomic operation penalty at the end of
++ * ever page writeback.
++ */
++ if (PageReclaim(page)) {
++ ClearPageReclaim(page);
+ rotate_reclaimable_page(page);
++ }
+
+ if (!test_clear_page_writeback(page))
+ BUG();
--- /dev/null
+From 21bda264f4243f61dfcc485174055f12ad0530b4 Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 6 Aug 2014 16:06:56 -0700
+Subject: mm: make copy_pte_range static again
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit 21bda264f4243f61dfcc485174055f12ad0530b4 upstream.
+
+Commit 71e3aac0724f ("thp: transparent hugepage core") adds
+copy_pte_range prototype to huge_mm.h. I'm not sure why (or if) this
+function have been used outside of memory.c, but it currently isn't.
+This patch makes copy_pte_range() static again.
+
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/huge_mm.h | 4 ----
+ mm/memory.c | 2 +-
+ 2 files changed, 1 insertion(+), 5 deletions(-)
+
+--- a/include/linux/huge_mm.h
++++ b/include/linux/huge_mm.h
+@@ -93,10 +93,6 @@ extern bool is_vma_temporary_stack(struc
+ #endif /* CONFIG_DEBUG_VM */
+
+ extern unsigned long transparent_hugepage_flags;
+-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- pmd_t *dst_pmd, pmd_t *src_pmd,
+- struct vm_area_struct *vma,
+- unsigned long addr, unsigned long end);
+ extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+ static inline int split_huge_page(struct page *page)
+ {
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -878,7 +878,7 @@ out_set_pte:
+ return 0;
+ }
+
+-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
++static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+ {
--- /dev/null
+From c0d73261f5c1355a35b8b40e871d31578ce0c044 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Wed, 6 Aug 2014 16:05:08 -0700
+Subject: mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault()
+
+From: Hugh Dickins <hughd@google.com>
+
+commit c0d73261f5c1355a35b8b40e871d31578ce0c044 upstream.
+
+Use ACCESS_ONCE() in handle_pte_fault() when getting the entry or
+orig_pte upon which all subsequent decisions and pte_same() tests will
+be made.
+
+I have no evidence that its lack is responsible for the mm/filemap.c:202
+BUG_ON(page_mapped(page)) in __delete_from_page_cache() found by
+trinity, and I am not optimistic that it will fix it. But I have found
+no other explanation, and ACCESS_ONCE() here will surely not hurt.
+
+If gcc does re-access the pte before passing it down, then that would be
+disastrous for correct page fault handling, and certainly could explain
+the page_mapped() BUGs seen (concurrent fault causing page to be mapped
+in a second time on top of itself: mapcount 2 for a single pte).
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Sasha Levin <sasha.levin@oracle.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Konstantin Khlebnikov <koct9i@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3646,7 +3646,7 @@ static int handle_pte_fault(struct mm_st
+ pte_t entry;
+ spinlock_t *ptl;
+
+- entry = *pte;
++ entry = ACCESS_ONCE(*pte);
+ if (!pte_present(entry)) {
+ if (pte_none(entry)) {
+ if (vma->vm_ops) {
--- /dev/null
+From 24b7e5819ad5cbef2b7c7376510862aa8319d240 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:11 -0700
+Subject: mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 24b7e5819ad5cbef2b7c7376510862aa8319d240 upstream.
+
+This was formerly the series "Improve sequential read throughput" which
+noted some major differences in performance of tiobench since 3.0.
+While there are a number of factors, two that dominated were the
+introduction of the fair zone allocation policy and changes to CFQ.
+
+The behaviour of fair zone allocation policy makes more sense than
+tiobench as a benchmark and CFQ defaults were not changed due to
+insufficient benchmarking.
+
+This series is what's left. It's one functional fix to the fair zone
+allocation policy when used on NUMA machines and a reduction of overhead
+in general. tiobench was used for the comparison despite its flaws as
+an IO benchmark as in this case we are primarily interested in the
+overhead of page allocator and page reclaim activity.
+
+On UMA, it makes little difference to overhead
+
+ 3.16.0-rc3 3.16.0-rc3
+ vanilla lowercost-v5
+User 383.61 386.77
+System 403.83 401.74
+Elapsed 5411.50 5413.11
+
+On a 4-socket NUMA machine it's a bit more noticable
+
+ 3.16.0-rc3 3.16.0-rc3
+ vanilla lowercost-v5
+User 746.94 802.00
+System 65336.22 40852.33
+Elapsed 27553.52 27368.46
+
+This patch (of 6):
+
+The LRU insertion and activate tracepoints take PFN as a parameter
+forcing the overhead to the caller. Move the overhead to the tracepoint
+fast-assign method to ensure the cost is only incurred when the
+tracepoint is active.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/trace/events/pagemap.h | 16 +++++++---------
+ mm/swap.c | 4 ++--
+ 2 files changed, 9 insertions(+), 11 deletions(-)
+
+--- a/include/trace/events/pagemap.h
++++ b/include/trace/events/pagemap.h
+@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TP_PROTO(
+ struct page *page,
+- unsigned long pfn,
+- int lru,
+- unsigned long flags
++ int lru
+ ),
+
+- TP_ARGS(page, pfn, lru, flags),
++ TP_ARGS(page, lru),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page )
+@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TP_fast_assign(
+ __entry->page = page;
+- __entry->pfn = pfn;
++ __entry->pfn = page_to_pfn(page);
+ __entry->lru = lru;
+- __entry->flags = flags;
++ __entry->flags = trace_pagemap_flags(page);
+ ),
+
+ /* Flag format is based on page-types.c formatting for pagemap */
+@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion,
+
+ TRACE_EVENT(mm_lru_activate,
+
+- TP_PROTO(struct page *page, unsigned long pfn),
++ TP_PROTO(struct page *page),
+
+- TP_ARGS(page, pfn),
++ TP_ARGS(page),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page )
+@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate,
+
+ TP_fast_assign(
+ __entry->page = page;
+- __entry->pfn = pfn;
++ __entry->pfn = page_to_pfn(page);
+ ),
+
+ /* Flag format is based on page-types.c formatting for pagemap */
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -469,7 +469,7 @@ static void __activate_page(struct page
+ SetPageActive(page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(page, lruvec, lru);
+- trace_mm_lru_activate(page, page_to_pfn(page));
++ trace_mm_lru_activate(page);
+
+ __count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 1);
+@@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, lru);
+ update_page_reclaim_stat(lruvec, file, active);
+- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
++ trace_mm_lru_insertion(page, lru);
+ }
+
+ /*
--- /dev/null
+From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:14 -0700
+Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream.
+
+The arrangement of struct zone has changed over time and now it has
+reached the point where there is some inappropriate sharing going on.
+On x86-64 for example
+
+o The zone->node field is shared with the zone lock and zone->node is
+ accessed frequently from the page allocator due to the fair zone
+ allocation policy.
+
+o span_seqlock is almost never used by shares a line with free_area
+
+o Some zone statistics share a cache line with the LRU lock so
+ reclaim-intensive and allocator-intensive workloads can bounce the cache
+ line on a stat update
+
+This patch rearranges struct zone to put read-only and read-mostly
+fields together and then splits the page allocator intensive fields, the
+zone statistics and the page reclaim intensive fields into their own
+cache lines. Note that the type of lowmem_reserve changes due to the
+watermark calculations being signed and avoiding a signed/unsigned
+conversion there.
+
+On the test configuration I used the overall size of struct zone shrunk
+by one cache line. On smaller machines, this is not likely to be
+noticable. However, on a 4-node NUMA machine running tiobench the
+system CPU overhead is reduced by this patch.
+
+ 3.16.0-rc3 3.16.0-rc3
+ vanillarearrange-v5r9
+User 746.94 759.78
+System 65336.22 58350.98
+Elapsed 27553.52 27282.02
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mmzone.h | 205 +++++++++++++++++++++++++------------------------
+ mm/page_alloc.c | 7 -
+ mm/vmstat.c | 4
+ 3 files changed, 110 insertions(+), 106 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -321,19 +321,12 @@ enum zone_type {
+ #ifndef __GENERATING_BOUNDS_H
+
+ struct zone {
+- /* Fields commonly accessed by the page allocator */
++ /* Read-mostly fields */
+
+ /* zone watermarks, access with *_wmark_pages(zone) macros */
+ unsigned long watermark[NR_WMARK];
+
+ /*
+- * When free pages are below this point, additional steps are taken
+- * when reading the number of free pages to avoid per-cpu counter
+- * drift allowing watermarks to be breached
+- */
+- unsigned long percpu_drift_mark;
+-
+- /*
+ * We don't know if the memory that we're going to allocate will be freeable
+ * or/and it will be released eventually, so to avoid totally wasting several
+ * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+@@ -341,41 +334,26 @@ struct zone {
+ * on the higher zones). This array is recalculated at runtime if the
+ * sysctl_lowmem_reserve_ratio sysctl changes.
+ */
+- unsigned long lowmem_reserve[MAX_NR_ZONES];
+-
+- /*
+- * This is a per-zone reserve of pages that should not be
+- * considered dirtyable memory.
+- */
+- unsigned long dirty_balance_reserve;
++ long lowmem_reserve[MAX_NR_ZONES];
+
+ #ifdef CONFIG_NUMA
+ int node;
++#endif
++
+ /*
+- * zone reclaim becomes active if more unmapped pages exist.
++ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
++ * this zone's LRU. Maintained by the pageout code.
+ */
+- unsigned long min_unmapped_pages;
+- unsigned long min_slab_pages;
+-#endif
++ unsigned int inactive_ratio;
++
++ struct pglist_data *zone_pgdat;
+ struct per_cpu_pageset __percpu *pageset;
++
+ /*
+- * free areas of different sizes
++ * This is a per-zone reserve of pages that should not be
++ * considered dirtyable memory.
+ */
+- spinlock_t lock;
+-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+- /* Set to true when the PG_migrate_skip bits should be cleared */
+- bool compact_blockskip_flush;
+-
+- /* pfn where compaction free scanner should start */
+- unsigned long compact_cached_free_pfn;
+- /* pfn where async and sync compaction migration scanner should start */
+- unsigned long compact_cached_migrate_pfn[2];
+-#endif
+-#ifdef CONFIG_MEMORY_HOTPLUG
+- /* see spanned/present_pages for more description */
+- seqlock_t span_seqlock;
+-#endif
+- struct free_area free_area[MAX_ORDER];
++ unsigned long dirty_balance_reserve;
+
+ #ifndef CONFIG_SPARSEMEM
+ /*
+@@ -385,71 +363,14 @@ struct zone {
+ unsigned long *pageblock_flags;
+ #endif /* CONFIG_SPARSEMEM */
+
+-#ifdef CONFIG_COMPACTION
+- /*
+- * On compaction failure, 1<<compact_defer_shift compactions
+- * are skipped before trying again. The number attempted since
+- * last failure is tracked with compact_considered.
+- */
+- unsigned int compact_considered;
+- unsigned int compact_defer_shift;
+- int compact_order_failed;
+-#endif
+-
+- ZONE_PADDING(_pad1_)
+-
+- /* Fields commonly accessed by the page reclaim scanner */
+- spinlock_t lru_lock;
+- struct lruvec lruvec;
+-
+- unsigned long pages_scanned; /* since last reclaim */
+- unsigned long flags; /* zone flags, see below */
+-
+- /* Zone statistics */
+- atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+-
+- /*
+- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+- * this zone's LRU. Maintained by the pageout code.
+- */
+- unsigned int inactive_ratio;
+-
+-
+- ZONE_PADDING(_pad2_)
+- /* Rarely used or read-mostly fields */
+-
++#ifdef CONFIG_NUMA
+ /*
+- * wait_table -- the array holding the hash table
+- * wait_table_hash_nr_entries -- the size of the hash table array
+- * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
+- *
+- * The purpose of all these is to keep track of the people
+- * waiting for a page to become available and make them
+- * runnable again when possible. The trouble is that this
+- * consumes a lot of space, especially when so few things
+- * wait on pages at a given time. So instead of using
+- * per-page waitqueues, we use a waitqueue hash table.
+- *
+- * The bucket discipline is to sleep on the same queue when
+- * colliding and wake all in that wait queue when removing.
+- * When something wakes, it must check to be sure its page is
+- * truly available, a la thundering herd. The cost of a
+- * collision is great, but given the expected load of the
+- * table, they should be so rare as to be outweighed by the
+- * benefits from the saved space.
+- *
+- * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
+- * primary users of these fields, and in mm/page_alloc.c
+- * free_area_init_core() performs the initialization of them.
++ * zone reclaim becomes active if more unmapped pages exist.
+ */
+- wait_queue_head_t * wait_table;
+- unsigned long wait_table_hash_nr_entries;
+- unsigned long wait_table_bits;
++ unsigned long min_unmapped_pages;
++ unsigned long min_slab_pages;
++#endif /* CONFIG_NUMA */
+
+- /*
+- * Discontig memory support fields.
+- */
+- struct pglist_data *zone_pgdat;
+ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
+ unsigned long zone_start_pfn;
+
+@@ -495,9 +416,11 @@ struct zone {
+ * adjust_managed_page_count() should be used instead of directly
+ * touching zone->managed_pages and totalram_pages.
+ */
++ unsigned long managed_pages;
+ unsigned long spanned_pages;
+ unsigned long present_pages;
+- unsigned long managed_pages;
++
++ const char *name;
+
+ /*
+ * Number of MIGRATE_RESEVE page block. To maintain for just
+@@ -505,10 +428,92 @@ struct zone {
+ */
+ int nr_migrate_reserve_block;
+
++#ifdef CONFIG_MEMORY_HOTPLUG
++ /* see spanned/present_pages for more description */
++ seqlock_t span_seqlock;
++#endif
++
+ /*
+- * rarely used fields:
++ * wait_table -- the array holding the hash table
++ * wait_table_hash_nr_entries -- the size of the hash table array
++ * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
++ *
++ * The purpose of all these is to keep track of the people
++ * waiting for a page to become available and make them
++ * runnable again when possible. The trouble is that this
++ * consumes a lot of space, especially when so few things
++ * wait on pages at a given time. So instead of using
++ * per-page waitqueues, we use a waitqueue hash table.
++ *
++ * The bucket discipline is to sleep on the same queue when
++ * colliding and wake all in that wait queue when removing.
++ * When something wakes, it must check to be sure its page is
++ * truly available, a la thundering herd. The cost of a
++ * collision is great, but given the expected load of the
++ * table, they should be so rare as to be outweighed by the
++ * benefits from the saved space.
++ *
++ * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
++ * primary users of these fields, and in mm/page_alloc.c
++ * free_area_init_core() performs the initialization of them.
+ */
+- const char *name;
++ wait_queue_head_t *wait_table;
++ unsigned long wait_table_hash_nr_entries;
++ unsigned long wait_table_bits;
++
++ ZONE_PADDING(_pad1_)
++
++ /* Write-intensive fields used from the page allocator */
++ spinlock_t lock;
++
++ /* free areas of different sizes */
++ struct free_area free_area[MAX_ORDER];
++
++ /* zone flags, see below */
++ unsigned long flags;
++
++ ZONE_PADDING(_pad2_)
++
++ /* Write-intensive fields used by page reclaim */
++
++ /* Fields commonly accessed by the page reclaim scanner */
++ spinlock_t lru_lock;
++ unsigned long pages_scanned; /* since last reclaim */
++ struct lruvec lruvec;
++
++ /*
++ * When free pages are below this point, additional steps are taken
++ * when reading the number of free pages to avoid per-cpu counter
++ * drift allowing watermarks to be breached
++ */
++ unsigned long percpu_drift_mark;
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++ /* pfn where compaction free scanner should start */
++ unsigned long compact_cached_free_pfn;
++ /* pfn where async and sync compaction migration scanner should start */
++ unsigned long compact_cached_migrate_pfn[2];
++#endif
++
++#ifdef CONFIG_COMPACTION
++ /*
++ * On compaction failure, 1<<compact_defer_shift compactions
++ * are skipped before trying again. The number attempted since
++ * last failure is tracked with compact_considered.
++ */
++ unsigned int compact_considered;
++ unsigned int compact_defer_shift;
++ int compact_order_failed;
++#endif
++
++#if defined CONFIG_COMPACTION || defined CONFIG_CMA
++ /* Set to true when the PG_migrate_skip bits should be cleared */
++ bool compact_blockskip_flush;
++#endif
++
++ ZONE_PADDING(_pad3_)
++ /* Zone statistics */
++ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+ } ____cacheline_internodealigned_in_smp;
+
+ typedef enum {
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1710,7 +1710,6 @@ static bool __zone_watermark_ok(struct z
+ {
+ /* free_pages my go negative - that's OK */
+ long min = mark;
+- long lowmem_reserve = z->lowmem_reserve[classzone_idx];
+ int o;
+ long free_cma = 0;
+
+@@ -1725,7 +1724,7 @@ static bool __zone_watermark_ok(struct z
+ free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ #endif
+
+- if (free_pages - free_cma <= min + lowmem_reserve)
++ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+ return false;
+ for (o = 0; o < order; o++) {
+ /* At the next order, this order's pages become unavailable */
+@@ -3257,7 +3256,7 @@ void show_free_areas(unsigned int filter
+ );
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+- printk(" %lu", zone->lowmem_reserve[i]);
++ printk(" %ld", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
+
+@@ -5585,7 +5584,7 @@ static void calculate_totalreserve_pages
+ for_each_online_pgdat(pgdat) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+- unsigned long max = 0;
++ long max = 0;
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (j = i; j < MAX_NR_ZONES; j++) {
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1065,10 +1065,10 @@ static void zoneinfo_show_print(struct s
+ zone_page_state(zone, i));
+
+ seq_printf(m,
+- "\n protection: (%lu",
++ "\n protection: (%ld",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
++ seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
--- /dev/null
+From 14a4e2141e24304fff2c697be6382ffb83888185 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Wed, 6 Aug 2014 16:07:29 -0700
+Subject: mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode
+
+From: David Rientjes <rientjes@google.com>
+
+commit 14a4e2141e24304fff2c697be6382ffb83888185 upstream.
+
+Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target
+node") improved the previous khugepaged logic which allocated a
+transparent hugepages from the node of the first page being collapsed.
+
+However, it is still possible to collapse pages to remote memory which
+may suffer from additional access latency. With the current policy, it
+is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed
+remotely if the majority are allocated from that node.
+
+When zone_reclaim_mode is enabled, it means the VM should make every
+attempt to allocate locally to prevent NUMA performance degradation. In
+this case, we do not want to collapse hugepages to remote nodes that
+would suffer from increased access latency. Thus, when
+zone_reclaim_mode is enabled, only allow collapsing to nodes with
+RECLAIM_DISTANCE or less.
+
+There is no functional change for systems that disable
+zone_reclaim_mode.
+
+Signed-off-by: David Rientjes <rientjes@google.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 26 ++++++++++++++++++++++++++
+ 1 file changed, 26 insertions(+)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2273,6 +2273,30 @@ static void khugepaged_alloc_sleep(void)
+
+ static int khugepaged_node_load[MAX_NUMNODES];
+
++static bool khugepaged_scan_abort(int nid)
++{
++ int i;
++
++ /*
++ * If zone_reclaim_mode is disabled, then no extra effort is made to
++ * allocate memory locally.
++ */
++ if (!zone_reclaim_mode)
++ return false;
++
++ /* If there is a count for this node already, it must be acceptable */
++ if (khugepaged_node_load[nid])
++ return false;
++
++ for (i = 0; i < MAX_NUMNODES; i++) {
++ if (!khugepaged_node_load[i])
++ continue;
++ if (node_distance(nid, i) > RECLAIM_DISTANCE)
++ return true;
++ }
++ return false;
++}
++
+ #ifdef CONFIG_NUMA
+ static int khugepaged_find_target_node(void)
+ {
+@@ -2589,6 +2613,8 @@ static int khugepaged_scan_pmd(struct mm
+ * hit record.
+ */
+ node = page_to_nid(page);
++ if (khugepaged_scan_abort(node))
++ goto out_unmap;
+ khugepaged_node_load[node]++;
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
mm-do-not-use-unnecessary-atomic-operations-when-adding-pages-to-the-lru.patch
fs-buffer-do-not-use-unnecessary-atomic-operations-when-discarding-buffers.patch
mm-non-atomically-mark-page-accessed-during-page-cache-allocation-where-possible.patch
+mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch
+shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch
+mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch
+mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch
+mm-make-copy_pte_range-static-again.patch
+vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch
+memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch
+mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch
+mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch
--- /dev/null
+From 66d2f4d28cd030220e7ea2a628993fcabcb956d1 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Wed, 2 Jul 2014 15:22:38 -0700
+Subject: shmem: fix init_page_accessed use to stop !PageLRU bug
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 66d2f4d28cd030220e7ea2a628993fcabcb956d1 upstream.
+
+Under shmem swapping load, I sometimes hit the VM_BUG_ON_PAGE(!PageLRU)
+in isolate_lru_pages() at mm/vmscan.c:1281!
+
+Commit 2457aec63745 ("mm: non-atomically mark page accessed during page
+cache allocation where possible") looks like interrupted work-in-progress.
+
+mm/filemap.c's call to init_page_accessed() is fine, but not mm/shmem.c's
+- shmem_write_begin() is clearly wrong to use it after shmem_getpage(),
+when the page is always visible in radix_tree, and often already on LRU.
+
+Revert change to shmem_write_begin(), and use init_page_accessed() or
+mark_page_accessed() appropriately for SGP_WRITE in shmem_getpage_gfp().
+
+SGP_WRITE also covers shmem_symlink(), which did not mark_page_accessed()
+before; but since many other filesystems use [__]page_symlink(), which did
+and does mark the page accessed, consider this as rectifying an oversight.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Prabhakar Lad <prabhakar.csengg@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/shmem.c | 15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -1035,6 +1035,9 @@ repeat:
+ goto failed;
+ }
+
++ if (page && sgp == SGP_WRITE)
++ mark_page_accessed(page);
++
+ /* fallocated page? */
+ if (page && !PageUptodate(page)) {
+ if (sgp != SGP_READ)
+@@ -1116,6 +1119,9 @@ repeat:
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
++ if (sgp == SGP_WRITE)
++ mark_page_accessed(page);
++
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+@@ -1142,6 +1148,9 @@ repeat:
+
+ __SetPageSwapBacked(page);
+ __set_page_locked(page);
++ if (sgp == SGP_WRITE)
++ init_page_accessed(page);
++
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (error)
+@@ -1438,13 +1447,9 @@ shmem_write_begin(struct file *file, str
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+ {
+- int ret;
+ struct inode *inode = mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+- ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+- if (ret == 0 && *pagep)
+- init_page_accessed(*pagep);
+- return ret;
++ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+ }
+
+ static int
--- /dev/null
+From 474750aba88817c53f39424e5567b8e4acc4b39b Mon Sep 17 00:00:00 2001
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Date: Wed, 6 Aug 2014 16:05:06 -0700
+Subject: vmalloc: use rcu list iterator to reduce vmap_area_lock contention
+
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+
+commit 474750aba88817c53f39424e5567b8e4acc4b39b upstream.
+
+Richard Yao reported a month ago that his system have a trouble with
+vmap_area_lock contention during performance analysis by /proc/meminfo.
+Andrew asked why his analysis checks /proc/meminfo stressfully, but he
+didn't answer it.
+
+ https://lkml.org/lkml/2014/4/10/416
+
+Although I'm not sure that this is right usage or not, there is a
+solution reducing vmap_area_lock contention with no side-effect. That
+is just to use rcu list iterator in get_vmalloc_info().
+
+rcu can be used in this function because all RCU protocol is already
+respected by writers, since Nick Piggin commit db64fe02258f1 ("mm:
+rewrite vmap layer") back in linux-2.6.28
+
+Specifically :
+ insertions use list_add_rcu(),
+ deletions use list_del_rcu() and kfree_rcu().
+
+Note the rb tree is not used from rcu reader (it would not be safe),
+only the vmap_area_list has full RCU protection.
+
+Note that __purge_vmap_area_lazy() already uses this rcu protection.
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ if (va->flags & VM_LAZY_FREE) {
+ if (va->va_start < *start)
+ *start = va->va_start;
+ if (va->va_end > *end)
+ *end = va->va_end;
+ nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ list_add_tail(&va->purge_list, &valist);
+ va->flags |= VM_LAZY_FREEING;
+ va->flags &= ~VM_LAZY_FREE;
+ }
+ }
+ rcu_read_unlock();
+
+Peter:
+
+: While rcu list traversal over the vmap_area_list is safe, this may
+: arrive at different results than the spinlocked version. The rcu list
+: traversal version will not be a 'snapshot' of a single, valid instant
+: of the entire vmap_area_list, but rather a potential amalgam of
+: different list states.
+
+Joonsoo:
+
+: Yes, you are right, but I don't think that we should be strict here.
+: Meminfo is already not a 'snapshot' at specific time. While we try to get
+: certain stats, the other stats can change. And, although we may arrive at
+: different results than the spinlocked version, the difference would not be
+: large and would not make serious side-effect.
+
+[edumazet@google.com: add more commit description]
+Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Reported-by: Richard Yao <ryao@gentoo.org>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Cc: Peter Hurley <peter@hurleysoftware.com>
+Cc: Zhang Yanfei <zhangyanfei.yes@gmail.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Andi Kleen <andi@firstfloor.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmalloc.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -2681,14 +2681,14 @@ void get_vmalloc_info(struct vmalloc_inf
+
+ prev_end = VMALLOC_START;
+
+- spin_lock(&vmap_area_lock);
++ rcu_read_lock();
+
+ if (list_empty(&vmap_area_list)) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ goto out;
+ }
+
+- list_for_each_entry(va, &vmap_area_list, list) {
++ list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ unsigned long addr = va->va_start;
+
+ /*
+@@ -2715,7 +2715,7 @@ void get_vmalloc_info(struct vmalloc_inf
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+ out:
+- spin_unlock(&vmap_area_lock);
++ rcu_read_unlock();
+ }
+ #endif
+