From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 28 Jan 2015 01:13:19 +0000 (-0800)
Subject: 3.14-stable patches
X-Git-Tag: v3.10.67~7
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=04ff371c3f81f73602b55dd9b413d0439783b161;p=thirdparty%2Fkernel%2Fstable-queue.git

3.14-stable patches

added patches:
	mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
	mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
	mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch
	mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
---

diff --git a/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch b/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
new file mode 100644
index 00000000000..91e463a5d4f
--- /dev/null
+++ b/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
@@ -0,0 +1,147 @@
+From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:16 -0700
+Subject: mm: move zone->pages_scanned into a vmstat counter
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd upstream.
+
+zone->pages_scanned is a write-intensive cache line during page reclaim
+and it's also updated during page free.  Move the counter into vmstat to
+take advantage of the per-cpu updates and do not update it in the free
+paths unless necessary.
+
+On a small UMA machine running tiobench the difference is marginal.  On
+a 4-node machine the overhead is more noticable.  Note that automatic
+NUMA balancing was disabled for this test as otherwise the system CPU
+overhead is unpredictable.
+
+          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
+             vanillarearrange-v5   vmstat-v5
+User          746.94      759.78      774.56
+System      65336.22    58350.98    32847.27
+Elapsed     27553.52    27282.02    27415.04
+
+Note that the overhead reduction will vary depending on where exactly
+pages are allocated and freed.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h |    2 +-
+ mm/page_alloc.c        |   12 +++++++++---
+ mm/vmscan.c            |    7 ++++---
+ mm/vmstat.c            |    3 ++-
+ 4 files changed, 16 insertions(+), 8 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -143,6 +143,7 @@ enum zone_stat_item {
+ 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
+ 	NR_DIRTIED,		/* page dirtyings since bootup */
+ 	NR_WRITTEN,		/* page writings since bootup */
++	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
+ #ifdef CONFIG_NUMA
+ 	NUMA_HIT,		/* allocated in intended node */
+ 	NUMA_MISS,		/* allocated in non intended node */
+@@ -478,7 +479,6 @@ struct zone {
+ 
+ 	/* Fields commonly accessed by the page reclaim scanner */
+ 	spinlock_t		lru_lock;
+-	unsigned long		pages_scanned;	   /* since last reclaim */
+ 	struct lruvec		lruvec;
+ 
+ 	/*
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -678,9 +678,12 @@ static void free_pcppages_bulk(struct zo
+ 	int migratetype = 0;
+ 	int batch_free = 0;
+ 	int to_free = count;
++	unsigned long nr_scanned;
+ 
+ 	spin_lock(&zone->lock);
+-	zone->pages_scanned = 0;
++	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++	if (nr_scanned)
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ 
+ 	while (to_free) {
+ 		struct page *page;
+@@ -729,8 +732,11 @@ static void free_one_page(struct zone *z
+ 				unsigned int order,
+ 				int migratetype)
+ {
++	unsigned long nr_scanned;
+ 	spin_lock(&zone->lock);
+-	zone->pages_scanned = 0;
++	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++	if (nr_scanned)
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ 
+ 	__free_one_page(page, pfn, zone, order, migratetype);
+ 	if (unlikely(!is_migrate_isolate(migratetype)))
+@@ -3251,7 +3257,7 @@ void show_free_areas(unsigned int filter
+ 			K(zone_page_state(zone, NR_BOUNCE)),
+ 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
+ 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
+-			zone->pages_scanned,
++			K(zone_page_state(zone, NR_PAGES_SCANNED)),
+ 			(!zone_reclaimable(zone) ? "yes" : "no")
+ 			);
+ 		printk("lowmem_reserve[]:");
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pa
+ 
+ bool zone_reclaimable(struct zone *zone)
+ {
+-	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
++	return zone_page_state(zone, NR_PAGES_SCANNED) <
++		zone_reclaimable_pages(zone) * 6;
+ }
+ 
+ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+@@ -1470,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to
+ 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ 
+ 	if (global_reclaim(sc)) {
+-		zone->pages_scanned += nr_scanned;
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ 		if (current_is_kswapd())
+ 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ 		else
+@@ -1659,7 +1660,7 @@ static void shrink_active_list(unsigned
+ 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ 				     &nr_scanned, sc, isolate_mode, lru);
+ 	if (global_reclaim(sc))
+-		zone->pages_scanned += nr_scanned;
++		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ 
+ 	reclaim_stat->recent_scanned[file] += nr_taken;
+ 
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -761,6 +761,7 @@ const char * const vmstat_text[] = {
+ 	"nr_shmem",
+ 	"nr_dirtied",
+ 	"nr_written",
++	"nr_pages_scanned",
+ 
+ #ifdef CONFIG_NUMA
+ 	"numa_hit",
+@@ -1055,7 +1056,7 @@ static void zoneinfo_show_print(struct s
+ 		   min_wmark_pages(zone),
+ 		   low_wmark_pages(zone),
+ 		   high_wmark_pages(zone),
+-		   zone->pages_scanned,
++		   zone_page_state(zone, NR_PAGES_SCANNED),
+ 		   zone->spanned_pages,
+ 		   zone->present_pages,
+ 		   zone->managed_pages);
diff --git a/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch b/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
new file mode 100644
index 00000000000..8a3361042f8
--- /dev/null
+++ b/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
@@ -0,0 +1,40 @@
+From stable-owner@vger.kernel.org Thu Aug 28 11:44:23 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Thu, 28 Aug 2014 19:35:44 +0100
+Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit f7b5d647946aae1647bf5cd26c16b3a793c1ac49 upstream.
+
+The purpose of numa_zonelist_order=zone is to preserve lower zones for
+use with 32-bit devices.  If locality is preferred then the
+numa_zonelist_order=node policy should be used.
+
+Unfortunately, the fair zone allocation policy overrides this by
+skipping zones on remote nodes until the lower one is found.  While this
+makes sense from a page aging and performance perspective, it breaks the
+expected zonelist policy.  This patch restores the expected behaviour
+for zone-list ordering.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1980,7 +1980,7 @@ zonelist_scan:
+ 		 */
+ 		if (alloc_flags & ALLOC_FAIR) {
+ 			if (!zone_local(preferred_zone, zone))
+-				continue;
++				break;
+ 			if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0)
+ 				continue;
+ 		}
diff --git a/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch b/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch
new file mode 100644
index 00000000000..4bf082778aa
--- /dev/null
+++ b/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch
@@ -0,0 +1,237 @@
+From stable-owner@vger.kernel.org Thu Aug 28 11:44:22 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Thu, 28 Aug 2014 19:35:45 +0100
+Subject: mm: page_alloc: reduce cost of the fair zone allocation policy
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream.
+
+The fair zone allocation policy round-robins allocations between zones
+within a node to avoid age inversion problems during reclaim.  If the
+first allocation fails, the batch counts are reset and a second attempt
+made before entering the slow path.
+
+One assumption made with this scheme is that batches expire at roughly
+the same time and the resets each time are justified.  This assumption
+does not hold when zones reach their low watermark as the batches will
+be consumed at uneven rates.  Allocation failure due to watermark
+depletion result in additional zonelist scans for the reset and another
+watermark check before hitting the slowpath.
+
+On UMA, the benefit is negligible -- around 0.25%.  On 4-socket NUMA
+machine it's variable due to the variability of measuring overhead with
+the vmstat changes.  The system CPU overhead comparison looks like
+
+          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
+             vanilla   vmstat-v5 lowercost-v5
+User          746.94      774.56      802.00
+System      65336.22    32847.27    40852.33
+Elapsed     27553.52    27415.04    27368.46
+
+However it is worth noting that the overall benchmark still completed
+faster and intuitively it makes sense to take as few passes as possible
+through the zonelists.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mmzone.h |    6 ++
+ mm/page_alloc.c        |  101 +++++++++++++++++++++++++------------------------
+ 2 files changed, 59 insertions(+), 48 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -529,6 +529,7 @@ typedef enum {
+ 	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+ 					 * many pages under writeback
+ 					 */
++	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
+ } zone_flags_t;
+ 
+ static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
+@@ -566,6 +567,11 @@ static inline int zone_is_reclaim_locked
+ 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ }
+ 
++static inline int zone_is_fair_depleted(const struct zone *zone)
++{
++	return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
++}
++
+ static inline int zone_is_oom_locked(const struct zone *zone)
+ {
+ 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1614,6 +1614,9 @@ again:
+ 	}
+ 
+ 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
++	if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
++	    !zone_is_fair_depleted(zone))
++		zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+ 
+ 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
+ 	zone_statistics(preferred_zone, zone, gfp_flags);
+@@ -1938,6 +1941,18 @@ static inline void init_zone_allows_recl
+ }
+ #endif	/* CONFIG_NUMA */
+ 
++static void reset_alloc_batches(struct zone *preferred_zone)
++{
++	struct zone *zone = preferred_zone->zone_pgdat->node_zones;
++
++	do {
++		mod_zone_page_state(zone, NR_ALLOC_BATCH,
++			high_wmark_pages(zone) - low_wmark_pages(zone) -
++			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
++		zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
++	} while (zone++ != preferred_zone);
++}
++
+ /*
+  * get_page_from_freelist goes through the zonelist trying to allocate
+  * a page.
+@@ -1955,8 +1970,12 @@ get_page_from_freelist(gfp_t gfp_mask, n
+ 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
+ 	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+ 				(gfp_mask & __GFP_WRITE);
++	int nr_fair_skipped = 0;
++	bool zonelist_rescan;
+ 
+ zonelist_scan:
++	zonelist_rescan = false;
++
+ 	/*
+ 	 * Scan zonelist, looking for a zone with enough free.
+ 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+@@ -1981,8 +2000,10 @@ zonelist_scan:
+ 		if (alloc_flags & ALLOC_FAIR) {
+ 			if (!zone_local(preferred_zone, zone))
+ 				break;
+-			if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0)
++			if (zone_is_fair_depleted(zone)) {
++				nr_fair_skipped++;
+ 				continue;
++			}
+ 		}
+ 		/*
+ 		 * When allocating a page cache page for writing, we
+@@ -2088,13 +2109,7 @@ this_zone_full:
+ 			zlc_mark_zone_full(zonelist, z);
+ 	}
+ 
+-	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+-		/* Disable zlc cache for second zonelist scan */
+-		zlc_active = 0;
+-		goto zonelist_scan;
+-	}
+-
+-	if (page)
++	if (page) {
+ 		/*
+ 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ 		 * necessary to allocate the page. The expectation is
+@@ -2103,8 +2118,37 @@ this_zone_full:
+ 		 * for !PFMEMALLOC purposes.
+ 		 */
+ 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
++		return page;
++	}
+ 
+-	return page;
++	/*
++	 * The first pass makes sure allocations are spread fairly within the
++	 * local node.  However, the local node might have free pages left
++	 * after the fairness batches are exhausted, and remote zones haven't
++	 * even been considered yet.  Try once more without fairness, and
++	 * include remote zones now, before entering the slowpath and waking
++	 * kswapd: prefer spilling to a remote zone over swapping locally.
++	 */
++	if (alloc_flags & ALLOC_FAIR) {
++		alloc_flags &= ~ALLOC_FAIR;
++		if (nr_fair_skipped) {
++			zonelist_rescan = true;
++			reset_alloc_batches(preferred_zone);
++		}
++		if (nr_online_nodes > 1)
++			zonelist_rescan = true;
++	}
++
++	if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
++		/* Disable zlc cache for second zonelist scan */
++		zlc_active = 0;
++		zonelist_rescan = true;
++	}
++
++	if (zonelist_rescan)
++		goto zonelist_scan;
++
++	return NULL;
+ }
+ 
+ /*
+@@ -2433,28 +2477,6 @@ __alloc_pages_high_priority(gfp_t gfp_ma
+ 	return page;
+ }
+ 
+-static void reset_alloc_batches(struct zonelist *zonelist,
+-				enum zone_type high_zoneidx,
+-				struct zone *preferred_zone)
+-{
+-	struct zoneref *z;
+-	struct zone *zone;
+-
+-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+-		/*
+-		 * Only reset the batches of zones that were actually
+-		 * considered in the fairness pass, we don't want to
+-		 * trash fairness information for zones that are not
+-		 * actually part of this zonelist's round-robin cycle.
+-		 */
+-		if (!zone_local(preferred_zone, zone))
+-			continue;
+-		mod_zone_page_state(zone, NR_ALLOC_BATCH,
+-			high_wmark_pages(zone) - low_wmark_pages(zone) -
+-			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+-	}
+-}
+-
+ static void wake_all_kswapds(unsigned int order,
+ 			     struct zonelist *zonelist,
+ 			     enum zone_type high_zoneidx,
+@@ -2792,29 +2814,12 @@ retry_cpuset:
+ 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ 		alloc_flags |= ALLOC_CMA;
+ #endif
+-retry:
+ 	/* First allocation attempt */
+ 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ 			zonelist, high_zoneidx, alloc_flags,
+ 			preferred_zone, classzone_idx, migratetype);
+ 	if (unlikely(!page)) {
+ 		/*
+-		 * The first pass makes sure allocations are spread
+-		 * fairly within the local node.  However, the local
+-		 * node might have free pages left after the fairness
+-		 * batches are exhausted, and remote zones haven't
+-		 * even been considered yet.  Try once more without
+-		 * fairness, and include remote zones now, before
+-		 * entering the slowpath and waking kswapd: prefer
+-		 * spilling to a remote zone over swapping locally.
+-		 */
+-		if (alloc_flags & ALLOC_FAIR) {
+-			reset_alloc_batches(zonelist, high_zoneidx,
+-					    preferred_zone);
+-			alloc_flags &= ~ALLOC_FAIR;
+-			goto retry;
+-		}
+-		/*
+ 		 * Runtime PM, block IO and its error handling path
+ 		 * can deadlock because I/O on the device might not
+ 		 * complete.
diff --git a/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch b/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
new file mode 100644
index 00000000000..6e6d5ee4a6c
--- /dev/null
+++ b/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
@@ -0,0 +1,37 @@
+From bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:18 -0700
+Subject: mm: vmscan: only update per-cpu thresholds for online CPU
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 upstream.
+
+When kswapd is awake reclaiming, the per-cpu stat thresholds are lowered
+to get more accurate counts to avoid breaching watermarks.  This
+threshold update iterates over all possible CPUs which is unnecessary.
+Only online CPUs need to be updated.  If a new CPU is onlined,
+refresh_zone_stat_thresholds() will set the thresholds correctly.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmstat.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_
+ 			continue;
+ 
+ 		threshold = (*calculate_pressure)(zone);
+-		for_each_possible_cpu(cpu)
++		for_each_online_cpu(cpu)
+ 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ 							= threshold;
+ 	}
diff --git a/queue-3.14/series b/queue-3.14/series
index f12aa9a328a..54104775ee3 100644
--- a/queue-3.14/series
+++ b/queue-3.14/series
@@ -69,3 +69,7 @@ vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch
 memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch
 mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch
 mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch
+mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
+mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
+mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
+mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch