]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 28 Jan 2015 01:13:19 +0000 (17:13 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 28 Jan 2015 01:13:19 +0000 (17:13 -0800)
added patches:
mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch
mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch

queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch [new file with mode: 0644]
queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch [new file with mode: 0644]
queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch [new file with mode: 0644]
queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch [new file with mode: 0644]
queue-3.14/series

diff --git a/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch b/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
new file mode 100644 (file)
index 0000000..91e463a
--- /dev/null
@@ -0,0 +1,147 @@
+From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:16 -0700
+Subject: mm: move zone->pages_scanned into a vmstat counter
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd upstream.
+
+zone->pages_scanned is a write-intensive cache line during page reclaim
+and it's also updated during page free.  Move the counter into vmstat to
+take advantage of the per-cpu updates and do not update it in the free
+paths unless necessary.
+
+On a small UMA machine running tiobench the difference is marginal.  On
+a 4-node machine the overhead is more noticable.  Note that automatic
+NUMA balancing was disabled for this test as otherwise the system CPU
+overhead is unpredictable.
+
+          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
+             vanillarearrange-v5   vmstat-v5
+User          746.94      759.78      774.56
+System      65336.22    58350.98    32847.27
+Elapsed     27553.52    27282.02    27415.04
+
+Note that the overhead reduction will vary depending on where exactly
+pages are allocated and freed.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h |    2 +-
+ mm/page_alloc.c        |   12 +++++++++---
+ mm/vmscan.c            |    7 ++++---
+ mm/vmstat.c            |    3 ++-
+ 4 files changed, 16 insertions(+), 8 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -143,6 +143,7 @@ enum zone_stat_item {
+       NR_SHMEM,               /* shmem pages (included tmpfs/GEM pages) */
+       NR_DIRTIED,             /* page dirtyings since bootup */
+       NR_WRITTEN,             /* page writings since bootup */
++      NR_PAGES_SCANNED,       /* pages scanned since last reclaim */
+ #ifdef CONFIG_NUMA
+       NUMA_HIT,               /* allocated in intended node */
+       NUMA_MISS,              /* allocated in non intended node */
+@@ -478,7 +479,6 @@ struct zone {
+       /* Fields commonly accessed by the page reclaim scanner */
+       spinlock_t              lru_lock;
+-      unsigned long           pages_scanned;     /* since last reclaim */
+       struct lruvec           lruvec;
+       /*
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -678,9 +678,12 @@ static void free_pcppages_bulk(struct zo
+       int migratetype = 0;
+       int batch_free = 0;
+       int to_free = count;
++      unsigned long nr_scanned;
+       spin_lock(&zone->lock);
+-      zone->pages_scanned = 0;
++      nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++      if (nr_scanned)
++              __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+       while (to_free) {
+               struct page *page;
+@@ -729,8 +732,11 @@ static void free_one_page(struct zone *z
+                               unsigned int order,
+                               int migratetype)
+ {
++      unsigned long nr_scanned;
+       spin_lock(&zone->lock);
+-      zone->pages_scanned = 0;
++      nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
++      if (nr_scanned)
++              __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+       __free_one_page(page, pfn, zone, order, migratetype);
+       if (unlikely(!is_migrate_isolate(migratetype)))
+@@ -3251,7 +3257,7 @@ void show_free_areas(unsigned int filter
+                       K(zone_page_state(zone, NR_BOUNCE)),
+                       K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
+                       K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
+-                      zone->pages_scanned,
++                      K(zone_page_state(zone, NR_PAGES_SCANNED)),
+                       (!zone_reclaimable(zone) ? "yes" : "no")
+                       );
+               printk("lowmem_reserve[]:");
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pa
+ bool zone_reclaimable(struct zone *zone)
+ {
+-      return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
++      return zone_page_state(zone, NR_PAGES_SCANNED) <
++              zone_reclaimable_pages(zone) * 6;
+ }
+ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+@@ -1470,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to
+       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       if (global_reclaim(sc)) {
+-              zone->pages_scanned += nr_scanned;
++              __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+               if (current_is_kswapd())
+                       __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+               else
+@@ -1659,7 +1660,7 @@ static void shrink_active_list(unsigned
+       nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+                                    &nr_scanned, sc, isolate_mode, lru);
+       if (global_reclaim(sc))
+-              zone->pages_scanned += nr_scanned;
++              __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+       reclaim_stat->recent_scanned[file] += nr_taken;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -761,6 +761,7 @@ const char * const vmstat_text[] = {
+       "nr_shmem",
+       "nr_dirtied",
+       "nr_written",
++      "nr_pages_scanned",
+ #ifdef CONFIG_NUMA
+       "numa_hit",
+@@ -1055,7 +1056,7 @@ static void zoneinfo_show_print(struct s
+                  min_wmark_pages(zone),
+                  low_wmark_pages(zone),
+                  high_wmark_pages(zone),
+-                 zone->pages_scanned,
++                 zone_page_state(zone, NR_PAGES_SCANNED),
+                  zone->spanned_pages,
+                  zone->present_pages,
+                  zone->managed_pages);
diff --git a/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch b/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
new file mode 100644 (file)
index 0000000..8a33610
--- /dev/null
@@ -0,0 +1,40 @@
+From stable-owner@vger.kernel.org Thu Aug 28 11:44:23 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Thu, 28 Aug 2014 19:35:44 +0100
+Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit f7b5d647946aae1647bf5cd26c16b3a793c1ac49 upstream.
+
+The purpose of numa_zonelist_order=zone is to preserve lower zones for
+use with 32-bit devices.  If locality is preferred then the
+numa_zonelist_order=node policy should be used.
+
+Unfortunately, the fair zone allocation policy overrides this by
+skipping zones on remote nodes until the lower one is found.  While this
+makes sense from a page aging and performance perspective, it breaks the
+expected zonelist policy.  This patch restores the expected behaviour
+for zone-list ordering.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/page_alloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1980,7 +1980,7 @@ zonelist_scan:
+                */
+               if (alloc_flags & ALLOC_FAIR) {
+                       if (!zone_local(preferred_zone, zone))
+-                              continue;
++                              break;
+                       if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0)
+                               continue;
+               }
diff --git a/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch b/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch
new file mode 100644 (file)
index 0000000..4bf0827
--- /dev/null
@@ -0,0 +1,237 @@
+From stable-owner@vger.kernel.org Thu Aug 28 11:44:22 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Thu, 28 Aug 2014 19:35:45 +0100
+Subject: mm: page_alloc: reduce cost of the fair zone allocation policy
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream.
+
+The fair zone allocation policy round-robins allocations between zones
+within a node to avoid age inversion problems during reclaim.  If the
+first allocation fails, the batch counts are reset and a second attempt
+made before entering the slow path.
+
+One assumption made with this scheme is that batches expire at roughly
+the same time and the resets each time are justified.  This assumption
+does not hold when zones reach their low watermark as the batches will
+be consumed at uneven rates.  Allocation failure due to watermark
+depletion result in additional zonelist scans for the reset and another
+watermark check before hitting the slowpath.
+
+On UMA, the benefit is negligible -- around 0.25%.  On 4-socket NUMA
+machine it's variable due to the variability of measuring overhead with
+the vmstat changes.  The system CPU overhead comparison looks like
+
+          3.16.0-rc3  3.16.0-rc3  3.16.0-rc3
+             vanilla   vmstat-v5 lowercost-v5
+User          746.94      774.56      802.00
+System      65336.22    32847.27    40852.33
+Elapsed     27553.52    27415.04    27368.46
+
+However it is worth noting that the overall benchmark still completed
+faster and intuitively it makes sense to take as few passes as possible
+through the zonelists.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mmzone.h |    6 ++
+ mm/page_alloc.c        |  101 +++++++++++++++++++++++++------------------------
+ 2 files changed, 59 insertions(+), 48 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -529,6 +529,7 @@ typedef enum {
+       ZONE_WRITEBACK,                 /* reclaim scanning has recently found
+                                        * many pages under writeback
+                                        */
++      ZONE_FAIR_DEPLETED,             /* fair zone policy batch depleted */
+ } zone_flags_t;
+ static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
+@@ -566,6 +567,11 @@ static inline int zone_is_reclaim_locked
+       return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ }
++static inline int zone_is_fair_depleted(const struct zone *zone)
++{
++      return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
++}
++
+ static inline int zone_is_oom_locked(const struct zone *zone)
+ {
+       return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1614,6 +1614,9 @@ again:
+       }
+       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
++      if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
++          !zone_is_fair_depleted(zone))
++              zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+       __count_zone_vm_events(PGALLOC, zone, 1 << order);
+       zone_statistics(preferred_zone, zone, gfp_flags);
+@@ -1938,6 +1941,18 @@ static inline void init_zone_allows_recl
+ }
+ #endif        /* CONFIG_NUMA */
++static void reset_alloc_batches(struct zone *preferred_zone)
++{
++      struct zone *zone = preferred_zone->zone_pgdat->node_zones;
++
++      do {
++              mod_zone_page_state(zone, NR_ALLOC_BATCH,
++                      high_wmark_pages(zone) - low_wmark_pages(zone) -
++                      atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
++              zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
++      } while (zone++ != preferred_zone);
++}
++
+ /*
+  * get_page_from_freelist goes through the zonelist trying to allocate
+  * a page.
+@@ -1955,8 +1970,12 @@ get_page_from_freelist(gfp_t gfp_mask, n
+       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
+       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+                               (gfp_mask & __GFP_WRITE);
++      int nr_fair_skipped = 0;
++      bool zonelist_rescan;
+ zonelist_scan:
++      zonelist_rescan = false;
++
+       /*
+        * Scan zonelist, looking for a zone with enough free.
+        * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+@@ -1981,8 +2000,10 @@ zonelist_scan:
+               if (alloc_flags & ALLOC_FAIR) {
+                       if (!zone_local(preferred_zone, zone))
+                               break;
+-                      if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0)
++                      if (zone_is_fair_depleted(zone)) {
++                              nr_fair_skipped++;
+                               continue;
++                      }
+               }
+               /*
+                * When allocating a page cache page for writing, we
+@@ -2088,13 +2109,7 @@ this_zone_full:
+                       zlc_mark_zone_full(zonelist, z);
+       }
+-      if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+-              /* Disable zlc cache for second zonelist scan */
+-              zlc_active = 0;
+-              goto zonelist_scan;
+-      }
+-
+-      if (page)
++      if (page) {
+               /*
+                * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                * necessary to allocate the page. The expectation is
+@@ -2103,8 +2118,37 @@ this_zone_full:
+                * for !PFMEMALLOC purposes.
+                */
+               page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
++              return page;
++      }
+-      return page;
++      /*
++       * The first pass makes sure allocations are spread fairly within the
++       * local node.  However, the local node might have free pages left
++       * after the fairness batches are exhausted, and remote zones haven't
++       * even been considered yet.  Try once more without fairness, and
++       * include remote zones now, before entering the slowpath and waking
++       * kswapd: prefer spilling to a remote zone over swapping locally.
++       */
++      if (alloc_flags & ALLOC_FAIR) {
++              alloc_flags &= ~ALLOC_FAIR;
++              if (nr_fair_skipped) {
++                      zonelist_rescan = true;
++                      reset_alloc_batches(preferred_zone);
++              }
++              if (nr_online_nodes > 1)
++                      zonelist_rescan = true;
++      }
++
++      if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
++              /* Disable zlc cache for second zonelist scan */
++              zlc_active = 0;
++              zonelist_rescan = true;
++      }
++
++      if (zonelist_rescan)
++              goto zonelist_scan;
++
++      return NULL;
+ }
+ /*
+@@ -2433,28 +2477,6 @@ __alloc_pages_high_priority(gfp_t gfp_ma
+       return page;
+ }
+-static void reset_alloc_batches(struct zonelist *zonelist,
+-                              enum zone_type high_zoneidx,
+-                              struct zone *preferred_zone)
+-{
+-      struct zoneref *z;
+-      struct zone *zone;
+-
+-      for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+-              /*
+-               * Only reset the batches of zones that were actually
+-               * considered in the fairness pass, we don't want to
+-               * trash fairness information for zones that are not
+-               * actually part of this zonelist's round-robin cycle.
+-               */
+-              if (!zone_local(preferred_zone, zone))
+-                      continue;
+-              mod_zone_page_state(zone, NR_ALLOC_BATCH,
+-                      high_wmark_pages(zone) - low_wmark_pages(zone) -
+-                      atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+-      }
+-}
+-
+ static void wake_all_kswapds(unsigned int order,
+                            struct zonelist *zonelist,
+                            enum zone_type high_zoneidx,
+@@ -2792,29 +2814,12 @@ retry_cpuset:
+       if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+               alloc_flags |= ALLOC_CMA;
+ #endif
+-retry:
+       /* First allocation attempt */
+       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+                       zonelist, high_zoneidx, alloc_flags,
+                       preferred_zone, classzone_idx, migratetype);
+       if (unlikely(!page)) {
+               /*
+-               * The first pass makes sure allocations are spread
+-               * fairly within the local node.  However, the local
+-               * node might have free pages left after the fairness
+-               * batches are exhausted, and remote zones haven't
+-               * even been considered yet.  Try once more without
+-               * fairness, and include remote zones now, before
+-               * entering the slowpath and waking kswapd: prefer
+-               * spilling to a remote zone over swapping locally.
+-               */
+-              if (alloc_flags & ALLOC_FAIR) {
+-                      reset_alloc_batches(zonelist, high_zoneidx,
+-                                          preferred_zone);
+-                      alloc_flags &= ~ALLOC_FAIR;
+-                      goto retry;
+-              }
+-              /*
+                * Runtime PM, block IO and its error handling path
+                * can deadlock because I/O on the device might not
+                * complete.
diff --git a/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch b/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
new file mode 100644 (file)
index 0000000..6e6d5ee
--- /dev/null
@@ -0,0 +1,37 @@
+From bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 6 Aug 2014 16:07:18 -0700
+Subject: mm: vmscan: only update per-cpu thresholds for online CPU
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 upstream.
+
+When kswapd is awake reclaiming, the per-cpu stat thresholds are lowered
+to get more accurate counts to avoid breaching watermarks.  This
+threshold update iterates over all possible CPUs which is unnecessary.
+Only online CPUs need to be updated.  If a new CPU is onlined,
+refresh_zone_stat_thresholds() will set the thresholds correctly.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmstat.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_
+                       continue;
+               threshold = (*calculate_pressure)(zone);
+-              for_each_possible_cpu(cpu)
++              for_each_online_cpu(cpu)
+                       per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                       = threshold;
+       }
index f12aa9a328a7a5f250e27f941d49dcbe15e13ce2..54104775ee3e31abcc31e21bcf556dcc7d24d6ed 100644 (file)
@@ -69,3 +69,7 @@ vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch
 memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch
 mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch
 mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch
+mm-move-zone-pages_scanned-into-a-vmstat-counter.patch
+mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch
+mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch
+mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch