]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.0-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 24 Jul 2012 23:02:52 +0000 (16:02 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 24 Jul 2012 23:02:52 +0000 (16:02 -0700)
added patches:
mm-compaction-trivial-clean-up-in-acct_isolated.patch
vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch
vmscan-limit-direct-reclaim-for-higher-order-allocations.patch
vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch
vmscan-shrinker-nr-updates-race-and-go-wrong.patch

queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch [new file with mode: 0644]
queue-3.0/series
queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch [new file with mode: 0644]
queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch [new file with mode: 0644]
queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch [new file with mode: 0644]
queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch [new file with mode: 0644]

diff --git a/queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch b/queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch
new file mode 100644 (file)
index 0000000..5d0a031
--- /dev/null
@@ -0,0 +1,73 @@
+From b9e84ac1536d35aee03b2601f19694949f0bd506 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan.kim@gmail.com>
+Date: Mon, 31 Oct 2011 17:06:44 -0700
+Subject: mm: compaction: trivial clean up in acct_isolated()
+
+From: Minchan Kim <minchan.kim@gmail.com>
+
+commit b9e84ac1536d35aee03b2601f19694949f0bd506 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch makes later patches
+       easier to apply but has no other impact.
+
+acct_isolated of compaction uses page_lru_base_type which returns only
+base type of LRU list so it never returns LRU_ACTIVE_ANON or
+LRU_ACTIVE_FILE.  In addtion, cc->nr_[anon|file] is used in only
+acct_isolated so it doesn't have fields in conpact_control.
+
+This patch removes fields from compact_control and makes clear function of
+acct_issolated which counts the number of anon|file pages isolated.
+
+Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |   18 +++++-------------
+ 1 file changed, 5 insertions(+), 13 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -35,10 +35,6 @@ struct compact_control {
+       unsigned long migrate_pfn;      /* isolate_migratepages search base */
+       bool sync;                      /* Synchronous migration */
+-      /* Account for isolated anon and file pages */
+-      unsigned long nr_anon;
+-      unsigned long nr_file;
+-
+       unsigned int order;             /* order a direct compactor needs */
+       int migratetype;                /* MOVABLE, RECLAIMABLE etc */
+       struct zone *zone;
+@@ -223,17 +219,13 @@ static void isolate_freepages(struct zon
+ static void acct_isolated(struct zone *zone, struct compact_control *cc)
+ {
+       struct page *page;
+-      unsigned int count[NR_LRU_LISTS] = { 0, };
++      unsigned int count[2] = { 0, };
+-      list_for_each_entry(page, &cc->migratepages, lru) {
+-              int lru = page_lru_base_type(page);
+-              count[lru]++;
+-      }
++      list_for_each_entry(page, &cc->migratepages, lru)
++              count[!!page_is_file_cache(page)]++;
+-      cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+-      cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+-      __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
+-      __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
++      __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
++      __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ }
+ /* Similar to reclaim, but different enough that they don't share logic */
index cf7cced32c7f03487d3123b19d7414ebe0839eb7..59fda2d916c3526b91a1cdbc16aaab9ad6b43b0e 100644 (file)
@@ -10,3 +10,8 @@ mm-reduce-the-amount-of-work-done-when-updating-min_free_kbytes.patch
 mm-vmscan-fix-force-scanning-small-targets-without-swap.patch
 vmscan-clear-zone_congested-for-zone-with-good-watermark.patch
 vmscan-add-shrink_slab-tracepoints.patch
+vmscan-shrinker-nr-updates-race-and-go-wrong.patch
+vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch
+vmscan-limit-direct-reclaim-for-higher-order-allocations.patch
+vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch
+mm-compaction-trivial-clean-up-in-acct_isolated.patch
diff --git a/queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch b/queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch
new file mode 100644 (file)
index 0000000..3a88a55
--- /dev/null
@@ -0,0 +1,106 @@
+From e0c23279c9f800c403f37511484d9014ac83adec Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 31 Oct 2011 17:09:33 -0700
+Subject: vmscan: abort reclaim/compaction if compaction can proceed
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit e0c23279c9f800c403f37511484d9014ac83adec upstream.
+
+Stable note: Not tracked on Bugzilla. THP and compaction was found to
+       aggressively reclaim pages and stall systems under different
+       situations that was addressed piecemeal over time.
+
+If compaction can proceed, shrink_zones() stops doing any work but its
+callers still call shrink_slab() which raises the priority and potentially
+sleeps.  This is unnecessary and wasteful so this patch aborts direct
+reclaim/compaction entirely if compaction can proceed.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
+Acked-by: Johannes Weiner <jweiner@redhat.com>
+Cc: Josh Boyer <jwboyer@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2037,14 +2037,19 @@ restart:
+  *
+  * If a zone is deemed to be full of pinned pages then just give it a light
+  * scan then give up on it.
++ *
++ * This function returns true if a zone is being reclaimed for a costly
++ * high-order allocation and compaction is either ready to begin or deferred.
++ * This indicates to the caller that it should retry the allocation or fail.
+  */
+-static void shrink_zones(int priority, struct zonelist *zonelist,
++static bool shrink_zones(int priority, struct zonelist *zonelist,
+                                       struct scan_control *sc)
+ {
+       struct zoneref *z;
+       struct zone *zone;
+       unsigned long nr_soft_reclaimed;
+       unsigned long nr_soft_scanned;
++      bool should_abort_reclaim = false;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
+@@ -2061,19 +2066,20 @@ static void shrink_zones(int priority, s
+                               continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+-                               * If we already have plenty of memory
+-                               * free for compaction, don't free any
+-                               * more.  Even though compaction is
+-                               * invoked for any non-zero order,
+-                               * only frequent costly order
+-                               * reclamation is disruptive enough to
+-                               * become a noticable problem, like
+-                               * transparent huge page allocations.
++                               * If we already have plenty of memory free for
++                               * compaction in this zone, don't free any more.
++                               * Even though compaction is invoked for any
++                               * non-zero order, only frequent costly order
++                               * reclamation is disruptive enough to become a
++                               * noticable problem, like transparent huge page
++                               * allocations.
+                                */
+                               if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                       (compaction_suitable(zone, sc->order) ||
+-                                       compaction_deferred(zone)))
++                                       compaction_deferred(zone))) {
++                                      should_abort_reclaim = true;
+                                       continue;
++                              }
+                       }
+                       /*
+                        * This steals pages from memory cgroups over softlimit
+@@ -2092,6 +2098,8 @@ static void shrink_zones(int priority, s
+               shrink_zone(priority, zone, sc);
+       }
++
++      return should_abort_reclaim;
+ }
+ static bool zone_reclaimable(struct zone *zone)
+@@ -2156,7 +2164,9 @@ static unsigned long do_try_to_free_page
+               sc->nr_scanned = 0;
+               if (!priority)
+                       disable_swap_token(sc->mem_cgroup);
+-              shrink_zones(priority, zonelist, sc);
++              if (shrink_zones(priority, zonelist, sc))
++                      break;
++
+               /*
+                * Don't shrink slabs when reclaiming memory from
+                * over limit cgroups
diff --git a/queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch b/queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch
new file mode 100644 (file)
index 0000000..5b10e6a
--- /dev/null
@@ -0,0 +1,73 @@
+From e0887c19b2daa140f20ca8104bdc5740f39dbb86 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Mon, 31 Oct 2011 17:09:31 -0700
+Subject: vmscan: limit direct reclaim for higher order allocations
+
+From: Rik van Riel <riel@redhat.com>
+
+commit e0887c19b2daa140f20ca8104bdc5740f39dbb86 upstream.
+
+Stable note: Not tracked on Bugzilla. THP and compaction was found to
+       aggressively reclaim pages and stall systems under different
+       situations that was addressed piecemeal over time.  Paragraph
+       3 of this changelog is the motivation for this patch.
+
+When suffering from memory fragmentation due to unfreeable pages, THP page
+faults will repeatedly try to compact memory.  Due to the unfreeable
+pages, compaction fails.
+
+Needless to say, at that point page reclaim also fails to create free
+contiguous 2MB areas.  However, that doesn't stop the current code from
+trying, over and over again, and freeing a minimum of 4MB (2UL <<
+sc->order pages) at every single invocation.
+
+This resulted in my 12GB system having 2-3GB free memory, a corresponding
+amount of used swap and very sluggish response times.
+
+This can be avoided by having the direct reclaim code not reclaim from
+zones that already have plenty of free memory available for compaction.
+
+If compaction still fails due to unmovable memory, doing additional
+reclaim will only hurt the system, not help.
+
+[jweiner@redhat.com: change comment to explain the order check]
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Acked-by: Johannes Weiner <jweiner@redhat.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
+Signed-off-by: Johannes Weiner <jweiner@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2059,6 +2059,22 @@ static void shrink_zones(int priority, s
+                               continue;
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                               continue;       /* Let kswapd poll it */
++                      if (COMPACTION_BUILD) {
++                              /*
++                               * If we already have plenty of memory
++                               * free for compaction, don't free any
++                               * more.  Even though compaction is
++                               * invoked for any non-zero order,
++                               * only frequent costly order
++                               * reclamation is disruptive enough to
++                               * become a noticable problem, like
++                               * transparent huge page allocations.
++                               */
++                              if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
++                                      (compaction_suitable(zone, sc->order) ||
++                                       compaction_deferred(zone)))
++                                      continue;
++                      }
+                       /*
+                        * This steals pages from memory cgroups over softlimit
+                        * and returns the number of reclaimed pages and
diff --git a/queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch b/queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch
new file mode 100644 (file)
index 0000000..5bfe0f7
--- /dev/null
@@ -0,0 +1,90 @@
+From 3567b59aa80ac4417002bf58e35dce5c777d4164 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 8 Jul 2011 14:14:36 +1000
+Subject: vmscan: reduce wind up shrinker->nr when shrinker can't do work
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 3567b59aa80ac4417002bf58e35dce5c777d4164 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch reduces excessive
+       reclaim of slab objects reducing the amount of information that
+       has to be brought back in from disk. The third and fourth paragram
+       in the series describes the impact.
+
+When a shrinker returns -1 to shrink_slab() to indicate it cannot do
+any work given the current memory reclaim requirements, it adds the
+entire total_scan count to shrinker->nr. The idea ehind this is that
+whenteh shrinker is next called and can do work, it will do the work
+of the previously aborted shrinker call as well.
+
+However, if a filesystem is doing lots of allocation with GFP_NOFS
+set, then we get many, many more aborts from the shrinkers than we
+do successful calls. The result is that shrinker->nr winds up to
+it's maximum permissible value (twice the current cache size) and
+then when the next shrinker call that can do work is issued, it
+has enough scan count built up to free the entire cache twice over.
+
+This manifests itself in the cache going from full to empty in a
+matter of seconds, even when only a small part of the cache is
+needed to be emptied to free sufficient memory.
+
+Under metadata intensive workloads on ext4 and XFS, I'm seeing the
+VFS caches increase memory consumption up to 75% of memory (no page
+cache pressure) over a period of 30-60s, and then the shrinker
+empties them down to zero in the space of 2-3s. This cycle repeats
+over and over again, with the shrinker completely trashing the inode
+and dentry caches every minute or so the workload continues.
+
+This behaviour was made obvious by the shrink_slab tracepoints added
+earlier in the series, and made worse by the patch that corrected
+the concurrent accounting of shrinker->nr.
+
+To avoid this problem, stop repeated small increments of the total
+scan value from winding shrinker->nr up to a value that can cause
+the entire cache to be freed. We still need to allow it to wind up,
+so use the delta as the "large scan" threshold check - if the delta
+is more than a quarter of the entire cache size, then it is a large
+scan and allowed to cause lots of windup because we are clearly
+needing to free lots of memory.
+
+If it isn't a large scan then limit the total scan to half the size
+of the cache so that windup never increases to consume the whole
+cache. Reducing the total scan limit further does not allow enough
+wind-up to maintain the current levels of performance, whilst a
+higher threshold does not prevent the windup from freeing the entire
+cache under sustained workloads.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -277,6 +277,21 @@ unsigned long shrink_slab(struct shrink_
+               }
+               /*
++               * We need to avoid excessive windup on filesystem shrinkers
++               * due to large numbers of GFP_NOFS allocations causing the
++               * shrinkers to return -1 all the time. This results in a large
++               * nr being built up so when a shrink that can do some work
++               * comes along it empties the entire cache due to nr >>>
++               * max_pass.  This is bad for sustaining a working set in
++               * memory.
++               *
++               * Hence only allow the shrinker to scan the entire cache when
++               * a large delta change is calculated directly.
++               */
++              if (delta < max_pass / 4)
++                      total_scan = min(total_scan, max_pass / 2);
++
++              /*
+                * Avoid risking looping forever due to too large nr value:
+                * never try to free more than twice the estimate number of
+                * freeable entries.
diff --git a/queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch b/queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch
new file mode 100644 (file)
index 0000000..7ee038f
--- /dev/null
@@ -0,0 +1,114 @@
+From acf92b485cccf028177f46918e045c0c4e80ee10 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 8 Jul 2011 14:14:35 +1000
+Subject: vmscan: shrinker->nr updates race and go wrong
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit acf92b485cccf028177f46918e045c0c4e80ee10 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch reduces excessive
+       reclaim of slab objects reducing the amount of information
+       that has to be brought back in from disk.
+
+shrink_slab() allows shrinkers to be called in parallel so the
+struct shrinker can be updated concurrently. It does not provide any
+exclusio for such updates, so we can get the shrinker->nr value
+increasing or decreasing incorrectly.
+
+As a result, when a shrinker repeatedly returns a value of -1 (e.g.
+a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire,
+sometimes updating with the scan count that wasn't used, sometimes
+losing it altogether. Worse is when a shrinker does work and that
+update is lost due to racy updates, which means the shrinker will do
+the work again!
+
+Fix this by making the total_scan calculations independent of
+shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to
+other updates via cmpxchg loops.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   45 ++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 32 insertions(+), 13 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_
+               unsigned long total_scan;
+               unsigned long max_pass;
+               int shrink_ret = 0;
++              long nr;
++              long new_nr;
++              /*
++               * copy the current shrinker scan count into a local variable
++               * and zero it so that other concurrent shrinker invocations
++               * don't also do this scanning work.
++               */
++              do {
++                      nr = shrinker->nr;
++              } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
++
++              total_scan = nr;
+               max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+               delta = (4 * nr_pages_scanned) / shrinker->seeks;
+               delta *= max_pass;
+               do_div(delta, lru_pages + 1);
+-              shrinker->nr += delta;
+-              if (shrinker->nr < 0) {
++              total_scan += delta;
++              if (total_scan < 0) {
+                       printk(KERN_ERR "shrink_slab: %pF negative objects to "
+                              "delete nr=%ld\n",
+-                             shrinker->shrink, shrinker->nr);
+-                      shrinker->nr = max_pass;
++                             shrinker->shrink, total_scan);
++                      total_scan = max_pass;
+               }
+               /*
+@@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_
+                * never try to free more than twice the estimate number of
+                * freeable entries.
+                */
+-              if (shrinker->nr > max_pass * 2)
+-                      shrinker->nr = max_pass * 2;
+-
+-              total_scan = shrinker->nr;
+-              shrinker->nr = 0;
++              if (total_scan > max_pass * 2)
++                      total_scan = max_pass * 2;
+-              trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
++              trace_mm_shrink_slab_start(shrinker, shrink, nr,
+                                       nr_pages_scanned, lru_pages,
+                                       max_pass, delta, total_scan);
+@@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_
+                       cond_resched();
+               }
+-              shrinker->nr += total_scan;
+-              trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
+-                                       shrinker->nr);
++              /*
++               * move the unused scan count back into the shrinker in a
++               * manner that handles concurrent updates. If we exhausted the
++               * scan, there is no need to do an update.
++               */
++              do {
++                      nr = shrinker->nr;
++                      new_nr = total_scan + nr;
++                      if (total_scan <= 0)
++                              break;
++              } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
++
++              trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+       }
+       up_read(&shrinker_rwsem);
+ out: