--- /dev/null
+From b9e84ac1536d35aee03b2601f19694949f0bd506 Mon Sep 17 00:00:00 2001
+From: Minchan Kim <minchan.kim@gmail.com>
+Date: Mon, 31 Oct 2011 17:06:44 -0700
+Subject: mm: compaction: trivial clean up in acct_isolated()
+
+From: Minchan Kim <minchan.kim@gmail.com>
+
+commit b9e84ac1536d35aee03b2601f19694949f0bd506 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch makes later patches
+ easier to apply but has no other impact.
+
+acct_isolated of compaction uses page_lru_base_type which returns only
+base type of LRU list so it never returns LRU_ACTIVE_ANON or
+LRU_ACTIVE_FILE. In addtion, cc->nr_[anon|file] is used in only
+acct_isolated so it doesn't have fields in conpact_control.
+
+This patch removes fields from compact_control and makes clear function of
+acct_issolated which counts the number of anon|file pages isolated.
+
+Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c | 18 +++++-------------
+ 1 file changed, 5 insertions(+), 13 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -35,10 +35,6 @@ struct compact_control {
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
+
+- /* Account for isolated anon and file pages */
+- unsigned long nr_anon;
+- unsigned long nr_file;
+-
+ unsigned int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+@@ -223,17 +219,13 @@ static void isolate_freepages(struct zon
+ static void acct_isolated(struct zone *zone, struct compact_control *cc)
+ {
+ struct page *page;
+- unsigned int count[NR_LRU_LISTS] = { 0, };
++ unsigned int count[2] = { 0, };
+
+- list_for_each_entry(page, &cc->migratepages, lru) {
+- int lru = page_lru_base_type(page);
+- count[lru]++;
+- }
++ list_for_each_entry(page, &cc->migratepages, lru)
++ count[!!page_is_file_cache(page)]++;
+
+- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
+- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
++ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
++ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ }
+
+ /* Similar to reclaim, but different enough that they don't share logic */
mm-vmscan-fix-force-scanning-small-targets-without-swap.patch
vmscan-clear-zone_congested-for-zone-with-good-watermark.patch
vmscan-add-shrink_slab-tracepoints.patch
+vmscan-shrinker-nr-updates-race-and-go-wrong.patch
+vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch
+vmscan-limit-direct-reclaim-for-higher-order-allocations.patch
+vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch
+mm-compaction-trivial-clean-up-in-acct_isolated.patch
--- /dev/null
+From e0c23279c9f800c403f37511484d9014ac83adec Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 31 Oct 2011 17:09:33 -0700
+Subject: vmscan: abort reclaim/compaction if compaction can proceed
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit e0c23279c9f800c403f37511484d9014ac83adec upstream.
+
+Stable note: Not tracked on Bugzilla. THP and compaction was found to
+ aggressively reclaim pages and stall systems under different
+ situations that was addressed piecemeal over time.
+
+If compaction can proceed, shrink_zones() stops doing any work but its
+callers still call shrink_slab() which raises the priority and potentially
+sleeps. This is unnecessary and wasteful so this patch aborts direct
+reclaim/compaction entirely if compaction can proceed.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
+Acked-by: Johannes Weiner <jweiner@redhat.com>
+Cc: Josh Boyer <jwboyer@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2037,14 +2037,19 @@ restart:
+ *
+ * If a zone is deemed to be full of pinned pages then just give it a light
+ * scan then give up on it.
++ *
++ * This function returns true if a zone is being reclaimed for a costly
++ * high-order allocation and compaction is either ready to begin or deferred.
++ * This indicates to the caller that it should retry the allocation or fail.
+ */
+-static void shrink_zones(int priority, struct zonelist *zonelist,
++static bool shrink_zones(int priority, struct zonelist *zonelist,
+ struct scan_control *sc)
+ {
+ struct zoneref *z;
+ struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
++ bool should_abort_reclaim = false;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+@@ -2061,19 +2066,20 @@ static void shrink_zones(int priority, s
+ continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+- * If we already have plenty of memory
+- * free for compaction, don't free any
+- * more. Even though compaction is
+- * invoked for any non-zero order,
+- * only frequent costly order
+- * reclamation is disruptive enough to
+- * become a noticable problem, like
+- * transparent huge page allocations.
++ * If we already have plenty of memory free for
++ * compaction in this zone, don't free any more.
++ * Even though compaction is invoked for any
++ * non-zero order, only frequent costly order
++ * reclamation is disruptive enough to become a
++ * noticable problem, like transparent huge page
++ * allocations.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ (compaction_suitable(zone, sc->order) ||
+- compaction_deferred(zone)))
++ compaction_deferred(zone))) {
++ should_abort_reclaim = true;
+ continue;
++ }
+ }
+ /*
+ * This steals pages from memory cgroups over softlimit
+@@ -2092,6 +2098,8 @@ static void shrink_zones(int priority, s
+
+ shrink_zone(priority, zone, sc);
+ }
++
++ return should_abort_reclaim;
+ }
+
+ static bool zone_reclaimable(struct zone *zone)
+@@ -2156,7 +2164,9 @@ static unsigned long do_try_to_free_page
+ sc->nr_scanned = 0;
+ if (!priority)
+ disable_swap_token(sc->mem_cgroup);
+- shrink_zones(priority, zonelist, sc);
++ if (shrink_zones(priority, zonelist, sc))
++ break;
++
+ /*
+ * Don't shrink slabs when reclaiming memory from
+ * over limit cgroups
--- /dev/null
+From e0887c19b2daa140f20ca8104bdc5740f39dbb86 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Mon, 31 Oct 2011 17:09:31 -0700
+Subject: vmscan: limit direct reclaim for higher order allocations
+
+From: Rik van Riel <riel@redhat.com>
+
+commit e0887c19b2daa140f20ca8104bdc5740f39dbb86 upstream.
+
+Stable note: Not tracked on Bugzilla. THP and compaction was found to
+ aggressively reclaim pages and stall systems under different
+ situations that was addressed piecemeal over time. Paragraph
+ 3 of this changelog is the motivation for this patch.
+
+When suffering from memory fragmentation due to unfreeable pages, THP page
+faults will repeatedly try to compact memory. Due to the unfreeable
+pages, compaction fails.
+
+Needless to say, at that point page reclaim also fails to create free
+contiguous 2MB areas. However, that doesn't stop the current code from
+trying, over and over again, and freeing a minimum of 4MB (2UL <<
+sc->order pages) at every single invocation.
+
+This resulted in my 12GB system having 2-3GB free memory, a corresponding
+amount of used swap and very sluggish response times.
+
+This can be avoided by having the direct reclaim code not reclaim from
+zones that already have plenty of free memory available for compaction.
+
+If compaction still fails due to unmovable memory, doing additional
+reclaim will only hurt the system, not help.
+
+[jweiner@redhat.com: change comment to explain the order check]
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Acked-by: Johannes Weiner <jweiner@redhat.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
+Signed-off-by: Johannes Weiner <jweiner@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2059,6 +2059,22 @@ static void shrink_zones(int priority, s
+ continue;
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue; /* Let kswapd poll it */
++ if (COMPACTION_BUILD) {
++ /*
++ * If we already have plenty of memory
++ * free for compaction, don't free any
++ * more. Even though compaction is
++ * invoked for any non-zero order,
++ * only frequent costly order
++ * reclamation is disruptive enough to
++ * become a noticable problem, like
++ * transparent huge page allocations.
++ */
++ if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
++ (compaction_suitable(zone, sc->order) ||
++ compaction_deferred(zone)))
++ continue;
++ }
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
--- /dev/null
+From 3567b59aa80ac4417002bf58e35dce5c777d4164 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 8 Jul 2011 14:14:36 +1000
+Subject: vmscan: reduce wind up shrinker->nr when shrinker can't do work
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 3567b59aa80ac4417002bf58e35dce5c777d4164 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch reduces excessive
+ reclaim of slab objects reducing the amount of information that
+ has to be brought back in from disk. The third and fourth paragram
+ in the series describes the impact.
+
+When a shrinker returns -1 to shrink_slab() to indicate it cannot do
+any work given the current memory reclaim requirements, it adds the
+entire total_scan count to shrinker->nr. The idea ehind this is that
+whenteh shrinker is next called and can do work, it will do the work
+of the previously aborted shrinker call as well.
+
+However, if a filesystem is doing lots of allocation with GFP_NOFS
+set, then we get many, many more aborts from the shrinkers than we
+do successful calls. The result is that shrinker->nr winds up to
+it's maximum permissible value (twice the current cache size) and
+then when the next shrinker call that can do work is issued, it
+has enough scan count built up to free the entire cache twice over.
+
+This manifests itself in the cache going from full to empty in a
+matter of seconds, even when only a small part of the cache is
+needed to be emptied to free sufficient memory.
+
+Under metadata intensive workloads on ext4 and XFS, I'm seeing the
+VFS caches increase memory consumption up to 75% of memory (no page
+cache pressure) over a period of 30-60s, and then the shrinker
+empties them down to zero in the space of 2-3s. This cycle repeats
+over and over again, with the shrinker completely trashing the inode
+and dentry caches every minute or so the workload continues.
+
+This behaviour was made obvious by the shrink_slab tracepoints added
+earlier in the series, and made worse by the patch that corrected
+the concurrent accounting of shrinker->nr.
+
+To avoid this problem, stop repeated small increments of the total
+scan value from winding shrinker->nr up to a value that can cause
+the entire cache to be freed. We still need to allow it to wind up,
+so use the delta as the "large scan" threshold check - if the delta
+is more than a quarter of the entire cache size, then it is a large
+scan and allowed to cause lots of windup because we are clearly
+needing to free lots of memory.
+
+If it isn't a large scan then limit the total scan to half the size
+of the cache so that windup never increases to consume the whole
+cache. Reducing the total scan limit further does not allow enough
+wind-up to maintain the current levels of performance, whilst a
+higher threshold does not prevent the windup from freeing the entire
+cache under sustained workloads.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -277,6 +277,21 @@ unsigned long shrink_slab(struct shrink_
+ }
+
+ /*
++ * We need to avoid excessive windup on filesystem shrinkers
++ * due to large numbers of GFP_NOFS allocations causing the
++ * shrinkers to return -1 all the time. This results in a large
++ * nr being built up so when a shrink that can do some work
++ * comes along it empties the entire cache due to nr >>>
++ * max_pass. This is bad for sustaining a working set in
++ * memory.
++ *
++ * Hence only allow the shrinker to scan the entire cache when
++ * a large delta change is calculated directly.
++ */
++ if (delta < max_pass / 4)
++ total_scan = min(total_scan, max_pass / 2);
++
++ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
--- /dev/null
+From acf92b485cccf028177f46918e045c0c4e80ee10 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Fri, 8 Jul 2011 14:14:35 +1000
+Subject: vmscan: shrinker->nr updates race and go wrong
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit acf92b485cccf028177f46918e045c0c4e80ee10 upstream.
+
+Stable note: Not tracked in Bugzilla. This patch reduces excessive
+ reclaim of slab objects reducing the amount of information
+ that has to be brought back in from disk.
+
+shrink_slab() allows shrinkers to be called in parallel so the
+struct shrinker can be updated concurrently. It does not provide any
+exclusio for such updates, so we can get the shrinker->nr value
+increasing or decreasing incorrectly.
+
+As a result, when a shrinker repeatedly returns a value of -1 (e.g.
+a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire,
+sometimes updating with the scan count that wasn't used, sometimes
+losing it altogether. Worse is when a shrinker does work and that
+update is lost due to racy updates, which means the shrinker will do
+the work again!
+
+Fix this by making the total_scan calculations independent of
+shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to
+other updates via cmpxchg loops.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 45 ++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 32 insertions(+), 13 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_
+ unsigned long total_scan;
+ unsigned long max_pass;
+ int shrink_ret = 0;
++ long nr;
++ long new_nr;
+
++ /*
++ * copy the current shrinker scan count into a local variable
++ * and zero it so that other concurrent shrinker invocations
++ * don't also do this scanning work.
++ */
++ do {
++ nr = shrinker->nr;
++ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
++
++ total_scan = nr;
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+- shrinker->nr += delta;
+- if (shrinker->nr < 0) {
++ total_scan += delta;
++ if (total_scan < 0) {
+ printk(KERN_ERR "shrink_slab: %pF negative objects to "
+ "delete nr=%ld\n",
+- shrinker->shrink, shrinker->nr);
+- shrinker->nr = max_pass;
++ shrinker->shrink, total_scan);
++ total_scan = max_pass;
+ }
+
+ /*
+@@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+- if (shrinker->nr > max_pass * 2)
+- shrinker->nr = max_pass * 2;
+-
+- total_scan = shrinker->nr;
+- shrinker->nr = 0;
++ if (total_scan > max_pass * 2)
++ total_scan = max_pass * 2;
+
+- trace_mm_shrink_slab_start(shrinker, shrink, total_scan,
++ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+@@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_
+ cond_resched();
+ }
+
+- shrinker->nr += total_scan;
+- trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan,
+- shrinker->nr);
++ /*
++ * move the unused scan count back into the shrinker in a
++ * manner that handles concurrent updates. If we exhausted the
++ * scan, there is no need to do an update.
++ */
++ do {
++ nr = shrinker->nr;
++ new_nr = total_scan + nr;
++ if (total_scan <= 0)
++ break;
++ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
++
++ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+ }
+ up_read(&shrinker_rwsem);
+ out: