From ec792532673a7c0e2961520bc093722e211d333c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 24 Jul 2012 16:02:52 -0700 Subject: [PATCH] 3.0-stable patches added patches: mm-compaction-trivial-clean-up-in-acct_isolated.patch vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch vmscan-limit-direct-reclaim-for-higher-order-allocations.patch vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch vmscan-shrinker-nr-updates-race-and-go-wrong.patch --- ...on-trivial-clean-up-in-acct_isolated.patch | 73 +++++++++++ queue-3.0/series | 5 + ...compaction-if-compaction-can-proceed.patch | 106 ++++++++++++++++ ...reclaim-for-higher-order-allocations.patch | 73 +++++++++++ ...inker-nr-when-shrinker-can-t-do-work.patch | 90 ++++++++++++++ ...hrinker-nr-updates-race-and-go-wrong.patch | 114 ++++++++++++++++++ 6 files changed, 461 insertions(+) create mode 100644 queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch create mode 100644 queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch create mode 100644 queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch create mode 100644 queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch create mode 100644 queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch diff --git a/queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch b/queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch new file mode 100644 index 00000000000..5d0a031c6b0 --- /dev/null +++ b/queue-3.0/mm-compaction-trivial-clean-up-in-acct_isolated.patch @@ -0,0 +1,73 @@ +From b9e84ac1536d35aee03b2601f19694949f0bd506 Mon Sep 17 00:00:00 2001 +From: Minchan Kim +Date: Mon, 31 Oct 2011 17:06:44 -0700 +Subject: mm: compaction: trivial clean up in acct_isolated() + +From: Minchan Kim + +commit b9e84ac1536d35aee03b2601f19694949f0bd506 upstream. + +Stable note: Not tracked in Bugzilla. This patch makes later patches + easier to apply but has no other impact. + +acct_isolated of compaction uses page_lru_base_type which returns only +base type of LRU list so it never returns LRU_ACTIVE_ANON or +LRU_ACTIVE_FILE. In addtion, cc->nr_[anon|file] is used in only +acct_isolated so it doesn't have fields in conpact_control. + +This patch removes fields from compact_control and makes clear function of +acct_issolated which counts the number of anon|file pages isolated. + +Signed-off-by: Minchan Kim +Acked-by: Johannes Weiner +Reviewed-by: KAMEZAWA Hiroyuki +Reviewed-by: KOSAKI Motohiro +Acked-by: Mel Gorman +Acked-by: Rik van Riel +Reviewed-by: Michal Hocko +Cc: Andrea Arcangeli +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 18 +++++------------- + 1 file changed, 5 insertions(+), 13 deletions(-) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -35,10 +35,6 @@ struct compact_control { + unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ + +- /* Account for isolated anon and file pages */ +- unsigned long nr_anon; +- unsigned long nr_file; +- + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +@@ -223,17 +219,13 @@ static void isolate_freepages(struct zon + static void acct_isolated(struct zone *zone, struct compact_control *cc) + { + struct page *page; +- unsigned int count[NR_LRU_LISTS] = { 0, }; ++ unsigned int count[2] = { 0, }; + +- list_for_each_entry(page, &cc->migratepages, lru) { +- int lru = page_lru_base_type(page); +- count[lru]++; +- } ++ list_for_each_entry(page, &cc->migratepages, lru) ++ count[!!page_is_file_cache(page)]++; + +- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; +- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; +- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); +- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); ++ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); ++ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); + } + + /* Similar to reclaim, but different enough that they don't share logic */ diff --git a/queue-3.0/series b/queue-3.0/series index cf7cced32c7..59fda2d916c 100644 --- a/queue-3.0/series +++ b/queue-3.0/series @@ -10,3 +10,8 @@ mm-reduce-the-amount-of-work-done-when-updating-min_free_kbytes.patch mm-vmscan-fix-force-scanning-small-targets-without-swap.patch vmscan-clear-zone_congested-for-zone-with-good-watermark.patch vmscan-add-shrink_slab-tracepoints.patch +vmscan-shrinker-nr-updates-race-and-go-wrong.patch +vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch +vmscan-limit-direct-reclaim-for-higher-order-allocations.patch +vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch +mm-compaction-trivial-clean-up-in-acct_isolated.patch diff --git a/queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch b/queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch new file mode 100644 index 00000000000..3a88a55307f --- /dev/null +++ b/queue-3.0/vmscan-abort-reclaim-compaction-if-compaction-can-proceed.patch @@ -0,0 +1,106 @@ +From e0c23279c9f800c403f37511484d9014ac83adec Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Mon, 31 Oct 2011 17:09:33 -0700 +Subject: vmscan: abort reclaim/compaction if compaction can proceed + +From: Mel Gorman + +commit e0c23279c9f800c403f37511484d9014ac83adec upstream. + +Stable note: Not tracked on Bugzilla. THP and compaction was found to + aggressively reclaim pages and stall systems under different + situations that was addressed piecemeal over time. + +If compaction can proceed, shrink_zones() stops doing any work but its +callers still call shrink_slab() which raises the priority and potentially +sleeps. This is unnecessary and wasteful so this patch aborts direct +reclaim/compaction entirely if compaction can proceed. + +Signed-off-by: Mel Gorman +Acked-by: Rik van Riel +Reviewed-by: Minchan Kim +Acked-by: Johannes Weiner +Cc: Josh Boyer +Cc: Andrea Arcangeli +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 32 +++++++++++++++++++++----------- + 1 file changed, 21 insertions(+), 11 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2037,14 +2037,19 @@ restart: + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. ++ * ++ * This function returns true if a zone is being reclaimed for a costly ++ * high-order allocation and compaction is either ready to begin or deferred. ++ * This indicates to the caller that it should retry the allocation or fail. + */ +-static void shrink_zones(int priority, struct zonelist *zonelist, ++static bool shrink_zones(int priority, struct zonelist *zonelist, + struct scan_control *sc) + { + struct zoneref *z; + struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; ++ bool should_abort_reclaim = false; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { +@@ -2061,19 +2066,20 @@ static void shrink_zones(int priority, s + continue; /* Let kswapd poll it */ + if (COMPACTION_BUILD) { + /* +- * If we already have plenty of memory +- * free for compaction, don't free any +- * more. Even though compaction is +- * invoked for any non-zero order, +- * only frequent costly order +- * reclamation is disruptive enough to +- * become a noticable problem, like +- * transparent huge page allocations. ++ * If we already have plenty of memory free for ++ * compaction in this zone, don't free any more. ++ * Even though compaction is invoked for any ++ * non-zero order, only frequent costly order ++ * reclamation is disruptive enough to become a ++ * noticable problem, like transparent huge page ++ * allocations. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER && + (compaction_suitable(zone, sc->order) || +- compaction_deferred(zone))) ++ compaction_deferred(zone))) { ++ should_abort_reclaim = true; + continue; ++ } + } + /* + * This steals pages from memory cgroups over softlimit +@@ -2092,6 +2098,8 @@ static void shrink_zones(int priority, s + + shrink_zone(priority, zone, sc); + } ++ ++ return should_abort_reclaim; + } + + static bool zone_reclaimable(struct zone *zone) +@@ -2156,7 +2164,9 @@ static unsigned long do_try_to_free_page + sc->nr_scanned = 0; + if (!priority) + disable_swap_token(sc->mem_cgroup); +- shrink_zones(priority, zonelist, sc); ++ if (shrink_zones(priority, zonelist, sc)) ++ break; ++ + /* + * Don't shrink slabs when reclaiming memory from + * over limit cgroups diff --git a/queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch b/queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch new file mode 100644 index 00000000000..5b10e6ae36a --- /dev/null +++ b/queue-3.0/vmscan-limit-direct-reclaim-for-higher-order-allocations.patch @@ -0,0 +1,73 @@ +From e0887c19b2daa140f20ca8104bdc5740f39dbb86 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Mon, 31 Oct 2011 17:09:31 -0700 +Subject: vmscan: limit direct reclaim for higher order allocations + +From: Rik van Riel + +commit e0887c19b2daa140f20ca8104bdc5740f39dbb86 upstream. + +Stable note: Not tracked on Bugzilla. THP and compaction was found to + aggressively reclaim pages and stall systems under different + situations that was addressed piecemeal over time. Paragraph + 3 of this changelog is the motivation for this patch. + +When suffering from memory fragmentation due to unfreeable pages, THP page +faults will repeatedly try to compact memory. Due to the unfreeable +pages, compaction fails. + +Needless to say, at that point page reclaim also fails to create free +contiguous 2MB areas. However, that doesn't stop the current code from +trying, over and over again, and freeing a minimum of 4MB (2UL << +sc->order pages) at every single invocation. + +This resulted in my 12GB system having 2-3GB free memory, a corresponding +amount of used swap and very sluggish response times. + +This can be avoided by having the direct reclaim code not reclaim from +zones that already have plenty of free memory available for compaction. + +If compaction still fails due to unmovable memory, doing additional +reclaim will only hurt the system, not help. + +[jweiner@redhat.com: change comment to explain the order check] +Signed-off-by: Rik van Riel +Acked-by: Johannes Weiner +Acked-by: Mel Gorman +Cc: Andrea Arcangeli +Reviewed-by: Minchan Kim +Signed-off-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2059,6 +2059,22 @@ static void shrink_zones(int priority, s + continue; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ ++ if (COMPACTION_BUILD) { ++ /* ++ * If we already have plenty of memory ++ * free for compaction, don't free any ++ * more. Even though compaction is ++ * invoked for any non-zero order, ++ * only frequent costly order ++ * reclamation is disruptive enough to ++ * become a noticable problem, like ++ * transparent huge page allocations. ++ */ ++ if (sc->order > PAGE_ALLOC_COSTLY_ORDER && ++ (compaction_suitable(zone, sc->order) || ++ compaction_deferred(zone))) ++ continue; ++ } + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and diff --git a/queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch b/queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch new file mode 100644 index 00000000000..5bfe0f70b93 --- /dev/null +++ b/queue-3.0/vmscan-reduce-wind-up-shrinker-nr-when-shrinker-can-t-do-work.patch @@ -0,0 +1,90 @@ +From 3567b59aa80ac4417002bf58e35dce5c777d4164 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Fri, 8 Jul 2011 14:14:36 +1000 +Subject: vmscan: reduce wind up shrinker->nr when shrinker can't do work + +From: Dave Chinner + +commit 3567b59aa80ac4417002bf58e35dce5c777d4164 upstream. + +Stable note: Not tracked in Bugzilla. This patch reduces excessive + reclaim of slab objects reducing the amount of information that + has to be brought back in from disk. The third and fourth paragram + in the series describes the impact. + +When a shrinker returns -1 to shrink_slab() to indicate it cannot do +any work given the current memory reclaim requirements, it adds the +entire total_scan count to shrinker->nr. The idea ehind this is that +whenteh shrinker is next called and can do work, it will do the work +of the previously aborted shrinker call as well. + +However, if a filesystem is doing lots of allocation with GFP_NOFS +set, then we get many, many more aborts from the shrinkers than we +do successful calls. The result is that shrinker->nr winds up to +it's maximum permissible value (twice the current cache size) and +then when the next shrinker call that can do work is issued, it +has enough scan count built up to free the entire cache twice over. + +This manifests itself in the cache going from full to empty in a +matter of seconds, even when only a small part of the cache is +needed to be emptied to free sufficient memory. + +Under metadata intensive workloads on ext4 and XFS, I'm seeing the +VFS caches increase memory consumption up to 75% of memory (no page +cache pressure) over a period of 30-60s, and then the shrinker +empties them down to zero in the space of 2-3s. This cycle repeats +over and over again, with the shrinker completely trashing the inode +and dentry caches every minute or so the workload continues. + +This behaviour was made obvious by the shrink_slab tracepoints added +earlier in the series, and made worse by the patch that corrected +the concurrent accounting of shrinker->nr. + +To avoid this problem, stop repeated small increments of the total +scan value from winding shrinker->nr up to a value that can cause +the entire cache to be freed. We still need to allow it to wind up, +so use the delta as the "large scan" threshold check - if the delta +is more than a quarter of the entire cache size, then it is a large +scan and allowed to cause lots of windup because we are clearly +needing to free lots of memory. + +If it isn't a large scan then limit the total scan to half the size +of the cache so that windup never increases to consume the whole +cache. Reducing the total scan limit further does not allow enough +wind-up to maintain the current levels of performance, whilst a +higher threshold does not prevent the windup from freeing the entire +cache under sustained workloads. + +Signed-off-by: Dave Chinner +Signed-off-by: Al Viro +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -277,6 +277,21 @@ unsigned long shrink_slab(struct shrink_ + } + + /* ++ * We need to avoid excessive windup on filesystem shrinkers ++ * due to large numbers of GFP_NOFS allocations causing the ++ * shrinkers to return -1 all the time. This results in a large ++ * nr being built up so when a shrink that can do some work ++ * comes along it empties the entire cache due to nr >>> ++ * max_pass. This is bad for sustaining a working set in ++ * memory. ++ * ++ * Hence only allow the shrinker to scan the entire cache when ++ * a large delta change is calculated directly. ++ */ ++ if (delta < max_pass / 4) ++ total_scan = min(total_scan, max_pass / 2); ++ ++ /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. diff --git a/queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch b/queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch new file mode 100644 index 00000000000..7ee038fefb8 --- /dev/null +++ b/queue-3.0/vmscan-shrinker-nr-updates-race-and-go-wrong.patch @@ -0,0 +1,114 @@ +From acf92b485cccf028177f46918e045c0c4e80ee10 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Fri, 8 Jul 2011 14:14:35 +1000 +Subject: vmscan: shrinker->nr updates race and go wrong + +From: Dave Chinner + +commit acf92b485cccf028177f46918e045c0c4e80ee10 upstream. + +Stable note: Not tracked in Bugzilla. This patch reduces excessive + reclaim of slab objects reducing the amount of information + that has to be brought back in from disk. + +shrink_slab() allows shrinkers to be called in parallel so the +struct shrinker can be updated concurrently. It does not provide any +exclusio for such updates, so we can get the shrinker->nr value +increasing or decreasing incorrectly. + +As a result, when a shrinker repeatedly returns a value of -1 (e.g. +a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire, +sometimes updating with the scan count that wasn't used, sometimes +losing it altogether. Worse is when a shrinker does work and that +update is lost due to racy updates, which means the shrinker will do +the work again! + +Fix this by making the total_scan calculations independent of +shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to +other updates via cmpxchg loops. + +Signed-off-by: Dave Chinner +Signed-off-by: Al Viro +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 45 ++++++++++++++++++++++++++++++++------------- + 1 file changed, 32 insertions(+), 13 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_ + unsigned long total_scan; + unsigned long max_pass; + int shrink_ret = 0; ++ long nr; ++ long new_nr; + ++ /* ++ * copy the current shrinker scan count into a local variable ++ * and zero it so that other concurrent shrinker invocations ++ * don't also do this scanning work. ++ */ ++ do { ++ nr = shrinker->nr; ++ } while (cmpxchg(&shrinker->nr, nr, 0) != nr); ++ ++ total_scan = nr; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + delta = (4 * nr_pages_scanned) / shrinker->seeks; + delta *= max_pass; + do_div(delta, lru_pages + 1); +- shrinker->nr += delta; +- if (shrinker->nr < 0) { ++ total_scan += delta; ++ if (total_scan < 0) { + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", +- shrinker->shrink, shrinker->nr); +- shrinker->nr = max_pass; ++ shrinker->shrink, total_scan); ++ total_scan = max_pass; + } + + /* +@@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_ + * never try to free more than twice the estimate number of + * freeable entries. + */ +- if (shrinker->nr > max_pass * 2) +- shrinker->nr = max_pass * 2; +- +- total_scan = shrinker->nr; +- shrinker->nr = 0; ++ if (total_scan > max_pass * 2) ++ total_scan = max_pass * 2; + +- trace_mm_shrink_slab_start(shrinker, shrink, total_scan, ++ trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); + +@@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_ + cond_resched(); + } + +- shrinker->nr += total_scan; +- trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan, +- shrinker->nr); ++ /* ++ * move the unused scan count back into the shrinker in a ++ * manner that handles concurrent updates. If we exhausted the ++ * scan, there is no need to do an update. ++ */ ++ do { ++ nr = shrinker->nr; ++ new_nr = total_scan + nr; ++ if (total_scan <= 0) ++ break; ++ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); ++ ++ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + } + up_read(&shrinker_rwsem); + out: -- 2.47.3