From: Greg Kroah-Hartman Date: Wed, 28 Jan 2015 01:13:19 +0000 (-0800) Subject: 3.14-stable patches X-Git-Tag: v3.10.67~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=04ff371c3f81f73602b55dd9b413d0439783b161;p=thirdparty%2Fkernel%2Fstable-queue.git 3.14-stable patches added patches: mm-move-zone-pages_scanned-into-a-vmstat-counter.patch mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch --- diff --git a/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch b/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch new file mode 100644 index 00000000000..91e463a5d4f --- /dev/null +++ b/queue-3.14/mm-move-zone-pages_scanned-into-a-vmstat-counter.patch @@ -0,0 +1,147 @@ +From 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 6 Aug 2014 16:07:16 -0700 +Subject: mm: move zone->pages_scanned into a vmstat counter + +From: Mel Gorman + +commit 0d5d823ab4e608ec7b52ac4410de4cb74bbe0edd upstream. + +zone->pages_scanned is a write-intensive cache line during page reclaim +and it's also updated during page free. Move the counter into vmstat to +take advantage of the per-cpu updates and do not update it in the free +paths unless necessary. + +On a small UMA machine running tiobench the difference is marginal. On +a 4-node machine the overhead is more noticable. Note that automatic +NUMA balancing was disabled for this test as otherwise the system CPU +overhead is unpredictable. + + 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 + vanillarearrange-v5 vmstat-v5 +User 746.94 759.78 774.56 +System 65336.22 58350.98 32847.27 +Elapsed 27553.52 27282.02 27415.04 + +Note that the overhead reduction will vary depending on where exactly +pages are allocated and freed. + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mmzone.h | 2 +- + mm/page_alloc.c | 12 +++++++++--- + mm/vmscan.c | 7 ++++--- + mm/vmstat.c | 3 ++- + 4 files changed, 16 insertions(+), 8 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -143,6 +143,7 @@ enum zone_stat_item { + NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ + NR_DIRTIED, /* page dirtyings since bootup */ + NR_WRITTEN, /* page writings since bootup */ ++ NR_PAGES_SCANNED, /* pages scanned since last reclaim */ + #ifdef CONFIG_NUMA + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ +@@ -478,7 +479,6 @@ struct zone { + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; +- unsigned long pages_scanned; /* since last reclaim */ + struct lruvec lruvec; + + /* +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -678,9 +678,12 @@ static void free_pcppages_bulk(struct zo + int migratetype = 0; + int batch_free = 0; + int to_free = count; ++ unsigned long nr_scanned; + + spin_lock(&zone->lock); +- zone->pages_scanned = 0; ++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); ++ if (nr_scanned) ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + + while (to_free) { + struct page *page; +@@ -729,8 +732,11 @@ static void free_one_page(struct zone *z + unsigned int order, + int migratetype) + { ++ unsigned long nr_scanned; + spin_lock(&zone->lock); +- zone->pages_scanned = 0; ++ nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); ++ if (nr_scanned) ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + + __free_one_page(page, pfn, zone, order, migratetype); + if (unlikely(!is_migrate_isolate(migratetype))) +@@ -3251,7 +3257,7 @@ void show_free_areas(unsigned int filter + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), +- zone->pages_scanned, ++ K(zone_page_state(zone, NR_PAGES_SCANNED)), + (!zone_reclaimable(zone) ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pa + + bool zone_reclaimable(struct zone *zone) + { +- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; ++ return zone_page_state(zone, NR_PAGES_SCANNED) < ++ zone_reclaimable_pages(zone) * 6; + } + + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +@@ -1470,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + + if (global_reclaim(sc)) { +- zone->pages_scanned += nr_scanned; ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + else +@@ -1659,7 +1660,7 @@ static void shrink_active_list(unsigned + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); + if (global_reclaim(sc)) +- zone->pages_scanned += nr_scanned; ++ __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + + reclaim_stat->recent_scanned[file] += nr_taken; + +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -761,6 +761,7 @@ const char * const vmstat_text[] = { + "nr_shmem", + "nr_dirtied", + "nr_written", ++ "nr_pages_scanned", + + #ifdef CONFIG_NUMA + "numa_hit", +@@ -1055,7 +1056,7 @@ static void zoneinfo_show_print(struct s + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), +- zone->pages_scanned, ++ zone_page_state(zone, NR_PAGES_SCANNED), + zone->spanned_pages, + zone->present_pages, + zone->managed_pages); diff --git a/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch b/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch new file mode 100644 index 00000000000..8a3361042f8 --- /dev/null +++ b/queue-3.14/mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch @@ -0,0 +1,40 @@ +From stable-owner@vger.kernel.org Thu Aug 28 11:44:23 2014 +From: Mel Gorman +Date: Thu, 28 Aug 2014 19:35:44 +0100 +Subject: mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered + +From: Mel Gorman + +commit f7b5d647946aae1647bf5cd26c16b3a793c1ac49 upstream. + +The purpose of numa_zonelist_order=zone is to preserve lower zones for +use with 32-bit devices. If locality is preferred then the +numa_zonelist_order=node policy should be used. + +Unfortunately, the fair zone allocation policy overrides this by +skipping zones on remote nodes until the lower one is found. While this +makes sense from a page aging and performance perspective, it breaks the +expected zonelist policy. This patch restores the expected behaviour +for zone-list ordering. + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1980,7 +1980,7 @@ zonelist_scan: + */ + if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(preferred_zone, zone)) +- continue; ++ break; + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0) + continue; + } diff --git a/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch b/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch new file mode 100644 index 00000000000..4bf082778aa --- /dev/null +++ b/queue-3.14/mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch @@ -0,0 +1,237 @@ +From stable-owner@vger.kernel.org Thu Aug 28 11:44:22 2014 +From: Mel Gorman +Date: Thu, 28 Aug 2014 19:35:45 +0100 +Subject: mm: page_alloc: reduce cost of the fair zone allocation policy + +From: Mel Gorman + +commit 4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 upstream. + +The fair zone allocation policy round-robins allocations between zones +within a node to avoid age inversion problems during reclaim. If the +first allocation fails, the batch counts are reset and a second attempt +made before entering the slow path. + +One assumption made with this scheme is that batches expire at roughly +the same time and the resets each time are justified. This assumption +does not hold when zones reach their low watermark as the batches will +be consumed at uneven rates. Allocation failure due to watermark +depletion result in additional zonelist scans for the reset and another +watermark check before hitting the slowpath. + +On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA +machine it's variable due to the variability of measuring overhead with +the vmstat changes. The system CPU overhead comparison looks like + + 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 + vanilla vmstat-v5 lowercost-v5 +User 746.94 774.56 802.00 +System 65336.22 32847.27 40852.33 +Elapsed 27553.52 27415.04 27368.46 + +However it is worth noting that the overall benchmark still completed +faster and intuitively it makes sense to take as few passes as possible +through the zonelists. + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mmzone.h | 6 ++ + mm/page_alloc.c | 101 +++++++++++++++++++++++++------------------------ + 2 files changed, 59 insertions(+), 48 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -529,6 +529,7 @@ typedef enum { + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ ++ ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ + } zone_flags_t; + + static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) +@@ -566,6 +567,11 @@ static inline int zone_is_reclaim_locked + return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + } + ++static inline int zone_is_fair_depleted(const struct zone *zone) ++{ ++ return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); ++} ++ + static inline int zone_is_oom_locked(const struct zone *zone) + { + return test_bit(ZONE_OOM_LOCKED, &zone->flags); +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1614,6 +1614,9 @@ again: + } + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); ++ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && ++ !zone_is_fair_depleted(zone)) ++ zone_set_flag(zone, ZONE_FAIR_DEPLETED); + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone, gfp_flags); +@@ -1938,6 +1941,18 @@ static inline void init_zone_allows_recl + } + #endif /* CONFIG_NUMA */ + ++static void reset_alloc_batches(struct zone *preferred_zone) ++{ ++ struct zone *zone = preferred_zone->zone_pgdat->node_zones; ++ ++ do { ++ mod_zone_page_state(zone, NR_ALLOC_BATCH, ++ high_wmark_pages(zone) - low_wmark_pages(zone) - ++ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); ++ zone_clear_flag(zone, ZONE_FAIR_DEPLETED); ++ } while (zone++ != preferred_zone); ++} ++ + /* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. +@@ -1955,8 +1970,12 @@ get_page_from_freelist(gfp_t gfp_mask, n + int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); ++ int nr_fair_skipped = 0; ++ bool zonelist_rescan; + + zonelist_scan: ++ zonelist_rescan = false; ++ + /* + * Scan zonelist, looking for a zone with enough free. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. +@@ -1981,8 +2000,10 @@ zonelist_scan: + if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(preferred_zone, zone)) + break; +- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0) ++ if (zone_is_fair_depleted(zone)) { ++ nr_fair_skipped++; + continue; ++ } + } + /* + * When allocating a page cache page for writing, we +@@ -2088,13 +2109,7 @@ this_zone_full: + zlc_mark_zone_full(zonelist, z); + } + +- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { +- /* Disable zlc cache for second zonelist scan */ +- zlc_active = 0; +- goto zonelist_scan; +- } +- +- if (page) ++ if (page) { + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is +@@ -2103,8 +2118,37 @@ this_zone_full: + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); ++ return page; ++ } + +- return page; ++ /* ++ * The first pass makes sure allocations are spread fairly within the ++ * local node. However, the local node might have free pages left ++ * after the fairness batches are exhausted, and remote zones haven't ++ * even been considered yet. Try once more without fairness, and ++ * include remote zones now, before entering the slowpath and waking ++ * kswapd: prefer spilling to a remote zone over swapping locally. ++ */ ++ if (alloc_flags & ALLOC_FAIR) { ++ alloc_flags &= ~ALLOC_FAIR; ++ if (nr_fair_skipped) { ++ zonelist_rescan = true; ++ reset_alloc_batches(preferred_zone); ++ } ++ if (nr_online_nodes > 1) ++ zonelist_rescan = true; ++ } ++ ++ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { ++ /* Disable zlc cache for second zonelist scan */ ++ zlc_active = 0; ++ zonelist_rescan = true; ++ } ++ ++ if (zonelist_rescan) ++ goto zonelist_scan; ++ ++ return NULL; + } + + /* +@@ -2433,28 +2477,6 @@ __alloc_pages_high_priority(gfp_t gfp_ma + return page; + } + +-static void reset_alloc_batches(struct zonelist *zonelist, +- enum zone_type high_zoneidx, +- struct zone *preferred_zone) +-{ +- struct zoneref *z; +- struct zone *zone; +- +- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { +- /* +- * Only reset the batches of zones that were actually +- * considered in the fairness pass, we don't want to +- * trash fairness information for zones that are not +- * actually part of this zonelist's round-robin cycle. +- */ +- if (!zone_local(preferred_zone, zone)) +- continue; +- mod_zone_page_state(zone, NR_ALLOC_BATCH, +- high_wmark_pages(zone) - low_wmark_pages(zone) - +- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); +- } +-} +- + static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, +@@ -2792,29 +2814,12 @@ retry_cpuset: + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + #endif +-retry: + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, alloc_flags, + preferred_zone, classzone_idx, migratetype); + if (unlikely(!page)) { + /* +- * The first pass makes sure allocations are spread +- * fairly within the local node. However, the local +- * node might have free pages left after the fairness +- * batches are exhausted, and remote zones haven't +- * even been considered yet. Try once more without +- * fairness, and include remote zones now, before +- * entering the slowpath and waking kswapd: prefer +- * spilling to a remote zone over swapping locally. +- */ +- if (alloc_flags & ALLOC_FAIR) { +- reset_alloc_batches(zonelist, high_zoneidx, +- preferred_zone); +- alloc_flags &= ~ALLOC_FAIR; +- goto retry; +- } +- /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. diff --git a/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch b/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch new file mode 100644 index 00000000000..6e6d5ee4a6c --- /dev/null +++ b/queue-3.14/mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch @@ -0,0 +1,37 @@ +From bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 6 Aug 2014 16:07:18 -0700 +Subject: mm: vmscan: only update per-cpu thresholds for online CPU + +From: Mel Gorman + +commit bb0b6dffa2ccfbd9747ad0cc87c7459622896e60 upstream. + +When kswapd is awake reclaiming, the per-cpu stat thresholds are lowered +to get more accurate counts to avoid breaching watermarks. This +threshold update iterates over all possible CPUs which is unnecessary. +Only online CPUs need to be updated. If a new CPU is onlined, +refresh_zone_stat_thresholds() will set the thresholds correctly. + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmstat.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_ + continue; + + threshold = (*calculate_pressure)(zone); +- for_each_possible_cpu(cpu) ++ for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } diff --git a/queue-3.14/series b/queue-3.14/series index f12aa9a328a..54104775ee3 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -69,3 +69,7 @@ vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch +mm-move-zone-pages_scanned-into-a-vmstat-counter.patch +mm-vmscan-only-update-per-cpu-thresholds-for-online-cpu.patch +mm-page_alloc-abort-fair-zone-allocation-policy-when-remotes-nodes-are-encountered.patch +mm-page_alloc-reduce-cost-of-the-fair-zone-allocation-policy.patch