From: Greg Kroah-Hartman Date: Wed, 28 Jan 2015 01:06:36 +0000 (-0800) Subject: 3.14-stable patches X-Git-Tag: v3.10.67~8 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3fd061c6f9a68e317230683a2afcb79506958d1b;p=thirdparty%2Fkernel%2Fstable-queue.git 3.14-stable patches added patches: memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch mm-make-copy_pte_range-static-again.patch mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch --- diff --git a/queue-3.14/memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch b/queue-3.14/memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch new file mode 100644 index 00000000000..cfb00fff81c --- /dev/null +++ b/queue-3.14/memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch @@ -0,0 +1,90 @@ +From 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Wed, 6 Aug 2014 16:08:03 -0700 +Subject: memcg, vmscan: Fix forced scan of anonymous pages + +From: Jerome Marchand + +commit 2ab051e11bfa3cbb7b24177f3d6aaed10a0d743e upstream. + +When memory cgoups are enabled, the code that decides to force to scan +anonymous pages in get_scan_count() compares global values (free, +high_watermark) to a value that is restricted to a memory cgroup (file). +It make the code over-eager to force anon scan. + +For instance, it will force anon scan when scanning a memcg that is +mainly populated by anonymous page, even when there is plenty of file +pages to get rid of in others memcgs, even when swappiness == 0. It +breaks user's expectation about swappiness and hurts performance. + +This patch makes sure that forced anon scan only happens when there not +enough file pages for the all zone, not just in one random memcg. + +[hannes@cmpxchg.org: cleanups] +Signed-off-by: Jerome Marchand +Acked-by: Michal Hocko +Acked-by: Johannes Weiner +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Signed-off-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmscan.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1847,7 +1847,7 @@ static void get_scan_count(struct lruvec + struct zone *zone = lruvec_zone(lruvec); + unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; +- unsigned long anon, file, free; ++ unsigned long anon, file; + bool force_scan = false; + unsigned long ap, fp; + enum lru_list lru; +@@ -1895,11 +1895,6 @@ static void get_scan_count(struct lruvec + goto out; + } + +- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + +- get_lru_size(lruvec, LRU_INACTIVE_ANON); +- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + +- get_lru_size(lruvec, LRU_INACTIVE_FILE); +- + /* + * If it's foreseeable that reclaiming the file cache won't be + * enough to get the zone back into a desirable shape, we have +@@ -1907,8 +1902,14 @@ static void get_scan_count(struct lruvec + * thrashing - remaining file pages alone. + */ + if (global_reclaim(sc)) { +- free = zone_page_state(zone, NR_FREE_PAGES); +- if (unlikely(file + free <= high_wmark_pages(zone))) { ++ unsigned long zonefile; ++ unsigned long zonefree; ++ ++ zonefree = zone_page_state(zone, NR_FREE_PAGES); ++ zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + ++ zone_page_state(zone, NR_INACTIVE_FILE); ++ ++ if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; + } +@@ -1943,6 +1944,12 @@ static void get_scan_count(struct lruvec + * + * anon in [0], file in [1] + */ ++ ++ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + ++ get_lru_size(lruvec, LRU_INACTIVE_ANON); ++ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + ++ get_lru_size(lruvec, LRU_INACTIVE_FILE); ++ + spin_lock_irq(&zone->lru_lock); + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { + reclaim_stat->recent_scanned[0] /= 2; diff --git a/queue-3.14/mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch b/queue-3.14/mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch new file mode 100644 index 00000000000..2dcc5408a5d --- /dev/null +++ b/queue-3.14/mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch @@ -0,0 +1,53 @@ +From 888cf2db475a256fb0cda042140f73d7881f81fe Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 4 Jun 2014 16:10:34 -0700 +Subject: mm: avoid unnecessary atomic operations during end_page_writeback() + +From: Mel Gorman + +commit 888cf2db475a256fb0cda042140f73d7881f81fe upstream. + +If a page is marked for immediate reclaim then it is moved to the tail of +the LRU list. This occurs when the system is under enough memory pressure +for pages under writeback to reach the end of the LRU but we test for this +using atomic operations on every writeback. This patch uses an optimistic +non-atomic test first. It'll miss some pages in rare cases but the +consequences are not severe enough to warrant such a penalty. + +While the function does not dominate profiles during a simple dd test the +cost of it is reduced. + +73048 0.7428 vmlinux-3.15.0-rc5-mmotm-20140513 end_page_writeback +23740 0.2409 vmlinux-3.15.0-rc5-lessatomic end_page_writeback + +Signed-off-by: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/filemap.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -644,8 +644,17 @@ EXPORT_SYMBOL(unlock_page); + */ + void end_page_writeback(struct page *page) + { +- if (TestClearPageReclaim(page)) ++ /* ++ * TestClearPageReclaim could be used here but it is an atomic ++ * operation and overkill in this particular case. Failing to ++ * shuffle a page marked for immediate reclaim is too mild to ++ * justify taking an atomic operation penalty at the end of ++ * ever page writeback. ++ */ ++ if (PageReclaim(page)) { ++ ClearPageReclaim(page); + rotate_reclaimable_page(page); ++ } + + if (!test_clear_page_writeback(page)) + BUG(); diff --git a/queue-3.14/mm-make-copy_pte_range-static-again.patch b/queue-3.14/mm-make-copy_pte_range-static-again.patch new file mode 100644 index 00000000000..5bf98e03628 --- /dev/null +++ b/queue-3.14/mm-make-copy_pte_range-static-again.patch @@ -0,0 +1,50 @@ +From 21bda264f4243f61dfcc485174055f12ad0530b4 Mon Sep 17 00:00:00 2001 +From: Jerome Marchand +Date: Wed, 6 Aug 2014 16:06:56 -0700 +Subject: mm: make copy_pte_range static again + +From: Jerome Marchand + +commit 21bda264f4243f61dfcc485174055f12ad0530b4 upstream. + +Commit 71e3aac0724f ("thp: transparent hugepage core") adds +copy_pte_range prototype to huge_mm.h. I'm not sure why (or if) this +function have been used outside of memory.c, but it currently isn't. +This patch makes copy_pte_range() static again. + +Signed-off-by: Jerome Marchand +Acked-by: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/huge_mm.h | 4 ---- + mm/memory.c | 2 +- + 2 files changed, 1 insertion(+), 5 deletions(-) + +--- a/include/linux/huge_mm.h ++++ b/include/linux/huge_mm.h +@@ -93,10 +93,6 @@ extern bool is_vma_temporary_stack(struc + #endif /* CONFIG_DEBUG_VM */ + + extern unsigned long transparent_hugepage_flags; +-extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pmd_t *dst_pmd, pmd_t *src_pmd, +- struct vm_area_struct *vma, +- unsigned long addr, unsigned long end); + extern int split_huge_page_to_list(struct page *page, struct list_head *list); + static inline int split_huge_page(struct page *page) + { +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -878,7 +878,7 @@ out_set_pte: + return 0; + } + +-int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ++static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { diff --git a/queue-3.14/mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch b/queue-3.14/mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch new file mode 100644 index 00000000000..9bb8c869d76 --- /dev/null +++ b/queue-3.14/mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch @@ -0,0 +1,48 @@ +From c0d73261f5c1355a35b8b40e871d31578ce0c044 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Wed, 6 Aug 2014 16:05:08 -0700 +Subject: mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault() + +From: Hugh Dickins + +commit c0d73261f5c1355a35b8b40e871d31578ce0c044 upstream. + +Use ACCESS_ONCE() in handle_pte_fault() when getting the entry or +orig_pte upon which all subsequent decisions and pte_same() tests will +be made. + +I have no evidence that its lack is responsible for the mm/filemap.c:202 +BUG_ON(page_mapped(page)) in __delete_from_page_cache() found by +trinity, and I am not optimistic that it will fix it. But I have found +no other explanation, and ACCESS_ONCE() here will surely not hurt. + +If gcc does re-access the pte before passing it down, then that would be +disastrous for correct page fault handling, and certainly could explain +the page_mapped() BUGs seen (concurrent fault causing page to be mapped +in a second time on top of itself: mapcount 2 for a single pte). + +Signed-off-by: Hugh Dickins +Cc: Sasha Levin +Cc: Linus Torvalds +Cc: "Kirill A. Shutemov" +Cc: Konstantin Khlebnikov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3646,7 +3646,7 @@ static int handle_pte_fault(struct mm_st + pte_t entry; + spinlock_t *ptl; + +- entry = *pte; ++ entry = ACCESS_ONCE(*pte); + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (vma->vm_ops) { diff --git a/queue-3.14/mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch b/queue-3.14/mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch new file mode 100644 index 00000000000..39787b6f89d --- /dev/null +++ b/queue-3.14/mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch @@ -0,0 +1,129 @@ +From 24b7e5819ad5cbef2b7c7376510862aa8319d240 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 6 Aug 2014 16:07:11 -0700 +Subject: mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated + +From: Mel Gorman + +commit 24b7e5819ad5cbef2b7c7376510862aa8319d240 upstream. + +This was formerly the series "Improve sequential read throughput" which +noted some major differences in performance of tiobench since 3.0. +While there are a number of factors, two that dominated were the +introduction of the fair zone allocation policy and changes to CFQ. + +The behaviour of fair zone allocation policy makes more sense than +tiobench as a benchmark and CFQ defaults were not changed due to +insufficient benchmarking. + +This series is what's left. It's one functional fix to the fair zone +allocation policy when used on NUMA machines and a reduction of overhead +in general. tiobench was used for the comparison despite its flaws as +an IO benchmark as in this case we are primarily interested in the +overhead of page allocator and page reclaim activity. + +On UMA, it makes little difference to overhead + + 3.16.0-rc3 3.16.0-rc3 + vanilla lowercost-v5 +User 383.61 386.77 +System 403.83 401.74 +Elapsed 5411.50 5413.11 + +On a 4-socket NUMA machine it's a bit more noticable + + 3.16.0-rc3 3.16.0-rc3 + vanilla lowercost-v5 +User 746.94 802.00 +System 65336.22 40852.33 +Elapsed 27553.52 27368.46 + +This patch (of 6): + +The LRU insertion and activate tracepoints take PFN as a parameter +forcing the overhead to the caller. Move the overhead to the tracepoint +fast-assign method to ensure the cost is only incurred when the +tracepoint is active. + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/trace/events/pagemap.h | 16 +++++++--------- + mm/swap.c | 4 ++-- + 2 files changed, 9 insertions(+), 11 deletions(-) + +--- a/include/trace/events/pagemap.h ++++ b/include/trace/events/pagemap.h +@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, + + TP_PROTO( + struct page *page, +- unsigned long pfn, +- int lru, +- unsigned long flags ++ int lru + ), + +- TP_ARGS(page, pfn, lru, flags), ++ TP_ARGS(page, lru), + + TP_STRUCT__entry( + __field(struct page *, page ) +@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, + + TP_fast_assign( + __entry->page = page; +- __entry->pfn = pfn; ++ __entry->pfn = page_to_pfn(page); + __entry->lru = lru; +- __entry->flags = flags; ++ __entry->flags = trace_pagemap_flags(page); + ), + + /* Flag format is based on page-types.c formatting for pagemap */ +@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, + + TRACE_EVENT(mm_lru_activate, + +- TP_PROTO(struct page *page, unsigned long pfn), ++ TP_PROTO(struct page *page), + +- TP_ARGS(page, pfn), ++ TP_ARGS(page), + + TP_STRUCT__entry( + __field(struct page *, page ) +@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, + + TP_fast_assign( + __entry->page = page; +- __entry->pfn = pfn; ++ __entry->pfn = page_to_pfn(page); + ), + + /* Flag format is based on page-types.c formatting for pagemap */ +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -469,7 +469,7 @@ static void __activate_page(struct page + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(page, lruvec, lru); +- trace_mm_lru_activate(page, page_to_pfn(page)); ++ trace_mm_lru_activate(page); + + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(lruvec, file, 1); +@@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, lru); + update_page_reclaim_stat(lruvec, file, active); +- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); ++ trace_mm_lru_insertion(page, lru); + } + + /* diff --git a/queue-3.14/mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch b/queue-3.14/mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch new file mode 100644 index 00000000000..6f7549fe364 --- /dev/null +++ b/queue-3.14/mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch @@ -0,0 +1,367 @@ +From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Wed, 6 Aug 2014 16:07:14 -0700 +Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines + +From: Mel Gorman + +commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream. + +The arrangement of struct zone has changed over time and now it has +reached the point where there is some inappropriate sharing going on. +On x86-64 for example + +o The zone->node field is shared with the zone lock and zone->node is + accessed frequently from the page allocator due to the fair zone + allocation policy. + +o span_seqlock is almost never used by shares a line with free_area + +o Some zone statistics share a cache line with the LRU lock so + reclaim-intensive and allocator-intensive workloads can bounce the cache + line on a stat update + +This patch rearranges struct zone to put read-only and read-mostly +fields together and then splits the page allocator intensive fields, the +zone statistics and the page reclaim intensive fields into their own +cache lines. Note that the type of lowmem_reserve changes due to the +watermark calculations being signed and avoiding a signed/unsigned +conversion there. + +On the test configuration I used the overall size of struct zone shrunk +by one cache line. On smaller machines, this is not likely to be +noticable. However, on a 4-node NUMA machine running tiobench the +system CPU overhead is reduced by this patch. + + 3.16.0-rc3 3.16.0-rc3 + vanillarearrange-v5r9 +User 746.94 759.78 +System 65336.22 58350.98 +Elapsed 27553.52 27282.02 + +Signed-off-by: Mel Gorman +Acked-by: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/mmzone.h | 205 +++++++++++++++++++++++++------------------------ + mm/page_alloc.c | 7 - + mm/vmstat.c | 4 + 3 files changed, 110 insertions(+), 106 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -321,19 +321,12 @@ enum zone_type { + #ifndef __GENERATING_BOUNDS_H + + struct zone { +- /* Fields commonly accessed by the page allocator */ ++ /* Read-mostly fields */ + + /* zone watermarks, access with *_wmark_pages(zone) macros */ + unsigned long watermark[NR_WMARK]; + + /* +- * When free pages are below this point, additional steps are taken +- * when reading the number of free pages to avoid per-cpu counter +- * drift allowing watermarks to be breached +- */ +- unsigned long percpu_drift_mark; +- +- /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several + * GB of ram we must reserve some of the lower zone memory (otherwise we risk +@@ -341,41 +334,26 @@ struct zone { + * on the higher zones). This array is recalculated at runtime if the + * sysctl_lowmem_reserve_ratio sysctl changes. + */ +- unsigned long lowmem_reserve[MAX_NR_ZONES]; +- +- /* +- * This is a per-zone reserve of pages that should not be +- * considered dirtyable memory. +- */ +- unsigned long dirty_balance_reserve; ++ long lowmem_reserve[MAX_NR_ZONES]; + + #ifdef CONFIG_NUMA + int node; ++#endif ++ + /* +- * zone reclaim becomes active if more unmapped pages exist. ++ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on ++ * this zone's LRU. Maintained by the pageout code. + */ +- unsigned long min_unmapped_pages; +- unsigned long min_slab_pages; +-#endif ++ unsigned int inactive_ratio; ++ ++ struct pglist_data *zone_pgdat; + struct per_cpu_pageset __percpu *pageset; ++ + /* +- * free areas of different sizes ++ * This is a per-zone reserve of pages that should not be ++ * considered dirtyable memory. + */ +- spinlock_t lock; +-#if defined CONFIG_COMPACTION || defined CONFIG_CMA +- /* Set to true when the PG_migrate_skip bits should be cleared */ +- bool compact_blockskip_flush; +- +- /* pfn where compaction free scanner should start */ +- unsigned long compact_cached_free_pfn; +- /* pfn where async and sync compaction migration scanner should start */ +- unsigned long compact_cached_migrate_pfn[2]; +-#endif +-#ifdef CONFIG_MEMORY_HOTPLUG +- /* see spanned/present_pages for more description */ +- seqlock_t span_seqlock; +-#endif +- struct free_area free_area[MAX_ORDER]; ++ unsigned long dirty_balance_reserve; + + #ifndef CONFIG_SPARSEMEM + /* +@@ -385,71 +363,14 @@ struct zone { + unsigned long *pageblock_flags; + #endif /* CONFIG_SPARSEMEM */ + +-#ifdef CONFIG_COMPACTION +- /* +- * On compaction failure, 1<> PAGE_SHIFT */ + unsigned long zone_start_pfn; + +@@ -495,9 +416,11 @@ struct zone { + * adjust_managed_page_count() should be used instead of directly + * touching zone->managed_pages and totalram_pages. + */ ++ unsigned long managed_pages; + unsigned long spanned_pages; + unsigned long present_pages; +- unsigned long managed_pages; ++ ++ const char *name; + + /* + * Number of MIGRATE_RESEVE page block. To maintain for just +@@ -505,10 +428,92 @@ struct zone { + */ + int nr_migrate_reserve_block; + ++#ifdef CONFIG_MEMORY_HOTPLUG ++ /* see spanned/present_pages for more description */ ++ seqlock_t span_seqlock; ++#endif ++ + /* +- * rarely used fields: ++ * wait_table -- the array holding the hash table ++ * wait_table_hash_nr_entries -- the size of the hash table array ++ * wait_table_bits -- wait_table_size == (1 << wait_table_bits) ++ * ++ * The purpose of all these is to keep track of the people ++ * waiting for a page to become available and make them ++ * runnable again when possible. The trouble is that this ++ * consumes a lot of space, especially when so few things ++ * wait on pages at a given time. So instead of using ++ * per-page waitqueues, we use a waitqueue hash table. ++ * ++ * The bucket discipline is to sleep on the same queue when ++ * colliding and wake all in that wait queue when removing. ++ * When something wakes, it must check to be sure its page is ++ * truly available, a la thundering herd. The cost of a ++ * collision is great, but given the expected load of the ++ * table, they should be so rare as to be outweighed by the ++ * benefits from the saved space. ++ * ++ * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the ++ * primary users of these fields, and in mm/page_alloc.c ++ * free_area_init_core() performs the initialization of them. + */ +- const char *name; ++ wait_queue_head_t *wait_table; ++ unsigned long wait_table_hash_nr_entries; ++ unsigned long wait_table_bits; ++ ++ ZONE_PADDING(_pad1_) ++ ++ /* Write-intensive fields used from the page allocator */ ++ spinlock_t lock; ++ ++ /* free areas of different sizes */ ++ struct free_area free_area[MAX_ORDER]; ++ ++ /* zone flags, see below */ ++ unsigned long flags; ++ ++ ZONE_PADDING(_pad2_) ++ ++ /* Write-intensive fields used by page reclaim */ ++ ++ /* Fields commonly accessed by the page reclaim scanner */ ++ spinlock_t lru_lock; ++ unsigned long pages_scanned; /* since last reclaim */ ++ struct lruvec lruvec; ++ ++ /* ++ * When free pages are below this point, additional steps are taken ++ * when reading the number of free pages to avoid per-cpu counter ++ * drift allowing watermarks to be breached ++ */ ++ unsigned long percpu_drift_mark; ++ ++#if defined CONFIG_COMPACTION || defined CONFIG_CMA ++ /* pfn where compaction free scanner should start */ ++ unsigned long compact_cached_free_pfn; ++ /* pfn where async and sync compaction migration scanner should start */ ++ unsigned long compact_cached_migrate_pfn[2]; ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ /* ++ * On compaction failure, 1<lowmem_reserve[classzone_idx]; + int o; + long free_cma = 0; + +@@ -1725,7 +1724,7 @@ static bool __zone_watermark_ok(struct z + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + #endif + +- if (free_pages - free_cma <= min + lowmem_reserve) ++ if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + return false; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ +@@ -3257,7 +3256,7 @@ void show_free_areas(unsigned int filter + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) +- printk(" %lu", zone->lowmem_reserve[i]); ++ printk(" %ld", zone->lowmem_reserve[i]); + printk("\n"); + } + +@@ -5585,7 +5584,7 @@ static void calculate_totalreserve_pages + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; +- unsigned long max = 0; ++ long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1065,10 +1065,10 @@ static void zoneinfo_show_print(struct s + zone_page_state(zone, i)); + + seq_printf(m, +- "\n protection: (%lu", ++ "\n protection: (%ld", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) +- seq_printf(m, ", %lu", zone->lowmem_reserve[i]); ++ seq_printf(m, ", %ld", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); diff --git a/queue-3.14/mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch b/queue-3.14/mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch new file mode 100644 index 00000000000..c72e71db5ea --- /dev/null +++ b/queue-3.14/mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch @@ -0,0 +1,87 @@ +From 14a4e2141e24304fff2c697be6382ffb83888185 Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Wed, 6 Aug 2014 16:07:29 -0700 +Subject: mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode + +From: David Rientjes + +commit 14a4e2141e24304fff2c697be6382ffb83888185 upstream. + +Commit 9f1b868a13ac ("mm: thp: khugepaged: add policy for finding target +node") improved the previous khugepaged logic which allocated a +transparent hugepages from the node of the first page being collapsed. + +However, it is still possible to collapse pages to remote memory which +may suffer from additional access latency. With the current policy, it +is possible that 255 pages (with PAGE_SHIFT == 12) will be collapsed +remotely if the majority are allocated from that node. + +When zone_reclaim_mode is enabled, it means the VM should make every +attempt to allocate locally to prevent NUMA performance degradation. In +this case, we do not want to collapse hugepages to remote nodes that +would suffer from increased access latency. Thus, when +zone_reclaim_mode is enabled, only allow collapsing to nodes with +RECLAIM_DISTANCE or less. + +There is no functional change for systems that disable +zone_reclaim_mode. + +Signed-off-by: David Rientjes +Cc: Dave Hansen +Cc: Andrea Arcangeli +Acked-by: Vlastimil Babka +Acked-by: Mel Gorman +Cc: Rik van Riel +Cc: "Kirill A. Shutemov" +Cc: Bob Liu +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2273,6 +2273,30 @@ static void khugepaged_alloc_sleep(void) + + static int khugepaged_node_load[MAX_NUMNODES]; + ++static bool khugepaged_scan_abort(int nid) ++{ ++ int i; ++ ++ /* ++ * If zone_reclaim_mode is disabled, then no extra effort is made to ++ * allocate memory locally. ++ */ ++ if (!zone_reclaim_mode) ++ return false; ++ ++ /* If there is a count for this node already, it must be acceptable */ ++ if (khugepaged_node_load[nid]) ++ return false; ++ ++ for (i = 0; i < MAX_NUMNODES; i++) { ++ if (!khugepaged_node_load[i]) ++ continue; ++ if (node_distance(nid, i) > RECLAIM_DISTANCE) ++ return true; ++ } ++ return false; ++} ++ + #ifdef CONFIG_NUMA + static int khugepaged_find_target_node(void) + { +@@ -2589,6 +2613,8 @@ static int khugepaged_scan_pmd(struct mm + * hit record. + */ + node = page_to_nid(page); ++ if (khugepaged_scan_abort(node)) ++ goto out_unmap; + khugepaged_node_load[node]++; + VM_BUG_ON_PAGE(PageCompound(page), page); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) diff --git a/queue-3.14/series b/queue-3.14/series index 31609754fc2..f12aa9a328a 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -60,3 +60,12 @@ mm-do-not-use-atomic-operations-when-releasing-pages.patch mm-do-not-use-unnecessary-atomic-operations-when-adding-pages-to-the-lru.patch fs-buffer-do-not-use-unnecessary-atomic-operations-when-discarding-buffers.patch mm-non-atomically-mark-page-accessed-during-page-cache-allocation-where-possible.patch +mm-avoid-unnecessary-atomic-operations-during-end_page_writeback.patch +shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch +mm-memory.c-use-entry-access_once-pte-in-handle_pte_fault.patch +mm-thp-only-collapse-hugepages-to-nodes-with-affinity-for-zone_reclaim_mode.patch +mm-make-copy_pte_range-static-again.patch +vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch +memcg-vmscan-fix-forced-scan-of-anonymous-pages.patch +mm-pagemap-avoid-unnecessary-overhead-when-tracepoints-are-deactivated.patch +mm-rearrange-zone-fields-into-read-only-page-alloc-statistics-and-page-reclaim-lines.patch diff --git a/queue-3.14/shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch b/queue-3.14/shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch new file mode 100644 index 00000000000..e593ba3e06c --- /dev/null +++ b/queue-3.14/shmem-fix-init_page_accessed-use-to-stop-pagelru-bug.patch @@ -0,0 +1,88 @@ +From 66d2f4d28cd030220e7ea2a628993fcabcb956d1 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Wed, 2 Jul 2014 15:22:38 -0700 +Subject: shmem: fix init_page_accessed use to stop !PageLRU bug + +From: Hugh Dickins + +commit 66d2f4d28cd030220e7ea2a628993fcabcb956d1 upstream. + +Under shmem swapping load, I sometimes hit the VM_BUG_ON_PAGE(!PageLRU) +in isolate_lru_pages() at mm/vmscan.c:1281! + +Commit 2457aec63745 ("mm: non-atomically mark page accessed during page +cache allocation where possible") looks like interrupted work-in-progress. + +mm/filemap.c's call to init_page_accessed() is fine, but not mm/shmem.c's +- shmem_write_begin() is clearly wrong to use it after shmem_getpage(), +when the page is always visible in radix_tree, and often already on LRU. + +Revert change to shmem_write_begin(), and use init_page_accessed() or +mark_page_accessed() appropriately for SGP_WRITE in shmem_getpage_gfp(). + +SGP_WRITE also covers shmem_symlink(), which did not mark_page_accessed() +before; but since many other filesystems use [__]page_symlink(), which did +and does mark the page accessed, consider this as rectifying an oversight. + +Signed-off-by: Hugh Dickins +Acked-by: Mel Gorman +Cc: Johannes Weiner +Cc: Vlastimil Babka +Cc: Michal Hocko +Cc: Dave Hansen +Cc: Prabhakar Lad +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman +--- + mm/shmem.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -1035,6 +1035,9 @@ repeat: + goto failed; + } + ++ if (page && sgp == SGP_WRITE) ++ mark_page_accessed(page); ++ + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) +@@ -1116,6 +1119,9 @@ repeat: + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + ++ if (sgp == SGP_WRITE) ++ mark_page_accessed(page); ++ + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); +@@ -1142,6 +1148,9 @@ repeat: + + __SetPageSwapBacked(page); + __set_page_locked(page); ++ if (sgp == SGP_WRITE) ++ init_page_accessed(page); ++ + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (error) +@@ -1438,13 +1447,9 @@ shmem_write_begin(struct file *file, str + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + { +- int ret; + struct inode *inode = mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; +- ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); +- if (ret == 0 && *pagep) +- init_page_accessed(*pagep); +- return ret; ++ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + } + + static int diff --git a/queue-3.14/vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch b/queue-3.14/vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch new file mode 100644 index 00000000000..cae32e8c632 --- /dev/null +++ b/queue-3.14/vmalloc-use-rcu-list-iterator-to-reduce-vmap_area_lock-contention.patch @@ -0,0 +1,109 @@ +From 474750aba88817c53f39424e5567b8e4acc4b39b Mon Sep 17 00:00:00 2001 +From: Joonsoo Kim +Date: Wed, 6 Aug 2014 16:05:06 -0700 +Subject: vmalloc: use rcu list iterator to reduce vmap_area_lock contention + +From: Joonsoo Kim + +commit 474750aba88817c53f39424e5567b8e4acc4b39b upstream. + +Richard Yao reported a month ago that his system have a trouble with +vmap_area_lock contention during performance analysis by /proc/meminfo. +Andrew asked why his analysis checks /proc/meminfo stressfully, but he +didn't answer it. + + https://lkml.org/lkml/2014/4/10/416 + +Although I'm not sure that this is right usage or not, there is a +solution reducing vmap_area_lock contention with no side-effect. That +is just to use rcu list iterator in get_vmalloc_info(). + +rcu can be used in this function because all RCU protocol is already +respected by writers, since Nick Piggin commit db64fe02258f1 ("mm: +rewrite vmap layer") back in linux-2.6.28 + +Specifically : + insertions use list_add_rcu(), + deletions use list_del_rcu() and kfree_rcu(). + +Note the rb tree is not used from rcu reader (it would not be safe), +only the vmap_area_list has full RCU protection. + +Note that __purge_vmap_area_lazy() already uses this rcu protection. + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + +Peter: + +: While rcu list traversal over the vmap_area_list is safe, this may +: arrive at different results than the spinlocked version. The rcu list +: traversal version will not be a 'snapshot' of a single, valid instant +: of the entire vmap_area_list, but rather a potential amalgam of +: different list states. + +Joonsoo: + +: Yes, you are right, but I don't think that we should be strict here. +: Meminfo is already not a 'snapshot' at specific time. While we try to get +: certain stats, the other stats can change. And, although we may arrive at +: different results than the spinlocked version, the difference would not be +: large and would not make serious side-effect. + +[edumazet@google.com: add more commit description] +Signed-off-by: Joonsoo Kim +Reported-by: Richard Yao +Acked-by: Eric Dumazet +Cc: Peter Hurley +Cc: Zhang Yanfei +Cc: Johannes Weiner +Cc: Andi Kleen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmalloc.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -2681,14 +2681,14 @@ void get_vmalloc_info(struct vmalloc_inf + + prev_end = VMALLOC_START; + +- spin_lock(&vmap_area_lock); ++ rcu_read_lock(); + + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } + +- list_for_each_entry(va, &vmap_area_list, list) { ++ list_for_each_entry_rcu(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; + + /* +@@ -2715,7 +2715,7 @@ void get_vmalloc_info(struct vmalloc_inf + vmi->largest_chunk = VMALLOC_END - prev_end; + + out: +- spin_unlock(&vmap_area_lock); ++ rcu_read_unlock(); + } + #endif +