From: Greg Kroah-Hartman Date: Tue, 7 Oct 2014 16:58:56 +0000 (-0700) Subject: 3.14-stable patches X-Git-Tag: v3.10.57~6 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=61dc750beb4505fba3557973fbdd4a385993674f;p=thirdparty%2Fkernel%2Fstable-queue.git 3.14-stable patches added patches: mm-filemap.c-avoid-always-dirtying-mapping-flags-on-o_direct.patch mm-optimize-put_mems_allowed-usage.patch mm-vmscan-respect-numa-policy-mask-when-shrinking-slab-on-direct-reclaim.patch mm-vmscan-shrink_slab-rename-max_pass-freeable.patch vmscan-reclaim_clean_pages_from_list-must-use-mod_zone_page_state.patch --- diff --git a/queue-3.14/mm-filemap.c-avoid-always-dirtying-mapping-flags-on-o_direct.patch b/queue-3.14/mm-filemap.c-avoid-always-dirtying-mapping-flags-on-o_direct.patch new file mode 100644 index 00000000000..1ae33013e7a --- /dev/null +++ b/queue-3.14/mm-filemap.c-avoid-always-dirtying-mapping-flags-on-o_direct.patch @@ -0,0 +1,53 @@ +From 7fcbbaf18392f0b17c95e2f033c8ccf87eecde1d Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Thu, 22 May 2014 11:54:16 -0700 +Subject: mm/filemap.c: avoid always dirtying mapping->flags on O_DIRECT + +From: Jens Axboe + +commit 7fcbbaf18392f0b17c95e2f033c8ccf87eecde1d upstream. + +In some testing I ran today (some fio jobs that spread over two nodes), +we end up spending 40% of the time in filemap_check_errors(). That +smells fishy. Looking further, this is basically what happens: + +blkdev_aio_read() + generic_file_aio_read() + filemap_write_and_wait_range() + if (!mapping->nr_pages) + filemap_check_errors() + +and filemap_check_errors() always attempts two test_and_clear_bit() on +the mapping flags, thus dirtying it for every single invocation. The +patch below tests each of these bits before clearing them, avoiding this +issue. In my test case (4-socket box), performance went from 1.7M IOPS +to 4.0M IOPS. + +Signed-off-by: Jens Axboe +Acked-by: Jeff Moyer +Cc: Al Viro +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/filemap.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -192,9 +192,11 @@ static int filemap_check_errors(struct a + { + int ret = 0; + /* Check for outstanding write errors */ +- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ++ if (test_bit(AS_ENOSPC, &mapping->flags) && ++ test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; +- if (test_and_clear_bit(AS_EIO, &mapping->flags)) ++ if (test_bit(AS_EIO, &mapping->flags) && ++ test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; + } diff --git a/queue-3.14/mm-optimize-put_mems_allowed-usage.patch b/queue-3.14/mm-optimize-put_mems_allowed-usage.patch new file mode 100644 index 00000000000..9ab158bfabe --- /dev/null +++ b/queue-3.14/mm-optimize-put_mems_allowed-usage.patch @@ -0,0 +1,288 @@ +From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Thu, 3 Apr 2014 14:47:24 -0700 +Subject: mm: optimize put_mems_allowed() usage + +From: Mel Gorman + +commit d26914d11751b23ca2e8747725f2cae10c2f2c1b upstream. + +Since put_mems_allowed() is strictly optional, its a seqcount retry, we +don't need to evaluate the function if the allocation was in fact +successful, saving a smp_rmb some loads and comparisons on some relative +fast-paths. + +Since the naming, get/put_mems_allowed() does suggest a mandatory +pairing, rename the interface, as suggested by Mel, to resemble the +seqcount interface. + +This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(), +where it is important to note that the return value of the latter call +is inverted from its previous incarnation. + +Signed-off-by: Peter Zijlstra +Signed-off-by: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/cpuset.h | 27 ++++++++++++++------------- + kernel/cpuset.c | 2 +- + mm/filemap.c | 4 ++-- + mm/hugetlb.c | 4 ++-- + mm/mempolicy.c | 12 ++++++------ + mm/page_alloc.c | 8 ++++---- + mm/slab.c | 4 ++-- + mm/slub.c | 16 +++++++--------- + 8 files changed, 38 insertions(+), 39 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void); + extern void cpuset_print_task_mems_allowed(struct task_struct *p); + + /* +- * get_mems_allowed is required when making decisions involving mems_allowed +- * such as during page allocation. mems_allowed can be updated in parallel +- * and depending on the new value an operation can fail potentially causing +- * process failure. A retry loop with get_mems_allowed and put_mems_allowed +- * prevents these artificial failures. ++ * read_mems_allowed_begin is required when making decisions involving ++ * mems_allowed such as during page allocation. mems_allowed can be updated in ++ * parallel and depending on the new value an operation can fail potentially ++ * causing process failure. A retry loop with read_mems_allowed_begin and ++ * read_mems_allowed_retry prevents these artificial failures. + */ +-static inline unsigned int get_mems_allowed(void) ++static inline unsigned int read_mems_allowed_begin(void) + { + return read_seqcount_begin(¤t->mems_allowed_seq); + } + + /* +- * If this returns false, the operation that took place after get_mems_allowed +- * may have failed. It is up to the caller to retry the operation if ++ * If this returns true, the operation that took place after ++ * read_mems_allowed_begin may have failed artificially due to a concurrent ++ * update of mems_allowed. It is up to the caller to retry the operation if + * appropriate. + */ +-static inline bool put_mems_allowed(unsigned int seq) ++static inline bool read_mems_allowed_retry(unsigned int seq) + { +- return !read_seqcount_retry(¤t->mems_allowed_seq, seq); ++ return read_seqcount_retry(¤t->mems_allowed_seq, seq); + } + + static inline void set_mems_allowed(nodemask_t nodemask) +@@ -225,14 +226,14 @@ static inline void set_mems_allowed(node + { + } + +-static inline unsigned int get_mems_allowed(void) ++static inline unsigned int read_mems_allowed_begin(void) + { + return 0; + } + +-static inline bool put_mems_allowed(unsigned int seq) ++static inline bool read_mems_allowed_retry(unsigned int seq) + { +- return true; ++ return false; + } + + #endif /* !CONFIG_CPUSETS */ +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask( + task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing +- * get_mems_allowed(). If at least one node remains unchanged and ++ * read_mems_allowed_begin(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -520,10 +520,10 @@ struct page *__page_cache_alloc(gfp_t gf + if (cpuset_do_page_mem_spread()) { + unsigned int cpuset_mems_cookie; + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); +- } while (!put_mems_allowed(cpuset_mems_cookie) && !page); ++ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + + return page; + } +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vm + goto err; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + +@@ -562,7 +562,7 @@ retry_cpuset: + } + + mpol_cond_put(mpol); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp) + * If the effective policy is 'BIND, returns a pointer to the mempolicy's + * @nodemask for filtering the zonelist. + * +- * Must be protected by get_mems_allowed() ++ * Must be protected by read_mems_allowed_begin() + */ + struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags, struct mempolicy **mpol, +@@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, st + + retry_cpuset: + pol = get_vma_policy(current, vma, addr); +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + unsigned nid; +@@ -2069,7 +2069,7 @@ retry_cpuset: + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +@@ -2079,7 +2079,7 @@ retry_cpuset: + policy_nodemask(gfp, pol)); + if (unlikely(mpol_needs_cond_ref(pol))) + __mpol_put(pol); +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + } +@@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t g + pol = &default_policy; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * No reference counting needed for current->mempolicy +@@ -2126,7 +2126,7 @@ retry_cpuset: + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); + +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u + return NULL; + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + + /* The preferred zone is used for statistics later */ + first_zones_zonelist(zonelist, high_zoneidx, +@@ -2791,7 +2791,7 @@ out: + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) ++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + memcg_kmem_commit_charge(page, memcg, order); +@@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int f + goto out; + + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + ret = !node_isset(nid, cpuset_current_mems_allowed); +- } while (!put_mems_allowed(cpuset_mems_cookie)); ++ } while (read_mems_allowed_retry(cpuset_mems_cookie)); + out: + return ret; + } +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_ + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + + retry_cpuset: +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(slab_node(), flags); + + retry: +@@ -3180,7 +3180,7 @@ retry: + } + } + +- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) ++ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return obj; + } +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem + return NULL; + + do { +- cpuset_mems_cookie = get_mems_allowed(); ++ cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(slab_node(), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; +@@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem + object = get_partial_node(s, n, c, flags); + if (object) { + /* +- * Return the object even if +- * put_mems_allowed indicated that +- * the cpuset mems_allowed was +- * updated in parallel. It's a +- * harmless race between the alloc +- * and the cpuset update. ++ * Don't check read_mems_allowed_retry() ++ * here - if mems_allowed was updated in ++ * parallel, that was a harmless race ++ * between allocation and the cpuset ++ * update + */ +- put_mems_allowed(cpuset_mems_cookie); + return object; + } + } + } +- } while (!put_mems_allowed(cpuset_mems_cookie)); ++ } while (read_mems_allowed_retry(cpuset_mems_cookie)); + #endif + return NULL; + } diff --git a/queue-3.14/mm-vmscan-respect-numa-policy-mask-when-shrinking-slab-on-direct-reclaim.patch b/queue-3.14/mm-vmscan-respect-numa-policy-mask-when-shrinking-slab-on-direct-reclaim.patch new file mode 100644 index 00000000000..7b04de44cc8 --- /dev/null +++ b/queue-3.14/mm-vmscan-respect-numa-policy-mask-when-shrinking-slab-on-direct-reclaim.patch @@ -0,0 +1,45 @@ +From 99120b772b52853f9a2b829a21dd44d9b20558f1 Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Thu, 3 Apr 2014 14:47:19 -0700 +Subject: mm: vmscan: respect NUMA policy mask when shrinking slab on direct reclaim + +From: Vladimir Davydov + +commit 99120b772b52853f9a2b829a21dd44d9b20558f1 upstream. + +When direct reclaim is executed by a process bound to a set of NUMA +nodes, we should scan only those nodes when possible, but currently we +will scan kmem from all online nodes even if the kmem shrinker is NUMA +aware. That said, binding a process to a particular NUMA node won't +prevent it from shrinking inode/dentry caches from other nodes, which is +not good. Fix this. + +Signed-off-by: Vladimir Davydov +Cc: Mel Gorman +Cc: Michal Hocko +Cc: Johannes Weiner +Cc: Rik van Riel +Cc: Dave Chinner +Cc: Glauber Costa +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2424,8 +2424,8 @@ static unsigned long do_try_to_free_page + unsigned long lru_pages = 0; + + nodes_clear(shrink->nodes_to_scan); +- for_each_zone_zonelist(zone, z, zonelist, +- gfp_zone(sc->gfp_mask)) { ++ for_each_zone_zonelist_nodemask(zone, z, zonelist, ++ gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + diff --git a/queue-3.14/mm-vmscan-shrink_slab-rename-max_pass-freeable.patch b/queue-3.14/mm-vmscan-shrink_slab-rename-max_pass-freeable.patch new file mode 100644 index 00000000000..9577ada95a2 --- /dev/null +++ b/queue-3.14/mm-vmscan-shrink_slab-rename-max_pass-freeable.patch @@ -0,0 +1,111 @@ +From d5bc5fd3fcb7b8dfb431694a8c8052466504c10c Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Thu, 3 Apr 2014 14:47:32 -0700 +Subject: mm: vmscan: shrink_slab: rename max_pass -> freeable + +From: Vladimir Davydov + +commit d5bc5fd3fcb7b8dfb431694a8c8052466504c10c upstream. + +The name `max_pass' is misleading, because this variable actually keeps +the estimate number of freeable objects, not the maximal number of +objects we can scan in this pass, which can be twice that. Rename it to +reflect its actual meaning. + +Signed-off-by: Vladimir Davydov +Acked-by: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control * + unsigned long freed = 0; + unsigned long long delta; + long total_scan; +- long max_pass; ++ long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + +- max_pass = shrinker->count_objects(shrinker, shrinkctl); +- if (max_pass == 0) ++ freeable = shrinker->count_objects(shrinker, shrinkctl); ++ if (freeable == 0) + return 0; + + /* +@@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control * + + total_scan = nr; + delta = (4 * nr_pages_scanned) / shrinker->seeks; +- delta *= max_pass; ++ delta *= freeable; + do_div(delta, lru_pages + 1); + total_scan += delta; + if (total_scan < 0) { + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); +- total_scan = max_pass; ++ total_scan = freeable; + } + + /* +@@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control * + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> +- * max_pass. This is bad for sustaining a working set in ++ * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ +- if (delta < max_pass / 4) +- total_scan = min(total_scan, max_pass / 2); ++ if (delta < freeable / 4) ++ total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ +- if (total_scan > max_pass * 2) +- total_scan = max_pass * 2; ++ if (total_scan > freeable * 2) ++ total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_pages_scanned, lru_pages, +- max_pass, delta, total_scan); ++ freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one +@@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control * + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater +- * than the total number of objects on slab (max_pass), we must be ++ * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || +- total_scan >= max_pass) { ++ total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + diff --git a/queue-3.14/series b/queue-3.14/series index d72dfd9205b..bd61f8b7e81 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -28,3 +28,8 @@ mm-compaction-clean-up-code-on-success-of-ballon-isolation.patch mm-compaction-determine-isolation-mode-only-once.patch mm-compaction-ignore-pageblock-skip-when-manually-invoking-compaction.patch mm-readahead.c-fix-readahead-failure-for-memoryless-numa-nodes-and-limit-readahead-pages.patch +mm-optimize-put_mems_allowed-usage.patch +mm-filemap.c-avoid-always-dirtying-mapping-flags-on-o_direct.patch +mm-vmscan-respect-numa-policy-mask-when-shrinking-slab-on-direct-reclaim.patch +mm-vmscan-shrink_slab-rename-max_pass-freeable.patch +vmscan-reclaim_clean_pages_from_list-must-use-mod_zone_page_state.patch diff --git a/queue-3.14/vmscan-reclaim_clean_pages_from_list-must-use-mod_zone_page_state.patch b/queue-3.14/vmscan-reclaim_clean_pages_from_list-must-use-mod_zone_page_state.patch new file mode 100644 index 00000000000..c63290f8bd5 --- /dev/null +++ b/queue-3.14/vmscan-reclaim_clean_pages_from_list-must-use-mod_zone_page_state.patch @@ -0,0 +1,39 @@ +From 83da7510058736c09a14b9c17ec7d851940a4332 Mon Sep 17 00:00:00 2001 +From: Christoph Lameter +Date: Fri, 18 Apr 2014 15:07:10 -0700 +Subject: vmscan: reclaim_clean_pages_from_list() must use mod_zone_page_state() + +From: Christoph Lameter + +commit 83da7510058736c09a14b9c17ec7d851940a4332 upstream. + +Seems to be called with preemption enabled. Therefore it must use +mod_zone_page_state instead. + +Signed-off-by: Christoph Lameter +Reported-by: Grygorii Strashko +Tested-by: Grygorii Strashko +Cc: Tejun Heo +Cc: Santosh Shilimkar +Cc: Ingo Molnar +Cc: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1144,7 +1144,7 @@ unsigned long reclaim_clean_pages_from_l + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); +- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); ++ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; + } +