--- /dev/null
+From 7fcbbaf18392f0b17c95e2f033c8ccf87eecde1d Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Thu, 22 May 2014 11:54:16 -0700
+Subject: mm/filemap.c: avoid always dirtying mapping->flags on O_DIRECT
+
+From: Jens Axboe <axboe@fb.com>
+
+commit 7fcbbaf18392f0b17c95e2f033c8ccf87eecde1d upstream.
+
+In some testing I ran today (some fio jobs that spread over two nodes),
+we end up spending 40% of the time in filemap_check_errors(). That
+smells fishy. Looking further, this is basically what happens:
+
+blkdev_aio_read()
+ generic_file_aio_read()
+ filemap_write_and_wait_range()
+ if (!mapping->nr_pages)
+ filemap_check_errors()
+
+and filemap_check_errors() always attempts two test_and_clear_bit() on
+the mapping flags, thus dirtying it for every single invocation. The
+patch below tests each of these bits before clearing them, avoiding this
+issue. In my test case (4-socket box), performance went from 1.7M IOPS
+to 4.0M IOPS.
+
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Acked-by: Jeff Moyer <jmoyer@redhat.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/filemap.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -192,9 +192,11 @@ static int filemap_check_errors(struct a
+ {
+ int ret = 0;
+ /* Check for outstanding write errors */
+- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
++ if (test_bit(AS_ENOSPC, &mapping->flags) &&
++ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+- if (test_and_clear_bit(AS_EIO, &mapping->flags))
++ if (test_bit(AS_EIO, &mapping->flags) &&
++ test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+ return ret;
+ }
--- /dev/null
+From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Thu, 3 Apr 2014 14:47:24 -0700
+Subject: mm: optimize put_mems_allowed() usage
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit d26914d11751b23ca2e8747725f2cae10c2f2c1b upstream.
+
+Since put_mems_allowed() is strictly optional, its a seqcount retry, we
+don't need to evaluate the function if the allocation was in fact
+successful, saving a smp_rmb some loads and comparisons on some relative
+fast-paths.
+
+Since the naming, get/put_mems_allowed() does suggest a mandatory
+pairing, rename the interface, as suggested by Mel, to resemble the
+seqcount interface.
+
+This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(),
+where it is important to note that the return value of the latter call
+is inverted from its previous incarnation.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpuset.h | 27 ++++++++++++++-------------
+ kernel/cpuset.c | 2 +-
+ mm/filemap.c | 4 ++--
+ mm/hugetlb.c | 4 ++--
+ mm/mempolicy.c | 12 ++++++------
+ mm/page_alloc.c | 8 ++++----
+ mm/slab.c | 4 ++--
+ mm/slub.c | 16 +++++++---------
+ 8 files changed, 38 insertions(+), 39 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void);
+ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+
+ /*
+- * get_mems_allowed is required when making decisions involving mems_allowed
+- * such as during page allocation. mems_allowed can be updated in parallel
+- * and depending on the new value an operation can fail potentially causing
+- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+- * prevents these artificial failures.
++ * read_mems_allowed_begin is required when making decisions involving
++ * mems_allowed such as during page allocation. mems_allowed can be updated in
++ * parallel and depending on the new value an operation can fail potentially
++ * causing process failure. A retry loop with read_mems_allowed_begin and
++ * read_mems_allowed_retry prevents these artificial failures.
+ */
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ return read_seqcount_begin(¤t->mems_allowed_seq);
+ }
+
+ /*
+- * If this returns false, the operation that took place after get_mems_allowed
+- * may have failed. It is up to the caller to retry the operation if
++ * If this returns true, the operation that took place after
++ * read_mems_allowed_begin may have failed artificially due to a concurrent
++ * update of mems_allowed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+- return !read_seqcount_retry(¤t->mems_allowed_seq, seq);
++ return read_seqcount_retry(¤t->mems_allowed_seq, seq);
+ }
+
+ static inline void set_mems_allowed(nodemask_t nodemask)
+@@ -225,14 +226,14 @@ static inline void set_mems_allowed(node
+ {
+ }
+
+-static inline unsigned int get_mems_allowed(void)
++static inline unsigned int read_mems_allowed_begin(void)
+ {
+ return 0;
+ }
+
+-static inline bool put_mems_allowed(unsigned int seq)
++static inline bool read_mems_allowed_retry(unsigned int seq)
+ {
+- return true;
++ return false;
+ }
+
+ #endif /* !CONFIG_CPUSETS */
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(
+ task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+- * get_mems_allowed(). If at least one node remains unchanged and
++ * read_mems_allowed_begin(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -520,10 +520,10 @@ struct page *__page_cache_alloc(gfp_t gf
+ if (cpuset_do_page_mem_spread()) {
+ unsigned int cpuset_mems_cookie;
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+- } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
++ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return page;
+ }
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vm
+ goto err;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask(h), &mpol, &nodemask);
+
+@@ -562,7 +562,7 @@ retry_cpuset:
+ }
+
+ mpol_cond_put(mpol);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1897,7 +1897,7 @@ int node_random(const nodemask_t *maskp)
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
+ *
+- * Must be protected by get_mems_allowed()
++ * Must be protected by read_mems_allowed_begin()
+ */
+ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags, struct mempolicy **mpol,
+@@ -2061,7 +2061,7 @@ alloc_pages_vma(gfp_t gfp, int order, st
+
+ retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ unsigned nid;
+@@ -2069,7 +2069,7 @@ retry_cpuset:
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
+@@ -2079,7 +2079,7 @@ retry_cpuset:
+ policy_nodemask(gfp, pol));
+ if (unlikely(mpol_needs_cond_ref(pol)))
+ __mpol_put(pol);
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
+ }
+@@ -2113,7 +2113,7 @@ struct page *alloc_pages_current(gfp_t g
+ pol = &default_policy;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /*
+ * No reference counting needed for current->mempolicy
+@@ -2126,7 +2126,7 @@ retry_cpuset:
+ policy_zonelist(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
+
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -2736,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+ return NULL;
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /* The preferred zone is used for statistics later */
+ first_zones_zonelist(zonelist, high_zoneidx,
+@@ -2791,7 +2791,7 @@ out:
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
++ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ memcg_kmem_commit_charge(page, memcg, order);
+@@ -3059,9 +3059,9 @@ bool skip_free_areas_node(unsigned int f
+ goto out;
+
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+- } while (!put_mems_allowed(cpuset_mems_cookie));
++ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ out:
+ return ret;
+ }
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -3122,7 +3122,7 @@ static void *fallback_alloc(struct kmem_
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+ retry_cpuset:
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(slab_node(), flags);
+
+ retry:
+@@ -3180,7 +3180,7 @@ retry:
+ }
+ }
+
+- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
++ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return obj;
+ }
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem
+ return NULL;
+
+ do {
+- cpuset_mems_cookie = get_mems_allowed();
++ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(slab_node(), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+@@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem
+ object = get_partial_node(s, n, c, flags);
+ if (object) {
+ /*
+- * Return the object even if
+- * put_mems_allowed indicated that
+- * the cpuset mems_allowed was
+- * updated in parallel. It's a
+- * harmless race between the alloc
+- * and the cpuset update.
++ * Don't check read_mems_allowed_retry()
++ * here - if mems_allowed was updated in
++ * parallel, that was a harmless race
++ * between allocation and the cpuset
++ * update
+ */
+- put_mems_allowed(cpuset_mems_cookie);
+ return object;
+ }
+ }
+ }
+- } while (!put_mems_allowed(cpuset_mems_cookie));
++ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ #endif
+ return NULL;
+ }
--- /dev/null
+From d5bc5fd3fcb7b8dfb431694a8c8052466504c10c Mon Sep 17 00:00:00 2001
+From: Vladimir Davydov <vdavydov@parallels.com>
+Date: Thu, 3 Apr 2014 14:47:32 -0700
+Subject: mm: vmscan: shrink_slab: rename max_pass -> freeable
+
+From: Vladimir Davydov <vdavydov@parallels.com>
+
+commit d5bc5fd3fcb7b8dfb431694a8c8052466504c10c upstream.
+
+The name `max_pass' is misleading, because this variable actually keeps
+the estimate number of freeable objects, not the maximal number of
+objects we can scan in this pass, which can be twice that. Rename it to
+reflect its actual meaning.
+
+Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+- long max_pass;
++ long freeable;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+- max_pass = shrinker->count_objects(shrinker, shrinkctl);
+- if (max_pass == 0)
++ freeable = shrinker->count_objects(shrinker, shrinkctl);
++ if (freeable == 0)
+ return 0;
+
+ /*
+@@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+- delta *= max_pass;
++ delta *= freeable;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+- total_scan = max_pass;
++ total_scan = freeable;
+ }
+
+ /*
+@@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+- * max_pass. This is bad for sustaining a working set in
++ * freeable. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+- if (delta < max_pass / 4)
+- total_scan = min(total_scan, max_pass / 2);
++ if (delta < freeable / 4)
++ total_scan = min(total_scan, freeable / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+- if (total_scan > max_pass * 2)
+- total_scan = max_pass * 2;
++ if (total_scan > freeable * 2)
++ total_scan = freeable * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+- max_pass, delta, total_scan);
++ freeable, delta, total_scan);
+
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+@@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+- * than the total number of objects on slab (max_pass), we must be
++ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+- total_scan >= max_pass) {
++ total_scan >= freeable) {
+ unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
+