]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Nov 2014 20:51:03 +0000 (12:51 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Nov 2014 20:51:03 +0000 (12:51 -0800)
added patches:
fs-superblock-avoid-locking-counting-inodes-and-dentries-before-reclaiming-them.patch
fs-superblock-unregister-sb-shrinker-before-kill_sb.patch
mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages.patch
mm-compaction-do-not-count-migratepages-when-unnecessary.patch
mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention.patch
mm-fix-direct-reclaim-writeback-regression.patch
mm-page_alloc-prevent-migrate_reserve-pages-from-being-misplaced.patch
mm-vmscan-use-proportional-scanning-during-direct-reclaim-and-full-scan-at-def_priority.patch
x86-mm-in-the-pte-swapout-page-reclaim-case-clear-the-accessed-bit-instead-of-flushing-the-tlb.patch

queue-3.14/fs-superblock-avoid-locking-counting-inodes-and-dentries-before-reclaiming-them.patch [new file with mode: 0644]
queue-3.14/fs-superblock-unregister-sb-shrinker-before-kill_sb.patch [new file with mode: 0644]
queue-3.14/mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages.patch [new file with mode: 0644]
queue-3.14/mm-compaction-do-not-count-migratepages-when-unnecessary.patch [new file with mode: 0644]
queue-3.14/mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention.patch [new file with mode: 0644]
queue-3.14/mm-fix-direct-reclaim-writeback-regression.patch [new file with mode: 0644]
queue-3.14/mm-page_alloc-prevent-migrate_reserve-pages-from-being-misplaced.patch [new file with mode: 0644]
queue-3.14/mm-vmscan-use-proportional-scanning-during-direct-reclaim-and-full-scan-at-def_priority.patch [new file with mode: 0644]
queue-3.14/series
queue-3.14/x86-mm-in-the-pte-swapout-page-reclaim-case-clear-the-accessed-bit-instead-of-flushing-the-tlb.patch [new file with mode: 0644]

diff --git a/queue-3.14/fs-superblock-avoid-locking-counting-inodes-and-dentries-before-reclaiming-them.patch b/queue-3.14/fs-superblock-avoid-locking-counting-inodes-and-dentries-before-reclaiming-them.patch
new file mode 100644 (file)
index 0000000..4df7cf6
--- /dev/null
@@ -0,0 +1,92 @@
+From d23da150a37c9fe3cc83dbaf71b3e37fd434ed52 Mon Sep 17 00:00:00 2001
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Wed, 4 Jun 2014 16:10:47 -0700
+Subject: fs/superblock: avoid locking counting inodes and dentries before reclaiming them
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+commit d23da150a37c9fe3cc83dbaf71b3e37fd434ed52 upstream.
+
+We remove the call to grab_super_passive in call to super_cache_count.
+This becomes a scalability bottleneck as multiple threads are trying to do
+memory reclamation, e.g.  when we are doing large amount of file read and
+page cache is under pressure.  The cached objects quickly got reclaimed
+down to 0 and we are aborting the cache_scan() reclaim.  But counting
+creates a log jam acquiring the sb_lock.
+
+We are holding the shrinker_rwsem which ensures the safety of call to
+list_lru_count_node() and s_op->nr_cached_objects.  The shrinker is
+unregistered now before ->kill_sb() so the operation is safe when we are
+doing unmount.
+
+The impact will depend heavily on the machine and the workload but for a
+small machine using postmark tuned to use 4xRAM size the results were
+
+                                  3.15.0-rc5            3.15.0-rc5
+                                     vanilla         shrinker-v1r1
+Ops/sec Transactions         21.00 (  0.00%)       24.00 ( 14.29%)
+Ops/sec FilesCreate          39.00 (  0.00%)       44.00 ( 12.82%)
+Ops/sec CreateTransact       10.00 (  0.00%)       12.00 ( 20.00%)
+Ops/sec FilesDeleted       6202.00 (  0.00%)     6202.00 (  0.00%)
+Ops/sec DeleteTransact       11.00 (  0.00%)       12.00 (  9.09%)
+Ops/sec DataRead/MB          25.97 (  0.00%)       29.10 ( 12.05%)
+Ops/sec DataWrite/MB         49.99 (  0.00%)       56.02 ( 12.06%)
+
+ffsb running in a configuration that is meant to simulate a mail server showed
+
+                                 3.15.0-rc5             3.15.0-rc5
+                                    vanilla          shrinker-v1r1
+Ops/sec readall           9402.63 (  0.00%)      9567.97 (  1.76%)
+Ops/sec create            4695.45 (  0.00%)      4735.00 (  0.84%)
+Ops/sec delete             173.72 (  0.00%)       179.83 (  3.52%)
+Ops/sec Transactions     14271.80 (  0.00%)     14482.81 (  1.48%)
+Ops/sec Read                37.00 (  0.00%)        37.60 (  1.62%)
+Ops/sec Write               18.20 (  0.00%)        18.30 (  0.55%)
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Jan Kara <jack@suse.cz>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/super.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -114,9 +114,14 @@ static unsigned long super_cache_count(s
+       sb = container_of(shrink, struct super_block, s_shrink);
+-      if (!grab_super_passive(sb))
+-              return 0;
+-
++      /*
++       * Don't call grab_super_passive as it is a potential
++       * scalability bottleneck. The counts could get updated
++       * between super_cache_count and super_cache_scan anyway.
++       * Call to super_cache_count with shrinker_rwsem held
++       * ensures the safety of call to list_lru_count_node() and
++       * s_op->nr_cached_objects().
++       */
+       if (sb->s_op && sb->s_op->nr_cached_objects)
+               total_objects = sb->s_op->nr_cached_objects(sb,
+                                                sc->nid);
+@@ -127,7 +132,6 @@ static unsigned long super_cache_count(s
+                                                sc->nid);
+       total_objects = vfs_pressure_ratio(total_objects);
+-      drop_super(sb);
+       return total_objects;
+ }
diff --git a/queue-3.14/fs-superblock-unregister-sb-shrinker-before-kill_sb.patch b/queue-3.14/fs-superblock-unregister-sb-shrinker-before-kill_sb.patch
new file mode 100644 (file)
index 0000000..bbcf03a
--- /dev/null
@@ -0,0 +1,136 @@
+From 28f2cd4f6da24a1aa06c226618ed5ad69e13df64 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <david@fromorbit.com>
+Date: Wed, 4 Jun 2014 16:10:46 -0700
+Subject: fs/superblock: unregister sb shrinker before ->kill_sb()
+
+From: Dave Chinner <david@fromorbit.com>
+
+commit 28f2cd4f6da24a1aa06c226618ed5ad69e13df64 upstream.
+
+This series is aimed at regressions noticed during reclaim activity.  The
+first two patches are shrinker patches that were posted ages ago but never
+merged for reasons that are unclear to me.  I'm posting them again to see
+if there was a reason they were dropped or if they just got lost.  Dave?
+Time?  The last patch adjusts proportional reclaim.  Yuanhan Liu, can you
+retest the vm scalability test cases on a larger machine?  Hugh, does this
+work for you on the memcg test cases?
+
+Based on ext4, I get the following results but unfortunately my larger
+test machines are all unavailable so this is based on a relatively small
+machine.
+
+postmark
+                                  3.15.0-rc5            3.15.0-rc5
+                                     vanilla       proportion-v1r4
+Ops/sec Transactions         21.00 (  0.00%)       25.00 ( 19.05%)
+Ops/sec FilesCreate          39.00 (  0.00%)       45.00 ( 15.38%)
+Ops/sec CreateTransact       10.00 (  0.00%)       12.00 ( 20.00%)
+Ops/sec FilesDeleted       6202.00 (  0.00%)     6202.00 (  0.00%)
+Ops/sec DeleteTransact       11.00 (  0.00%)       12.00 (  9.09%)
+Ops/sec DataRead/MB          25.97 (  0.00%)       30.02 ( 15.59%)
+Ops/sec DataWrite/MB         49.99 (  0.00%)       57.78 ( 15.58%)
+
+ffsb (mail server simulator)
+                                 3.15.0-rc5             3.15.0-rc5
+                                    vanilla        proportion-v1r4
+Ops/sec readall           9402.63 (  0.00%)      9805.74 (  4.29%)
+Ops/sec create            4695.45 (  0.00%)      4781.39 (  1.83%)
+Ops/sec delete             173.72 (  0.00%)       177.23 (  2.02%)
+Ops/sec Transactions     14271.80 (  0.00%)     14764.37 (  3.45%)
+Ops/sec Read                37.00 (  0.00%)        38.50 (  4.05%)
+Ops/sec Write               18.20 (  0.00%)        18.50 (  1.65%)
+
+dd of a large file
+                                3.15.0-rc5            3.15.0-rc5
+                                   vanilla       proportion-v1r4
+WallTime DownloadTar       75.00 (  0.00%)       61.00 ( 18.67%)
+WallTime DD               423.00 (  0.00%)      401.00 (  5.20%)
+WallTime Delete             2.00 (  0.00%)        5.00 (-150.00%)
+
+stutter (times mmap latency during large amounts of IO)
+
+                            3.15.0-rc5            3.15.0-rc5
+                               vanilla       proportion-v1r4
+Unit >5ms Delays  80252.0000 (  0.00%)  81523.0000 ( -1.58%)
+Unit Mmap min         8.2118 (  0.00%)      8.3206 ( -1.33%)
+Unit Mmap mean       17.4614 (  0.00%)     17.2868 (  1.00%)
+Unit Mmap stddev     24.9059 (  0.00%)     34.6771 (-39.23%)
+Unit Mmap max      2811.6433 (  0.00%)   2645.1398 (  5.92%)
+Unit Mmap 90%        20.5098 (  0.00%)     18.3105 ( 10.72%)
+Unit Mmap 93%        22.9180 (  0.00%)     20.1751 ( 11.97%)
+Unit Mmap 95%        25.2114 (  0.00%)     22.4988 ( 10.76%)
+Unit Mmap 99%        46.1430 (  0.00%)     43.5952 (  5.52%)
+Unit Ideal  Tput     85.2623 (  0.00%)     78.8906 (  7.47%)
+Unit Tput min        44.0666 (  0.00%)     43.9609 (  0.24%)
+Unit Tput mean       45.5646 (  0.00%)     45.2009 (  0.80%)
+Unit Tput stddev      0.9318 (  0.00%)      1.1084 (-18.95%)
+Unit Tput max        46.7375 (  0.00%)     46.7539 ( -0.04%)
+
+This patch (of 3):
+
+We will like to unregister the sb shrinker before ->kill_sb().  This will
+allow cached objects to be counted without call to grab_super_passive() to
+update ref count on sb.  We want to avoid locking during memory
+reclamation especially when we are skipping the memory reclaim when we are
+out of cached objects.
+
+This is safe because grab_super_passive does a try-lock on the
+sb->s_umount now, and so if we are in the unmount process, it won't ever
+block.  That means what used to be a deadlock and races we were avoiding
+by using grab_super_passive() is now:
+
+        shrinker                        umount
+
+        down_read(shrinker_rwsem)
+                                        down_write(sb->s_umount)
+                                        shrinker_unregister
+                                          down_write(shrinker_rwsem)
+                                            <blocks>
+        grab_super_passive(sb)
+          down_read_trylock(sb->s_umount)
+            <fails>
+        <shrinker aborts>
+        ....
+        <shrinkers finish running>
+        up_read(shrinker_rwsem)
+                                          <unblocks>
+                                          <removes shrinker>
+                                          up_write(shrinker_rwsem)
+                                        ->kill_sb()
+                                        ....
+
+So it is safe to deregister the shrinker before ->kill_sb().
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Jan Kara <jack@suse.cz>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/super.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -278,10 +278,8 @@ void deactivate_locked_super(struct supe
+       struct file_system_type *fs = s->s_type;
+       if (atomic_dec_and_test(&s->s_active)) {
+               cleancache_invalidate_fs(s);
+-              fs->kill_sb(s);
+-
+-              /* caches are now gone, we can safely kill the shrinker now */
+               unregister_shrinker(&s->s_shrink);
++              fs->kill_sb(s);
+               put_filesystem(fs);
+               put_super(s);
diff --git a/queue-3.14/mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages.patch b/queue-3.14/mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages.patch
new file mode 100644 (file)
index 0000000..33a0587
--- /dev/null
@@ -0,0 +1,107 @@
+From e9ade569910a82614ff5f2c2cea2b65a8d785da4 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 4 Jun 2014 16:08:34 -0700
+Subject: mm/compaction: avoid rescanning pageblocks in isolate_freepages
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit e9ade569910a82614ff5f2c2cea2b65a8d785da4 upstream.
+
+The compaction free scanner in isolate_freepages() currently remembers PFN
+of the highest pageblock where it successfully isolates, to be used as the
+starting pageblock for the next invocation.  The rationale behind this is
+that page migration might return free pages to the allocator when
+migration fails and we don't want to skip them if the compaction
+continues.
+
+Since migration now returns free pages back to compaction code where they
+can be reused, this is no longer a concern.  This patch changes
+isolate_freepages() so that the PFN for restarting is updated with each
+pageblock where isolation is attempted.  Using stress-highalloc from
+mmtests, this resulted in 10% reduction of the pages scanned by the free
+scanner.
+
+Note that the somewhat similar functionality that records highest
+successful pageblock in zone->compact_cached_free_pfn, remains unchanged.
+This cache is used when the whole compaction is restarted, not for
+multiple invocations of the free scanner during single compaction.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+Acked-by: Michal Nazarewicz <mina86@mina86.com>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Rik van Riel <riel@redhat.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |   22 +++++++---------------
+ 1 file changed, 7 insertions(+), 15 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -688,7 +688,6 @@ static void isolate_freepages(struct zon
+       unsigned long block_start_pfn;  /* start of current pageblock */
+       unsigned long block_end_pfn;    /* end of current pageblock */
+       unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
+-      unsigned long next_free_pfn; /* start pfn for scaning at next round */
+       int nr_freepages = cc->nr_freepages;
+       struct list_head *freelist = &cc->freepages;
+@@ -709,12 +708,6 @@ static void isolate_freepages(struct zon
+       low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+       /*
+-       * If no pages are isolated, the block_start_pfn < low_pfn check
+-       * will kick in.
+-       */
+-      next_free_pfn = 0;
+-
+-      /*
+        * Isolate free pages until enough are available to migrate the
+        * pages on cc->migratepages. We stop searching if the migrate
+        * and free page scanners meet or enough free pages are isolated.
+@@ -754,19 +747,19 @@ static void isolate_freepages(struct zon
+                       continue;
+               /* Found a block suitable for isolating free pages from */
++              cc->free_pfn = block_start_pfn;
+               isolated = isolate_freepages_block(cc, block_start_pfn,
+                                       block_end_pfn, freelist, false);
+               nr_freepages += isolated;
+               /*
+-               * Record the highest PFN we isolated pages from. When next
+-               * looking for free pages, the search will restart here as
+-               * page migration may have returned some pages to the allocator
++               * Set a flag that we successfully isolated in this pageblock.
++               * In the next loop iteration, zone->compact_cached_free_pfn
++               * will not be updated and thus it will effectively contain the
++               * highest pageblock we isolated pages from.
+                */
+-              if (isolated && next_free_pfn == 0) {
++              if (isolated)
+                       cc->finished_update_free = true;
+-                      next_free_pfn = block_start_pfn;
+-              }
+       }
+       /* split_free_page does not map the pages */
+@@ -777,9 +770,8 @@ static void isolate_freepages(struct zon
+        * so that compact_finished() may detect this
+        */
+       if (block_start_pfn < low_pfn)
+-              next_free_pfn = cc->migrate_pfn;
++              cc->free_pfn = cc->migrate_pfn;
+-      cc->free_pfn = next_free_pfn;
+       cc->nr_freepages = nr_freepages;
+ }
diff --git a/queue-3.14/mm-compaction-do-not-count-migratepages-when-unnecessary.patch b/queue-3.14/mm-compaction-do-not-count-migratepages-when-unnecessary.patch
new file mode 100644 (file)
index 0000000..0fab5c3
--- /dev/null
@@ -0,0 +1,167 @@
+From f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 4 Jun 2014 16:08:32 -0700
+Subject: mm/compaction: do not count migratepages when unnecessary
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 upstream.
+
+During compaction, update_nr_listpages() has been used to count remaining
+non-migrated and free pages after a call to migrage_pages().  The
+freepages counting has become unneccessary, and it turns out that
+migratepages counting is also unnecessary in most cases.
+
+The only situation when it's needed to count cc->migratepages is when
+migrate_pages() returns with a negative error code.  Otherwise, the
+non-negative return value is the number of pages that were not migrated,
+which is exactly the count of remaining pages in the cc->migratepages
+list.
+
+Furthermore, any non-zero count is only interesting for the tracepoint of
+mm_compaction_migratepages events, because after that all remaining
+unmigrated pages are put back and their count is set to 0.
+
+This patch therefore removes update_nr_listpages() completely, and changes
+the tracepoint definition so that the manual counting is done only when
+the tracepoint is enabled, and only when migrate_pages() returns a
+negative error code.
+
+Furthermore, migrate_pages() and the tracepoints won't be called when
+there's nothing to migrate.  This potentially avoids some wasted cycles
+and reduces the volume of uninteresting mm_compaction_migratepages events
+where "nr_migrated=0 nr_failed=0".  In the stress-highalloc mmtest, this
+was about 75% of the events.  The mm_compaction_isolate_migratepages event
+is better for determining that nothing was isolated for migration, and
+this one was just duplicating the info.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+Acked-by: Michal Nazarewicz <mina86@mina86.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Rik van Riel <riel@redhat.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/trace/events/compaction.h |   25 +++++++++++++++++++++----
+ mm/compaction.c                   |   31 +++++++------------------------
+ 2 files changed, 28 insertions(+), 28 deletions(-)
+
+--- a/include/trace/events/compaction.h
++++ b/include/trace/events/compaction.h
+@@ -5,6 +5,7 @@
+ #define _TRACE_COMPACTION_H
+ #include <linux/types.h>
++#include <linux/list.h>
+ #include <linux/tracepoint.h>
+ #include <trace/events/gfpflags.h>
+@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_templ
+ TRACE_EVENT(mm_compaction_migratepages,
+-      TP_PROTO(unsigned long nr_migrated,
+-              unsigned long nr_failed),
++      TP_PROTO(unsigned long nr_all,
++              int migrate_rc,
++              struct list_head *migratepages),
+-      TP_ARGS(nr_migrated, nr_failed),
++      TP_ARGS(nr_all, migrate_rc, migratepages),
+       TP_STRUCT__entry(
+               __field(unsigned long, nr_migrated)
+@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
+       ),
+       TP_fast_assign(
+-              __entry->nr_migrated = nr_migrated;
++              unsigned long nr_failed = 0;
++              struct list_head *page_lru;
++
++              /*
++               * migrate_pages() returns either a non-negative number
++               * with the number of pages that failed migration, or an
++               * error code, in which case we need to count the remaining
++               * pages manually
++               */
++              if (migrate_rc >= 0)
++                      nr_failed = migrate_rc;
++              else
++                      list_for_each(page_lru, migratepages)
++                              nr_failed++;
++
++              __entry->nr_migrated = nr_all - nr_failed;
+               __entry->nr_failed = nr_failed;
+       ),
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -822,22 +822,6 @@ static void compaction_free(struct page
+       cc->nr_freepages++;
+ }
+-/*
+- * We cannot control nr_migratepages fully when migration is running as
+- * migrate_pages() has no knowledge of of compact_control.  When migration is
+- * complete, we count the number of pages on the list by hand.
+- */
+-static void update_nr_listpages(struct compact_control *cc)
+-{
+-      int nr_migratepages = 0;
+-      struct page *page;
+-
+-      list_for_each_entry(page, &cc->migratepages, lru)
+-              nr_migratepages++;
+-
+-      cc->nr_migratepages = nr_migratepages;
+-}
+-
+ /* possible outcome of isolate_migratepages */
+ typedef enum {
+       ISOLATE_ABORT,          /* Abort compaction now */
+@@ -1032,7 +1016,6 @@ static int compact_zone(struct zone *zon
+       migrate_prep_local();
+       while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+-              unsigned long nr_migrate, nr_remaining;
+               int err;
+               switch (isolate_migratepages(zone, cc)) {
+@@ -1047,20 +1030,20 @@ static int compact_zone(struct zone *zon
+                       ;
+               }
+-              nr_migrate = cc->nr_migratepages;
++              if (!cc->nr_migratepages)
++                      continue;
++
+               err = migrate_pages(&cc->migratepages, compaction_alloc,
+                               compaction_free, (unsigned long)cc, cc->mode,
+                               MR_COMPACTION);
+-              update_nr_listpages(cc);
+-              nr_remaining = cc->nr_migratepages;
+-              trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+-                                              nr_remaining);
++              trace_mm_compaction_migratepages(cc->nr_migratepages, err,
++                                                      &cc->migratepages);
+-              /* Release isolated pages not migrated */
++              /* All pages were either migrated or will be released */
++              cc->nr_migratepages = 0;
+               if (err) {
+                       putback_movable_pages(&cc->migratepages);
+-                      cc->nr_migratepages = 0;
+                       /*
+                        * migrate_pages() may return -ENOMEM when scanners meet
+                        * and we want compact_finished() to detect it
diff --git a/queue-3.14/mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention.patch b/queue-3.14/mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention.patch
new file mode 100644 (file)
index 0000000..52bc176
--- /dev/null
@@ -0,0 +1,185 @@
+From be9765722e6b7ece8263cbab857490332339bd6f Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 4 Jun 2014 16:10:41 -0700
+Subject: mm, compaction: properly signal and act upon lock and need_sched() contention
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit be9765722e6b7ece8263cbab857490332339bd6f upstream.
+
+Compaction uses compact_checklock_irqsave() function to periodically check
+for lock contention and need_resched() to either abort async compaction,
+or to free the lock, schedule and retake the lock.  When aborting,
+cc->contended is set to signal the contended state to the caller.  Two
+problems have been identified in this mechanism.
+
+First, compaction also calls directly cond_resched() in both scanners when
+no lock is yet taken.  This call either does not abort async compaction,
+or set cc->contended appropriately.  This patch introduces a new
+compact_should_abort() function to achieve both.  In isolate_freepages(),
+the check frequency is reduced to once by SWAP_CLUSTER_MAX pageblocks to
+match what the migration scanner does in the preliminary page checks.  In
+case a pageblock is found suitable for calling isolate_freepages_block(),
+the checks within there are done on higher frequency.
+
+Second, isolate_freepages() does not check if isolate_freepages_block()
+aborted due to contention, and advances to the next pageblock.  This
+violates the principle of aborting on contention, and might result in
+pageblocks not being scanned completely, since the scanning cursor is
+advanced.  This problem has been noticed in the code by Joonsoo Kim when
+reviewing related patches.  This patch makes isolate_freepages_block()
+check the cc->contended flag and abort.
+
+In case isolate_freepages() has already isolated some pages before
+aborting due to contention, page migration will proceed, which is OK since
+we do not want to waste the work that has been done, and page migration
+has own checks for contention.  However, we do not want another isolation
+attempt by either of the scanners, so cc->contended flag check is added
+also to compaction_alloc() and compact_finished() to make sure compaction
+is aborted right after the migration.
+
+The outcome of the patch should be reduced lock contention by async
+compaction and lower latencies for higher-order allocations where direct
+compaction is involved.
+
+[akpm@linux-foundation.org: fix typo in comment]
+Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+Cc: Michal Nazarewicz <mina86@mina86.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Rik van Riel <riel@redhat.com>
+Acked-by: Michal Nazarewicz <mina86@mina86.com>
+Tested-by: Shawn Guo <shawn.guo@linaro.org>
+Tested-by: Kevin Hilman <khilman@linaro.org>
+Tested-by: Stephen Warren <swarren@nvidia.com>
+Tested-by: Fabio Estevam <fabio.estevam@freescale.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |   54 ++++++++++++++++++++++++++++++++++++++++++++----------
+ mm/internal.h   |    5 ++++-
+ 2 files changed, 48 insertions(+), 11 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(sp
+       return true;
+ }
++/*
++ * Aside from avoiding lock contention, compaction also periodically checks
++ * need_resched() and either schedules in sync compaction or aborts async
++ * compaction. This is similar to what compact_checklock_irqsave() does, but
++ * is used where no lock is concerned.
++ *
++ * Returns false when no scheduling was needed, or sync compaction scheduled.
++ * Returns true when async compaction should abort.
++ */
++static inline bool compact_should_abort(struct compact_control *cc)
++{
++      /* async compaction aborts if contended */
++      if (need_resched()) {
++              if (cc->mode == MIGRATE_ASYNC) {
++                      cc->contended = true;
++                      return true;
++              }
++
++              cond_resched();
++      }
++
++      return false;
++}
++
+ /* Returns true if the page is within a block suitable for migration to */
+ static bool suitable_migration_target(struct page *page)
+ {
+@@ -494,11 +518,8 @@ isolate_migratepages_range(struct zone *
+                       return 0;
+       }
+-      if (cond_resched()) {
+-              /* Async terminates prematurely on need_resched() */
+-              if (cc->mode == MIGRATE_ASYNC)
+-                      return 0;
+-      }
++      if (compact_should_abort(cc))
++              return 0;
+       /* Time to isolate some pages for migration */
+       for (; low_pfn < end_pfn; low_pfn++) {
+@@ -720,9 +741,11 @@ static void isolate_freepages(struct zon
+               /*
+                * This can iterate a massively long zone without finding any
+                * suitable migration targets, so periodically check if we need
+-               * to schedule.
++               * to schedule, or even abort async compaction.
+                */
+-              cond_resched();
++              if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
++                                              && compact_should_abort(cc))
++                      break;
+               if (!pfn_valid(block_start_pfn))
+                       continue;
+@@ -760,6 +783,13 @@ static void isolate_freepages(struct zon
+                */
+               if (isolated)
+                       cc->finished_update_free = true;
++
++              /*
++               * isolate_freepages_block() might have aborted due to async
++               * compaction being contended
++               */
++              if (cc->contended)
++                      break;
+       }
+       /* split_free_page does not map the pages */
+@@ -786,9 +816,13 @@ static struct page *compaction_alloc(str
+       struct compact_control *cc = (struct compact_control *)data;
+       struct page *freepage;
+-      /* Isolate free pages if necessary */
++      /*
++       * Isolate free pages if necessary, and if we are not aborting due to
++       * contention.
++       */
+       if (list_empty(&cc->freepages)) {
+-              isolate_freepages(cc->zone, cc);
++              if (!cc->contended)
++                      isolate_freepages(cc->zone, cc);
+               if (list_empty(&cc->freepages))
+                       return NULL;
+@@ -858,7 +892,7 @@ static int compact_finished(struct zone
+       unsigned int order;
+       unsigned long watermark;
+-      if (fatal_signal_pending(current))
++      if (cc->contended || fatal_signal_pending(current))
+               return COMPACT_PARTIAL;
+       /* Compaction run completes if the migrate and free scanner meet */
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -144,7 +144,10 @@ struct compact_control {
+       int order;                      /* order a direct compactor needs */
+       int migratetype;                /* MOVABLE, RECLAIMABLE etc */
+       struct zone *zone;
+-      bool contended;                 /* True if a lock was contended */
++      bool contended;                 /* True if a lock was contended, or
++                                       * need_resched() true during async
++                                       * compaction
++                                       */
+ };
+ unsigned long
diff --git a/queue-3.14/mm-fix-direct-reclaim-writeback-regression.patch b/queue-3.14/mm-fix-direct-reclaim-writeback-regression.patch
new file mode 100644 (file)
index 0000000..0b73b7a
--- /dev/null
@@ -0,0 +1,90 @@
+From 8bdd638091605dc66d92c57c4b80eb87fffc15f7 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Sat, 26 Jul 2014 12:58:23 -0700
+Subject: mm: fix direct reclaim writeback regression
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 8bdd638091605dc66d92c57c4b80eb87fffc15f7 upstream.
+
+Shortly before 3.16-rc1, Dave Jones reported:
+
+  WARNING: CPU: 3 PID: 19721 at fs/xfs/xfs_aops.c:971
+           xfs_vm_writepage+0x5ce/0x630 [xfs]()
+  CPU: 3 PID: 19721 Comm: trinity-c61 Not tainted 3.15.0+ #3
+  Call Trace:
+    xfs_vm_writepage+0x5ce/0x630 [xfs]
+    shrink_page_list+0x8f9/0xb90
+    shrink_inactive_list+0x253/0x510
+    shrink_lruvec+0x563/0x6c0
+    shrink_zone+0x3b/0x100
+    shrink_zones+0x1f1/0x3c0
+    try_to_free_pages+0x164/0x380
+    __alloc_pages_nodemask+0x822/0xc90
+    alloc_pages_vma+0xaf/0x1c0
+    handle_mm_fault+0xa31/0xc50
+  etc.
+
+ 970   if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ 971                   PF_MEMALLOC))
+
+I did not respond at the time, because a glance at the PageDirty block
+in shrink_page_list() quickly shows that this is impossible: we don't do
+writeback on file pages (other than tmpfs) from direct reclaim nowadays.
+Dave was hallucinating, but it would have been disrespectful to say so.
+
+However, my own /var/log/messages now shows similar complaints
+
+  WARNING: CPU: 1 PID: 28814 at fs/ext4/inode.c:1881 ext4_writepage+0xa7/0x38b()
+  WARNING: CPU: 0 PID: 27347 at fs/ext4/inode.c:1764 ext4_writepage+0xa7/0x38b()
+
+from stressing some mmotm trees during July.
+
+Could a dirty xfs or ext4 file page somehow get marked PageSwapBacked,
+so fail shrink_page_list()'s page_is_file_cache() test, and so proceed
+to mapping->a_ops->writepage()?
+
+Yes, 3.16-rc1's commit 68711a746345 ("mm, migration: add destination
+page freeing callback") has provided such a way to compaction: if
+migrating a SwapBacked page fails, its newpage may be put back on the
+list for later use with PageSwapBacked still set, and nothing will clear
+it.
+
+Whether that can do anything worse than issue WARN_ON_ONCEs, and get
+some statistics wrong, is unclear: easier to fix than to think through
+the consequences.
+
+Fixing it here, before the put_new_page(), addresses the bug directly,
+but is probably the worst place to fix it.  Page migration is doing too
+many parts of the job on too many levels: fixing it in
+move_to_new_page() to complement its SetPageSwapBacked would be
+preferable, except why is it (and newpage->mapping and newpage->index)
+done there, rather than down in migrate_page_move_mapping(), once we are
+sure of success? Not a cleanup to get into right now, especially not
+with memcg cleanups coming in 3.17.
+
+Reported-by: Dave Jones <davej@redhat.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/migrate.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -993,9 +993,10 @@ out:
+        * it.  Otherwise, putback_lru_page() will drop the reference grabbed
+        * during isolation.
+        */
+-      if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
++      if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
++              ClearPageSwapBacked(newpage);
+               put_new_page(newpage, private);
+-      else
++      } else
+               putback_lru_page(newpage);
+       if (result) {
diff --git a/queue-3.14/mm-page_alloc-prevent-migrate_reserve-pages-from-being-misplaced.patch b/queue-3.14/mm-page_alloc-prevent-migrate_reserve-pages-from-being-misplaced.patch
new file mode 100644 (file)
index 0000000..9acb6a1
--- /dev/null
@@ -0,0 +1,134 @@
+From 5bcc9f86ef09a933255ee66bd899d4601785dad5 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 4 Jun 2014 16:07:22 -0700
+Subject: mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 5bcc9f86ef09a933255ee66bd899d4601785dad5 upstream.
+
+For the MIGRATE_RESERVE pages, it is useful when they do not get
+misplaced on free_list of other migratetype, otherwise they might get
+allocated prematurely and e.g.  fragment the MIGRATE_RESEVE pageblocks.
+While this cannot be avoided completely when allocating new
+MIGRATE_RESERVE pageblocks in min_free_kbytes sysctl handler, we should
+prevent the misplacement where possible.
+
+Currently, it is possible for the misplacement to happen when a
+MIGRATE_RESERVE page is allocated on pcplist through rmqueue_bulk() as a
+fallback for other desired migratetype, and then later freed back
+through free_pcppages_bulk() without being actually used.  This happens
+because free_pcppages_bulk() uses get_freepage_migratetype() to choose
+the free_list, and rmqueue_bulk() calls set_freepage_migratetype() with
+the *desired* migratetype and not the page's original MIGRATE_RESERVE
+migratetype.
+
+This patch fixes the problem by moving the call to
+set_freepage_migratetype() from rmqueue_bulk() down to
+__rmqueue_smallest() and __rmqueue_fallback() where the actual page's
+migratetype (e.g.  from which free_list the page is taken from) is used.
+Note that this migratetype might be different from the pageblock's
+migratetype due to freepage stealing decisions.  This is OK, as page
+stealing never uses MIGRATE_RESERVE as a fallback, and also takes care
+to leave all MIGRATE_CMA pages on the correct freelist.
+
+Therefore, as an additional benefit, the call to
+get_pageblock_migratetype() from rmqueue_bulk() when CMA is enabled, can
+be removed completely.  This relies on the fact that MIGRATE_CMA
+pageblocks are created only during system init, and the above.  The
+related is_migrate_isolate() check is also unnecessary, as memory
+isolation has other ways to move pages between freelists, and drain pcp
+lists containing pages that should be isolated.  The buffered_rmqueue()
+can also benefit from calling get_freepage_migratetype() instead of
+get_pageblock_migratetype().
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Yong-Taek Lee <ytk.lee@samsung.com>
+Reported-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+Suggested-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Suggested-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Marek Szyprowski <m.szyprowski@samsung.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Michal Nazarewicz <mina86@mina86.com>
+Cc: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |   23 +++++++++++++----------
+ 1 file changed, 13 insertions(+), 10 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -943,6 +943,7 @@ struct page *__rmqueue_smallest(struct z
+               rmv_page_order(page);
+               area->nr_free--;
+               expand(zone, page, order, current_order, area, migratetype);
++              set_freepage_migratetype(page, migratetype);
+               return page;
+       }
+@@ -1069,7 +1070,9 @@ static int try_to_steal_freepages(struct
+       /*
+        * When borrowing from MIGRATE_CMA, we need to release the excess
+-       * buddy pages to CMA itself.
++       * buddy pages to CMA itself. We also ensure the freepage_migratetype
++       * is set to CMA so it is returned to the correct freelist in case
++       * the page ends up being not actually allocated from the pcp lists.
+        */
+       if (is_migrate_cma(fallback_type))
+               return fallback_type;
+@@ -1137,6 +1140,12 @@ __rmqueue_fallback(struct zone *zone, in
+                       expand(zone, page, order, current_order, area,
+                              new_type);
++                      /* The freepage_migratetype may differ from pageblock's
++                       * migratetype depending on the decisions in
++                       * try_to_steal_freepages. This is OK as long as it does
++                       * not differ for MIGRATE_CMA type.
++                       */
++                      set_freepage_migratetype(page, new_type);
+                       trace_mm_page_alloc_extfrag(page, order, current_order,
+                               start_migratetype, migratetype, new_type);
+@@ -1187,7 +1196,7 @@ static int rmqueue_bulk(struct zone *zon
+                       unsigned long count, struct list_head *list,
+                       int migratetype, int cold)
+ {
+-      int mt = migratetype, i;
++      int i;
+       spin_lock(&zone->lock);
+       for (i = 0; i < count; ++i) {
+@@ -1208,14 +1217,8 @@ static int rmqueue_bulk(struct zone *zon
+                       list_add(&page->lru, list);
+               else
+                       list_add_tail(&page->lru, list);
+-              if (IS_ENABLED(CONFIG_CMA)) {
+-                      mt = get_pageblock_migratetype(page);
+-                      if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
+-                              mt = migratetype;
+-              }
+-              set_freepage_migratetype(page, mt);
+               list = &page->lru;
+-              if (is_migrate_cma(mt))
++              if (is_migrate_cma(get_freepage_migratetype(page)))
+                       __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+                                             -(1 << order));
+       }
+@@ -1584,7 +1587,7 @@ again:
+               if (!page)
+                       goto failed;
+               __mod_zone_freepage_state(zone, -(1 << order),
+-                                        get_pageblock_migratetype(page));
++                                        get_freepage_migratetype(page));
+       }
+       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
diff --git a/queue-3.14/mm-vmscan-use-proportional-scanning-during-direct-reclaim-and-full-scan-at-def_priority.patch b/queue-3.14/mm-vmscan-use-proportional-scanning-during-direct-reclaim-and-full-scan-at-def_priority.patch
new file mode 100644 (file)
index 0000000..8bc5613
--- /dev/null
@@ -0,0 +1,144 @@
+From 1a501907bbea8e6ebb0b16cf6db9e9cbf1d2c813 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 4 Jun 2014 16:10:49 -0700
+Subject: mm: vmscan: use proportional scanning during direct reclaim and full scan at DEF_PRIORITY
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 1a501907bbea8e6ebb0b16cf6db9e9cbf1d2c813 upstream.
+
+Commit "mm: vmscan: obey proportional scanning requirements for kswapd"
+ensured that file/anon lists were scanned proportionally for reclaim from
+kswapd but ignored it for direct reclaim.  The intent was to minimse
+direct reclaim latency but Yuanhan Liu pointer out that it substitutes one
+long stall for many small stalls and distorts aging for normal workloads
+like streaming readers/writers.  Hugh Dickins pointed out that a
+side-effect of the same commit was that when one LRU list dropped to zero
+that the entirety of the other list was shrunk leading to excessive
+reclaim in memcgs.  This patch scans the file/anon lists proportionally
+for direct reclaim to similarly age page whether reclaimed by kswapd or
+direct reclaim but takes care to abort reclaim if one LRU drops to zero
+after reclaiming the requested number of pages.
+
+Based on ext4 and using the Intel VM scalability test
+
+                                              3.15.0-rc5            3.15.0-rc5
+                                                shrinker            proportion
+Unit  lru-file-readonce    elapsed      5.3500 (  0.00%)      5.4200 ( -1.31%)
+Unit  lru-file-readonce time_range      0.2700 (  0.00%)      0.1400 ( 48.15%)
+Unit  lru-file-readonce time_stddv      0.1148 (  0.00%)      0.0536 ( 53.33%)
+Unit lru-file-readtwice    elapsed      8.1700 (  0.00%)      8.1700 (  0.00%)
+Unit lru-file-readtwice time_range      0.4300 (  0.00%)      0.2300 ( 46.51%)
+Unit lru-file-readtwice time_stddv      0.1650 (  0.00%)      0.0971 ( 41.16%)
+
+The test cases are running multiple dd instances reading sparse files. The results are within
+the noise for the small test machine. The impact of the patch is more noticable from the vmstats
+
+                            3.15.0-rc5  3.15.0-rc5
+                              shrinker  proportion
+Minor Faults                     35154       36784
+Major Faults                       611        1305
+Swap Ins                           394        1651
+Swap Outs                         4394        5891
+Allocation stalls               118616       44781
+Direct pages scanned           4935171     4602313
+Kswapd pages scanned          15921292    16258483
+Kswapd pages reclaimed        15913301    16248305
+Direct pages reclaimed         4933368     4601133
+Kswapd efficiency                  99%         99%
+Kswapd velocity             670088.047  682555.961
+Direct efficiency                  99%         99%
+Direct velocity             207709.217  193212.133
+Percentage direct scans            23%         22%
+Page writes by reclaim        4858.000    6232.000
+Page writes file                   464         341
+Page writes anon                  4394        5891
+
+Note that there are fewer allocation stalls even though the amount
+of direct reclaim scanning is very approximately the same.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Tim Chen <tim.c.chen@linux.intel.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Tested-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Jan Kara <jack@suse.cz>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   36 +++++++++++++++++++++++++-----------
+ 1 file changed, 25 insertions(+), 11 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2018,13 +2018,27 @@ static void shrink_lruvec(struct lruvec
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
+-      bool scan_adjusted = false;
++      bool scan_adjusted;
+       get_scan_count(lruvec, sc, nr);
+       /* Record the original scan target for proportional adjustments later */
+       memcpy(targets, nr, sizeof(nr));
++      /*
++       * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
++       * event that can occur when there is little memory pressure e.g.
++       * multiple streaming readers/writers. Hence, we do not abort scanning
++       * when the requested number of pages are reclaimed when scanning at
++       * DEF_PRIORITY on the assumption that the fact we are direct
++       * reclaiming implies that kswapd is not keeping up and it is best to
++       * do a batch of work at once. For memcg reclaim one check is made to
++       * abort proportional reclaim if either the file or anon lru has already
++       * dropped to zero at the first pass.
++       */
++      scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
++                       sc->priority == DEF_PRIORITY);
++
+       blk_start_plug(&plug);
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+                                       nr[LRU_INACTIVE_FILE]) {
+@@ -2045,17 +2059,8 @@ static void shrink_lruvec(struct lruvec
+                       continue;
+               /*
+-               * For global direct reclaim, reclaim only the number of pages
+-               * requested. Less care is taken to scan proportionally as it
+-               * is more important to minimise direct reclaim stall latency
+-               * than it is to properly age the LRU lists.
+-               */
+-              if (global_reclaim(sc) && !current_is_kswapd())
+-                      break;
+-
+-              /*
+                * For kswapd and memcg, reclaim at least the number of pages
+-               * requested. Ensure that the anon and file LRUs shrink
++               * requested. Ensure that the anon and file LRUs are scanned
+                * proportionally what was requested by get_scan_count(). We
+                * stop reclaiming one LRU and reduce the amount scanning
+                * proportional to the original scan target.
+@@ -2063,6 +2068,15 @@ static void shrink_lruvec(struct lruvec
+               nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+               nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
++              /*
++               * It's just vindictive to attack the larger once the smaller
++               * has gone to zero.  And given the way we stop scanning the
++               * smaller below, this makes sure that we only make one nudge
++               * towards proportionality once we've got nr_to_reclaim.
++               */
++              if (!nr_file || !nr_anon)
++                      break;
++
+               if (nr_file > nr_anon) {
+                       unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+                                               targets[LRU_ACTIVE_ANON] + 1;
index ce9e2dc39d4bd79ae50a6e030fe74e1927b59960..e3d431defa862718e0e56ce44cc2dd5b7d8d222f 100644 (file)
@@ -111,3 +111,12 @@ mm-compaction-return-failed-migration-target-pages-back-to-freelist.patch
 mm-compaction-add-per-zone-migration-pfn-cache-for-async-compaction.patch
 mm-compaction-embed-migration-mode-in-compact_control.patch
 mm-compaction-terminate-async-compaction-when-rescheduling.patch
+mm-compaction-do-not-count-migratepages-when-unnecessary.patch
+mm-compaction-avoid-rescanning-pageblocks-in-isolate_freepages.patch
+mm-compaction-properly-signal-and-act-upon-lock-and-need_sched-contention.patch
+x86-mm-in-the-pte-swapout-page-reclaim-case-clear-the-accessed-bit-instead-of-flushing-the-tlb.patch
+mm-fix-direct-reclaim-writeback-regression.patch
+fs-superblock-unregister-sb-shrinker-before-kill_sb.patch
+fs-superblock-avoid-locking-counting-inodes-and-dentries-before-reclaiming-them.patch
+mm-vmscan-use-proportional-scanning-during-direct-reclaim-and-full-scan-at-def_priority.patch
+mm-page_alloc-prevent-migrate_reserve-pages-from-being-misplaced.patch
diff --git a/queue-3.14/x86-mm-in-the-pte-swapout-page-reclaim-case-clear-the-accessed-bit-instead-of-flushing-the-tlb.patch b/queue-3.14/x86-mm-in-the-pte-swapout-page-reclaim-case-clear-the-accessed-bit-instead-of-flushing-the-tlb.patch
new file mode 100644 (file)
index 0000000..7cd6a8e
--- /dev/null
@@ -0,0 +1,79 @@
+From b13b1d2d8692b437203de7a404c6b809d2cc4d99 Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@kernel.org>
+Date: Tue, 8 Apr 2014 15:58:09 +0800
+Subject: x86/mm: In the PTE swapout page reclaim case clear the accessed bit instead of flushing the TLB
+
+From: Shaohua Li <shli@kernel.org>
+
+commit b13b1d2d8692b437203de7a404c6b809d2cc4d99 upstream.
+
+We use the accessed bit to age a page at page reclaim time,
+and currently we also flush the TLB when doing so.
+
+But in some workloads TLB flush overhead is very heavy. In my
+simple multithreaded app with a lot of swap to several pcie
+SSDs, removing the tlb flush gives about 20% ~ 30% swapout
+speedup.
+
+Fortunately just removing the TLB flush is a valid optimization:
+on x86 CPUs, clearing the accessed bit without a TLB flush
+doesn't cause data corruption.
+
+It could cause incorrect page aging and the (mistaken) reclaim of
+hot pages, but the chance of that should be relatively low.
+
+So as a performance optimization don't flush the TLB when
+clearing the accessed bit, it will eventually be flushed by
+a context switch or a VM operation anyway. [ In the rare
+event of it not getting flushed for a long time the delay
+shouldn't really matter because there's no real memory
+pressure for swapout to react to. ]
+
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Shaohua Li <shli@fusionio.com>
+Acked-by: Rik van Riel <riel@redhat.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Hugh Dickins <hughd@google.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: linux-mm@kvack.org
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20140408075809.GA1764@kernel.org
+[ Rewrote the changelog and the code comments. ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/pgtable.c |   21 ++++++++++++++-------
+ 1 file changed, 14 insertions(+), 7 deletions(-)
+
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -399,13 +399,20 @@ int pmdp_test_and_clear_young(struct vm_
+ int ptep_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep)
+ {
+-      int young;
+-
+-      young = ptep_test_and_clear_young(vma, address, ptep);
+-      if (young)
+-              flush_tlb_page(vma, address);
+-
+-      return young;
++      /*
++       * On x86 CPUs, clearing the accessed bit without a TLB flush
++       * doesn't cause data corruption. [ It could cause incorrect
++       * page aging and the (mistaken) reclaim of hot pages, but the
++       * chance of that should be relatively low. ]
++       *
++       * So as a performance optimization don't flush the TLB when
++       * clearing the accessed bit, it will eventually be flushed by
++       * a context switch or a VM operation anyway. [ In the rare
++       * event of it not getting flushed for a long time the delay
++       * shouldn't really matter because there's no real memory
++       * pressure for swapout to react to. ]
++       */
++      return ptep_test_and_clear_young(vma, address, ptep);
+ }
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE