]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 7 Oct 2014 04:16:21 +0000 (21:16 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 7 Oct 2014 04:16:21 +0000 (21:16 -0700)
added patches:
lib-plist-add-helper-functions.patch
lib-plist-add-plist_requeue.patch
mm-compaction-avoid-isolating-pinned-pages.patch
mm-exclude-memoryless-nodes-from-zone_reclaim.patch
swap-change-swap_info-singly-linked-list-to-list_head.patch
swap-change-swap_list_head-to-plist-add-swap_avail_head.patch

queue-3.14/lib-plist-add-helper-functions.patch [new file with mode: 0644]
queue-3.14/lib-plist-add-plist_requeue.patch [new file with mode: 0644]
queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch [new file with mode: 0644]
queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch [new file with mode: 0644]
queue-3.14/series
queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch [new file with mode: 0644]
queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch [new file with mode: 0644]

diff --git a/queue-3.14/lib-plist-add-helper-functions.patch b/queue-3.14/lib-plist-add-helper-functions.patch
new file mode 100644 (file)
index 0000000..6d76a63
--- /dev/null
@@ -0,0 +1,121 @@
+From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:55 -0700
+Subject: lib/plist: add helper functions
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit fd16618e12a05df79a3439d72d5ffdac5d34f3da upstream.
+
+Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
+define and initialize a struct plist_head.
+
+Add plist_for_each_continue() and plist_for_each_entry_continue(),
+equivalent to list_for_each_continue() and list_for_each_entry_continue(),
+to iterate over a plist continuing after the current position.
+
+Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
+and ->next, implemented by list_prev_entry() and list_next_entry(), to
+access the prev/next struct plist_node entry.  These are needed because
+unlike struct list_head, direct access of the prev/next struct plist_node
+isn't possible; the list must be navigated via the contained struct
+list_head.  e.g.  instead of accessing the prev by list_prev_entry(node,
+node_list) it can be accessed by plist_prev(node).
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h |   43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head:     name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++      struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+  * PLIST_NODE_INIT - static struct plist_node initializer
+  * @node:     struct plist_node variable name
+  * @__prio:   initial node priority
+@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node
+        list_for_each_entry(pos, &(head)->node_list, node_list)
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos:      the type * to use as a loop cursor
++ * @head:     the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head)    \
++       list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+  * plist_for_each_safe - iterate safely over a plist of given type
+  * @pos:      the type * to use as a loop counter
+  * @n:        another type * to use as temporary storage
+@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node
+        list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos:      the type * to use as a loop cursor
++ * @head:     the head for your list
++ * @m:                the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m)   \
++      list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+  * plist_for_each_entry_safe - iterate safely over list of given type
+  * @pos:      the type * to use as a loop counter
+  * @n:                another type * to use as temporary storage
+@@ -229,6 +258,20 @@ static inline int plist_node_empty(const
+ #endif
+ /**
++ * plist_next - get the next entry in list
++ * @pos:      the type * to cursor
++ */
++#define plist_next(pos) \
++      list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos:      the type * to cursor
++ */
++#define plist_prev(pos) \
++      list_prev_entry(pos, node_list)
++
++/**
+  * plist_first - return the first node (and thus, highest priority)
+  * @head:     the &struct plist_head pointer
+  *
diff --git a/queue-3.14/lib-plist-add-plist_requeue.patch b/queue-3.14/lib-plist-add-plist_requeue.patch
new file mode 100644 (file)
index 0000000..0b6ede4
--- /dev/null
@@ -0,0 +1,145 @@
+From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:57 -0700
+Subject: lib/plist: add plist_requeue
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit a75f232ce0fe38bd01301899ecd97ffd0254316a upstream.
+
+Add plist_requeue(), which moves the specified plist_node after all other
+same-priority plist_nodes in the list.  This is essentially an optimized
+plist_del() followed by plist_add().
+
+This is needed by swap, which (with the next patch in this set) uses a
+plist of available swap devices.  When a swap device (either a swap
+partition or swap file) are added to the system with swapon(), the device
+is added to a plist, ordered by the swap device's priority.  When swap
+needs to allocate a page from one of the swap devices, it takes the page
+from the first swap device on the plist, which is the highest priority
+swap device.  The swap device is left in the plist until all its pages are
+used, and then removed from the plist when it becomes full.
+
+However, as described in man 2 swapon, swap must allocate pages from swap
+devices with the same priority in round-robin order; to do this, on each
+swap page allocation, swap uses a page from the first swap device in the
+plist, and then calls plist_requeue() to move that swap device entry to
+after any other same-priority swap devices.  The next swap page allocation
+will again use a page from the first swap device in the plist and requeue
+it, and so on, resulting in round-robin usage of equal-priority swap
+devices.
+
+Also add plist_test_requeue() test function, for use by plist_test() to
+test plist_requeue() function.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h |    2 +
+ lib/plist.c           |   52 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 54 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -141,6 +141,8 @@ static inline void plist_node_init(struc
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+  * plist_for_each - iterate over the plist
+  * @pos:      the type * to use as a loop counter
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node,
+       plist_check_head(head);
+ }
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add().  It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node:     &struct plist_node pointer - entry to be moved
++ * @head:     &struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++      struct plist_node *iter;
++      struct list_head *node_next = &head->node_list;
++
++      plist_check_head(head);
++      BUG_ON(plist_head_empty(head));
++      BUG_ON(plist_node_empty(node));
++
++      if (node == plist_last(head))
++              return;
++
++      iter = plist_next(node);
++
++      if (node->prio != iter->prio)
++              return;
++
++      plist_del(node, head);
++
++      plist_for_each_continue(iter, head) {
++              if (node->prio != iter->prio) {
++                      node_next = &iter->node_list;
++                      break;
++              }
++      }
++      list_add_tail(&node->node_list, node_next);
++
++      plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int
+       BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
++static void __init plist_test_requeue(struct plist_node *node)
++{
++      plist_requeue(node, &test_head);
++
++      if (node != plist_last(&test_head))
++              BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int  __init plist_test(void)
+ {
+       int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int  __init plist_test(void)
+                       nr_expect--;
+               }
+               plist_test_check(nr_expect);
++              if (!plist_node_empty(test_node + i)) {
++                      plist_test_requeue(test_node + i);
++                      plist_test_check(nr_expect);
++              }
+       }
+       for (i = 0; i < ARRAY_SIZE(test_node); i++) {
diff --git a/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch b/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch
new file mode 100644 (file)
index 0000000..2402b7d
--- /dev/null
@@ -0,0 +1,69 @@
+From 119d6d59dcc0980dcd581fdadb6b2033b512a473 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Thu, 3 Apr 2014 14:48:00 -0700
+Subject: mm, compaction: avoid isolating pinned pages
+
+From: David Rientjes <rientjes@google.com>
+
+commit 119d6d59dcc0980dcd581fdadb6b2033b512a473 upstream.
+
+Page migration will fail for memory that is pinned in memory with, for
+example, get_user_pages().  In this case, it is unnecessary to take
+zone->lru_lock or isolating the page and passing it to page migration
+which will ultimately fail.
+
+This is a racy check, the page can still change from under us, but in
+that case we'll just fail later when attempting to move the page.
+
+This avoids very expensive memory compaction when faulting transparent
+hugepages after pinning a lot of memory with a Mellanox driver.
+
+On a 128GB machine and pinning ~120GB of memory, before this patch we
+see the enormous disparity in the number of page migration failures
+because of the pinning (from /proc/vmstat):
+
+       compact_pages_moved 8450
+       compact_pagemigrate_failed 15614415
+
+0.05% of pages isolated are successfully migrated and explicitly
+triggering memory compaction takes 102 seconds.  After the patch:
+
+       compact_pages_moved 9197
+       compact_pagemigrate_failed 7
+
+99.9% of pages isolated are now successfully migrated in this
+configuration and memory compaction takes less than one second.
+
+Signed-off-by: David Rientjes <rientjes@google.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Greg Thelen <gthelen@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -584,6 +584,15 @@ isolate_migratepages_range(struct zone *
+                       continue;
+               }
++              /*
++               * Migration will fail if an anonymous page is pinned in memory,
++               * so avoid taking lru_lock and isolating it unnecessarily in an
++               * admittedly racy check.
++               */
++              if (!page_mapping(page) &&
++                  page_count(page) > page_mapcount(page))
++                      continue;
++
+               /* Check if it is ok to still hold the lock */
+               locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+                                                               locked, cc);
diff --git a/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch b/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch
new file mode 100644 (file)
index 0000000..1cc1633
--- /dev/null
@@ -0,0 +1,72 @@
+From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.cz>
+Date: Mon, 7 Apr 2014 15:37:01 -0700
+Subject: mm: exclude memoryless nodes from zone_reclaim
+
+From: Michal Hocko <mhocko@suse.cz>
+
+commit 70ef57e6c22c3323dce179b7d0d433c479266612 upstream.
+
+We had a report about strange OOM killer strikes on a PPC machine
+although there was a lot of swap free and a tons of anonymous memory
+which could be swapped out.  In the end it turned out that the OOM was a
+side effect of zone reclaim which wasn't unmapping and swapping out and
+so the system was pushed to the OOM.  Although this sounds like a bug
+somewhere in the kswapd vs.  zone reclaim vs.  direct reclaim
+interaction numactl on the said hardware suggests that the zone reclaim
+should not have been set in the first place:
+
+  node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  node 0 size: 0 MB
+  node 0 free: 0 MB
+  node 2 cpus:
+  node 2 size: 7168 MB
+  node 2 free: 6019 MB
+  node distances:
+  node   0   2
+  0:  10  40
+  2:  40  10
+
+So all the CPUs are associated with Node0 which doesn't have any memory
+while Node2 contains all the available memory.  Node distances cause an
+automatic zone_reclaim_mode enabling.
+
+Zone reclaim is intended to keep the allocations local but this doesn't
+make any sense on the memoryless nodes.  So let's exclude such nodes for
+init_zone_allows_reclaim which evaluates zone reclaim behavior and
+suitable reclaim_nodes.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+Acked-by: David Rientjes <rientjes@google.com>
+Acked-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Tested-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allow
+ {
+       int i;
+-      for_each_online_node(i)
++      for_each_node_state(i, N_MEMORY)
+               if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+                       node_set(i, NODE_DATA(nid)->reclaim_nodes);
+               else
+@@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(in
+       pgdat->node_id = nid;
+       pgdat->node_start_pfn = node_start_pfn;
+-      init_zone_allows_reclaim(nid);
++      if (node_state(nid, N_MEMORY))
++              init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+       get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
index a3580981162eb7154917c484d9c6634f266a6c1a..8b843668db672c5e6c1d35fca47ffd2a19387ca8 100644 (file)
@@ -13,3 +13,9 @@ mm-thp-move-invariant-bug-check-out-of-loop-in-__split_huge_page_map.patch
 mm-numa-do-not-mark-ptes-pte_numa-when-splitting-huge-pages.patch
 media-vb2-fix-vbi-poll-regression.patch
 jiffies-fix-timeval-conversion-to-jiffies.patch
+mm-exclude-memoryless-nodes-from-zone_reclaim.patch
+swap-change-swap_info-singly-linked-list-to-list_head.patch
+lib-plist-add-helper-functions.patch
+lib-plist-add-plist_requeue.patch
+swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
+mm-compaction-avoid-isolating-pinned-pages.patch
diff --git a/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch b/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch
new file mode 100644 (file)
index 0000000..13fefb3
--- /dev/null
@@ -0,0 +1,480 @@
+From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:53 -0700
+Subject: swap: change swap_info singly-linked list to list_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit adfab836f4908deb049a5128082719e689eed964 upstream.
+
+The logic controlling the singly-linked list of swap_info_struct entries
+for all active, i.e.  swapon'ed, swap targets is rather complex, because:
+
+ - it stores the entries in priority order
+ - there is a pointer to the highest priority entry
+ - there is a pointer to the highest priority not-full entry
+ - there is a highest_priority_index variable set outside the swap_lock
+ - swap entries of equal priority should be used equally
+
+this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
+where different priority swap targets are incorrectly used equally.
+
+That bug probably could be solved with the existing singly-linked lists,
+but I think it would only add more complexity to the already difficult to
+understand get_swap_page() swap_list iteration logic.
+
+The first patch changes from a singly-linked list to a doubly-linked list
+using list_heads; the highest_priority_index and related code are removed
+and get_swap_page() starts each iteration at the highest priority
+swap_info entry, even if it's full.  While this does introduce unnecessary
+list iteration (i.e.  Schlemiel the painter's algorithm) in the case where
+one or more of the highest priority entries are full, the iteration and
+manipulation code is much simpler and behaves correctly re: the above bug;
+and the fourth patch removes the unnecessary iteration.
+
+The second patch adds some minor plist helper functions; nothing new
+really, just functions to match existing regular list functions.  These
+are used by the next two patches.
+
+The third patch adds plist_requeue(), which is used by get_swap_page() in
+the next patch - it performs the requeueing of same-priority entries
+(which moves the entry to the end of its priority in the plist), so that
+all equal-priority swap_info_structs get used equally.
+
+The fourth patch converts the main list into a plist, and adds a new plist
+that contains only swap_info entries that are both active and not full.
+As Mel suggested using plists allows removing all the ordering code from
+swap - plists handle ordering automatically.  The list naming is also
+clarified now that there are two lists, with the original list changed
+from swap_list_head to swap_active_head and the new list named
+swap_avail_head.  A new spinlock is also added for the new list, so
+swap_info entries can be added or removed from the new list immediately as
+they become full or not full.
+
+This patch (of 4):
+
+Replace the singly-linked list tracking active, i.e.  swapon'ed,
+swap_info_struct entries with a doubly-linked list using struct
+list_heads.  Simplify the logic iterating and manipulating the list of
+entries, especially get_swap_page(), by using standard list_head
+functions, and removing the highest priority iteration logic.
+
+The change fixes the bug:
+https://lkml.org/lkml/2014/2/13/181
+in which different priority swap entries after the highest priority entry
+are incorrectly used equally in pairs.  The swap behavior is now as
+advertised, i.e. different priority swap entries are used in order, and
+equal priority swap targets are used concurrently.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h     |    7 -
+ include/linux/swapfile.h |    2 
+ mm/frontswap.c           |   13 +--
+ mm/swapfile.c            |  171 +++++++++++++++++++----------------------------
+ 4 files changed, 78 insertions(+), 115 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+       unsigned long   flags;          /* SWP_USED etc: see above */
+       signed short    prio;           /* swap priority of this type */
++      struct list_head list;          /* entry in swap list */
+       signed char     type;           /* strange name for an index */
+-      signed char     next;           /* next type on the swap list */
+       unsigned int    max;            /* extent of the swap_map */
+       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
+       struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +255,6 @@ struct swap_info_struct {
+       struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+-struct swap_list_t {
+-      int head;       /* head of priority-ordered swapfile list */
+-      int next;       /* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct list_head swap_list_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_are
+ static unsigned long __frontswap_curr_pages(void)
+ {
+-      int type;
+       unsigned long totalpages = 0;
+       struct swap_info_struct *si = NULL;
+       assert_spin_locked(&swap_lock);
+-      for (type = swap_list.head; type >= 0; type = si->next) {
+-              si = swap_info[type];
++      list_for_each_entry(si, &swap_list_head, list)
+               totalpages += atomic_read(&si->frontswap_pages);
+-      }
+       return totalpages;
+ }
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsig
+       int si_frontswap_pages;
+       unsigned long total_pages_to_unuse = total;
+       unsigned long pages = 0, pages_to_unuse = 0;
+-      int type;
+       assert_spin_locked(&swap_lock);
+-      for (type = swap_list.head; type >= 0; type = si->next) {
+-              si = swap_info[type];
++      list_for_each_entry(si, &swap_list_head, list) {
+               si_frontswap_pages = atomic_read(&si->frontswap_pages);
+               if (total_pages_to_unuse < si_frontswap_pages) {
+                       pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsig
+               }
+               vm_unacct_memory(pages);
+               *unused = pages_to_unuse;
+-              *swapid = type;
++              *swapid = si->type;
+               ret = 0;
+               break;
+       }
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+       /*
+        * we don't want to hold swap_lock while doing a very
+        * lengthy try_to_unuse, but swap_list may change
+-       * so restart scan from swap_list.head each time
++       * so restart scan from swap_list_head each time
+        */
+       spin_lock(&swap_lock);
+       ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++LIST_HEAD(swap_list_head);
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+@@ -640,66 +643,54 @@ no_page:
+ swp_entry_t get_swap_page(void)
+ {
+-      struct swap_info_struct *si;
++      struct swap_info_struct *si, *next;
+       pgoff_t offset;
+-      int type, next;
+-      int wrapped = 0;
+-      int hp_index;
++      struct list_head *tmp;
+       spin_lock(&swap_lock);
+       if (atomic_long_read(&nr_swap_pages) <= 0)
+               goto noswap;
+       atomic_long_dec(&nr_swap_pages);
+-      for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+-              hp_index = atomic_xchg(&highest_priority_index, -1);
+-              /*
+-               * highest_priority_index records current highest priority swap
+-               * type which just frees swap entries. If its priority is
+-               * higher than that of swap_list.next swap type, we use it.  It
+-               * isn't protected by swap_lock, so it can be an invalid value
+-               * if the corresponding swap type is swapoff. We double check
+-               * the flags here. It's even possible the swap type is swapoff
+-               * and swapon again and its priority is changed. In such rare
+-               * case, low prority swap type might be used, but eventually
+-               * high priority swap will be used after several rounds of
+-               * swap.
+-               */
+-              if (hp_index != -1 && hp_index != type &&
+-                  swap_info[type]->prio < swap_info[hp_index]->prio &&
+-                  (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+-                      type = hp_index;
+-                      swap_list.next = type;
+-              }
+-
+-              si = swap_info[type];
+-              next = si->next;
+-              if (next < 0 ||
+-                  (!wrapped && si->prio != swap_info[next]->prio)) {
+-                      next = swap_list.head;
+-                      wrapped++;
+-              }
+-
++      list_for_each(tmp, &swap_list_head) {
++              si = list_entry(tmp, typeof(*si), list);
+               spin_lock(&si->lock);
+-              if (!si->highest_bit) {
+-                      spin_unlock(&si->lock);
+-                      continue;
+-              }
+-              if (!(si->flags & SWP_WRITEOK)) {
++              if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+                       spin_unlock(&si->lock);
+                       continue;
+               }
+-              swap_list.next = next;
++              /*
++               * rotate the current swap_info that we're going to use
++               * to after any other swap_info that have the same prio,
++               * so that all equal-priority swap_info get used equally
++               */
++              next = si;
++              list_for_each_entry_continue(next, &swap_list_head, list) {
++                      if (si->prio != next->prio)
++                              break;
++                      list_rotate_left(&si->list);
++                      next = si;
++              }
+               spin_unlock(&swap_lock);
+               /* This is called for allocating swap entry for cache */
+               offset = scan_swap_map(si, SWAP_HAS_CACHE);
+               spin_unlock(&si->lock);
+               if (offset)
+-                      return swp_entry(type, offset);
++                      return swp_entry(si->type, offset);
+               spin_lock(&swap_lock);
+-              next = swap_list.next;
++              /*
++               * if we got here, it's likely that si was almost full before,
++               * and since scan_swap_map() can drop the si->lock, multiple
++               * callers probably all tried to get a page from the same si
++               * and it filled up before we could get one.  So we need to
++               * try again.  Since we dropped the swap_lock, there may now
++               * be non-full higher priority swap_infos, and this si may have
++               * even been removed from the list (although very unlikely).
++               * Let's start over.
++               */
++              tmp = &swap_list_head;
+       }
+       atomic_long_inc(&nr_swap_pages);
+@@ -766,27 +757,6 @@ out:
+       return NULL;
+ }
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+-      int old_hp_index, new_hp_index;
+-
+-      do {
+-              old_hp_index = atomic_read(&highest_priority_index);
+-              if (old_hp_index != -1 &&
+-                      swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+-                      break;
+-              new_hp_index = type;
+-      } while (atomic_cmpxchg(&highest_priority_index,
+-              old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+                                    swp_entry_t entry, unsigned char usage)
+ {
+@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(str
+                       p->lowest_bit = offset;
+               if (offset > p->highest_bit)
+                       p->highest_bit = offset;
+-              set_highest_priority_index(p->type);
+               atomic_long_inc(&nr_swap_pages);
+               p->inuse_pages--;
+               frontswap_invalidate_page(p->type, offset);
+@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swa
+                               unsigned char *swap_map,
+                               struct swap_cluster_info *cluster_info)
+ {
+-      int i, prev;
++      struct swap_info_struct *si;
+       if (prio >= 0)
+               p->prio = prio;
+@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swa
+       atomic_long_add(p->pages, &nr_swap_pages);
+       total_swap_pages += p->pages;
+-      /* insert swap space into swap_list: */
+-      prev = -1;
+-      for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+-              if (p->prio >= swap_info[i]->prio)
+-                      break;
+-              prev = i;
++      assert_spin_locked(&swap_lock);
++      BUG_ON(!list_empty(&p->list));
++      /*
++       * insert into swap list; the list is in priority order,
++       * so that get_swap_page() can get a page from the highest
++       * priority swap_info_struct with available page(s), and
++       * swapoff can adjust the auto-assigned (i.e. negative) prio
++       * values for any lower-priority swap_info_structs when
++       * removing a negative-prio swap_info_struct
++       */
++      list_for_each_entry(si, &swap_list_head, list) {
++              if (p->prio >= si->prio) {
++                      list_add_tail(&p->list, &si->list);
++                      return;
++              }
+       }
+-      p->next = i;
+-      if (prev < 0)
+-              swap_list.head = swap_list.next = p->type;
+-      else
+-              swap_info[prev]->next = p->type;
++      /*
++       * this covers two cases:
++       * 1) p->prio is less than all existing prio
++       * 2) the swap list is empty
++       */
++      list_add_tail(&p->list, &swap_list_head);
+ }
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+       struct address_space *mapping;
+       struct inode *inode;
+       struct filename *pathname;
+-      int i, type, prev;
+-      int err;
++      int err, found = 0;
+       unsigned int old_block_size;
+       if (!capable(CAP_SYS_ADMIN))
+@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+               goto out;
+       mapping = victim->f_mapping;
+-      prev = -1;
+       spin_lock(&swap_lock);
+-      for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+-              p = swap_info[type];
++      list_for_each_entry(p, &swap_list_head, list) {
+               if (p->flags & SWP_WRITEOK) {
+-                      if (p->swap_file->f_mapping == mapping)
++                      if (p->swap_file->f_mapping == mapping) {
++                              found = 1;
+                               break;
++                      }
+               }
+-              prev = type;
+       }
+-      if (type < 0) {
++      if (!found) {
+               err = -EINVAL;
+               spin_unlock(&swap_lock);
+               goto out_dput;
+@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+               spin_unlock(&swap_lock);
+               goto out_dput;
+       }
+-      if (prev < 0)
+-              swap_list.head = p->next;
+-      else
+-              swap_info[prev]->next = p->next;
+-      if (type == swap_list.next) {
+-              /* just pick something that's safe... */
+-              swap_list.next = swap_list.head;
+-      }
+       spin_lock(&p->lock);
+       if (p->prio < 0) {
+-              for (i = p->next; i >= 0; i = swap_info[i]->next)
+-                      swap_info[i]->prio = p->prio--;
++              struct swap_info_struct *si = p;
++
++              list_for_each_entry_continue(si, &swap_list_head, list) {
++                      si->prio++;
++              }
+               least_priority++;
+       }
++      list_del_init(&p->list);
+       atomic_long_sub(p->pages, &nr_swap_pages);
+       total_swap_pages -= p->pages;
+       p->flags &= ~SWP_WRITEOK;
+@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+       spin_unlock(&swap_lock);
+       set_current_oom_origin();
+-      err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++      err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+       clear_current_oom_origin();
+       if (err) {
+@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+       frontswap_map = frontswap_map_get(p);
+       spin_unlock(&p->lock);
+       spin_unlock(&swap_lock);
+-      frontswap_invalidate_area(type);
++      frontswap_invalidate_area(p->type);
+       frontswap_map_set(p, NULL);
+       mutex_unlock(&swapon_mutex);
+       free_percpu(p->percpu_cluster);
+@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+       vfree(cluster_info);
+       vfree(frontswap_map);
+       /* Destroy swap account information */
+-      swap_cgroup_swapoff(type);
++      swap_cgroup_swapoff(p->type);
+       inode = mapping->host;
+       if (S_ISBLK(inode->i_mode)) {
+@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_sw
+                */
+       }
+       INIT_LIST_HEAD(&p->first_swap_extent.list);
++      INIT_LIST_HEAD(&p->list);
+       p->flags = SWP_USED;
+-      p->next = -1;
+       spin_unlock(&swap_lock);
+       spin_lock_init(&p->lock);
diff --git a/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch b/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
new file mode 100644 (file)
index 0000000..b1ecacf
--- /dev/null
@@ -0,0 +1,367 @@
+From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:59 -0700
+Subject: swap: change swap_list_head to plist, add swap_avail_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit 18ab4d4ced0817421e6db6940374cc39d28d65da upstream.
+
+Originally get_swap_page() started iterating through the singly-linked
+list of swap_info_structs using swap_list.next or highest_priority_index,
+which both were intended to point to the highest priority active swap
+target that was not full.  The first patch in this series changed the
+singly-linked list to a doubly-linked list, and removed the logic to start
+at the highest priority non-full entry; it starts scanning at the highest
+priority entry each time, even if the entry is full.
+
+Replace the manually ordered swap_list_head with a plist, swap_active_head.
+Add a new plist, swap_avail_head.  The original swap_active_head plist
+contains all active swap_info_structs, as before, while the new
+swap_avail_head plist contains only swap_info_structs that are active and
+available, i.e. not full.  Add a new spinlock, swap_avail_lock, to protect
+the swap_avail_head list.
+
+Mel Gorman suggested using plists since they internally handle ordering
+the list entries based on priority, which is exactly what swap was doing
+manually.  All the ordering code is now removed, and swap_info_struct
+entries and simply added to their corresponding plist and automatically
+ordered correctly.
+
+Using a new plist for available swap_info_structs simplifies and
+optimizes get_swap_page(), which no longer has to iterate over full
+swap_info_structs.  Using a new spinlock for swap_avail_head plist
+allows each swap_info_struct to add or remove themselves from the
+plist when they become full or not-full; previously they could not
+do so because the swap_info_struct->lock is held when they change
+from full<->not-full, and the swap_lock protecting the main
+swap_active_head must be ordered before any swap_info_struct->lock.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h     |    3 
+ include/linux/swapfile.h |    2 
+ mm/frontswap.c           |    6 -
+ mm/swapfile.c            |  145 +++++++++++++++++++++++++++++------------------
+ 4 files changed, 97 insertions(+), 59 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,7 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+       unsigned long   flags;          /* SWP_USED etc: see above */
+       signed short    prio;           /* swap priority of this type */
+-      struct list_head list;          /* entry in swap list */
++      struct plist_node list;         /* entry in swap_active_head */
++      struct plist_node avail_list;   /* entry in swap_avail_head */
+       signed char     type;           /* strange name for an index */
+       unsigned int    max;            /* extent of the swap_map */
+       unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct list_head swap_list_head;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pa
+       struct swap_info_struct *si = NULL;
+       assert_spin_locked(&swap_lock);
+-      list_for_each_entry(si, &swap_list_head, list)
++      plist_for_each_entry(si, &swap_active_head, list)
+               totalpages += atomic_read(&si->frontswap_pages);
+       return totalpages;
+ }
+@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsig
+       unsigned long pages = 0, pages_to_unuse = 0;
+       assert_spin_locked(&swap_lock);
+-      list_for_each_entry(si, &swap_list_head, list) {
++      plist_for_each_entry(si, &swap_active_head, list) {
+               si_frontswap_pages = atomic_read(&si->frontswap_pages);
+               if (total_pages_to_unuse < si_frontswap_pages) {
+                       pages = pages_to_unuse = total_pages_to_unuse;
+@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+       /*
+        * we don't want to hold swap_lock while doing a very
+        * lengthy try_to_unuse, but swap_list may change
+-       * so restart scan from swap_list_head each time
++       * so restart scan from swap_active_head each time
+        */
+       spin_lock(&swap_lock);
+       ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unu
+  * all active swap_info_structs
+  * protected with swap_lock, and ordered by priority.
+  */
+-LIST_HEAD(swap_list_head);
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+@@ -594,6 +609,9 @@ checks:
+       if (si->inuse_pages == si->pages) {
+               si->lowest_bit = si->max;
+               si->highest_bit = 0;
++              spin_lock(&swap_avail_lock);
++              plist_del(&si->avail_list, &swap_avail_head);
++              spin_unlock(&swap_avail_lock);
+       }
+       si->swap_map[offset] = usage;
+       inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void)
+ {
+       struct swap_info_struct *si, *next;
+       pgoff_t offset;
+-      struct list_head *tmp;
+-      spin_lock(&swap_lock);
+       if (atomic_long_read(&nr_swap_pages) <= 0)
+               goto noswap;
+       atomic_long_dec(&nr_swap_pages);
+-      list_for_each(tmp, &swap_list_head) {
+-              si = list_entry(tmp, typeof(*si), list);
++      spin_lock(&swap_avail_lock);
++
++start_over:
++      plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++              /* requeue si to after same-priority siblings */
++              plist_requeue(&si->avail_list, &swap_avail_head);
++              spin_unlock(&swap_avail_lock);
+               spin_lock(&si->lock);
+               if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++                      spin_lock(&swap_avail_lock);
++                      if (plist_node_empty(&si->avail_list)) {
++                              spin_unlock(&si->lock);
++                              goto nextsi;
++                      }
++                      WARN(!si->highest_bit,
++                           "swap_info %d in list but !highest_bit\n",
++                           si->type);
++                      WARN(!(si->flags & SWP_WRITEOK),
++                           "swap_info %d in list but !SWP_WRITEOK\n",
++                           si->type);
++                      plist_del(&si->avail_list, &swap_avail_head);
+                       spin_unlock(&si->lock);
+-                      continue;
++                      goto nextsi;
+               }
+-              /*
+-               * rotate the current swap_info that we're going to use
+-               * to after any other swap_info that have the same prio,
+-               * so that all equal-priority swap_info get used equally
+-               */
+-              next = si;
+-              list_for_each_entry_continue(next, &swap_list_head, list) {
+-                      if (si->prio != next->prio)
+-                              break;
+-                      list_rotate_left(&si->list);
+-                      next = si;
+-              }
+-
+-              spin_unlock(&swap_lock);
+               /* This is called for allocating swap entry for cache */
+               offset = scan_swap_map(si, SWAP_HAS_CACHE);
+               spin_unlock(&si->lock);
+               if (offset)
+                       return swp_entry(si->type, offset);
+-              spin_lock(&swap_lock);
++              pr_debug("scan_swap_map of si %d failed to find offset\n",
++                     si->type);
++              spin_lock(&swap_avail_lock);
++nextsi:
+               /*
+                * if we got here, it's likely that si was almost full before,
+                * and since scan_swap_map() can drop the si->lock, multiple
+                * callers probably all tried to get a page from the same si
+-               * and it filled up before we could get one.  So we need to
+-               * try again.  Since we dropped the swap_lock, there may now
+-               * be non-full higher priority swap_infos, and this si may have
+-               * even been removed from the list (although very unlikely).
+-               * Let's start over.
++               * and it filled up before we could get one; or, the si filled
++               * up between us dropping swap_avail_lock and taking si->lock.
++               * Since we dropped the swap_avail_lock, the swap_avail_head
++               * list may have been modified; so if next is still in the
++               * swap_avail_head list then try it, otherwise start over.
+                */
+-              tmp = &swap_list_head;
++              if (plist_node_empty(&next->avail_list))
++                      goto start_over;
+       }
++      spin_unlock(&swap_avail_lock);
++
+       atomic_long_inc(&nr_swap_pages);
+ noswap:
+-      spin_unlock(&swap_lock);
+       return (swp_entry_t) {0};
+ }
+@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(str
+               dec_cluster_info_page(p, p->cluster_info, offset);
+               if (offset < p->lowest_bit)
+                       p->lowest_bit = offset;
+-              if (offset > p->highest_bit)
++              if (offset > p->highest_bit) {
++                      bool was_full = !p->highest_bit;
+                       p->highest_bit = offset;
++                      if (was_full && (p->flags & SWP_WRITEOK)) {
++                              spin_lock(&swap_avail_lock);
++                              WARN_ON(!plist_node_empty(&p->avail_list));
++                              if (plist_node_empty(&p->avail_list))
++                                      plist_add(&p->avail_list,
++                                                &swap_avail_head);
++                              spin_unlock(&swap_avail_lock);
++                      }
++              }
+               atomic_long_inc(&nr_swap_pages);
+               p->inuse_pages--;
+               frontswap_invalidate_page(p->type, offset);
+@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swa
+                               unsigned char *swap_map,
+                               struct swap_cluster_info *cluster_info)
+ {
+-      struct swap_info_struct *si;
+-
+       if (prio >= 0)
+               p->prio = prio;
+       else
+               p->prio = --least_priority;
++      /*
++       * the plist prio is negated because plist ordering is
++       * low-to-high, while swap ordering is high-to-low
++       */
++      p->list.prio = -p->prio;
++      p->avail_list.prio = -p->prio;
+       p->swap_map = swap_map;
+       p->cluster_info = cluster_info;
+       p->flags |= SWP_WRITEOK;
+@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swa
+       total_swap_pages += p->pages;
+       assert_spin_locked(&swap_lock);
+-      BUG_ON(!list_empty(&p->list));
+-      /*
+-       * insert into swap list; the list is in priority order,
+-       * so that get_swap_page() can get a page from the highest
+-       * priority swap_info_struct with available page(s), and
+-       * swapoff can adjust the auto-assigned (i.e. negative) prio
+-       * values for any lower-priority swap_info_structs when
+-       * removing a negative-prio swap_info_struct
+-       */
+-      list_for_each_entry(si, &swap_list_head, list) {
+-              if (p->prio >= si->prio) {
+-                      list_add_tail(&p->list, &si->list);
+-                      return;
+-              }
+-      }
+       /*
+-       * this covers two cases:
+-       * 1) p->prio is less than all existing prio
+-       * 2) the swap list is empty
++       * both lists are plists, and thus priority ordered.
++       * swap_active_head needs to be priority ordered for swapoff(),
++       * which on removal of any swap_info_struct with an auto-assigned
++       * (i.e. negative) priority increments the auto-assigned priority
++       * of any lower-priority swap_info_structs.
++       * swap_avail_head needs to be priority ordered for get_swap_page(),
++       * which allocates swap pages from the highest available priority
++       * swap_info_struct.
+        */
+-      list_add_tail(&p->list, &swap_list_head);
++      plist_add(&p->list, &swap_active_head);
++      spin_lock(&swap_avail_lock);
++      plist_add(&p->avail_list, &swap_avail_head);
++      spin_unlock(&swap_avail_lock);
+ }
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+       mapping = victim->f_mapping;
+       spin_lock(&swap_lock);
+-      list_for_each_entry(p, &swap_list_head, list) {
++      plist_for_each_entry(p, &swap_active_head, list) {
+               if (p->flags & SWP_WRITEOK) {
+                       if (p->swap_file->f_mapping == mapping) {
+                               found = 1;
+@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __us
+               spin_unlock(&swap_lock);
+               goto out_dput;
+       }
++      spin_lock(&swap_avail_lock);
++      plist_del(&p->avail_list, &swap_avail_head);
++      spin_unlock(&swap_avail_lock);
+       spin_lock(&p->lock);
+       if (p->prio < 0) {
+               struct swap_info_struct *si = p;
+-              list_for_each_entry_continue(si, &swap_list_head, list) {
++              plist_for_each_entry_continue(si, &swap_active_head, list) {
+                       si->prio++;
++                      si->list.prio--;
++                      si->avail_list.prio--;
+               }
+               least_priority++;
+       }
+-      list_del_init(&p->list);
++      plist_del(&p->list, &swap_active_head);
+       atomic_long_sub(p->pages, &nr_swap_pages);
+       total_swap_pages -= p->pages;
+       p->flags &= ~SWP_WRITEOK;
+@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_sw
+                */
+       }
+       INIT_LIST_HEAD(&p->first_swap_extent.list);
+-      INIT_LIST_HEAD(&p->list);
++      plist_node_init(&p->list, 0);
++      plist_node_init(&p->avail_list, 0);
+       p->flags = SWP_USED;
+       spin_unlock(&swap_lock);
+       spin_lock_init(&p->lock);