From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 7 Oct 2014 04:16:21 +0000 (-0700)
Subject: 3.14-stable patches
X-Git-Tag: v3.10.57~10
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f38cf2954be8eeb6922793be7857bdf8e42acab9;p=thirdparty%2Fkernel%2Fstable-queue.git

3.14-stable patches

added patches:
	lib-plist-add-helper-functions.patch
	lib-plist-add-plist_requeue.patch
	mm-compaction-avoid-isolating-pinned-pages.patch
	mm-exclude-memoryless-nodes-from-zone_reclaim.patch
	swap-change-swap_info-singly-linked-list-to-list_head.patch
	swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
---

diff --git a/queue-3.14/lib-plist-add-helper-functions.patch b/queue-3.14/lib-plist-add-helper-functions.patch
new file mode 100644
index 00000000000..6d76a636f75
--- /dev/null
+++ b/queue-3.14/lib-plist-add-helper-functions.patch
@@ -0,0 +1,121 @@
+From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:55 -0700
+Subject: lib/plist: add helper functions
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit fd16618e12a05df79a3439d72d5ffdac5d34f3da upstream.
+
+Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
+define and initialize a struct plist_head.
+
+Add plist_for_each_continue() and plist_for_each_entry_continue(),
+equivalent to list_for_each_continue() and list_for_each_entry_continue(),
+to iterate over a plist continuing after the current position.
+
+Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
+and ->next, implemented by list_prev_entry() and list_next_entry(), to
+access the prev/next struct plist_node entry.  These are needed because
+unlike struct list_head, direct access of the prev/next struct plist_node
+isn't possible; the list must be navigated via the contained struct
+list_head.  e.g.  instead of accessing the prev by list_prev_entry(node,
+node_list) it can be accessed by plist_prev(node).
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h |   43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+ 
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head:	name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++	struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+  * PLIST_NODE_INIT - static struct plist_node initializer
+  * @node:	struct plist_node variable name
+  * @__prio:	initial node priority
+@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node
+ 	 list_for_each_entry(pos, &(head)->node_list, node_list)
+ 
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head)	\
++	 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+  * plist_for_each_safe - iterate safely over a plist of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:	another type * to use as temporary storage
+@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node
+ 	 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+ 
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos:	the type * to use as a loop cursor
++ * @head:	the head for your list
++ * @m:		the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m)	\
++	list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+  * plist_for_each_entry_safe - iterate safely over list of given type
+  * @pos:	the type * to use as a loop counter
+  * @n:		another type * to use as temporary storage
+@@ -229,6 +258,20 @@ static inline int plist_node_empty(const
+ #endif
+ 
+ /**
++ * plist_next - get the next entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_next(pos) \
++	list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos:	the type * to cursor
++ */
++#define plist_prev(pos) \
++	list_prev_entry(pos, node_list)
++
++/**
+  * plist_first - return the first node (and thus, highest priority)
+  * @head:	the &struct plist_head pointer
+  *
diff --git a/queue-3.14/lib-plist-add-plist_requeue.patch b/queue-3.14/lib-plist-add-plist_requeue.patch
new file mode 100644
index 00000000000..0b6ede44a9e
--- /dev/null
+++ b/queue-3.14/lib-plist-add-plist_requeue.patch
@@ -0,0 +1,145 @@
+From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:57 -0700
+Subject: lib/plist: add plist_requeue
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit a75f232ce0fe38bd01301899ecd97ffd0254316a upstream.
+
+Add plist_requeue(), which moves the specified plist_node after all other
+same-priority plist_nodes in the list.  This is essentially an optimized
+plist_del() followed by plist_add().
+
+This is needed by swap, which (with the next patch in this set) uses a
+plist of available swap devices.  When a swap device (either a swap
+partition or swap file) are added to the system with swapon(), the device
+is added to a plist, ordered by the swap device's priority.  When swap
+needs to allocate a page from one of the swap devices, it takes the page
+from the first swap device on the plist, which is the highest priority
+swap device.  The swap device is left in the plist until all its pages are
+used, and then removed from the plist when it becomes full.
+
+However, as described in man 2 swapon, swap must allocate pages from swap
+devices with the same priority in round-robin order; to do this, on each
+swap page allocation, swap uses a page from the first swap device in the
+plist, and then calls plist_requeue() to move that swap device entry to
+after any other same-priority swap devices.  The next swap page allocation
+will again use a page from the first swap device in the plist and requeue
+it, and so on, resulting in round-robin usage of equal-priority swap
+devices.
+
+Also add plist_test_requeue() test function, for use by plist_test() to
+test plist_requeue() function.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h |    2 +
+ lib/plist.c           |   52 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 54 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -141,6 +141,8 @@ static inline void plist_node_init(struc
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
+ 
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+  * plist_for_each - iterate over the plist
+  * @pos:	the type * to use as a loop counter
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node,
+ 	plist_check_head(head);
+ }
+ 
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add().  It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node:	&struct plist_node pointer - entry to be moved
++ * @head:	&struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++	struct plist_node *iter;
++	struct list_head *node_next = &head->node_list;
++
++	plist_check_head(head);
++	BUG_ON(plist_head_empty(head));
++	BUG_ON(plist_node_empty(node));
++
++	if (node == plist_last(head))
++		return;
++
++	iter = plist_next(node);
++
++	if (node->prio != iter->prio)
++		return;
++
++	plist_del(node, head);
++
++	plist_for_each_continue(iter, head) {
++		if (node->prio != iter->prio) {
++			node_next = &iter->node_list;
++			break;
++		}
++	}
++	list_add_tail(&node->node_list, node_next);
++
++	plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int
+ 	BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
+ 
++static void __init plist_test_requeue(struct plist_node *node)
++{
++	plist_requeue(node, &test_head);
++
++	if (node != plist_last(&test_head))
++		BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int  __init plist_test(void)
+ {
+ 	int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int  __init plist_test(void)
+ 			nr_expect--;
+ 		}
+ 		plist_test_check(nr_expect);
++		if (!plist_node_empty(test_node + i)) {
++			plist_test_requeue(test_node + i);
++			plist_test_check(nr_expect);
++		}
+ 	}
+ 
+ 	for (i = 0; i < ARRAY_SIZE(test_node); i++) {
diff --git a/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch b/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch
new file mode 100644
index 00000000000..2402b7d239f
--- /dev/null
+++ b/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch
@@ -0,0 +1,69 @@
+From 119d6d59dcc0980dcd581fdadb6b2033b512a473 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Thu, 3 Apr 2014 14:48:00 -0700
+Subject: mm, compaction: avoid isolating pinned pages
+
+From: David Rientjes <rientjes@google.com>
+
+commit 119d6d59dcc0980dcd581fdadb6b2033b512a473 upstream.
+
+Page migration will fail for memory that is pinned in memory with, for
+example, get_user_pages().  In this case, it is unnecessary to take
+zone->lru_lock or isolating the page and passing it to page migration
+which will ultimately fail.
+
+This is a racy check, the page can still change from under us, but in
+that case we'll just fail later when attempting to move the page.
+
+This avoids very expensive memory compaction when faulting transparent
+hugepages after pinning a lot of memory with a Mellanox driver.
+
+On a 128GB machine and pinning ~120GB of memory, before this patch we
+see the enormous disparity in the number of page migration failures
+because of the pinning (from /proc/vmstat):
+
+	compact_pages_moved 8450
+	compact_pagemigrate_failed 15614415
+
+0.05% of pages isolated are successfully migrated and explicitly
+triggering memory compaction takes 102 seconds.  After the patch:
+
+	compact_pages_moved 9197
+	compact_pagemigrate_failed 7
+
+99.9% of pages isolated are now successfully migrated in this
+configuration and memory compaction takes less than one second.
+
+Signed-off-by: David Rientjes <rientjes@google.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Greg Thelen <gthelen@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -584,6 +584,15 @@ isolate_migratepages_range(struct zone *
+ 			continue;
+ 		}
+ 
++		/*
++		 * Migration will fail if an anonymous page is pinned in memory,
++		 * so avoid taking lru_lock and isolating it unnecessarily in an
++		 * admittedly racy check.
++		 */
++		if (!page_mapping(page) &&
++		    page_count(page) > page_mapcount(page))
++			continue;
++
+ 		/* Check if it is ok to still hold the lock */
+ 		locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ 								locked, cc);
diff --git a/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch b/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch
new file mode 100644
index 00000000000..1cc1633d863
--- /dev/null
+++ b/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch
@@ -0,0 +1,72 @@
+From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.cz>
+Date: Mon, 7 Apr 2014 15:37:01 -0700
+Subject: mm: exclude memoryless nodes from zone_reclaim
+
+From: Michal Hocko <mhocko@suse.cz>
+
+commit 70ef57e6c22c3323dce179b7d0d433c479266612 upstream.
+
+We had a report about strange OOM killer strikes on a PPC machine
+although there was a lot of swap free and a tons of anonymous memory
+which could be swapped out.  In the end it turned out that the OOM was a
+side effect of zone reclaim which wasn't unmapping and swapping out and
+so the system was pushed to the OOM.  Although this sounds like a bug
+somewhere in the kswapd vs.  zone reclaim vs.  direct reclaim
+interaction numactl on the said hardware suggests that the zone reclaim
+should not have been set in the first place:
+
+  node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  node 0 size: 0 MB
+  node 0 free: 0 MB
+  node 2 cpus:
+  node 2 size: 7168 MB
+  node 2 free: 6019 MB
+  node distances:
+  node   0   2
+  0:  10  40
+  2:  40  10
+
+So all the CPUs are associated with Node0 which doesn't have any memory
+while Node2 contains all the available memory.  Node distances cause an
+automatic zone_reclaim_mode enabling.
+
+Zone reclaim is intended to keep the allocations local but this doesn't
+make any sense on the memoryless nodes.  So let's exclude such nodes for
+init_zone_allows_reclaim which evaluates zone reclaim behavior and
+suitable reclaim_nodes.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+Acked-by: David Rientjes <rientjes@google.com>
+Acked-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Tested-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allow
+ {
+ 	int i;
+ 
+-	for_each_online_node(i)
++	for_each_node_state(i, N_MEMORY)
+ 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ 		else
+@@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(in
+ 
+ 	pgdat->node_id = nid;
+ 	pgdat->node_start_pfn = node_start_pfn;
+-	init_zone_allows_reclaim(nid);
++	if (node_state(nid, N_MEMORY))
++		init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
diff --git a/queue-3.14/series b/queue-3.14/series
index a3580981162..8b843668db6 100644
--- a/queue-3.14/series
+++ b/queue-3.14/series
@@ -13,3 +13,9 @@ mm-thp-move-invariant-bug-check-out-of-loop-in-__split_huge_page_map.patch
 mm-numa-do-not-mark-ptes-pte_numa-when-splitting-huge-pages.patch
 media-vb2-fix-vbi-poll-regression.patch
 jiffies-fix-timeval-conversion-to-jiffies.patch
+mm-exclude-memoryless-nodes-from-zone_reclaim.patch
+swap-change-swap_info-singly-linked-list-to-list_head.patch
+lib-plist-add-helper-functions.patch
+lib-plist-add-plist_requeue.patch
+swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
+mm-compaction-avoid-isolating-pinned-pages.patch
diff --git a/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch b/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch
new file mode 100644
index 00000000000..13fefb31d2d
--- /dev/null
+++ b/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch
@@ -0,0 +1,480 @@
+From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:53 -0700
+Subject: swap: change swap_info singly-linked list to list_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit adfab836f4908deb049a5128082719e689eed964 upstream.
+
+The logic controlling the singly-linked list of swap_info_struct entries
+for all active, i.e.  swapon'ed, swap targets is rather complex, because:
+
+ - it stores the entries in priority order
+ - there is a pointer to the highest priority entry
+ - there is a pointer to the highest priority not-full entry
+ - there is a highest_priority_index variable set outside the swap_lock
+ - swap entries of equal priority should be used equally
+
+this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
+where different priority swap targets are incorrectly used equally.
+
+That bug probably could be solved with the existing singly-linked lists,
+but I think it would only add more complexity to the already difficult to
+understand get_swap_page() swap_list iteration logic.
+
+The first patch changes from a singly-linked list to a doubly-linked list
+using list_heads; the highest_priority_index and related code are removed
+and get_swap_page() starts each iteration at the highest priority
+swap_info entry, even if it's full.  While this does introduce unnecessary
+list iteration (i.e.  Schlemiel the painter's algorithm) in the case where
+one or more of the highest priority entries are full, the iteration and
+manipulation code is much simpler and behaves correctly re: the above bug;
+and the fourth patch removes the unnecessary iteration.
+
+The second patch adds some minor plist helper functions; nothing new
+really, just functions to match existing regular list functions.  These
+are used by the next two patches.
+
+The third patch adds plist_requeue(), which is used by get_swap_page() in
+the next patch - it performs the requeueing of same-priority entries
+(which moves the entry to the end of its priority in the plist), so that
+all equal-priority swap_info_structs get used equally.
+
+The fourth patch converts the main list into a plist, and adds a new plist
+that contains only swap_info entries that are both active and not full.
+As Mel suggested using plists allows removing all the ordering code from
+swap - plists handle ordering automatically.  The list naming is also
+clarified now that there are two lists, with the original list changed
+from swap_list_head to swap_active_head and the new list named
+swap_avail_head.  A new spinlock is also added for the new list, so
+swap_info entries can be added or removed from the new list immediately as
+they become full or not full.
+
+This patch (of 4):
+
+Replace the singly-linked list tracking active, i.e.  swapon'ed,
+swap_info_struct entries with a doubly-linked list using struct
+list_heads.  Simplify the logic iterating and manipulating the list of
+entries, especially get_swap_page(), by using standard list_head
+functions, and removing the highest priority iteration logic.
+
+The change fixes the bug:
+https://lkml.org/lkml/2014/2/13/181
+in which different priority swap entries after the highest priority entry
+are incorrectly used equally in pairs.  The swap behavior is now as
+advertised, i.e. different priority swap entries are used in order, and
+equal priority swap targets are used concurrently.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h     |    7 -
+ include/linux/swapfile.h |    2 
+ mm/frontswap.c           |   13 +--
+ mm/swapfile.c            |  171 +++++++++++++++++++----------------------------
+ 4 files changed, 78 insertions(+), 115 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ 	unsigned long	flags;		/* SWP_USED etc: see above */
+ 	signed short	prio;		/* swap priority of this type */
++	struct list_head list;		/* entry in swap list */
+ 	signed char	type;		/* strange name for an index */
+-	signed char	next;		/* next type on the swap list */
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+ 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +255,6 @@ struct swap_info_struct {
+ 	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+ 
+-struct swap_list_t {
+-	int head;	/* head of priority-ordered swapfile list */
+-	int next;	/* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct list_head swap_list_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+ 
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_are
+ 
+ static unsigned long __frontswap_curr_pages(void)
+ {
+-	int type;
+ 	unsigned long totalpages = 0;
+ 	struct swap_info_struct *si = NULL;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	list_for_each_entry(si, &swap_list_head, list)
+ 		totalpages += atomic_read(&si->frontswap_pages);
+-	}
+ 	return totalpages;
+ }
+ 
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsig
+ 	int si_frontswap_pages;
+ 	unsigned long total_pages_to_unuse = total;
+ 	unsigned long pages = 0, pages_to_unuse = 0;
+-	int type;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = si->next) {
+-		si = swap_info[type];
++	list_for_each_entry(si, &swap_list_head, list) {
+ 		si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ 		if (total_pages_to_unuse < si_frontswap_pages) {
+ 			pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsig
+ 		}
+ 		vm_unacct_memory(pages);
+ 		*unused = pages_to_unuse;
+-		*swapid = type;
++		*swapid = si->type;
+ 		ret = 0;
+ 		break;
+ 	}
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+ 	/*
+ 	 * we don't want to hold swap_lock while doing a very
+ 	 * lengthy try_to_unuse, but swap_list may change
+-	 * so restart scan from swap_list.head each time
++	 * so restart scan from swap_list_head each time
+ 	 */
+ 	spin_lock(&swap_lock);
+ 	ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+ 
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+ 
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++LIST_HEAD(swap_list_head);
+ 
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+ 
+@@ -640,66 +643,54 @@ no_page:
+ 
+ swp_entry_t get_swap_page(void)
+ {
+-	struct swap_info_struct *si;
++	struct swap_info_struct *si, *next;
+ 	pgoff_t offset;
+-	int type, next;
+-	int wrapped = 0;
+-	int hp_index;
++	struct list_head *tmp;
+ 
+ 	spin_lock(&swap_lock);
+ 	if (atomic_long_read(&nr_swap_pages) <= 0)
+ 		goto noswap;
+ 	atomic_long_dec(&nr_swap_pages);
+ 
+-	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+-		hp_index = atomic_xchg(&highest_priority_index, -1);
+-		/*
+-		 * highest_priority_index records current highest priority swap
+-		 * type which just frees swap entries. If its priority is
+-		 * higher than that of swap_list.next swap type, we use it.  It
+-		 * isn't protected by swap_lock, so it can be an invalid value
+-		 * if the corresponding swap type is swapoff. We double check
+-		 * the flags here. It's even possible the swap type is swapoff
+-		 * and swapon again and its priority is changed. In such rare
+-		 * case, low prority swap type might be used, but eventually
+-		 * high priority swap will be used after several rounds of
+-		 * swap.
+-		 */
+-		if (hp_index != -1 && hp_index != type &&
+-		    swap_info[type]->prio < swap_info[hp_index]->prio &&
+-		    (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+-			type = hp_index;
+-			swap_list.next = type;
+-		}
+-
+-		si = swap_info[type];
+-		next = si->next;
+-		if (next < 0 ||
+-		    (!wrapped && si->prio != swap_info[next]->prio)) {
+-			next = swap_list.head;
+-			wrapped++;
+-		}
+-
++	list_for_each(tmp, &swap_list_head) {
++		si = list_entry(tmp, typeof(*si), list);
+ 		spin_lock(&si->lock);
+-		if (!si->highest_bit) {
+-			spin_unlock(&si->lock);
+-			continue;
+-		}
+-		if (!(si->flags & SWP_WRITEOK)) {
++		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ 			spin_unlock(&si->lock);
+ 			continue;
+ 		}
+ 
+-		swap_list.next = next;
++		/*
++		 * rotate the current swap_info that we're going to use
++		 * to after any other swap_info that have the same prio,
++		 * so that all equal-priority swap_info get used equally
++		 */
++		next = si;
++		list_for_each_entry_continue(next, &swap_list_head, list) {
++			if (si->prio != next->prio)
++				break;
++			list_rotate_left(&si->list);
++			next = si;
++		}
+ 
+ 		spin_unlock(&swap_lock);
+ 		/* This is called for allocating swap entry for cache */
+ 		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ 		spin_unlock(&si->lock);
+ 		if (offset)
+-			return swp_entry(type, offset);
++			return swp_entry(si->type, offset);
+ 		spin_lock(&swap_lock);
+-		next = swap_list.next;
++		/*
++		 * if we got here, it's likely that si was almost full before,
++		 * and since scan_swap_map() can drop the si->lock, multiple
++		 * callers probably all tried to get a page from the same si
++		 * and it filled up before we could get one.  So we need to
++		 * try again.  Since we dropped the swap_lock, there may now
++		 * be non-full higher priority swap_infos, and this si may have
++		 * even been removed from the list (although very unlikely).
++		 * Let's start over.
++		 */
++		tmp = &swap_list_head;
+ 	}
+ 
+ 	atomic_long_inc(&nr_swap_pages);
+@@ -766,27 +757,6 @@ out:
+ 	return NULL;
+ }
+ 
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+-	int old_hp_index, new_hp_index;
+-
+-	do {
+-		old_hp_index = atomic_read(&highest_priority_index);
+-		if (old_hp_index != -1 &&
+-			swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+-			break;
+-		new_hp_index = type;
+-	} while (atomic_cmpxchg(&highest_priority_index,
+-		old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ 				     swp_entry_t entry, unsigned char usage)
+ {
+@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(str
+ 			p->lowest_bit = offset;
+ 		if (offset > p->highest_bit)
+ 			p->highest_bit = offset;
+-		set_highest_priority_index(p->type);
+ 		atomic_long_inc(&nr_swap_pages);
+ 		p->inuse_pages--;
+ 		frontswap_invalidate_page(p->type, offset);
+@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swa
+ 				unsigned char *swap_map,
+ 				struct swap_cluster_info *cluster_info)
+ {
+-	int i, prev;
++	struct swap_info_struct *si;
+ 
+ 	if (prio >= 0)
+ 		p->prio = prio;
+@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swa
+ 	atomic_long_add(p->pages, &nr_swap_pages);
+ 	total_swap_pages += p->pages;
+ 
+-	/* insert swap space into swap_list: */
+-	prev = -1;
+-	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+-		if (p->prio >= swap_info[i]->prio)
+-			break;
+-		prev = i;
++	assert_spin_locked(&swap_lock);
++	BUG_ON(!list_empty(&p->list));
++	/*
++	 * insert into swap list; the list is in priority order,
++	 * so that get_swap_page() can get a page from the highest
++	 * priority swap_info_struct with available page(s), and
++	 * swapoff can adjust the auto-assigned (i.e. negative) prio
++	 * values for any lower-priority swap_info_structs when
++	 * removing a negative-prio swap_info_struct
++	 */
++	list_for_each_entry(si, &swap_list_head, list) {
++		if (p->prio >= si->prio) {
++			list_add_tail(&p->list, &si->list);
++			return;
++		}
+ 	}
+-	p->next = i;
+-	if (prev < 0)
+-		swap_list.head = swap_list.next = p->type;
+-	else
+-		swap_info[prev]->next = p->type;
++	/*
++	 * this covers two cases:
++	 * 1) p->prio is less than all existing prio
++	 * 2) the swap list is empty
++	 */
++	list_add_tail(&p->list, &swap_list_head);
+ }
+ 
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 	struct address_space *mapping;
+ 	struct inode *inode;
+ 	struct filename *pathname;
+-	int i, type, prev;
+-	int err;
++	int err, found = 0;
+ 	unsigned int old_block_size;
+ 
+ 	if (!capable(CAP_SYS_ADMIN))
+@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 		goto out;
+ 
+ 	mapping = victim->f_mapping;
+-	prev = -1;
+ 	spin_lock(&swap_lock);
+-	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+-		p = swap_info[type];
++	list_for_each_entry(p, &swap_list_head, list) {
+ 		if (p->flags & SWP_WRITEOK) {
+-			if (p->swap_file->f_mapping == mapping)
++			if (p->swap_file->f_mapping == mapping) {
++				found = 1;
+ 				break;
++			}
+ 		}
+-		prev = type;
+ 	}
+-	if (type < 0) {
++	if (!found) {
+ 		err = -EINVAL;
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+ 	}
+-	if (prev < 0)
+-		swap_list.head = p->next;
+-	else
+-		swap_info[prev]->next = p->next;
+-	if (type == swap_list.next) {
+-		/* just pick something that's safe... */
+-		swap_list.next = swap_list.head;
+-	}
+ 	spin_lock(&p->lock);
+ 	if (p->prio < 0) {
+-		for (i = p->next; i >= 0; i = swap_info[i]->next)
+-			swap_info[i]->prio = p->prio--;
++		struct swap_info_struct *si = p;
++
++		list_for_each_entry_continue(si, &swap_list_head, list) {
++			si->prio++;
++		}
+ 		least_priority++;
+ 	}
++	list_del_init(&p->list);
+ 	atomic_long_sub(p->pages, &nr_swap_pages);
+ 	total_swap_pages -= p->pages;
+ 	p->flags &= ~SWP_WRITEOK;
+@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 	spin_unlock(&swap_lock);
+ 
+ 	set_current_oom_origin();
+-	err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ 	clear_current_oom_origin();
+ 
+ 	if (err) {
+@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 	frontswap_map = frontswap_map_get(p);
+ 	spin_unlock(&p->lock);
+ 	spin_unlock(&swap_lock);
+-	frontswap_invalidate_area(type);
++	frontswap_invalidate_area(p->type);
+ 	frontswap_map_set(p, NULL);
+ 	mutex_unlock(&swapon_mutex);
+ 	free_percpu(p->percpu_cluster);
+@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 	vfree(cluster_info);
+ 	vfree(frontswap_map);
+ 	/* Destroy swap account information */
+-	swap_cgroup_swapoff(type);
++	swap_cgroup_swapoff(p->type);
+ 
+ 	inode = mapping->host;
+ 	if (S_ISBLK(inode->i_mode)) {
+@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_sw
+ 		 */
+ 	}
+ 	INIT_LIST_HEAD(&p->first_swap_extent.list);
++	INIT_LIST_HEAD(&p->list);
+ 	p->flags = SWP_USED;
+-	p->next = -1;
+ 	spin_unlock(&swap_lock);
+ 	spin_lock_init(&p->lock);
+ 
diff --git a/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch b/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
new file mode 100644
index 00000000000..b1ecacffcfc
--- /dev/null
+++ b/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
@@ -0,0 +1,367 @@
+From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:59 -0700
+Subject: swap: change swap_list_head to plist, add swap_avail_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit 18ab4d4ced0817421e6db6940374cc39d28d65da upstream.
+
+Originally get_swap_page() started iterating through the singly-linked
+list of swap_info_structs using swap_list.next or highest_priority_index,
+which both were intended to point to the highest priority active swap
+target that was not full.  The first patch in this series changed the
+singly-linked list to a doubly-linked list, and removed the logic to start
+at the highest priority non-full entry; it starts scanning at the highest
+priority entry each time, even if the entry is full.
+
+Replace the manually ordered swap_list_head with a plist, swap_active_head.
+Add a new plist, swap_avail_head.  The original swap_active_head plist
+contains all active swap_info_structs, as before, while the new
+swap_avail_head plist contains only swap_info_structs that are active and
+available, i.e. not full.  Add a new spinlock, swap_avail_lock, to protect
+the swap_avail_head list.
+
+Mel Gorman suggested using plists since they internally handle ordering
+the list entries based on priority, which is exactly what swap was doing
+manually.  All the ordering code is now removed, and swap_info_struct
+entries and simply added to their corresponding plist and automatically
+ordered correctly.
+
+Using a new plist for available swap_info_structs simplifies and
+optimizes get_swap_page(), which no longer has to iterate over full
+swap_info_structs.  Using a new spinlock for swap_avail_head plist
+allows each swap_info_struct to add or remove themselves from the
+plist when they become full or not-full; previously they could not
+do so because the swap_info_struct->lock is held when they change
+from full<->not-full, and the swap_lock protecting the main
+swap_active_head must be ordered before any swap_info_struct->lock.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h     |    3 
+ include/linux/swapfile.h |    2 
+ mm/frontswap.c           |    6 -
+ mm/swapfile.c            |  145 +++++++++++++++++++++++++++++------------------
+ 4 files changed, 97 insertions(+), 59 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,7 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ 	unsigned long	flags;		/* SWP_USED etc: see above */
+ 	signed short	prio;		/* swap priority of this type */
+-	struct list_head list;		/* entry in swap list */
++	struct plist_node list;		/* entry in swap_active_head */
++	struct plist_node avail_list;	/* entry in swap_avail_head */
+ 	signed char	type;		/* strange name for an index */
+ 	unsigned int	max;		/* extent of the swap_map */
+ 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+  * want to expose them to the dozens of source files that include swap.h
+  */
+ extern spinlock_t swap_lock;
+-extern struct list_head swap_list_head;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+ 
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pa
+ 	struct swap_info_struct *si = NULL;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	list_for_each_entry(si, &swap_list_head, list)
++	plist_for_each_entry(si, &swap_active_head, list)
+ 		totalpages += atomic_read(&si->frontswap_pages);
+ 	return totalpages;
+ }
+@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsig
+ 	unsigned long pages = 0, pages_to_unuse = 0;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	list_for_each_entry(si, &swap_list_head, list) {
++	plist_for_each_entry(si, &swap_active_head, list) {
+ 		si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ 		if (total_pages_to_unuse < si_frontswap_pages) {
+ 			pages = pages_to_unuse = total_pages_to_unuse;
+@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+ 	/*
+ 	 * we don't want to hold swap_lock while doing a very
+ 	 * lengthy try_to_unuse, but swap_list may change
+-	 * so restart scan from swap_list_head each time
++	 * so restart scan from swap_active_head each time
+ 	 */
+ 	spin_lock(&swap_lock);
+ 	ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unu
+  * all active swap_info_structs
+  * protected with swap_lock, and ordered by priority.
+  */
+-LIST_HEAD(swap_list_head);
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+ 
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+ 
+@@ -594,6 +609,9 @@ checks:
+ 	if (si->inuse_pages == si->pages) {
+ 		si->lowest_bit = si->max;
+ 		si->highest_bit = 0;
++		spin_lock(&swap_avail_lock);
++		plist_del(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 	}
+ 	si->swap_map[offset] = usage;
+ 	inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void)
+ {
+ 	struct swap_info_struct *si, *next;
+ 	pgoff_t offset;
+-	struct list_head *tmp;
+ 
+-	spin_lock(&swap_lock);
+ 	if (atomic_long_read(&nr_swap_pages) <= 0)
+ 		goto noswap;
+ 	atomic_long_dec(&nr_swap_pages);
+ 
+-	list_for_each(tmp, &swap_list_head) {
+-		si = list_entry(tmp, typeof(*si), list);
++	spin_lock(&swap_avail_lock);
++
++start_over:
++	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++		/* requeue si to after same-priority siblings */
++		plist_requeue(&si->avail_list, &swap_avail_head);
++		spin_unlock(&swap_avail_lock);
+ 		spin_lock(&si->lock);
+ 		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++			spin_lock(&swap_avail_lock);
++			if (plist_node_empty(&si->avail_list)) {
++				spin_unlock(&si->lock);
++				goto nextsi;
++			}
++			WARN(!si->highest_bit,
++			     "swap_info %d in list but !highest_bit\n",
++			     si->type);
++			WARN(!(si->flags & SWP_WRITEOK),
++			     "swap_info %d in list but !SWP_WRITEOK\n",
++			     si->type);
++			plist_del(&si->avail_list, &swap_avail_head);
+ 			spin_unlock(&si->lock);
+-			continue;
++			goto nextsi;
+ 		}
+ 
+-		/*
+-		 * rotate the current swap_info that we're going to use
+-		 * to after any other swap_info that have the same prio,
+-		 * so that all equal-priority swap_info get used equally
+-		 */
+-		next = si;
+-		list_for_each_entry_continue(next, &swap_list_head, list) {
+-			if (si->prio != next->prio)
+-				break;
+-			list_rotate_left(&si->list);
+-			next = si;
+-		}
+-
+-		spin_unlock(&swap_lock);
+ 		/* This is called for allocating swap entry for cache */
+ 		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ 		spin_unlock(&si->lock);
+ 		if (offset)
+ 			return swp_entry(si->type, offset);
+-		spin_lock(&swap_lock);
++		pr_debug("scan_swap_map of si %d failed to find offset\n",
++		       si->type);
++		spin_lock(&swap_avail_lock);
++nextsi:
+ 		/*
+ 		 * if we got here, it's likely that si was almost full before,
+ 		 * and since scan_swap_map() can drop the si->lock, multiple
+ 		 * callers probably all tried to get a page from the same si
+-		 * and it filled up before we could get one.  So we need to
+-		 * try again.  Since we dropped the swap_lock, there may now
+-		 * be non-full higher priority swap_infos, and this si may have
+-		 * even been removed from the list (although very unlikely).
+-		 * Let's start over.
++		 * and it filled up before we could get one; or, the si filled
++		 * up between us dropping swap_avail_lock and taking si->lock.
++		 * Since we dropped the swap_avail_lock, the swap_avail_head
++		 * list may have been modified; so if next is still in the
++		 * swap_avail_head list then try it, otherwise start over.
+ 		 */
+-		tmp = &swap_list_head;
++		if (plist_node_empty(&next->avail_list))
++			goto start_over;
+ 	}
+ 
++	spin_unlock(&swap_avail_lock);
++
+ 	atomic_long_inc(&nr_swap_pages);
+ noswap:
+-	spin_unlock(&swap_lock);
+ 	return (swp_entry_t) {0};
+ }
+ 
+@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(str
+ 		dec_cluster_info_page(p, p->cluster_info, offset);
+ 		if (offset < p->lowest_bit)
+ 			p->lowest_bit = offset;
+-		if (offset > p->highest_bit)
++		if (offset > p->highest_bit) {
++			bool was_full = !p->highest_bit;
+ 			p->highest_bit = offset;
++			if (was_full && (p->flags & SWP_WRITEOK)) {
++				spin_lock(&swap_avail_lock);
++				WARN_ON(!plist_node_empty(&p->avail_list));
++				if (plist_node_empty(&p->avail_list))
++					plist_add(&p->avail_list,
++						  &swap_avail_head);
++				spin_unlock(&swap_avail_lock);
++			}
++		}
+ 		atomic_long_inc(&nr_swap_pages);
+ 		p->inuse_pages--;
+ 		frontswap_invalidate_page(p->type, offset);
+@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swa
+ 				unsigned char *swap_map,
+ 				struct swap_cluster_info *cluster_info)
+ {
+-	struct swap_info_struct *si;
+-
+ 	if (prio >= 0)
+ 		p->prio = prio;
+ 	else
+ 		p->prio = --least_priority;
++	/*
++	 * the plist prio is negated because plist ordering is
++	 * low-to-high, while swap ordering is high-to-low
++	 */
++	p->list.prio = -p->prio;
++	p->avail_list.prio = -p->prio;
+ 	p->swap_map = swap_map;
+ 	p->cluster_info = cluster_info;
+ 	p->flags |= SWP_WRITEOK;
+@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swa
+ 	total_swap_pages += p->pages;
+ 
+ 	assert_spin_locked(&swap_lock);
+-	BUG_ON(!list_empty(&p->list));
+-	/*
+-	 * insert into swap list; the list is in priority order,
+-	 * so that get_swap_page() can get a page from the highest
+-	 * priority swap_info_struct with available page(s), and
+-	 * swapoff can adjust the auto-assigned (i.e. negative) prio
+-	 * values for any lower-priority swap_info_structs when
+-	 * removing a negative-prio swap_info_struct
+-	 */
+-	list_for_each_entry(si, &swap_list_head, list) {
+-		if (p->prio >= si->prio) {
+-			list_add_tail(&p->list, &si->list);
+-			return;
+-		}
+-	}
+ 	/*
+-	 * this covers two cases:
+-	 * 1) p->prio is less than all existing prio
+-	 * 2) the swap list is empty
++	 * both lists are plists, and thus priority ordered.
++	 * swap_active_head needs to be priority ordered for swapoff(),
++	 * which on removal of any swap_info_struct with an auto-assigned
++	 * (i.e. negative) priority increments the auto-assigned priority
++	 * of any lower-priority swap_info_structs.
++	 * swap_avail_head needs to be priority ordered for get_swap_page(),
++	 * which allocates swap pages from the highest available priority
++	 * swap_info_struct.
+ 	 */
+-	list_add_tail(&p->list, &swap_list_head);
++	plist_add(&p->list, &swap_active_head);
++	spin_lock(&swap_avail_lock);
++	plist_add(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ }
+ 
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 
+ 	mapping = victim->f_mapping;
+ 	spin_lock(&swap_lock);
+-	list_for_each_entry(p, &swap_list_head, list) {
++	plist_for_each_entry(p, &swap_active_head, list) {
+ 		if (p->flags & SWP_WRITEOK) {
+ 			if (p->swap_file->f_mapping == mapping) {
+ 				found = 1;
+@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ 		spin_unlock(&swap_lock);
+ 		goto out_dput;
+ 	}
++	spin_lock(&swap_avail_lock);
++	plist_del(&p->avail_list, &swap_avail_head);
++	spin_unlock(&swap_avail_lock);
+ 	spin_lock(&p->lock);
+ 	if (p->prio < 0) {
+ 		struct swap_info_struct *si = p;
+ 
+-		list_for_each_entry_continue(si, &swap_list_head, list) {
++		plist_for_each_entry_continue(si, &swap_active_head, list) {
+ 			si->prio++;
++			si->list.prio--;
++			si->avail_list.prio--;
+ 		}
+ 		least_priority++;
+ 	}
+-	list_del_init(&p->list);
++	plist_del(&p->list, &swap_active_head);
+ 	atomic_long_sub(p->pages, &nr_swap_pages);
+ 	total_swap_pages -= p->pages;
+ 	p->flags &= ~SWP_WRITEOK;
+@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_sw
+ 		 */
+ 	}
+ 	INIT_LIST_HEAD(&p->first_swap_extent.list);
+-	INIT_LIST_HEAD(&p->list);
++	plist_node_init(&p->list, 0);
++	plist_node_init(&p->avail_list, 0);
+ 	p->flags = SWP_USED;
+ 	spin_unlock(&swap_lock);
+ 	spin_lock_init(&p->lock);