--- /dev/null
+From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:55 -0700
+Subject: lib/plist: add helper functions
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit fd16618e12a05df79a3439d72d5ffdac5d34f3da upstream.
+
+Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to
+define and initialize a struct plist_head.
+
+Add plist_for_each_continue() and plist_for_each_entry_continue(),
+equivalent to list_for_each_continue() and list_for_each_entry_continue(),
+to iterate over a plist continuing after the current position.
+
+Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev
+and ->next, implemented by list_prev_entry() and list_next_entry(), to
+access the prev/next struct plist_node entry. These are needed because
+unlike struct list_head, direct access of the prev/next struct plist_node
+isn't possible; the list must be navigated via the contained struct
+list_head. e.g. instead of accessing the prev by list_prev_entry(node,
+node_list) it can be accessed by plist_prev(node).
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h | 43 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 43 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -98,6 +98,13 @@ struct plist_node {
+ }
+
+ /**
++ * PLIST_HEAD - declare and init plist_head
++ * @head: name for struct plist_head variable
++ */
++#define PLIST_HEAD(head) \
++ struct plist_head head = PLIST_HEAD_INIT(head)
++
++/**
+ * PLIST_NODE_INIT - static struct plist_node initializer
+ * @node: struct plist_node variable name
+ * @__prio: initial node priority
+@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node
+ list_for_each_entry(pos, &(head)->node_list, node_list)
+
+ /**
++ * plist_for_each_continue - continue iteration over the plist
++ * @pos: the type * to use as a loop cursor
++ * @head: the head for your list
++ *
++ * Continue to iterate over plist, continuing after the current position.
++ */
++#define plist_for_each_continue(pos, head) \
++ list_for_each_entry_continue(pos, &(head)->node_list, node_list)
++
++/**
+ * plist_for_each_safe - iterate safely over a plist of given type
+ * @pos: the type * to use as a loop counter
+ * @n: another type * to use as temporary storage
+@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node
+ list_for_each_entry(pos, &(head)->node_list, mem.node_list)
+
+ /**
++ * plist_for_each_entry_continue - continue iteration over list of given type
++ * @pos: the type * to use as a loop cursor
++ * @head: the head for your list
++ * @m: the name of the list_struct within the struct
++ *
++ * Continue to iterate over list of given type, continuing after
++ * the current position.
++ */
++#define plist_for_each_entry_continue(pos, head, m) \
++ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
++
++/**
+ * plist_for_each_entry_safe - iterate safely over list of given type
+ * @pos: the type * to use as a loop counter
+ * @n: another type * to use as temporary storage
+@@ -229,6 +258,20 @@ static inline int plist_node_empty(const
+ #endif
+
+ /**
++ * plist_next - get the next entry in list
++ * @pos: the type * to cursor
++ */
++#define plist_next(pos) \
++ list_next_entry(pos, node_list)
++
++/**
++ * plist_prev - get the prev entry in list
++ * @pos: the type * to cursor
++ */
++#define plist_prev(pos) \
++ list_prev_entry(pos, node_list)
++
++/**
+ * plist_first - return the first node (and thus, highest priority)
+ * @head: the &struct plist_head pointer
+ *
--- /dev/null
+From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:57 -0700
+Subject: lib/plist: add plist_requeue
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit a75f232ce0fe38bd01301899ecd97ffd0254316a upstream.
+
+Add plist_requeue(), which moves the specified plist_node after all other
+same-priority plist_nodes in the list. This is essentially an optimized
+plist_del() followed by plist_add().
+
+This is needed by swap, which (with the next patch in this set) uses a
+plist of available swap devices. When a swap device (either a swap
+partition or swap file) are added to the system with swapon(), the device
+is added to a plist, ordered by the swap device's priority. When swap
+needs to allocate a page from one of the swap devices, it takes the page
+from the first swap device on the plist, which is the highest priority
+swap device. The swap device is left in the plist until all its pages are
+used, and then removed from the plist when it becomes full.
+
+However, as described in man 2 swapon, swap must allocate pages from swap
+devices with the same priority in round-robin order; to do this, on each
+swap page allocation, swap uses a page from the first swap device in the
+plist, and then calls plist_requeue() to move that swap device entry to
+after any other same-priority swap devices. The next swap page allocation
+will again use a page from the first swap device in the plist and requeue
+it, and so on, resulting in round-robin usage of equal-priority swap
+devices.
+
+Also add plist_test_requeue() test function, for use by plist_test() to
+test plist_requeue() function.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/plist.h | 2 +
+ lib/plist.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 54 insertions(+)
+
+--- a/include/linux/plist.h
++++ b/include/linux/plist.h
+@@ -141,6 +141,8 @@ static inline void plist_node_init(struc
+ extern void plist_add(struct plist_node *node, struct plist_head *head);
+ extern void plist_del(struct plist_node *node, struct plist_head *head);
+
++extern void plist_requeue(struct plist_node *node, struct plist_head *head);
++
+ /**
+ * plist_for_each - iterate over the plist
+ * @pos: the type * to use as a loop counter
+--- a/lib/plist.c
++++ b/lib/plist.c
+@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node,
+ plist_check_head(head);
+ }
+
++/**
++ * plist_requeue - Requeue @node at end of same-prio entries.
++ *
++ * This is essentially an optimized plist_del() followed by
++ * plist_add(). It moves an entry already in the plist to
++ * after any other same-priority entries.
++ *
++ * @node: &struct plist_node pointer - entry to be moved
++ * @head: &struct plist_head pointer - list head
++ */
++void plist_requeue(struct plist_node *node, struct plist_head *head)
++{
++ struct plist_node *iter;
++ struct list_head *node_next = &head->node_list;
++
++ plist_check_head(head);
++ BUG_ON(plist_head_empty(head));
++ BUG_ON(plist_node_empty(node));
++
++ if (node == plist_last(head))
++ return;
++
++ iter = plist_next(node);
++
++ if (node->prio != iter->prio)
++ return;
++
++ plist_del(node, head);
++
++ plist_for_each_continue(iter, head) {
++ if (node->prio != iter->prio) {
++ node_next = &iter->node_list;
++ break;
++ }
++ }
++ list_add_tail(&node->node_list, node_next);
++
++ plist_check_head(head);
++}
++
+ #ifdef CONFIG_DEBUG_PI_LIST
+ #include <linux/sched.h>
+ #include <linux/module.h>
+@@ -170,6 +210,14 @@ static void __init plist_test_check(int
+ BUG_ON(prio_pos->prio_list.next != &first->prio_list);
+ }
+
++static void __init plist_test_requeue(struct plist_node *node)
++{
++ plist_requeue(node, &test_head);
++
++ if (node != plist_last(&test_head))
++ BUG_ON(node->prio == plist_next(node)->prio);
++}
++
+ static int __init plist_test(void)
+ {
+ int nr_expect = 0, i, loop;
+@@ -193,6 +241,10 @@ static int __init plist_test(void)
+ nr_expect--;
+ }
+ plist_test_check(nr_expect);
++ if (!plist_node_empty(test_node + i)) {
++ plist_test_requeue(test_node + i);
++ plist_test_check(nr_expect);
++ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(test_node); i++) {
--- /dev/null
+From 119d6d59dcc0980dcd581fdadb6b2033b512a473 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Thu, 3 Apr 2014 14:48:00 -0700
+Subject: mm, compaction: avoid isolating pinned pages
+
+From: David Rientjes <rientjes@google.com>
+
+commit 119d6d59dcc0980dcd581fdadb6b2033b512a473 upstream.
+
+Page migration will fail for memory that is pinned in memory with, for
+example, get_user_pages(). In this case, it is unnecessary to take
+zone->lru_lock or isolating the page and passing it to page migration
+which will ultimately fail.
+
+This is a racy check, the page can still change from under us, but in
+that case we'll just fail later when attempting to move the page.
+
+This avoids very expensive memory compaction when faulting transparent
+hugepages after pinning a lot of memory with a Mellanox driver.
+
+On a 128GB machine and pinning ~120GB of memory, before this patch we
+see the enormous disparity in the number of page migration failures
+because of the pinning (from /proc/vmstat):
+
+ compact_pages_moved 8450
+ compact_pagemigrate_failed 15614415
+
+0.05% of pages isolated are successfully migrated and explicitly
+triggering memory compaction takes 102 seconds. After the patch:
+
+ compact_pages_moved 9197
+ compact_pagemigrate_failed 7
+
+99.9% of pages isolated are now successfully migrated in this
+configuration and memory compaction takes less than one second.
+
+Signed-off-by: David Rientjes <rientjes@google.com>
+Acked-by: Hugh Dickins <hughd@google.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Greg Thelen <gthelen@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -584,6 +584,15 @@ isolate_migratepages_range(struct zone *
+ continue;
+ }
+
++ /*
++ * Migration will fail if an anonymous page is pinned in memory,
++ * so avoid taking lru_lock and isolating it unnecessarily in an
++ * admittedly racy check.
++ */
++ if (!page_mapping(page) &&
++ page_count(page) > page_mapcount(page))
++ continue;
++
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
--- /dev/null
+From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.cz>
+Date: Mon, 7 Apr 2014 15:37:01 -0700
+Subject: mm: exclude memoryless nodes from zone_reclaim
+
+From: Michal Hocko <mhocko@suse.cz>
+
+commit 70ef57e6c22c3323dce179b7d0d433c479266612 upstream.
+
+We had a report about strange OOM killer strikes on a PPC machine
+although there was a lot of swap free and a tons of anonymous memory
+which could be swapped out. In the end it turned out that the OOM was a
+side effect of zone reclaim which wasn't unmapping and swapping out and
+so the system was pushed to the OOM. Although this sounds like a bug
+somewhere in the kswapd vs. zone reclaim vs. direct reclaim
+interaction numactl on the said hardware suggests that the zone reclaim
+should not have been set in the first place:
+
+ node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ node 0 size: 0 MB
+ node 0 free: 0 MB
+ node 2 cpus:
+ node 2 size: 7168 MB
+ node 2 free: 6019 MB
+ node distances:
+ node 0 2
+ 0: 10 40
+ 2: 40 10
+
+So all the CPUs are associated with Node0 which doesn't have any memory
+while Node2 contains all the available memory. Node distances cause an
+automatic zone_reclaim_mode enabling.
+
+Zone reclaim is intended to keep the allocations local but this doesn't
+make any sense on the memoryless nodes. So let's exclude such nodes for
+init_zone_allows_reclaim which evaluates zone reclaim behavior and
+suitable reclaim_nodes.
+
+Signed-off-by: Michal Hocko <mhocko@suse.cz>
+Acked-by: David Rientjes <rientjes@google.com>
+Acked-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Tested-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allow
+ {
+ int i;
+
+- for_each_online_node(i)
++ for_each_node_state(i, N_MEMORY)
+ if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ else
+@@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(in
+
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+- init_zone_allows_reclaim(nid);
++ if (node_state(nid, N_MEMORY))
++ init_zone_allows_reclaim(nid);
+ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ #endif
mm-numa-do-not-mark-ptes-pte_numa-when-splitting-huge-pages.patch
media-vb2-fix-vbi-poll-regression.patch
jiffies-fix-timeval-conversion-to-jiffies.patch
+mm-exclude-memoryless-nodes-from-zone_reclaim.patch
+swap-change-swap_info-singly-linked-list-to-list_head.patch
+lib-plist-add-helper-functions.patch
+lib-plist-add-plist_requeue.patch
+swap-change-swap_list_head-to-plist-add-swap_avail_head.patch
+mm-compaction-avoid-isolating-pinned-pages.patch
--- /dev/null
+From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:53 -0700
+Subject: swap: change swap_info singly-linked list to list_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit adfab836f4908deb049a5128082719e689eed964 upstream.
+
+The logic controlling the singly-linked list of swap_info_struct entries
+for all active, i.e. swapon'ed, swap targets is rather complex, because:
+
+ - it stores the entries in priority order
+ - there is a pointer to the highest priority entry
+ - there is a pointer to the highest priority not-full entry
+ - there is a highest_priority_index variable set outside the swap_lock
+ - swap entries of equal priority should be used equally
+
+this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
+where different priority swap targets are incorrectly used equally.
+
+That bug probably could be solved with the existing singly-linked lists,
+but I think it would only add more complexity to the already difficult to
+understand get_swap_page() swap_list iteration logic.
+
+The first patch changes from a singly-linked list to a doubly-linked list
+using list_heads; the highest_priority_index and related code are removed
+and get_swap_page() starts each iteration at the highest priority
+swap_info entry, even if it's full. While this does introduce unnecessary
+list iteration (i.e. Schlemiel the painter's algorithm) in the case where
+one or more of the highest priority entries are full, the iteration and
+manipulation code is much simpler and behaves correctly re: the above bug;
+and the fourth patch removes the unnecessary iteration.
+
+The second patch adds some minor plist helper functions; nothing new
+really, just functions to match existing regular list functions. These
+are used by the next two patches.
+
+The third patch adds plist_requeue(), which is used by get_swap_page() in
+the next patch - it performs the requeueing of same-priority entries
+(which moves the entry to the end of its priority in the plist), so that
+all equal-priority swap_info_structs get used equally.
+
+The fourth patch converts the main list into a plist, and adds a new plist
+that contains only swap_info entries that are both active and not full.
+As Mel suggested using plists allows removing all the ordering code from
+swap - plists handle ordering automatically. The list naming is also
+clarified now that there are two lists, with the original list changed
+from swap_list_head to swap_active_head and the new list named
+swap_avail_head. A new spinlock is also added for the new list, so
+swap_info entries can be added or removed from the new list immediately as
+they become full or not full.
+
+This patch (of 4):
+
+Replace the singly-linked list tracking active, i.e. swapon'ed,
+swap_info_struct entries with a doubly-linked list using struct
+list_heads. Simplify the logic iterating and manipulating the list of
+entries, especially get_swap_page(), by using standard list_head
+functions, and removing the highest priority iteration logic.
+
+The change fixes the bug:
+https://lkml.org/lkml/2014/2/13/181
+in which different priority swap entries after the highest priority entry
+are incorrectly used equally in pairs. The swap behavior is now as
+advertised, i.e. different priority swap entries are used in order, and
+equal priority swap targets are used concurrently.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h | 7 -
+ include/linux/swapfile.h | 2
+ mm/frontswap.c | 13 +--
+ mm/swapfile.c | 171 +++++++++++++++++++----------------------------
+ 4 files changed, 78 insertions(+), 115 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,8 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ unsigned long flags; /* SWP_USED etc: see above */
+ signed short prio; /* swap priority of this type */
++ struct list_head list; /* entry in swap list */
+ signed char type; /* strange name for an index */
+- signed char next; /* next type on the swap list */
+ unsigned int max; /* extent of the swap_map */
+ unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+@@ -255,11 +255,6 @@ struct swap_info_struct {
+ struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
+ };
+
+-struct swap_list_t {
+- int head; /* head of priority-ordered swapfile list */
+- int next; /* swapfile to be used next */
+-};
+-
+ /* linux/mm/page_alloc.c */
+ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+ * want to expose them to the dozens of source files that include swap.h
+ */
+ extern spinlock_t swap_lock;
+-extern struct swap_list_t swap_list;
++extern struct list_head swap_list_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_are
+
+ static unsigned long __frontswap_curr_pages(void)
+ {
+- int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = si->next) {
+- si = swap_info[type];
++ list_for_each_entry(si, &swap_list_head, list)
+ totalpages += atomic_read(&si->frontswap_pages);
+- }
+ return totalpages;
+ }
+
+@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsig
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+- int type;
+
+ assert_spin_locked(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = si->next) {
+- si = swap_info[type];
++ list_for_each_entry(si, &swap_list_head, list) {
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsig
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+- *swapid = type;
++ *swapid = si->type;
+ ret = 0;
+ break;
+ }
+@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+- * so restart scan from swap_list.head each time
++ * so restart scan from swap_list_head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
+ /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+ long total_swap_pages;
+ static int least_priority;
+-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
+
+ static const char Bad_file[] = "Bad swap file entry ";
+ static const char Unused_file[] = "Unused swap file entry ";
+ static const char Bad_offset[] = "Bad swap offset entry ";
+ static const char Unused_offset[] = "Unused swap offset entry ";
+
+-struct swap_list_t swap_list = {-1, -1};
++/*
++ * all active swap_info_structs
++ * protected with swap_lock, and ordered by priority.
++ */
++LIST_HEAD(swap_list_head);
+
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+@@ -640,66 +643,54 @@ no_page:
+
+ swp_entry_t get_swap_page(void)
+ {
+- struct swap_info_struct *si;
++ struct swap_info_struct *si, *next;
+ pgoff_t offset;
+- int type, next;
+- int wrapped = 0;
+- int hp_index;
++ struct list_head *tmp;
+
+ spin_lock(&swap_lock);
+ if (atomic_long_read(&nr_swap_pages) <= 0)
+ goto noswap;
+ atomic_long_dec(&nr_swap_pages);
+
+- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+- hp_index = atomic_xchg(&highest_priority_index, -1);
+- /*
+- * highest_priority_index records current highest priority swap
+- * type which just frees swap entries. If its priority is
+- * higher than that of swap_list.next swap type, we use it. It
+- * isn't protected by swap_lock, so it can be an invalid value
+- * if the corresponding swap type is swapoff. We double check
+- * the flags here. It's even possible the swap type is swapoff
+- * and swapon again and its priority is changed. In such rare
+- * case, low prority swap type might be used, but eventually
+- * high priority swap will be used after several rounds of
+- * swap.
+- */
+- if (hp_index != -1 && hp_index != type &&
+- swap_info[type]->prio < swap_info[hp_index]->prio &&
+- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+- type = hp_index;
+- swap_list.next = type;
+- }
+-
+- si = swap_info[type];
+- next = si->next;
+- if (next < 0 ||
+- (!wrapped && si->prio != swap_info[next]->prio)) {
+- next = swap_list.head;
+- wrapped++;
+- }
+-
++ list_for_each(tmp, &swap_list_head) {
++ si = list_entry(tmp, typeof(*si), list);
+ spin_lock(&si->lock);
+- if (!si->highest_bit) {
+- spin_unlock(&si->lock);
+- continue;
+- }
+- if (!(si->flags & SWP_WRITEOK)) {
++ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_unlock(&si->lock);
+ continue;
+ }
+
+- swap_list.next = next;
++ /*
++ * rotate the current swap_info that we're going to use
++ * to after any other swap_info that have the same prio,
++ * so that all equal-priority swap_info get used equally
++ */
++ next = si;
++ list_for_each_entry_continue(next, &swap_list_head, list) {
++ if (si->prio != next->prio)
++ break;
++ list_rotate_left(&si->list);
++ next = si;
++ }
+
+ spin_unlock(&swap_lock);
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ spin_unlock(&si->lock);
+ if (offset)
+- return swp_entry(type, offset);
++ return swp_entry(si->type, offset);
+ spin_lock(&swap_lock);
+- next = swap_list.next;
++ /*
++ * if we got here, it's likely that si was almost full before,
++ * and since scan_swap_map() can drop the si->lock, multiple
++ * callers probably all tried to get a page from the same si
++ * and it filled up before we could get one. So we need to
++ * try again. Since we dropped the swap_lock, there may now
++ * be non-full higher priority swap_infos, and this si may have
++ * even been removed from the list (although very unlikely).
++ * Let's start over.
++ */
++ tmp = &swap_list_head;
+ }
+
+ atomic_long_inc(&nr_swap_pages);
+@@ -766,27 +757,6 @@ out:
+ return NULL;
+ }
+
+-/*
+- * This swap type frees swap entry, check if it is the highest priority swap
+- * type which just frees swap entry. get_swap_page() uses
+- * highest_priority_index to search highest priority swap type. The
+- * swap_info_struct.lock can't protect us if there are multiple swap types
+- * active, so we use atomic_cmpxchg.
+- */
+-static void set_highest_priority_index(int type)
+-{
+- int old_hp_index, new_hp_index;
+-
+- do {
+- old_hp_index = atomic_read(&highest_priority_index);
+- if (old_hp_index != -1 &&
+- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+- break;
+- new_hp_index = type;
+- } while (atomic_cmpxchg(&highest_priority_index,
+- old_hp_index, new_hp_index) != old_hp_index);
+-}
+-
+ static unsigned char swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+ {
+@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(str
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+ p->highest_bit = offset;
+- set_highest_priority_index(p->type);
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swa
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
+ {
+- int i, prev;
++ struct swap_info_struct *si;
+
+ if (prio >= 0)
+ p->prio = prio;
+@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swa
+ atomic_long_add(p->pages, &nr_swap_pages);
+ total_swap_pages += p->pages;
+
+- /* insert swap space into swap_list: */
+- prev = -1;
+- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+- if (p->prio >= swap_info[i]->prio)
+- break;
+- prev = i;
++ assert_spin_locked(&swap_lock);
++ BUG_ON(!list_empty(&p->list));
++ /*
++ * insert into swap list; the list is in priority order,
++ * so that get_swap_page() can get a page from the highest
++ * priority swap_info_struct with available page(s), and
++ * swapoff can adjust the auto-assigned (i.e. negative) prio
++ * values for any lower-priority swap_info_structs when
++ * removing a negative-prio swap_info_struct
++ */
++ list_for_each_entry(si, &swap_list_head, list) {
++ if (p->prio >= si->prio) {
++ list_add_tail(&p->list, &si->list);
++ return;
++ }
+ }
+- p->next = i;
+- if (prev < 0)
+- swap_list.head = swap_list.next = p->type;
+- else
+- swap_info[prev]->next = p->type;
++ /*
++ * this covers two cases:
++ * 1) p->prio is less than all existing prio
++ * 2) the swap list is empty
++ */
++ list_add_tail(&p->list, &swap_list_head);
+ }
+
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ struct address_space *mapping;
+ struct inode *inode;
+ struct filename *pathname;
+- int i, type, prev;
+- int err;
++ int err, found = 0;
+ unsigned int old_block_size;
+
+ if (!capable(CAP_SYS_ADMIN))
+@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ goto out;
+
+ mapping = victim->f_mapping;
+- prev = -1;
+ spin_lock(&swap_lock);
+- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+- p = swap_info[type];
++ list_for_each_entry(p, &swap_list_head, list) {
+ if (p->flags & SWP_WRITEOK) {
+- if (p->swap_file->f_mapping == mapping)
++ if (p->swap_file->f_mapping == mapping) {
++ found = 1;
+ break;
++ }
+ }
+- prev = type;
+ }
+- if (type < 0) {
++ if (!found) {
+ err = -EINVAL;
+ spin_unlock(&swap_lock);
+ goto out_dput;
+@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
+- if (prev < 0)
+- swap_list.head = p->next;
+- else
+- swap_info[prev]->next = p->next;
+- if (type == swap_list.next) {
+- /* just pick something that's safe... */
+- swap_list.next = swap_list.head;
+- }
+ spin_lock(&p->lock);
+ if (p->prio < 0) {
+- for (i = p->next; i >= 0; i = swap_info[i]->next)
+- swap_info[i]->prio = p->prio--;
++ struct swap_info_struct *si = p;
++
++ list_for_each_entry_continue(si, &swap_list_head, list) {
++ si->prio++;
++ }
+ least_priority++;
+ }
++ list_del_init(&p->list);
+ atomic_long_sub(p->pages, &nr_swap_pages);
+ total_swap_pages -= p->pages;
+ p->flags &= ~SWP_WRITEOK;
+@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ spin_unlock(&swap_lock);
+
+ set_current_oom_origin();
+- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
++ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ clear_current_oom_origin();
+
+ if (err) {
+@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ frontswap_map = frontswap_map_get(p);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+- frontswap_invalidate_area(type);
++ frontswap_invalidate_area(p->type);
+ frontswap_map_set(p, NULL);
+ mutex_unlock(&swapon_mutex);
+ free_percpu(p->percpu_cluster);
+@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ vfree(cluster_info);
+ vfree(frontswap_map);
+ /* Destroy swap account information */
+- swap_cgroup_swapoff(type);
++ swap_cgroup_swapoff(p->type);
+
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_sw
+ */
+ }
+ INIT_LIST_HEAD(&p->first_swap_extent.list);
++ INIT_LIST_HEAD(&p->list);
+ p->flags = SWP_USED;
+- p->next = -1;
+ spin_unlock(&swap_lock);
+ spin_lock_init(&p->lock);
+
--- /dev/null
+From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001
+From: Dan Streetman <ddstreet@ieee.org>
+Date: Wed, 4 Jun 2014 16:09:59 -0700
+Subject: swap: change swap_list_head to plist, add swap_avail_head
+
+From: Dan Streetman <ddstreet@ieee.org>
+
+commit 18ab4d4ced0817421e6db6940374cc39d28d65da upstream.
+
+Originally get_swap_page() started iterating through the singly-linked
+list of swap_info_structs using swap_list.next or highest_priority_index,
+which both were intended to point to the highest priority active swap
+target that was not full. The first patch in this series changed the
+singly-linked list to a doubly-linked list, and removed the logic to start
+at the highest priority non-full entry; it starts scanning at the highest
+priority entry each time, even if the entry is full.
+
+Replace the manually ordered swap_list_head with a plist, swap_active_head.
+Add a new plist, swap_avail_head. The original swap_active_head plist
+contains all active swap_info_structs, as before, while the new
+swap_avail_head plist contains only swap_info_structs that are active and
+available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect
+the swap_avail_head list.
+
+Mel Gorman suggested using plists since they internally handle ordering
+the list entries based on priority, which is exactly what swap was doing
+manually. All the ordering code is now removed, and swap_info_struct
+entries and simply added to their corresponding plist and automatically
+ordered correctly.
+
+Using a new plist for available swap_info_structs simplifies and
+optimizes get_swap_page(), which no longer has to iterate over full
+swap_info_structs. Using a new spinlock for swap_avail_head plist
+allows each swap_info_struct to add or remove themselves from the
+plist when they become full or not-full; previously they could not
+do so because the swap_info_struct->lock is held when they change
+from full<->not-full, and the swap_lock protecting the main
+swap_active_head must be ordered before any swap_info_struct->lock.
+
+Signed-off-by: Dan Streetman <ddstreet@ieee.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Cc: Shaohua Li <shli@fusionio.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Dan Streetman <ddstreet@ieee.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+Cc: Weijie Yang <weijieut@gmail.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Bob Liu <bob.liu@oracle.com>
+Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/swap.h | 3
+ include/linux/swapfile.h | 2
+ mm/frontswap.c | 6 -
+ mm/swapfile.c | 145 +++++++++++++++++++++++++++++------------------
+ 4 files changed, 97 insertions(+), 59 deletions(-)
+
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -214,7 +214,8 @@ struct percpu_cluster {
+ struct swap_info_struct {
+ unsigned long flags; /* SWP_USED etc: see above */
+ signed short prio; /* swap priority of this type */
+- struct list_head list; /* entry in swap list */
++ struct plist_node list; /* entry in swap_active_head */
++ struct plist_node avail_list; /* entry in swap_avail_head */
+ signed char type; /* strange name for an index */
+ unsigned int max; /* extent of the swap_map */
+ unsigned char *swap_map; /* vmalloc'ed array of usage counts */
+--- a/include/linux/swapfile.h
++++ b/include/linux/swapfile.h
+@@ -6,7 +6,7 @@
+ * want to expose them to the dozens of source files that include swap.h
+ */
+ extern spinlock_t swap_lock;
+-extern struct list_head swap_list_head;
++extern struct plist_head swap_active_head;
+ extern struct swap_info_struct *swap_info[];
+ extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+--- a/mm/frontswap.c
++++ b/mm/frontswap.c
+@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pa
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+- list_for_each_entry(si, &swap_list_head, list)
++ plist_for_each_entry(si, &swap_active_head, list)
+ totalpages += atomic_read(&si->frontswap_pages);
+ return totalpages;
+ }
+@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsig
+ unsigned long pages = 0, pages_to_unuse = 0;
+
+ assert_spin_locked(&swap_lock);
+- list_for_each_entry(si, &swap_list_head, list) {
++ plist_for_each_entry(si, &swap_active_head, list) {
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long targ
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+- * so restart scan from swap_list_head each time
++ * so restart scan from swap_active_head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unu
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+-LIST_HEAD(swap_list_head);
++PLIST_HEAD(swap_active_head);
++
++/*
++ * all available (active, not full) swap_info_structs
++ * protected with swap_avail_lock, ordered by priority.
++ * This is used by get_swap_page() instead of swap_active_head
++ * because swap_active_head includes all swap_info_structs,
++ * but get_swap_page() doesn't need to look at full ones.
++ * This uses its own lock instead of swap_lock because when a
++ * swap_info_struct changes between not-full/full, it needs to
++ * add/remove itself to/from this list, but the swap_info_struct->lock
++ * is held and the locking order requires swap_lock to be taken
++ * before any swap_info_struct->lock.
++ */
++static PLIST_HEAD(swap_avail_head);
++static DEFINE_SPINLOCK(swap_avail_lock);
+
+ struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+@@ -594,6 +609,9 @@ checks:
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
++ spin_lock(&swap_avail_lock);
++ plist_del(&si->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ }
+ si->swap_map[offset] = usage;
+ inc_cluster_info_page(si, si->cluster_info, offset);
+@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void)
+ {
+ struct swap_info_struct *si, *next;
+ pgoff_t offset;
+- struct list_head *tmp;
+
+- spin_lock(&swap_lock);
+ if (atomic_long_read(&nr_swap_pages) <= 0)
+ goto noswap;
+ atomic_long_dec(&nr_swap_pages);
+
+- list_for_each(tmp, &swap_list_head) {
+- si = list_entry(tmp, typeof(*si), list);
++ spin_lock(&swap_avail_lock);
++
++start_over:
++ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
++ /* requeue si to after same-priority siblings */
++ plist_requeue(&si->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ spin_lock(&si->lock);
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
++ spin_lock(&swap_avail_lock);
++ if (plist_node_empty(&si->avail_list)) {
++ spin_unlock(&si->lock);
++ goto nextsi;
++ }
++ WARN(!si->highest_bit,
++ "swap_info %d in list but !highest_bit\n",
++ si->type);
++ WARN(!(si->flags & SWP_WRITEOK),
++ "swap_info %d in list but !SWP_WRITEOK\n",
++ si->type);
++ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&si->lock);
+- continue;
++ goto nextsi;
+ }
+
+- /*
+- * rotate the current swap_info that we're going to use
+- * to after any other swap_info that have the same prio,
+- * so that all equal-priority swap_info get used equally
+- */
+- next = si;
+- list_for_each_entry_continue(next, &swap_list_head, list) {
+- if (si->prio != next->prio)
+- break;
+- list_rotate_left(&si->list);
+- next = si;
+- }
+-
+- spin_unlock(&swap_lock);
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ spin_unlock(&si->lock);
+ if (offset)
+ return swp_entry(si->type, offset);
+- spin_lock(&swap_lock);
++ pr_debug("scan_swap_map of si %d failed to find offset\n",
++ si->type);
++ spin_lock(&swap_avail_lock);
++nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+- * and it filled up before we could get one. So we need to
+- * try again. Since we dropped the swap_lock, there may now
+- * be non-full higher priority swap_infos, and this si may have
+- * even been removed from the list (although very unlikely).
+- * Let's start over.
++ * and it filled up before we could get one; or, the si filled
++ * up between us dropping swap_avail_lock and taking si->lock.
++ * Since we dropped the swap_avail_lock, the swap_avail_head
++ * list may have been modified; so if next is still in the
++ * swap_avail_head list then try it, otherwise start over.
+ */
+- tmp = &swap_list_head;
++ if (plist_node_empty(&next->avail_list))
++ goto start_over;
+ }
+
++ spin_unlock(&swap_avail_lock);
++
+ atomic_long_inc(&nr_swap_pages);
+ noswap:
+- spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+ }
+
+@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(str
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+- if (offset > p->highest_bit)
++ if (offset > p->highest_bit) {
++ bool was_full = !p->highest_bit;
+ p->highest_bit = offset;
++ if (was_full && (p->flags & SWP_WRITEOK)) {
++ spin_lock(&swap_avail_lock);
++ WARN_ON(!plist_node_empty(&p->avail_list));
++ if (plist_node_empty(&p->avail_list))
++ plist_add(&p->avail_list,
++ &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
++ }
++ }
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swa
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
+ {
+- struct swap_info_struct *si;
+-
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
++ /*
++ * the plist prio is negated because plist ordering is
++ * low-to-high, while swap ordering is high-to-low
++ */
++ p->list.prio = -p->prio;
++ p->avail_list.prio = -p->prio;
+ p->swap_map = swap_map;
+ p->cluster_info = cluster_info;
+ p->flags |= SWP_WRITEOK;
+@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swa
+ total_swap_pages += p->pages;
+
+ assert_spin_locked(&swap_lock);
+- BUG_ON(!list_empty(&p->list));
+- /*
+- * insert into swap list; the list is in priority order,
+- * so that get_swap_page() can get a page from the highest
+- * priority swap_info_struct with available page(s), and
+- * swapoff can adjust the auto-assigned (i.e. negative) prio
+- * values for any lower-priority swap_info_structs when
+- * removing a negative-prio swap_info_struct
+- */
+- list_for_each_entry(si, &swap_list_head, list) {
+- if (p->prio >= si->prio) {
+- list_add_tail(&p->list, &si->list);
+- return;
+- }
+- }
+ /*
+- * this covers two cases:
+- * 1) p->prio is less than all existing prio
+- * 2) the swap list is empty
++ * both lists are plists, and thus priority ordered.
++ * swap_active_head needs to be priority ordered for swapoff(),
++ * which on removal of any swap_info_struct with an auto-assigned
++ * (i.e. negative) priority increments the auto-assigned priority
++ * of any lower-priority swap_info_structs.
++ * swap_avail_head needs to be priority ordered for get_swap_page(),
++ * which allocates swap pages from the highest available priority
++ * swap_info_struct.
+ */
+- list_add_tail(&p->list, &swap_list_head);
++ plist_add(&p->list, &swap_active_head);
++ spin_lock(&swap_avail_lock);
++ plist_add(&p->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ }
+
+ static void enable_swap_info(struct swap_info_struct *p, int prio,
+@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
+
+ mapping = victim->f_mapping;
+ spin_lock(&swap_lock);
+- list_for_each_entry(p, &swap_list_head, list) {
++ plist_for_each_entry(p, &swap_active_head, list) {
+ if (p->flags & SWP_WRITEOK) {
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
+@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __us
+ spin_unlock(&swap_lock);
+ goto out_dput;
+ }
++ spin_lock(&swap_avail_lock);
++ plist_del(&p->avail_list, &swap_avail_head);
++ spin_unlock(&swap_avail_lock);
+ spin_lock(&p->lock);
+ if (p->prio < 0) {
+ struct swap_info_struct *si = p;
+
+- list_for_each_entry_continue(si, &swap_list_head, list) {
++ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
++ si->list.prio--;
++ si->avail_list.prio--;
+ }
+ least_priority++;
+ }
+- list_del_init(&p->list);
++ plist_del(&p->list, &swap_active_head);
+ atomic_long_sub(p->pages, &nr_swap_pages);
+ total_swap_pages -= p->pages;
+ p->flags &= ~SWP_WRITEOK;
+@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_sw
+ */
+ }
+ INIT_LIST_HEAD(&p->first_swap_extent.list);
+- INIT_LIST_HEAD(&p->list);
++ plist_node_init(&p->list, 0);
++ plist_node_init(&p->avail_list, 0);
+ p->flags = SWP_USED;
+ spin_unlock(&swap_lock);
+ spin_lock_init(&p->lock);