From: Greg Kroah-Hartman Date: Tue, 7 Oct 2014 04:16:21 +0000 (-0700) Subject: 3.14-stable patches X-Git-Tag: v3.10.57~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f38cf2954be8eeb6922793be7857bdf8e42acab9;p=thirdparty%2Fkernel%2Fstable-queue.git 3.14-stable patches added patches: lib-plist-add-helper-functions.patch lib-plist-add-plist_requeue.patch mm-compaction-avoid-isolating-pinned-pages.patch mm-exclude-memoryless-nodes-from-zone_reclaim.patch swap-change-swap_info-singly-linked-list-to-list_head.patch swap-change-swap_list_head-to-plist-add-swap_avail_head.patch --- diff --git a/queue-3.14/lib-plist-add-helper-functions.patch b/queue-3.14/lib-plist-add-helper-functions.patch new file mode 100644 index 00000000000..6d76a636f75 --- /dev/null +++ b/queue-3.14/lib-plist-add-helper-functions.patch @@ -0,0 +1,121 @@ +From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001 +From: Dan Streetman +Date: Wed, 4 Jun 2014 16:09:55 -0700 +Subject: lib/plist: add helper functions + +From: Dan Streetman + +commit fd16618e12a05df79a3439d72d5ffdac5d34f3da upstream. + +Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to +define and initialize a struct plist_head. + +Add plist_for_each_continue() and plist_for_each_entry_continue(), +equivalent to list_for_each_continue() and list_for_each_entry_continue(), +to iterate over a plist continuing after the current position. + +Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev +and ->next, implemented by list_prev_entry() and list_next_entry(), to +access the prev/next struct plist_node entry. These are needed because +unlike struct list_head, direct access of the prev/next struct plist_node +isn't possible; the list must be navigated via the contained struct +list_head. e.g. instead of accessing the prev by list_prev_entry(node, +node_list) it can be accessed by plist_prev(node). + +Signed-off-by: Dan Streetman +Acked-by: Mel Gorman +Cc: Paul Gortmaker +Cc: Steven Rostedt +Cc: Thomas Gleixner +Cc: Shaohua Li +Cc: Hugh Dickins +Cc: Dan Streetman +Cc: Michal Hocko +Cc: Christian Ehrhardt +Cc: Weijie Yang +Cc: Rik van Riel +Cc: Johannes Weiner +Cc: Bob Liu +Cc: Peter Zijlstra +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/plist.h | 43 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +--- a/include/linux/plist.h ++++ b/include/linux/plist.h +@@ -98,6 +98,13 @@ struct plist_node { + } + + /** ++ * PLIST_HEAD - declare and init plist_head ++ * @head: name for struct plist_head variable ++ */ ++#define PLIST_HEAD(head) \ ++ struct plist_head head = PLIST_HEAD_INIT(head) ++ ++/** + * PLIST_NODE_INIT - static struct plist_node initializer + * @node: struct plist_node variable name + * @__prio: initial node priority +@@ -143,6 +150,16 @@ extern void plist_del(struct plist_node + list_for_each_entry(pos, &(head)->node_list, node_list) + + /** ++ * plist_for_each_continue - continue iteration over the plist ++ * @pos: the type * to use as a loop cursor ++ * @head: the head for your list ++ * ++ * Continue to iterate over plist, continuing after the current position. ++ */ ++#define plist_for_each_continue(pos, head) \ ++ list_for_each_entry_continue(pos, &(head)->node_list, node_list) ++ ++/** + * plist_for_each_safe - iterate safely over a plist of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage +@@ -163,6 +180,18 @@ extern void plist_del(struct plist_node + list_for_each_entry(pos, &(head)->node_list, mem.node_list) + + /** ++ * plist_for_each_entry_continue - continue iteration over list of given type ++ * @pos: the type * to use as a loop cursor ++ * @head: the head for your list ++ * @m: the name of the list_struct within the struct ++ * ++ * Continue to iterate over list of given type, continuing after ++ * the current position. ++ */ ++#define plist_for_each_entry_continue(pos, head, m) \ ++ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) ++ ++/** + * plist_for_each_entry_safe - iterate safely over list of given type + * @pos: the type * to use as a loop counter + * @n: another type * to use as temporary storage +@@ -229,6 +258,20 @@ static inline int plist_node_empty(const + #endif + + /** ++ * plist_next - get the next entry in list ++ * @pos: the type * to cursor ++ */ ++#define plist_next(pos) \ ++ list_next_entry(pos, node_list) ++ ++/** ++ * plist_prev - get the prev entry in list ++ * @pos: the type * to cursor ++ */ ++#define plist_prev(pos) \ ++ list_prev_entry(pos, node_list) ++ ++/** + * plist_first - return the first node (and thus, highest priority) + * @head: the &struct plist_head pointer + * diff --git a/queue-3.14/lib-plist-add-plist_requeue.patch b/queue-3.14/lib-plist-add-plist_requeue.patch new file mode 100644 index 00000000000..0b6ede44a9e --- /dev/null +++ b/queue-3.14/lib-plist-add-plist_requeue.patch @@ -0,0 +1,145 @@ +From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001 +From: Dan Streetman +Date: Wed, 4 Jun 2014 16:09:57 -0700 +Subject: lib/plist: add plist_requeue + +From: Dan Streetman + +commit a75f232ce0fe38bd01301899ecd97ffd0254316a upstream. + +Add plist_requeue(), which moves the specified plist_node after all other +same-priority plist_nodes in the list. This is essentially an optimized +plist_del() followed by plist_add(). + +This is needed by swap, which (with the next patch in this set) uses a +plist of available swap devices. When a swap device (either a swap +partition or swap file) are added to the system with swapon(), the device +is added to a plist, ordered by the swap device's priority. When swap +needs to allocate a page from one of the swap devices, it takes the page +from the first swap device on the plist, which is the highest priority +swap device. The swap device is left in the plist until all its pages are +used, and then removed from the plist when it becomes full. + +However, as described in man 2 swapon, swap must allocate pages from swap +devices with the same priority in round-robin order; to do this, on each +swap page allocation, swap uses a page from the first swap device in the +plist, and then calls plist_requeue() to move that swap device entry to +after any other same-priority swap devices. The next swap page allocation +will again use a page from the first swap device in the plist and requeue +it, and so on, resulting in round-robin usage of equal-priority swap +devices. + +Also add plist_test_requeue() test function, for use by plist_test() to +test plist_requeue() function. + +Signed-off-by: Dan Streetman +Cc: Steven Rostedt +Cc: Peter Zijlstra +Acked-by: Mel Gorman +Cc: Paul Gortmaker +Cc: Thomas Gleixner +Cc: Shaohua Li +Cc: Hugh Dickins +Cc: Dan Streetman +Cc: Michal Hocko +Cc: Christian Ehrhardt +Cc: Weijie Yang +Cc: Rik van Riel +Cc: Johannes Weiner +Cc: Bob Liu +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/plist.h | 2 + + lib/plist.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 54 insertions(+) + +--- a/include/linux/plist.h ++++ b/include/linux/plist.h +@@ -141,6 +141,8 @@ static inline void plist_node_init(struc + extern void plist_add(struct plist_node *node, struct plist_head *head); + extern void plist_del(struct plist_node *node, struct plist_head *head); + ++extern void plist_requeue(struct plist_node *node, struct plist_head *head); ++ + /** + * plist_for_each - iterate over the plist + * @pos: the type * to use as a loop counter +--- a/lib/plist.c ++++ b/lib/plist.c +@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, + plist_check_head(head); + } + ++/** ++ * plist_requeue - Requeue @node at end of same-prio entries. ++ * ++ * This is essentially an optimized plist_del() followed by ++ * plist_add(). It moves an entry already in the plist to ++ * after any other same-priority entries. ++ * ++ * @node: &struct plist_node pointer - entry to be moved ++ * @head: &struct plist_head pointer - list head ++ */ ++void plist_requeue(struct plist_node *node, struct plist_head *head) ++{ ++ struct plist_node *iter; ++ struct list_head *node_next = &head->node_list; ++ ++ plist_check_head(head); ++ BUG_ON(plist_head_empty(head)); ++ BUG_ON(plist_node_empty(node)); ++ ++ if (node == plist_last(head)) ++ return; ++ ++ iter = plist_next(node); ++ ++ if (node->prio != iter->prio) ++ return; ++ ++ plist_del(node, head); ++ ++ plist_for_each_continue(iter, head) { ++ if (node->prio != iter->prio) { ++ node_next = &iter->node_list; ++ break; ++ } ++ } ++ list_add_tail(&node->node_list, node_next); ++ ++ plist_check_head(head); ++} ++ + #ifdef CONFIG_DEBUG_PI_LIST + #include + #include +@@ -170,6 +210,14 @@ static void __init plist_test_check(int + BUG_ON(prio_pos->prio_list.next != &first->prio_list); + } + ++static void __init plist_test_requeue(struct plist_node *node) ++{ ++ plist_requeue(node, &test_head); ++ ++ if (node != plist_last(&test_head)) ++ BUG_ON(node->prio == plist_next(node)->prio); ++} ++ + static int __init plist_test(void) + { + int nr_expect = 0, i, loop; +@@ -193,6 +241,10 @@ static int __init plist_test(void) + nr_expect--; + } + plist_test_check(nr_expect); ++ if (!plist_node_empty(test_node + i)) { ++ plist_test_requeue(test_node + i); ++ plist_test_check(nr_expect); ++ } + } + + for (i = 0; i < ARRAY_SIZE(test_node); i++) { diff --git a/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch b/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch new file mode 100644 index 00000000000..2402b7d239f --- /dev/null +++ b/queue-3.14/mm-compaction-avoid-isolating-pinned-pages.patch @@ -0,0 +1,69 @@ +From 119d6d59dcc0980dcd581fdadb6b2033b512a473 Mon Sep 17 00:00:00 2001 +From: David Rientjes +Date: Thu, 3 Apr 2014 14:48:00 -0700 +Subject: mm, compaction: avoid isolating pinned pages + +From: David Rientjes + +commit 119d6d59dcc0980dcd581fdadb6b2033b512a473 upstream. + +Page migration will fail for memory that is pinned in memory with, for +example, get_user_pages(). In this case, it is unnecessary to take +zone->lru_lock or isolating the page and passing it to page migration +which will ultimately fail. + +This is a racy check, the page can still change from under us, but in +that case we'll just fail later when attempting to move the page. + +This avoids very expensive memory compaction when faulting transparent +hugepages after pinning a lot of memory with a Mellanox driver. + +On a 128GB machine and pinning ~120GB of memory, before this patch we +see the enormous disparity in the number of page migration failures +because of the pinning (from /proc/vmstat): + + compact_pages_moved 8450 + compact_pagemigrate_failed 15614415 + +0.05% of pages isolated are successfully migrated and explicitly +triggering memory compaction takes 102 seconds. After the patch: + + compact_pages_moved 9197 + compact_pagemigrate_failed 7 + +99.9% of pages isolated are now successfully migrated in this +configuration and memory compaction takes less than one second. + +Signed-off-by: David Rientjes +Acked-by: Hugh Dickins +Acked-by: Mel Gorman +Cc: Joonsoo Kim +Cc: Rik van Riel +Cc: Greg Thelen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -584,6 +584,15 @@ isolate_migratepages_range(struct zone * + continue; + } + ++ /* ++ * Migration will fail if an anonymous page is pinned in memory, ++ * so avoid taking lru_lock and isolating it unnecessarily in an ++ * admittedly racy check. ++ */ ++ if (!page_mapping(page) && ++ page_count(page) > page_mapcount(page)) ++ continue; ++ + /* Check if it is ok to still hold the lock */ + locked = compact_checklock_irqsave(&zone->lru_lock, &flags, + locked, cc); diff --git a/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch b/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch new file mode 100644 index 00000000000..1cc1633d863 --- /dev/null +++ b/queue-3.14/mm-exclude-memoryless-nodes-from-zone_reclaim.patch @@ -0,0 +1,72 @@ +From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Mon, 7 Apr 2014 15:37:01 -0700 +Subject: mm: exclude memoryless nodes from zone_reclaim + +From: Michal Hocko + +commit 70ef57e6c22c3323dce179b7d0d433c479266612 upstream. + +We had a report about strange OOM killer strikes on a PPC machine +although there was a lot of swap free and a tons of anonymous memory +which could be swapped out. In the end it turned out that the OOM was a +side effect of zone reclaim which wasn't unmapping and swapping out and +so the system was pushed to the OOM. Although this sounds like a bug +somewhere in the kswapd vs. zone reclaim vs. direct reclaim +interaction numactl on the said hardware suggests that the zone reclaim +should not have been set in the first place: + + node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + node 0 size: 0 MB + node 0 free: 0 MB + node 2 cpus: + node 2 size: 7168 MB + node 2 free: 6019 MB + node distances: + node 0 2 + 0: 10 40 + 2: 40 10 + +So all the CPUs are associated with Node0 which doesn't have any memory +while Node2 contains all the available memory. Node distances cause an +automatic zone_reclaim_mode enabling. + +Zone reclaim is intended to keep the allocations local but this doesn't +make any sense on the memoryless nodes. So let's exclude such nodes for +init_zone_allows_reclaim which evaluates zone reclaim behavior and +suitable reclaim_nodes. + +Signed-off-by: Michal Hocko +Acked-by: David Rientjes +Acked-by: Nishanth Aravamudan +Tested-by: Nishanth Aravamudan +Acked-by: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1869,7 +1869,7 @@ static void __paginginit init_zone_allow + { + int i; + +- for_each_online_node(i) ++ for_each_node_state(i, N_MEMORY) + if (node_distance(nid, i) <= RECLAIM_DISTANCE) + node_set(i, NODE_DATA(nid)->reclaim_nodes); + else +@@ -4933,7 +4933,8 @@ void __paginginit free_area_init_node(in + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; +- init_zone_allows_reclaim(nid); ++ if (node_state(nid, N_MEMORY)) ++ init_zone_allows_reclaim(nid); + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + #endif diff --git a/queue-3.14/series b/queue-3.14/series index a3580981162..8b843668db6 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -13,3 +13,9 @@ mm-thp-move-invariant-bug-check-out-of-loop-in-__split_huge_page_map.patch mm-numa-do-not-mark-ptes-pte_numa-when-splitting-huge-pages.patch media-vb2-fix-vbi-poll-regression.patch jiffies-fix-timeval-conversion-to-jiffies.patch +mm-exclude-memoryless-nodes-from-zone_reclaim.patch +swap-change-swap_info-singly-linked-list-to-list_head.patch +lib-plist-add-helper-functions.patch +lib-plist-add-plist_requeue.patch +swap-change-swap_list_head-to-plist-add-swap_avail_head.patch +mm-compaction-avoid-isolating-pinned-pages.patch diff --git a/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch b/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch new file mode 100644 index 00000000000..13fefb31d2d --- /dev/null +++ b/queue-3.14/swap-change-swap_info-singly-linked-list-to-list_head.patch @@ -0,0 +1,480 @@ +From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001 +From: Dan Streetman +Date: Wed, 4 Jun 2014 16:09:53 -0700 +Subject: swap: change swap_info singly-linked list to list_head + +From: Dan Streetman + +commit adfab836f4908deb049a5128082719e689eed964 upstream. + +The logic controlling the singly-linked list of swap_info_struct entries +for all active, i.e. swapon'ed, swap targets is rather complex, because: + + - it stores the entries in priority order + - there is a pointer to the highest priority entry + - there is a pointer to the highest priority not-full entry + - there is a highest_priority_index variable set outside the swap_lock + - swap entries of equal priority should be used equally + +this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181 +where different priority swap targets are incorrectly used equally. + +That bug probably could be solved with the existing singly-linked lists, +but I think it would only add more complexity to the already difficult to +understand get_swap_page() swap_list iteration logic. + +The first patch changes from a singly-linked list to a doubly-linked list +using list_heads; the highest_priority_index and related code are removed +and get_swap_page() starts each iteration at the highest priority +swap_info entry, even if it's full. While this does introduce unnecessary +list iteration (i.e. Schlemiel the painter's algorithm) in the case where +one or more of the highest priority entries are full, the iteration and +manipulation code is much simpler and behaves correctly re: the above bug; +and the fourth patch removes the unnecessary iteration. + +The second patch adds some minor plist helper functions; nothing new +really, just functions to match existing regular list functions. These +are used by the next two patches. + +The third patch adds plist_requeue(), which is used by get_swap_page() in +the next patch - it performs the requeueing of same-priority entries +(which moves the entry to the end of its priority in the plist), so that +all equal-priority swap_info_structs get used equally. + +The fourth patch converts the main list into a plist, and adds a new plist +that contains only swap_info entries that are both active and not full. +As Mel suggested using plists allows removing all the ordering code from +swap - plists handle ordering automatically. The list naming is also +clarified now that there are two lists, with the original list changed +from swap_list_head to swap_active_head and the new list named +swap_avail_head. A new spinlock is also added for the new list, so +swap_info entries can be added or removed from the new list immediately as +they become full or not full. + +This patch (of 4): + +Replace the singly-linked list tracking active, i.e. swapon'ed, +swap_info_struct entries with a doubly-linked list using struct +list_heads. Simplify the logic iterating and manipulating the list of +entries, especially get_swap_page(), by using standard list_head +functions, and removing the highest priority iteration logic. + +The change fixes the bug: +https://lkml.org/lkml/2014/2/13/181 +in which different priority swap entries after the highest priority entry +are incorrectly used equally in pairs. The swap behavior is now as +advertised, i.e. different priority swap entries are used in order, and +equal priority swap targets are used concurrently. + +Signed-off-by: Dan Streetman +Acked-by: Mel Gorman +Cc: Shaohua Li +Cc: Hugh Dickins +Cc: Dan Streetman +Cc: Michal Hocko +Cc: Christian Ehrhardt +Cc: Weijie Yang +Cc: Rik van Riel +Cc: Johannes Weiner +Cc: Bob Liu +Cc: Steven Rostedt +Cc: Peter Zijlstra +Cc: Paul Gortmaker +Cc: Thomas Gleixner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/swap.h | 7 - + include/linux/swapfile.h | 2 + mm/frontswap.c | 13 +-- + mm/swapfile.c | 171 +++++++++++++++++++---------------------------- + 4 files changed, 78 insertions(+), 115 deletions(-) + +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -214,8 +214,8 @@ struct percpu_cluster { + struct swap_info_struct { + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ ++ struct list_head list; /* entry in swap list */ + signed char type; /* strange name for an index */ +- signed char next; /* next type on the swap list */ + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ +@@ -255,11 +255,6 @@ struct swap_info_struct { + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ + }; + +-struct swap_list_t { +- int head; /* head of priority-ordered swapfile list */ +- int next; /* swapfile to be used next */ +-}; +- + /* linux/mm/page_alloc.c */ + extern unsigned long totalram_pages; + extern unsigned long totalreserve_pages; +--- a/include/linux/swapfile.h ++++ b/include/linux/swapfile.h +@@ -6,7 +6,7 @@ + * want to expose them to the dozens of source files that include swap.h + */ + extern spinlock_t swap_lock; +-extern struct swap_list_t swap_list; ++extern struct list_head swap_list_head; + extern struct swap_info_struct *swap_info[]; + extern int try_to_unuse(unsigned int, bool, unsigned long); + +--- a/mm/frontswap.c ++++ b/mm/frontswap.c +@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_are + + static unsigned long __frontswap_curr_pages(void) + { +- int type; + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); +- for (type = swap_list.head; type >= 0; type = si->next) { +- si = swap_info[type]; ++ list_for_each_entry(si, &swap_list_head, list) + totalpages += atomic_read(&si->frontswap_pages); +- } + return totalpages; + } + +@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsig + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; +- int type; + + assert_spin_locked(&swap_lock); +- for (type = swap_list.head; type >= 0; type = si->next) { +- si = swap_info[type]; ++ list_for_each_entry(si, &swap_list_head, list) { + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; +@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsig + } + vm_unacct_memory(pages); + *unused = pages_to_unuse; +- *swapid = type; ++ *swapid = si->type; + ret = 0; + break; + } +@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long targ + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change +- * so restart scan from swap_list.head each time ++ * so restart scan from swap_list_head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages; + /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ + long total_swap_pages; + static int least_priority; +-static atomic_t highest_priority_index = ATOMIC_INIT(-1); + + static const char Bad_file[] = "Bad swap file entry "; + static const char Unused_file[] = "Unused swap file entry "; + static const char Bad_offset[] = "Bad swap offset entry "; + static const char Unused_offset[] = "Unused swap offset entry "; + +-struct swap_list_t swap_list = {-1, -1}; ++/* ++ * all active swap_info_structs ++ * protected with swap_lock, and ordered by priority. ++ */ ++LIST_HEAD(swap_list_head); + + struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +@@ -640,66 +643,54 @@ no_page: + + swp_entry_t get_swap_page(void) + { +- struct swap_info_struct *si; ++ struct swap_info_struct *si, *next; + pgoff_t offset; +- int type, next; +- int wrapped = 0; +- int hp_index; ++ struct list_head *tmp; + + spin_lock(&swap_lock); + if (atomic_long_read(&nr_swap_pages) <= 0) + goto noswap; + atomic_long_dec(&nr_swap_pages); + +- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { +- hp_index = atomic_xchg(&highest_priority_index, -1); +- /* +- * highest_priority_index records current highest priority swap +- * type which just frees swap entries. If its priority is +- * higher than that of swap_list.next swap type, we use it. It +- * isn't protected by swap_lock, so it can be an invalid value +- * if the corresponding swap type is swapoff. We double check +- * the flags here. It's even possible the swap type is swapoff +- * and swapon again and its priority is changed. In such rare +- * case, low prority swap type might be used, but eventually +- * high priority swap will be used after several rounds of +- * swap. +- */ +- if (hp_index != -1 && hp_index != type && +- swap_info[type]->prio < swap_info[hp_index]->prio && +- (swap_info[hp_index]->flags & SWP_WRITEOK)) { +- type = hp_index; +- swap_list.next = type; +- } +- +- si = swap_info[type]; +- next = si->next; +- if (next < 0 || +- (!wrapped && si->prio != swap_info[next]->prio)) { +- next = swap_list.head; +- wrapped++; +- } +- ++ list_for_each(tmp, &swap_list_head) { ++ si = list_entry(tmp, typeof(*si), list); + spin_lock(&si->lock); +- if (!si->highest_bit) { +- spin_unlock(&si->lock); +- continue; +- } +- if (!(si->flags & SWP_WRITEOK)) { ++ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_unlock(&si->lock); + continue; + } + +- swap_list.next = next; ++ /* ++ * rotate the current swap_info that we're going to use ++ * to after any other swap_info that have the same prio, ++ * so that all equal-priority swap_info get used equally ++ */ ++ next = si; ++ list_for_each_entry_continue(next, &swap_list_head, list) { ++ if (si->prio != next->prio) ++ break; ++ list_rotate_left(&si->list); ++ next = si; ++ } + + spin_unlock(&swap_lock); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_HAS_CACHE); + spin_unlock(&si->lock); + if (offset) +- return swp_entry(type, offset); ++ return swp_entry(si->type, offset); + spin_lock(&swap_lock); +- next = swap_list.next; ++ /* ++ * if we got here, it's likely that si was almost full before, ++ * and since scan_swap_map() can drop the si->lock, multiple ++ * callers probably all tried to get a page from the same si ++ * and it filled up before we could get one. So we need to ++ * try again. Since we dropped the swap_lock, there may now ++ * be non-full higher priority swap_infos, and this si may have ++ * even been removed from the list (although very unlikely). ++ * Let's start over. ++ */ ++ tmp = &swap_list_head; + } + + atomic_long_inc(&nr_swap_pages); +@@ -766,27 +757,6 @@ out: + return NULL; + } + +-/* +- * This swap type frees swap entry, check if it is the highest priority swap +- * type which just frees swap entry. get_swap_page() uses +- * highest_priority_index to search highest priority swap type. The +- * swap_info_struct.lock can't protect us if there are multiple swap types +- * active, so we use atomic_cmpxchg. +- */ +-static void set_highest_priority_index(int type) +-{ +- int old_hp_index, new_hp_index; +- +- do { +- old_hp_index = atomic_read(&highest_priority_index); +- if (old_hp_index != -1 && +- swap_info[old_hp_index]->prio >= swap_info[type]->prio) +- break; +- new_hp_index = type; +- } while (atomic_cmpxchg(&highest_priority_index, +- old_hp_index, new_hp_index) != old_hp_index); +-} +- + static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) + { +@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(str + p->lowest_bit = offset; + if (offset > p->highest_bit) + p->highest_bit = offset; +- set_highest_priority_index(p->type); + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); +@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swa + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) + { +- int i, prev; ++ struct swap_info_struct *si; + + if (prio >= 0) + p->prio = prio; +@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swa + atomic_long_add(p->pages, &nr_swap_pages); + total_swap_pages += p->pages; + +- /* insert swap space into swap_list: */ +- prev = -1; +- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { +- if (p->prio >= swap_info[i]->prio) +- break; +- prev = i; ++ assert_spin_locked(&swap_lock); ++ BUG_ON(!list_empty(&p->list)); ++ /* ++ * insert into swap list; the list is in priority order, ++ * so that get_swap_page() can get a page from the highest ++ * priority swap_info_struct with available page(s), and ++ * swapoff can adjust the auto-assigned (i.e. negative) prio ++ * values for any lower-priority swap_info_structs when ++ * removing a negative-prio swap_info_struct ++ */ ++ list_for_each_entry(si, &swap_list_head, list) { ++ if (p->prio >= si->prio) { ++ list_add_tail(&p->list, &si->list); ++ return; ++ } + } +- p->next = i; +- if (prev < 0) +- swap_list.head = swap_list.next = p->type; +- else +- swap_info[prev]->next = p->type; ++ /* ++ * this covers two cases: ++ * 1) p->prio is less than all existing prio ++ * 2) the swap list is empty ++ */ ++ list_add_tail(&p->list, &swap_list_head); + } + + static void enable_swap_info(struct swap_info_struct *p, int prio, +@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + struct address_space *mapping; + struct inode *inode; + struct filename *pathname; +- int i, type, prev; +- int err; ++ int err, found = 0; + unsigned int old_block_size; + + if (!capable(CAP_SYS_ADMIN)) +@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __us + goto out; + + mapping = victim->f_mapping; +- prev = -1; + spin_lock(&swap_lock); +- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { +- p = swap_info[type]; ++ list_for_each_entry(p, &swap_list_head, list) { + if (p->flags & SWP_WRITEOK) { +- if (p->swap_file->f_mapping == mapping) ++ if (p->swap_file->f_mapping == mapping) { ++ found = 1; + break; ++ } + } +- prev = type; + } +- if (type < 0) { ++ if (!found) { + err = -EINVAL; + spin_unlock(&swap_lock); + goto out_dput; +@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __us + spin_unlock(&swap_lock); + goto out_dput; + } +- if (prev < 0) +- swap_list.head = p->next; +- else +- swap_info[prev]->next = p->next; +- if (type == swap_list.next) { +- /* just pick something that's safe... */ +- swap_list.next = swap_list.head; +- } + spin_lock(&p->lock); + if (p->prio < 0) { +- for (i = p->next; i >= 0; i = swap_info[i]->next) +- swap_info[i]->prio = p->prio--; ++ struct swap_info_struct *si = p; ++ ++ list_for_each_entry_continue(si, &swap_list_head, list) { ++ si->prio++; ++ } + least_priority++; + } ++ list_del_init(&p->list); + atomic_long_sub(p->pages, &nr_swap_pages); + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; +@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + spin_unlock(&swap_lock); + + set_current_oom_origin(); +- err = try_to_unuse(type, false, 0); /* force all pages to be unused */ ++ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + clear_current_oom_origin(); + + if (err) { +@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + frontswap_map = frontswap_map_get(p); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +- frontswap_invalidate_area(type); ++ frontswap_invalidate_area(p->type); + frontswap_map_set(p, NULL); + mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); +@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + vfree(cluster_info); + vfree(frontswap_map); + /* Destroy swap account information */ +- swap_cgroup_swapoff(type); ++ swap_cgroup_swapoff(p->type); + + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { +@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_sw + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); ++ INIT_LIST_HEAD(&p->list); + p->flags = SWP_USED; +- p->next = -1; + spin_unlock(&swap_lock); + spin_lock_init(&p->lock); + diff --git a/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch b/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch new file mode 100644 index 00000000000..b1ecacffcfc --- /dev/null +++ b/queue-3.14/swap-change-swap_list_head-to-plist-add-swap_avail_head.patch @@ -0,0 +1,367 @@ +From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001 +From: Dan Streetman +Date: Wed, 4 Jun 2014 16:09:59 -0700 +Subject: swap: change swap_list_head to plist, add swap_avail_head + +From: Dan Streetman + +commit 18ab4d4ced0817421e6db6940374cc39d28d65da upstream. + +Originally get_swap_page() started iterating through the singly-linked +list of swap_info_structs using swap_list.next or highest_priority_index, +which both were intended to point to the highest priority active swap +target that was not full. The first patch in this series changed the +singly-linked list to a doubly-linked list, and removed the logic to start +at the highest priority non-full entry; it starts scanning at the highest +priority entry each time, even if the entry is full. + +Replace the manually ordered swap_list_head with a plist, swap_active_head. +Add a new plist, swap_avail_head. The original swap_active_head plist +contains all active swap_info_structs, as before, while the new +swap_avail_head plist contains only swap_info_structs that are active and +available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect +the swap_avail_head list. + +Mel Gorman suggested using plists since they internally handle ordering +the list entries based on priority, which is exactly what swap was doing +manually. All the ordering code is now removed, and swap_info_struct +entries and simply added to their corresponding plist and automatically +ordered correctly. + +Using a new plist for available swap_info_structs simplifies and +optimizes get_swap_page(), which no longer has to iterate over full +swap_info_structs. Using a new spinlock for swap_avail_head plist +allows each swap_info_struct to add or remove themselves from the +plist when they become full or not-full; previously they could not +do so because the swap_info_struct->lock is held when they change +from full<->not-full, and the swap_lock protecting the main +swap_active_head must be ordered before any swap_info_struct->lock. + +Signed-off-by: Dan Streetman +Acked-by: Mel Gorman +Cc: Shaohua Li +Cc: Steven Rostedt +Cc: Peter Zijlstra +Cc: Hugh Dickins +Cc: Dan Streetman +Cc: Michal Hocko +Cc: Christian Ehrhardt +Cc: Weijie Yang +Cc: Rik van Riel +Cc: Johannes Weiner +Cc: Bob Liu +Cc: Paul Gortmaker +Cc: Thomas Gleixner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/swap.h | 3 + include/linux/swapfile.h | 2 + mm/frontswap.c | 6 - + mm/swapfile.c | 145 +++++++++++++++++++++++++++++------------------ + 4 files changed, 97 insertions(+), 59 deletions(-) + +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -214,7 +214,8 @@ struct percpu_cluster { + struct swap_info_struct { + unsigned long flags; /* SWP_USED etc: see above */ + signed short prio; /* swap priority of this type */ +- struct list_head list; /* entry in swap list */ ++ struct plist_node list; /* entry in swap_active_head */ ++ struct plist_node avail_list; /* entry in swap_avail_head */ + signed char type; /* strange name for an index */ + unsigned int max; /* extent of the swap_map */ + unsigned char *swap_map; /* vmalloc'ed array of usage counts */ +--- a/include/linux/swapfile.h ++++ b/include/linux/swapfile.h +@@ -6,7 +6,7 @@ + * want to expose them to the dozens of source files that include swap.h + */ + extern spinlock_t swap_lock; +-extern struct list_head swap_list_head; ++extern struct plist_head swap_active_head; + extern struct swap_info_struct *swap_info[]; + extern int try_to_unuse(unsigned int, bool, unsigned long); + +--- a/mm/frontswap.c ++++ b/mm/frontswap.c +@@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pa + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); +- list_for_each_entry(si, &swap_list_head, list) ++ plist_for_each_entry(si, &swap_active_head, list) + totalpages += atomic_read(&si->frontswap_pages); + return totalpages; + } +@@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsig + unsigned long pages = 0, pages_to_unuse = 0; + + assert_spin_locked(&swap_lock); +- list_for_each_entry(si, &swap_list_head, list) { ++ plist_for_each_entry(si, &swap_active_head, list) { + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; +@@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long targ + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change +- * so restart scan from swap_list_head each time ++ * so restart scan from swap_active_head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unu + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +-LIST_HEAD(swap_list_head); ++PLIST_HEAD(swap_active_head); ++ ++/* ++ * all available (active, not full) swap_info_structs ++ * protected with swap_avail_lock, ordered by priority. ++ * This is used by get_swap_page() instead of swap_active_head ++ * because swap_active_head includes all swap_info_structs, ++ * but get_swap_page() doesn't need to look at full ones. ++ * This uses its own lock instead of swap_lock because when a ++ * swap_info_struct changes between not-full/full, it needs to ++ * add/remove itself to/from this list, but the swap_info_struct->lock ++ * is held and the locking order requires swap_lock to be taken ++ * before any swap_info_struct->lock. ++ */ ++static PLIST_HEAD(swap_avail_head); ++static DEFINE_SPINLOCK(swap_avail_lock); + + struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +@@ -594,6 +609,9 @@ checks: + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; ++ spin_lock(&swap_avail_lock); ++ plist_del(&si->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); +@@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void) + { + struct swap_info_struct *si, *next; + pgoff_t offset; +- struct list_head *tmp; + +- spin_lock(&swap_lock); + if (atomic_long_read(&nr_swap_pages) <= 0) + goto noswap; + atomic_long_dec(&nr_swap_pages); + +- list_for_each(tmp, &swap_list_head) { +- si = list_entry(tmp, typeof(*si), list); ++ spin_lock(&swap_avail_lock); ++ ++start_over: ++ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { ++ /* requeue si to after same-priority siblings */ ++ plist_requeue(&si->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + spin_lock(&si->lock); + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { ++ spin_lock(&swap_avail_lock); ++ if (plist_node_empty(&si->avail_list)) { ++ spin_unlock(&si->lock); ++ goto nextsi; ++ } ++ WARN(!si->highest_bit, ++ "swap_info %d in list but !highest_bit\n", ++ si->type); ++ WARN(!(si->flags & SWP_WRITEOK), ++ "swap_info %d in list but !SWP_WRITEOK\n", ++ si->type); ++ plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&si->lock); +- continue; ++ goto nextsi; + } + +- /* +- * rotate the current swap_info that we're going to use +- * to after any other swap_info that have the same prio, +- * so that all equal-priority swap_info get used equally +- */ +- next = si; +- list_for_each_entry_continue(next, &swap_list_head, list) { +- if (si->prio != next->prio) +- break; +- list_rotate_left(&si->list); +- next = si; +- } +- +- spin_unlock(&swap_lock); + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_HAS_CACHE); + spin_unlock(&si->lock); + if (offset) + return swp_entry(si->type, offset); +- spin_lock(&swap_lock); ++ pr_debug("scan_swap_map of si %d failed to find offset\n", ++ si->type); ++ spin_lock(&swap_avail_lock); ++nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si +- * and it filled up before we could get one. So we need to +- * try again. Since we dropped the swap_lock, there may now +- * be non-full higher priority swap_infos, and this si may have +- * even been removed from the list (although very unlikely). +- * Let's start over. ++ * and it filled up before we could get one; or, the si filled ++ * up between us dropping swap_avail_lock and taking si->lock. ++ * Since we dropped the swap_avail_lock, the swap_avail_head ++ * list may have been modified; so if next is still in the ++ * swap_avail_head list then try it, otherwise start over. + */ +- tmp = &swap_list_head; ++ if (plist_node_empty(&next->avail_list)) ++ goto start_over; + } + ++ spin_unlock(&swap_avail_lock); ++ + atomic_long_inc(&nr_swap_pages); + noswap: +- spin_unlock(&swap_lock); + return (swp_entry_t) {0}; + } + +@@ -798,8 +822,18 @@ static unsigned char swap_entry_free(str + dec_cluster_info_page(p, p->cluster_info, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; +- if (offset > p->highest_bit) ++ if (offset > p->highest_bit) { ++ bool was_full = !p->highest_bit; + p->highest_bit = offset; ++ if (was_full && (p->flags & SWP_WRITEOK)) { ++ spin_lock(&swap_avail_lock); ++ WARN_ON(!plist_node_empty(&p->avail_list)); ++ if (plist_node_empty(&p->avail_list)) ++ plist_add(&p->avail_list, ++ &swap_avail_head); ++ spin_unlock(&swap_avail_lock); ++ } ++ } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); +@@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swa + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) + { +- struct swap_info_struct *si; +- + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; ++ /* ++ * the plist prio is negated because plist ordering is ++ * low-to-high, while swap ordering is high-to-low ++ */ ++ p->list.prio = -p->prio; ++ p->avail_list.prio = -p->prio; + p->swap_map = swap_map; + p->cluster_info = cluster_info; + p->flags |= SWP_WRITEOK; +@@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swa + total_swap_pages += p->pages; + + assert_spin_locked(&swap_lock); +- BUG_ON(!list_empty(&p->list)); +- /* +- * insert into swap list; the list is in priority order, +- * so that get_swap_page() can get a page from the highest +- * priority swap_info_struct with available page(s), and +- * swapoff can adjust the auto-assigned (i.e. negative) prio +- * values for any lower-priority swap_info_structs when +- * removing a negative-prio swap_info_struct +- */ +- list_for_each_entry(si, &swap_list_head, list) { +- if (p->prio >= si->prio) { +- list_add_tail(&p->list, &si->list); +- return; +- } +- } + /* +- * this covers two cases: +- * 1) p->prio is less than all existing prio +- * 2) the swap list is empty ++ * both lists are plists, and thus priority ordered. ++ * swap_active_head needs to be priority ordered for swapoff(), ++ * which on removal of any swap_info_struct with an auto-assigned ++ * (i.e. negative) priority increments the auto-assigned priority ++ * of any lower-priority swap_info_structs. ++ * swap_avail_head needs to be priority ordered for get_swap_page(), ++ * which allocates swap pages from the highest available priority ++ * swap_info_struct. + */ +- list_add_tail(&p->list, &swap_list_head); ++ plist_add(&p->list, &swap_active_head); ++ spin_lock(&swap_avail_lock); ++ plist_add(&p->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + } + + static void enable_swap_info(struct swap_info_struct *p, int prio, +@@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + + mapping = victim->f_mapping; + spin_lock(&swap_lock); +- list_for_each_entry(p, &swap_list_head, list) { ++ plist_for_each_entry(p, &swap_active_head, list) { + if (p->flags & SWP_WRITEOK) { + if (p->swap_file->f_mapping == mapping) { + found = 1; +@@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __us + spin_unlock(&swap_lock); + goto out_dput; + } ++ spin_lock(&swap_avail_lock); ++ plist_del(&p->avail_list, &swap_avail_head); ++ spin_unlock(&swap_avail_lock); + spin_lock(&p->lock); + if (p->prio < 0) { + struct swap_info_struct *si = p; + +- list_for_each_entry_continue(si, &swap_list_head, list) { ++ plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; ++ si->list.prio--; ++ si->avail_list.prio--; + } + least_priority++; + } +- list_del_init(&p->list); ++ plist_del(&p->list, &swap_active_head); + atomic_long_sub(p->pages, &nr_swap_pages); + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; +@@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_sw + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); +- INIT_LIST_HEAD(&p->list); ++ plist_node_init(&p->list, 0); ++ plist_node_init(&p->avail_list, 0); + p->flags = SWP_USED; + spin_unlock(&swap_lock); + spin_lock_init(&p->lock);