From 26d099b58d3b8d138f0cf0ba31522fc40039bc72 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 May 2014 14:30:17 -0400 Subject: [PATCH] 3.14-stable patches added patches: hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch iser-target-match-frmr-descriptors-to-available-session-tags.patch mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch sh-fix-format-string-bug-in-stack-tracer.patch --- ...alue-of-sysctl_hung_task_timeout_sec.patch | 66 +++++ ...put-for-write_pending-in-tx_comp_err.patch | 149 ++++++++++++ ...escriptors-to-available-session-tags.patch | 117 +++++++++ ...-large-number-of-hugepages-are-freed.patch | 89 +++++++ ...to-remote-nodes-before-waking-kswapd.patch | 227 ++++++++++++++++++ ...ter-should-lock_page-before-mlocking.patch | 90 +++++++ ...-pages-just-because-free-file-is-low.patch | 73 ++++++ queue-3.14/series | 8 + ...ix-format-string-bug-in-stack-tracer.patch | 40 +++ 9 files changed, 859 insertions(+) create mode 100644 queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch create mode 100644 queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch create mode 100644 queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch create mode 100644 queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch create mode 100644 queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch create mode 100644 queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch create mode 100644 queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch create mode 100644 queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch diff --git a/queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch b/queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch new file mode 100644 index 00000000000..a2c28d0fd0a --- /dev/null +++ b/queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch @@ -0,0 +1,66 @@ +From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001 +From: Liu Hua +Date: Mon, 7 Apr 2014 15:38:57 -0700 +Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec" + +From: Liu Hua + +commit 80df28476505ed4e6701c3448c63c9229a50c655 upstream. + +As sysctl_hung_task_timeout_sec is unsigned long, when this value is +larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in +watchdog will return immediately without sleep and with print : + + schedule_timeout: wrong timeout value ffffffffffffff83 + +and then the funtion watchdog will call schedule_timeout_interruptible +again and again. The screen will be filled with + + "schedule_timeout: wrong timeout value ffffffffffffff83" + +This patch does some check and correction in sysctl, to let the function +schedule_timeout_interruptible allways get the valid parameter. + +Signed-off-by: Liu Hua +Tested-by: Satoru Takeuchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/sysctl/kernel.txt | 1 + + kernel/sysctl.c | 6 ++++++ + 2 files changed, 7 insertions(+) + +--- a/Documentation/sysctl/kernel.txt ++++ b/Documentation/sysctl/kernel.txt +@@ -317,6 +317,7 @@ for more than this value report a warnin + This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + + 0: means infinite timeout - no checking done. ++Possible values to set are in range {0..LONG_MAX/HZ}. + + ============================================================== + +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -144,6 +144,11 @@ static int min_percpu_pagelist_fract = 8 + static int ngroups_max = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + ++/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ ++#ifdef CONFIG_DETECT_HUNG_TASK ++static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++#endif ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -995,6 +1000,7 @@ static struct ctl_table kern_table[] = { + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, ++ .extra2 = &hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", diff --git a/queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch b/queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch new file mode 100644 index 00000000000..1bf6aaf13a1 --- /dev/null +++ b/queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch @@ -0,0 +1,149 @@ +From nab@linux-iscsi.org Sat May 3 14:15:37 2014 +From: "Nicholas A. Bellinger" +Date: Fri, 2 May 2014 21:26:30 +0000 +Subject: [PATCH-v3.14.y 2/2] iser-target: Add missing se_cmd put for WRITE_PENDING in tx_comp_err +To: target-devel +Cc: Greg-KH , stable , Nicholas Bellinger , Or Gerlitz +Message-ID: <1399065990-30552-3-git-send-email-nab@linux-iscsi.org> + + +From: Nicholas Bellinger + +commit 03e7848a64ed535a30f5d7fc6dede2d5a6a2534b upstream. + +This patch fixes a bug where outstanding RDMA_READs with WRITE_PENDING +status require an extra target_put_sess_cmd() in isert_put_cmd() code +when called from isert_cq_tx_comp_err() + isert_cq_drain_comp_llist() +context during session shutdown. + +The extra kref PUT is required so that transport_generic_free_cmd() +invokes the last target_put_sess_cmd() -> target_release_cmd_kref(), +which will complete(&se_cmd->cmd_wait_comp) the outstanding se_cmd +descriptor with WRITE_PENDING status, and awake the completion in +target_wait_for_sess_cmds() to invoke TFO->release_cmd(). + +The bug was manifesting itself in target_wait_for_sess_cmds() where +a se_cmd descriptor with WRITE_PENDING status would end up sleeping +indefinately. + +Acked-by: Sagi Grimberg +Cc: Or Gerlitz +Signed-off-by: Nicholas Bellinger +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/ulp/isert/ib_isert.c | 35 +++++++++++++++++++++----------- + 1 file changed, 24 insertions(+), 11 deletions(-) + +--- a/drivers/infiniband/ulp/isert/ib_isert.c ++++ b/drivers/infiniband/ulp/isert/ib_isert.c +@@ -1456,7 +1456,7 @@ isert_unreg_rdma(struct isert_cmd *isert + } + + static void +-isert_put_cmd(struct isert_cmd *isert_cmd) ++isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) + { + struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; + struct isert_conn *isert_conn = isert_cmd->conn; +@@ -1472,8 +1472,21 @@ isert_put_cmd(struct isert_cmd *isert_cm + list_del_init(&cmd->i_conn_node); + spin_unlock_bh(&conn->cmd_lock); + +- if (cmd->data_direction == DMA_TO_DEVICE) ++ if (cmd->data_direction == DMA_TO_DEVICE) { + iscsit_stop_dataout_timer(cmd); ++ /* ++ * Check for special case during comp_err where ++ * WRITE_PENDING has been handed off from core, ++ * but requires an extra target_put_sess_cmd() ++ * before transport_generic_free_cmd() below. ++ */ ++ if (comp_err && ++ cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) { ++ struct se_cmd *se_cmd = &cmd->se_cmd; ++ ++ target_put_sess_cmd(se_cmd->se_sess, se_cmd); ++ } ++ } + + device->unreg_rdma_mem(isert_cmd, isert_conn); + transport_generic_free_cmd(&cmd->se_cmd, 0); +@@ -1528,7 +1541,7 @@ isert_unmap_tx_desc(struct iser_tx_desc + + static void + isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd, +- struct ib_device *ib_dev) ++ struct ib_device *ib_dev, bool comp_err) + { + if (isert_cmd->pdu_buf_dma != 0) { + pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n"); +@@ -1538,7 +1551,7 @@ isert_completion_put(struct iser_tx_desc + } + + isert_unmap_tx_desc(tx_desc, ib_dev); +- isert_put_cmd(isert_cmd); ++ isert_put_cmd(isert_cmd, comp_err); + } + + static void +@@ -1582,14 +1595,14 @@ isert_do_control_comp(struct work_struct + iscsit_tmr_post_handler(cmd, cmd->conn); + + cmd->i_state = ISTATE_SENT_STATUS; +- isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); ++ isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_REJECT: + pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n"); + atomic_dec(&isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; +- isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); ++ isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + case ISTATE_SEND_LOGOUTRSP: + pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n"); +@@ -1603,7 +1616,7 @@ isert_do_control_comp(struct work_struct + case ISTATE_SEND_TEXTRSP: + atomic_dec(&isert_conn->post_send_buf_count); + cmd->i_state = ISTATE_SENT_STATUS; +- isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev); ++ isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false); + break; + default: + pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state); +@@ -1634,7 +1647,7 @@ isert_response_completion(struct iser_tx + atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count); + + cmd->i_state = ISTATE_SENT_STATUS; +- isert_completion_put(tx_desc, isert_cmd, ib_dev); ++ isert_completion_put(tx_desc, isert_cmd, ib_dev, false); + } + + static void +@@ -1715,7 +1728,7 @@ isert_cq_drain_comp_llist(struct isert_c + wr = &t->isert_cmd->rdma_wr; + + atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count); +- isert_completion_put(t, t->isert_cmd, ib_dev); ++ isert_completion_put(t, t->isert_cmd, ib_dev, true); + } + } + +@@ -1734,14 +1747,14 @@ isert_cq_tx_comp_err(struct iser_tx_desc + wr = &t->isert_cmd->rdma_wr; + + atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count); +- isert_completion_put(t, t->isert_cmd, ib_dev); ++ isert_completion_put(t, t->isert_cmd, ib_dev, true); + } + tx_desc->comp_llnode_batch = NULL; + + if (!isert_cmd) + isert_unmap_tx_desc(tx_desc, ib_dev); + else +- isert_completion_put(tx_desc, isert_cmd, ib_dev); ++ isert_completion_put(tx_desc, isert_cmd, ib_dev, true); + } + + static void diff --git a/queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch b/queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch new file mode 100644 index 00000000000..b8b7f6b2002 --- /dev/null +++ b/queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch @@ -0,0 +1,117 @@ +From nab@linux-iscsi.org Sat May 3 14:15:20 2014 +From: "Nicholas A. Bellinger" +Date: Fri, 2 May 2014 21:26:29 +0000 +Subject: iser-target: Match FRMR descriptors to available session tags +To: target-devel +Cc: Greg-KH , stable , Nicholas Bellinger , Sagi Grimberg , Or Gerlitz +Message-ID: <1399065990-30552-2-git-send-email-nab@linux-iscsi.org> + +From: Nicholas Bellinger + +commit f46d6a8a01d6bbd83a97140f30a72a89b038807b upstream. + +This patch changes isert_conn_create_fastreg_pool() to follow +logic in iscsi_target_locate_portal() for determining how many +FRMR descriptors to allocate based upon the number of possible +per-session command slots that are available. + +This addresses an OOPs in isert_reg_rdma() where due to the +use of ISCSI_DEF_XMIT_CMDS_MAX could end up returning a bogus +fast_reg_descriptor when the number of active tags exceeded +the original hardcoded max. + +Note this also includes moving isert_conn_create_fastreg_pool() +from isert_connect_request() to isert_put_login_tx() before +posting the final Login Response PDU in order to determine the +se_nacl->queue_depth (eg: number of tags) per session the target +will be enforcing. + +v2 changes: + - Move isert_conn->conn_fr_pool list_head init into + isert_conn_request() +v3 changes: + - Drop unnecessary list_empty() check in isert_reg_rdma() + (Sagi) + +Cc: Sagi Grimberg +Cc: Or Gerlitz +Signed-off-by: Nicholas Bellinger +Signed-off-by: Greg Kroah-Hartman +--- + drivers/infiniband/ulp/isert/ib_isert.c | 35 ++++++++++++++++++-------------- + 1 file changed, 20 insertions(+), 15 deletions(-) + +--- a/drivers/infiniband/ulp/isert/ib_isert.c ++++ b/drivers/infiniband/ulp/isert/ib_isert.c +@@ -436,11 +436,18 @@ isert_conn_create_fastreg_pool(struct is + { + struct fast_reg_descriptor *fr_desc; + struct isert_device *device = isert_conn->conn_device; +- int i, ret; ++ struct se_session *se_sess = isert_conn->conn->sess->se_sess; ++ struct se_node_acl *se_nacl = se_sess->se_node_acl; ++ int i, ret, tag_num; ++ /* ++ * Setup the number of FRMRs based upon the number of tags ++ * available to session in iscsi_target_locate_portal(). ++ */ ++ tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); ++ tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; + +- INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + isert_conn->conn_fr_pool_size = 0; +- for (i = 0; i < ISCSI_DEF_XMIT_CMDS_MAX; i++) { ++ for (i = 0; i < tag_num; i++) { + fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); + if (!fr_desc) { + pr_err("Failed to allocate fast_reg descriptor\n"); +@@ -498,6 +505,7 @@ isert_connect_request(struct rdma_cm_id + kref_get(&isert_conn->conn_kref); + mutex_init(&isert_conn->conn_mutex); + spin_lock_init(&isert_conn->conn_lock); ++ INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + + cma_id->context = isert_conn; + isert_conn->conn_cm_id = cma_id; +@@ -569,15 +577,6 @@ isert_connect_request(struct rdma_cm_id + goto out_mr; + } + +- if (device->use_fastreg) { +- ret = isert_conn_create_fastreg_pool(isert_conn); +- if (ret) { +- pr_err("Conn: %p failed to create fastreg pool\n", +- isert_conn); +- goto out_fastreg; +- } +- } +- + ret = isert_conn_setup_qp(isert_conn, cma_id); + if (ret) + goto out_conn_dev; +@@ -591,9 +590,6 @@ isert_connect_request(struct rdma_cm_id + return 0; + + out_conn_dev: +- if (device->use_fastreg) +- isert_conn_free_fastreg_pool(isert_conn); +-out_fastreg: + ib_dereg_mr(isert_conn->conn_mr); + out_mr: + ib_dealloc_pd(isert_conn->conn_pd); +@@ -967,6 +963,15 @@ isert_put_login_tx(struct iscsi_conn *co + } + if (!login->login_failed) { + if (login->login_complete) { ++ if (isert_conn->conn_device->use_fastreg) { ++ ret = isert_conn_create_fastreg_pool(isert_conn); ++ if (ret) { ++ pr_err("Conn: %p failed to create" ++ " fastreg pool\n", isert_conn); ++ return ret; ++ } ++ } ++ + ret = isert_alloc_rx_descriptors(isert_conn); + if (ret) + return ret; diff --git a/queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch b/queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch new file mode 100644 index 00000000000..69b3369a33b --- /dev/null +++ b/queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch @@ -0,0 +1,89 @@ +From 55f67141a8927b2be3e51840da37b8a2320143ed Mon Sep 17 00:00:00 2001 +From: "Mizuma, Masayoshi" +Date: Mon, 7 Apr 2014 15:37:54 -0700 +Subject: mm: hugetlb: fix softlockup when a large number of hugepages are freed. + +From: "Mizuma, Masayoshi" + +commit 55f67141a8927b2be3e51840da37b8a2320143ed upstream. + +When I decrease the value of nr_hugepage in procfs a lot, softlockup +happens. It is because there is no chance of context switch during this +process. + +On the other hand, when I allocate a large number of hugepages, there is +some chance of context switch. Hence softlockup doesn't happen during +this process. So it's necessary to add the context switch in the +freeing process as same as allocating process to avoid softlockup. + +When I freed 12 TB hugapages with kernel-2.6.32-358.el6, the freeing +process occupied a CPU over 150 seconds and following softlockup message +appeared twice or more. + +$ echo 6000000 > /proc/sys/vm/nr_hugepages +$ cat /proc/sys/vm/nr_hugepages +6000000 +$ grep ^Huge /proc/meminfo +HugePages_Total: 6000000 +HugePages_Free: 6000000 +HugePages_Rsvd: 0 +HugePages_Surp: 0 +Hugepagesize: 2048 kB +$ echo 0 > /proc/sys/vm/nr_hugepages + +BUG: soft lockup - CPU#16 stuck for 67s! [sh:12883] ... +Pid: 12883, comm: sh Not tainted 2.6.32-358.el6.x86_64 #1 +Call Trace: + free_pool_huge_page+0xb8/0xd0 + set_max_huge_pages+0x128/0x190 + hugetlb_sysctl_handler_common+0x113/0x140 + hugetlb_sysctl_handler+0x1e/0x20 + proc_sys_call_handler+0x97/0xd0 + proc_sys_write+0x14/0x20 + vfs_write+0xb8/0x1a0 + sys_write+0x51/0x90 + __audit_syscall_exit+0x265/0x290 + system_call_fastpath+0x16/0x1b + +I have not confirmed this problem with upstream kernels because I am not +able to prepare the machine equipped with 12TB memory now. However I +confirmed that the amount of decreasing hugepages was directly +proportional to the amount of required time. + +I measured required times on a smaller machine. It showed 130-145 +hugepages decreased in a millisecond. + + Amount of decreasing Required time Decreasing rate + hugepages (msec) (pages/msec) + ------------------------------------------------------------ + 10,000 pages == 20GB 70 - 74 135-142 + 30,000 pages == 60GB 208 - 229 131-144 + +It means decrement of 6TB hugepages will trigger softlockup with the +default threshold 20sec, in this decreasing rate. + +Signed-off-by: Masayoshi Mizuma +Cc: Joonsoo Kim +Cc: Michal Hocko +Cc: Wanpeng Li +Cc: Aneesh Kumar +Cc: KOSAKI Motohiro +Cc: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1509,6 +1509,7 @@ static unsigned long set_max_huge_pages( + while (min_count < persistent_huge_pages(h)) { + if (!free_pool_huge_page(h, nodes_allowed, 0)) + break; ++ cond_resched_lock(&hugetlb_lock); + } + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, nodes_allowed, 1)) diff --git a/queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch b/queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch new file mode 100644 index 00000000000..4b724f5aaff --- /dev/null +++ b/queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch @@ -0,0 +1,227 @@ +From 3a025760fc158b3726eac89ee95d7f29599e9dfa Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Mon, 7 Apr 2014 15:37:48 -0700 +Subject: mm: page_alloc: spill to remote nodes before waking kswapd + +From: Johannes Weiner + +commit 3a025760fc158b3726eac89ee95d7f29599e9dfa upstream. + +On NUMA systems, a node may start thrashing cache or even swap anonymous +pages while there are still free pages on remote nodes. + +This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone +allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect +of fair allocation policy"). + +Before those changes, the allocator would first try all allowed zones, +including those on remote nodes, before waking any kswapds. But now, +the allocator fastpath doubles as the fairness pass, which in turn can +only consider the local node to prevent remote spilling based on +exhausted fairness batches alone. Remote nodes are only considered in +the slowpath, after the kswapds are woken up. But if remote nodes still +have free memory, kswapd should not be woken to rebalance the local node +or it may thrash cash or swap prematurely. + +Fix this by adding one more unfair pass over the zonelist that is +allowed to spill to remote nodes after the local fairness pass fails but +before entering the slowpath and waking the kswapds. + +This also gets rid of the GFP_THISNODE exemption from the fairness +protocol because the unfair pass is no longer tied to kswapd, which +GFP_THISNODE is not allowed to wake up. + +However, because remote spills can be more frequent now - we prefer them +over local kswapd reclaim - the allocation batches on remote nodes could +underflow more heavily. When resetting the batches, use +atomic_long_read() directly instead of zone_page_state() to calculate the +delta as the latter filters negative counter values. + +Signed-off-by: Johannes Weiner +Acked-by: Rik van Riel +Acked-by: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/internal.h | 1 + mm/page_alloc.c | 89 ++++++++++++++++++++++++++++---------------------------- + 2 files changed, 46 insertions(+), 44 deletions(-) + +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_l + #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ + #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ ++#define ALLOC_FAIR 0x100 /* fair zone allocation */ + + #endif /* __MM_INTERNAL_H */ +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1238,15 +1238,6 @@ void drain_zone_pages(struct zone *zone, + } + local_irq_restore(flags); + } +-static bool gfp_thisnode_allocation(gfp_t gfp_mask) +-{ +- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; +-} +-#else +-static bool gfp_thisnode_allocation(gfp_t gfp_mask) +-{ +- return false; +-} + #endif + + /* +@@ -1583,12 +1574,7 @@ again: + get_pageblock_migratetype(page)); + } + +- /* +- * NOTE: GFP_THISNODE allocations do not partake in the kswapd +- * aging protocol, so they can't be fair. +- */ +- if (!gfp_thisnode_allocation(gfp_flags)) +- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); ++ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone, gfp_flags); +@@ -1954,23 +1940,12 @@ zonelist_scan: + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. +- * +- * Try to stay in local zones in the fastpath. If +- * that fails, the slowpath is entered, which will do +- * another pass starting with the local zones, but +- * ultimately fall back to remote zones that do not +- * partake in the fairness round-robin cycle of this +- * zonelist. +- * +- * NOTE: GFP_THISNODE allocations do not partake in +- * the kswapd aging protocol, so they can't be fair. + */ +- if ((alloc_flags & ALLOC_WMARK_LOW) && +- !gfp_thisnode_allocation(gfp_mask)) { +- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) +- continue; ++ if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(preferred_zone, zone)) + continue; ++ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) ++ continue; + } + /* + * When allocating a page cache page for writing, we +@@ -2408,32 +2383,40 @@ __alloc_pages_high_priority(gfp_t gfp_ma + return page; + } + +-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, +- struct zonelist *zonelist, +- enum zone_type high_zoneidx, +- struct zone *preferred_zone) ++static void reset_alloc_batches(struct zonelist *zonelist, ++ enum zone_type high_zoneidx, ++ struct zone *preferred_zone) + { + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { +- if (!(gfp_mask & __GFP_NO_KSWAPD)) +- wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + /* + * Only reset the batches of zones that were actually +- * considered in the fast path, we don't want to +- * thrash fairness information for zones that are not ++ * considered in the fairness pass, we don't want to ++ * trash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (!zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, +- high_wmark_pages(zone) - +- low_wmark_pages(zone) - +- zone_page_state(zone, NR_ALLOC_BATCH)); ++ high_wmark_pages(zone) - low_wmark_pages(zone) - ++ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + } + } + ++static void wake_all_kswapds(unsigned int order, ++ struct zonelist *zonelist, ++ enum zone_type high_zoneidx, ++ struct zone *preferred_zone) ++{ ++ struct zoneref *z; ++ struct zone *zone; ++ ++ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) ++ wakeup_kswapd(zone, order, zone_idx(preferred_zone)); ++} ++ + static inline int + gfp_to_alloc_flags(gfp_t gfp_mask) + { +@@ -2522,12 +2505,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u + * allowed per node queues are empty and that nodes are + * over allocated. + */ +- if (gfp_thisnode_allocation(gfp_mask)) ++ if (IS_ENABLED(CONFIG_NUMA) && ++ (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + goto nopage; + + restart: +- prepare_slowpath(gfp_mask, order, zonelist, +- high_zoneidx, preferred_zone); ++ if (!(gfp_mask & __GFP_NO_KSWAPD)) ++ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); + + /* + * OK, we're below the kswapd watermark and have kicked background +@@ -2711,7 +2695,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u + struct page *page = NULL; + int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; +- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; ++ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + struct mem_cgroup *memcg = NULL; + + gfp_mask &= gfp_allowed_mask; +@@ -2752,12 +2736,29 @@ retry_cpuset: + if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + #endif ++retry: + /* First allocation attempt */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, alloc_flags, + preferred_zone, migratetype); + if (unlikely(!page)) { + /* ++ * The first pass makes sure allocations are spread ++ * fairly within the local node. However, the local ++ * node might have free pages left after the fairness ++ * batches are exhausted, and remote zones haven't ++ * even been considered yet. Try once more without ++ * fairness, and include remote zones now, before ++ * entering the slowpath and waking kswapd: prefer ++ * spilling to a remote zone over swapping locally. ++ */ ++ if (alloc_flags & ALLOC_FAIR) { ++ reset_alloc_batches(zonelist, high_zoneidx, ++ preferred_zone); ++ alloc_flags &= ~ALLOC_FAIR; ++ goto retry; ++ } ++ /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. diff --git a/queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch b/queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch new file mode 100644 index 00000000000..3ae5ab3ad99 --- /dev/null +++ b/queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch @@ -0,0 +1,90 @@ +From 57e68e9cd65b4b8eb4045a1e0d0746458502554c Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Mon, 7 Apr 2014 15:37:50 -0700 +Subject: mm: try_to_unmap_cluster() should lock_page() before mlocking + +From: Vlastimil Babka + +commit 57e68e9cd65b4b8eb4045a1e0d0746458502554c upstream. + +A BUG_ON(!PageLocked) was triggered in mlock_vma_page() by Sasha Levin +fuzzing with trinity. The call site try_to_unmap_cluster() does not lock +the pages other than its check_page parameter (which is already locked). + +The BUG_ON in mlock_vma_page() is not documented and its purpose is +somewhat unclear, but apparently it serializes against page migration, +which could otherwise fail to transfer the PG_mlocked flag. This would +not be fatal, as the page would be eventually encountered again, but +NR_MLOCK accounting would become distorted nevertheless. This patch adds +a comment to the BUG_ON in mlock_vma_page() and munlock_vma_page() to that +effect. + +The call site try_to_unmap_cluster() is fixed so that for page != +check_page, trylock_page() is attempted (to avoid possible deadlocks as we +already have check_page locked) and mlock_vma_page() is performed only +upon success. If the page lock cannot be obtained, the page is left +without PG_mlocked, which is again not a problem in the whole unevictable +memory design. + +Signed-off-by: Vlastimil Babka +Signed-off-by: Bob Liu +Reported-by: Sasha Levin +Cc: Wanpeng Li +Cc: Michel Lespinasse +Cc: KOSAKI Motohiro +Acked-by: Rik van Riel +Cc: David Rientjes +Cc: Mel Gorman +Cc: Hugh Dickins +Cc: Joonsoo Kim +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mlock.c | 2 ++ + mm/rmap.c | 14 ++++++++++++-- + 2 files changed, 14 insertions(+), 2 deletions(-) + +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) + */ + void mlock_vma_page(struct page *page) + { ++ /* Serialize with page migration */ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { +@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct pag + unsigned int nr_pages; + struct zone *zone = page_zone(page); + ++ /* For try_to_munlock() and to serialize with page migration */ + BUG_ON(!PageLocked(page)); + + /* +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1322,9 +1322,19 @@ static int try_to_unmap_cluster(unsigned + BUG_ON(!page || PageAnon(page)); + + if (locked_vma) { +- mlock_vma_page(page); /* no-op if already mlocked */ +- if (page == check_page) ++ if (page == check_page) { ++ /* we know we have check_page locked */ ++ mlock_vma_page(page); + ret = SWAP_MLOCK; ++ } else if (trylock_page(page)) { ++ /* ++ * If we can lock the page, perform mlock. ++ * Otherwise leave the page alone, it will be ++ * eventually encountered again later. ++ */ ++ mlock_vma_page(page); ++ unlock_page(page); ++ } + continue; /* don't unmap */ + } + diff --git a/queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch b/queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch new file mode 100644 index 00000000000..24c63b34dac --- /dev/null +++ b/queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch @@ -0,0 +1,73 @@ +From 0bf1457f0cfca7bc026a82323ad34bcf58ad035d Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Tue, 8 Apr 2014 16:04:10 -0700 +Subject: mm: vmscan: do not swap anon pages just because free+file is low + +From: Johannes Weiner + +commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d upstream. + +Page reclaim force-scans / swaps anonymous pages when file cache drops +below the high watermark of a zone in order to prevent what little cache +remains from thrashing. + +However, on bigger machines the high watermark value can be quite large +and when the workload is dominated by a static anonymous/shmem set, the +file set might just be a small window of used-once cache. In such +situations, the VM starts swapping heavily when instead it should be +recycling the no longer used cache. + +This is a longer-standing problem, but it's more likely to trigger after +commit 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") +because file pages can no longer accumulate in a single zone and are +dispersed into smaller fractions among the available zones. + +To resolve this, do not force scan anon when file pages are low but +instead rely on the scan/rotation ratios to make the right prediction. + +Signed-off-by: Johannes Weiner +Acked-by: Rafael Aquini +Cc: Rik van Riel +Cc: Mel Gorman +Cc: Hugh Dickins +Cc: Suleiman Souhlal +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/vmscan.c | 16 +--------------- + 1 file changed, 1 insertion(+), 15 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1848,7 +1848,7 @@ static void get_scan_count(struct lruvec + struct zone *zone = lruvec_zone(lruvec); + unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; +- unsigned long anon, file, free; ++ unsigned long anon, file; + bool force_scan = false; + unsigned long ap, fp; + enum lru_list lru; +@@ -1902,20 +1902,6 @@ static void get_scan_count(struct lruvec + get_lru_size(lruvec, LRU_INACTIVE_FILE); + + /* +- * If it's foreseeable that reclaiming the file cache won't be +- * enough to get the zone back into a desirable shape, we have +- * to swap. Better start now and leave the - probably heavily +- * thrashing - remaining file pages alone. +- */ +- if (global_reclaim(sc)) { +- free = zone_page_state(zone, NR_FREE_PAGES); +- if (unlikely(file + free <= high_wmark_pages(zone))) { +- scan_balance = SCAN_ANON; +- goto out; +- } +- } +- +- /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ diff --git a/queue-3.14/series b/queue-3.14/series index 3342233b4ba..2a9ecc0c009 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -130,3 +130,11 @@ usb-unbind-all-interfaces-before-rebinding-any.patch mtip32xx-set-queue-bounce-limit.patch mtip32xx-unmap-the-dma-segments-before-completing-the-io-request.patch mtip32xx-mtip_async_complete-bug-fixes.patch +iser-target-match-frmr-descriptors-to-available-session-tags.patch +iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch +sh-fix-format-string-bug-in-stack-tracer.patch +mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch +mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch +mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch +mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch +hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch diff --git a/queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch b/queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch new file mode 100644 index 00000000000..142ff0b7463 --- /dev/null +++ b/queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch @@ -0,0 +1,40 @@ +From a0c32761e73c9999cbf592b702f284221fea8040 Mon Sep 17 00:00:00 2001 +From: Matt Fleming +Date: Thu, 3 Apr 2014 14:46:20 -0700 +Subject: sh: fix format string bug in stack tracer + +From: Matt Fleming + +commit a0c32761e73c9999cbf592b702f284221fea8040 upstream. + +Kees reported the following error: + + arch/sh/kernel/dumpstack.c: In function 'print_trace_address': + arch/sh/kernel/dumpstack.c:118:2: error: format not a string literal and no format arguments [-Werror=format-security] + +Use the "%s" format so that it's impossible to interpret 'data' as a +format string. + +Signed-off-by: Matt Fleming +Reported-by: Kees Cook +Acked-by: Kees Cook +Cc: Paul Mundt +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/sh/kernel/dumpstack.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/sh/kernel/dumpstack.c ++++ b/arch/sh/kernel/dumpstack.c +@@ -115,7 +115,7 @@ static int print_trace_stack(void *data, + */ + static void print_trace_address(void *data, unsigned long addr, int reliable) + { +- printk(data); ++ printk("%s", (char *)data); + printk_address(addr, reliable); + } + -- 2.47.3