]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 3 May 2014 18:30:17 +0000 (14:30 -0400)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 3 May 2014 18:30:17 +0000 (14:30 -0400)
added patches:
hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch
iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch
iser-target-match-frmr-descriptors-to-available-session-tags.patch
mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch
mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch
mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch
mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch
sh-fix-format-string-bug-in-stack-tracer.patch

queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch [new file with mode: 0644]
queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch [new file with mode: 0644]
queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch [new file with mode: 0644]
queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch [new file with mode: 0644]
queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch [new file with mode: 0644]
queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch [new file with mode: 0644]
queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch [new file with mode: 0644]
queue-3.14/series
queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch [new file with mode: 0644]

diff --git a/queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch b/queue-3.14/hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch
new file mode 100644 (file)
index 0000000..a2c28d0
--- /dev/null
@@ -0,0 +1,66 @@
+From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001
+From: Liu Hua <sdu.liu@huawei.com>
+Date: Mon, 7 Apr 2014 15:38:57 -0700
+Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec"
+
+From: Liu Hua <sdu.liu@huawei.com>
+
+commit 80df28476505ed4e6701c3448c63c9229a50c655 upstream.
+
+As sysctl_hung_task_timeout_sec is unsigned long, when this value is
+larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in
+watchdog will return immediately without sleep and with print :
+
+  schedule_timeout: wrong timeout value ffffffffffffff83
+
+and then the funtion watchdog will call schedule_timeout_interruptible
+again and again.  The screen will be filled with
+
+       "schedule_timeout: wrong timeout value ffffffffffffff83"
+
+This patch does some check and correction in sysctl, to let the function
+schedule_timeout_interruptible allways get the valid parameter.
+
+Signed-off-by: Liu Hua <sdu.liu@huawei.com>
+Tested-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/sysctl/kernel.txt |    1 +
+ kernel/sysctl.c                 |    6 ++++++
+ 2 files changed, 7 insertions(+)
+
+--- a/Documentation/sysctl/kernel.txt
++++ b/Documentation/sysctl/kernel.txt
+@@ -317,6 +317,7 @@ for more than this value report a warnin
+ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+ 0: means infinite timeout - no checking done.
++Possible values to set are in range {0..LONG_MAX/HZ}.
+ ==============================================================
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -144,6 +144,11 @@ static int min_percpu_pagelist_fract = 8
+ static int ngroups_max = NGROUPS_MAX;
+ static const int cap_last_cap = CAP_LAST_CAP;
++/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
++#ifdef CONFIG_DETECT_HUNG_TASK
++static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
++#endif
++
+ #ifdef CONFIG_INOTIFY_USER
+ #include <linux/inotify.h>
+ #endif
+@@ -995,6 +1000,7 @@ static struct ctl_table kern_table[] = {
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = proc_dohung_task_timeout_secs,
++              .extra2         = &hung_task_timeout_max,
+       },
+       {
+               .procname       = "hung_task_warnings",
diff --git a/queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch b/queue-3.14/iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch
new file mode 100644 (file)
index 0000000..1bf6aaf
--- /dev/null
@@ -0,0 +1,149 @@
+From nab@linux-iscsi.org  Sat May  3 14:15:37 2014
+From: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
+Date: Fri,  2 May 2014 21:26:30 +0000
+Subject: [PATCH-v3.14.y 2/2] iser-target: Add missing se_cmd put for WRITE_PENDING in tx_comp_err
+To: target-devel <target-devel@vger.kernel.org>
+Cc: Greg-KH <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>, Nicholas Bellinger <nab@linux-iscsi.org>, Or Gerlitz <ogerlitz@mellanox.com>
+Message-ID: <1399065990-30552-3-git-send-email-nab@linux-iscsi.org>
+
+
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+
+commit 03e7848a64ed535a30f5d7fc6dede2d5a6a2534b upstream.
+
+This patch fixes a bug where outstanding RDMA_READs with WRITE_PENDING
+status require an extra target_put_sess_cmd() in isert_put_cmd() code
+when called from isert_cq_tx_comp_err() + isert_cq_drain_comp_llist()
+context during session shutdown.
+
+The extra kref PUT is required so that transport_generic_free_cmd()
+invokes the last target_put_sess_cmd() -> target_release_cmd_kref(),
+which will complete(&se_cmd->cmd_wait_comp) the outstanding se_cmd
+descriptor with WRITE_PENDING status, and awake the completion in
+target_wait_for_sess_cmds() to invoke TFO->release_cmd().
+
+The bug was manifesting itself in target_wait_for_sess_cmds() where
+a se_cmd descriptor with WRITE_PENDING status would end up sleeping
+indefinately.
+
+Acked-by: Sagi Grimberg <sagig@mellanox.com>
+Cc: Or Gerlitz <ogerlitz@mellanox.com>
+Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/ulp/isert/ib_isert.c |   35 +++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/drivers/infiniband/ulp/isert/ib_isert.c
++++ b/drivers/infiniband/ulp/isert/ib_isert.c
+@@ -1456,7 +1456,7 @@ isert_unreg_rdma(struct isert_cmd *isert
+ }
+ static void
+-isert_put_cmd(struct isert_cmd *isert_cmd)
++isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
+ {
+       struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+       struct isert_conn *isert_conn = isert_cmd->conn;
+@@ -1472,8 +1472,21 @@ isert_put_cmd(struct isert_cmd *isert_cm
+                       list_del_init(&cmd->i_conn_node);
+               spin_unlock_bh(&conn->cmd_lock);
+-              if (cmd->data_direction == DMA_TO_DEVICE)
++              if (cmd->data_direction == DMA_TO_DEVICE) {
+                       iscsit_stop_dataout_timer(cmd);
++                      /*
++                       * Check for special case during comp_err where
++                       * WRITE_PENDING has been handed off from core,
++                       * but requires an extra target_put_sess_cmd()
++                       * before transport_generic_free_cmd() below.
++                       */
++                      if (comp_err &&
++                          cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) {
++                              struct se_cmd *se_cmd = &cmd->se_cmd;
++
++                              target_put_sess_cmd(se_cmd->se_sess, se_cmd);
++                      }
++              }
+               device->unreg_rdma_mem(isert_cmd, isert_conn);
+               transport_generic_free_cmd(&cmd->se_cmd, 0);
+@@ -1528,7 +1541,7 @@ isert_unmap_tx_desc(struct iser_tx_desc
+ static void
+ isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd,
+-                   struct ib_device *ib_dev)
++                   struct ib_device *ib_dev, bool comp_err)
+ {
+       if (isert_cmd->pdu_buf_dma != 0) {
+               pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n");
+@@ -1538,7 +1551,7 @@ isert_completion_put(struct iser_tx_desc
+       }
+       isert_unmap_tx_desc(tx_desc, ib_dev);
+-      isert_put_cmd(isert_cmd);
++      isert_put_cmd(isert_cmd, comp_err);
+ }
+ static void
+@@ -1582,14 +1595,14 @@ isert_do_control_comp(struct work_struct
+               iscsit_tmr_post_handler(cmd, cmd->conn);
+               cmd->i_state = ISTATE_SENT_STATUS;
+-              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev);
++              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+               break;
+       case ISTATE_SEND_REJECT:
+               pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n");
+               atomic_dec(&isert_conn->post_send_buf_count);
+               cmd->i_state = ISTATE_SENT_STATUS;
+-              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev);
++              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+               break;
+       case ISTATE_SEND_LOGOUTRSP:
+               pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n");
+@@ -1603,7 +1616,7 @@ isert_do_control_comp(struct work_struct
+       case ISTATE_SEND_TEXTRSP:
+               atomic_dec(&isert_conn->post_send_buf_count);
+               cmd->i_state = ISTATE_SENT_STATUS;
+-              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev);
++              isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+               break;
+       default:
+               pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state);
+@@ -1634,7 +1647,7 @@ isert_response_completion(struct iser_tx
+       atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count);
+       cmd->i_state = ISTATE_SENT_STATUS;
+-      isert_completion_put(tx_desc, isert_cmd, ib_dev);
++      isert_completion_put(tx_desc, isert_cmd, ib_dev, false);
+ }
+ static void
+@@ -1715,7 +1728,7 @@ isert_cq_drain_comp_llist(struct isert_c
+               wr = &t->isert_cmd->rdma_wr;
+               atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count);
+-              isert_completion_put(t, t->isert_cmd, ib_dev);
++              isert_completion_put(t, t->isert_cmd, ib_dev, true);
+       }
+ }
+@@ -1734,14 +1747,14 @@ isert_cq_tx_comp_err(struct iser_tx_desc
+               wr = &t->isert_cmd->rdma_wr;
+               atomic_sub(wr->send_wr_num + 1, &isert_conn->post_send_buf_count);
+-              isert_completion_put(t, t->isert_cmd, ib_dev);
++              isert_completion_put(t, t->isert_cmd, ib_dev, true);
+       }
+       tx_desc->comp_llnode_batch = NULL;
+       if (!isert_cmd)
+               isert_unmap_tx_desc(tx_desc, ib_dev);
+       else
+-              isert_completion_put(tx_desc, isert_cmd, ib_dev);
++              isert_completion_put(tx_desc, isert_cmd, ib_dev, true);
+ }
+ static void
diff --git a/queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch b/queue-3.14/iser-target-match-frmr-descriptors-to-available-session-tags.patch
new file mode 100644 (file)
index 0000000..b8b7f6b
--- /dev/null
@@ -0,0 +1,117 @@
+From nab@linux-iscsi.org  Sat May  3 14:15:20 2014
+From: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
+Date: Fri,  2 May 2014 21:26:29 +0000
+Subject: iser-target: Match FRMR descriptors to available session tags
+To: target-devel <target-devel@vger.kernel.org>
+Cc: Greg-KH <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>, Nicholas Bellinger <nab@linux-iscsi.org>, Sagi Grimberg <sagig@mellanox.com>, Or Gerlitz <ogerlitz@mellanox.com>
+Message-ID: <1399065990-30552-2-git-send-email-nab@linux-iscsi.org>
+
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+
+commit f46d6a8a01d6bbd83a97140f30a72a89b038807b upstream.
+
+This patch changes isert_conn_create_fastreg_pool() to follow
+logic in iscsi_target_locate_portal() for determining how many
+FRMR descriptors to allocate based upon the number of possible
+per-session command slots that are available.
+
+This addresses an OOPs in isert_reg_rdma() where due to the
+use of ISCSI_DEF_XMIT_CMDS_MAX could end up returning a bogus
+fast_reg_descriptor when the number of active tags exceeded
+the original hardcoded max.
+
+Note this also includes moving isert_conn_create_fastreg_pool()
+from isert_connect_request() to isert_put_login_tx() before
+posting the final Login Response PDU in order to determine the
+se_nacl->queue_depth (eg: number of tags) per session the target
+will be enforcing.
+
+v2 changes:
+  - Move isert_conn->conn_fr_pool list_head init into
+    isert_conn_request()
+v3 changes:
+  - Drop unnecessary list_empty() check in isert_reg_rdma()
+    (Sagi)
+
+Cc: Sagi Grimberg <sagig@mellanox.com>
+Cc: Or Gerlitz <ogerlitz@mellanox.com>
+Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/infiniband/ulp/isert/ib_isert.c |   35 ++++++++++++++++++--------------
+ 1 file changed, 20 insertions(+), 15 deletions(-)
+
+--- a/drivers/infiniband/ulp/isert/ib_isert.c
++++ b/drivers/infiniband/ulp/isert/ib_isert.c
+@@ -436,11 +436,18 @@ isert_conn_create_fastreg_pool(struct is
+ {
+       struct fast_reg_descriptor *fr_desc;
+       struct isert_device *device = isert_conn->conn_device;
+-      int i, ret;
++      struct se_session *se_sess = isert_conn->conn->sess->se_sess;
++      struct se_node_acl *se_nacl = se_sess->se_node_acl;
++      int i, ret, tag_num;
++      /*
++       * Setup the number of FRMRs based upon the number of tags
++       * available to session in iscsi_target_locate_portal().
++       */
++      tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth);
++      tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS;
+-      INIT_LIST_HEAD(&isert_conn->conn_fr_pool);
+       isert_conn->conn_fr_pool_size = 0;
+-      for (i = 0; i < ISCSI_DEF_XMIT_CMDS_MAX; i++) {
++      for (i = 0; i < tag_num; i++) {
+               fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL);
+               if (!fr_desc) {
+                       pr_err("Failed to allocate fast_reg descriptor\n");
+@@ -498,6 +505,7 @@ isert_connect_request(struct rdma_cm_id
+       kref_get(&isert_conn->conn_kref);
+       mutex_init(&isert_conn->conn_mutex);
+       spin_lock_init(&isert_conn->conn_lock);
++      INIT_LIST_HEAD(&isert_conn->conn_fr_pool);
+       cma_id->context = isert_conn;
+       isert_conn->conn_cm_id = cma_id;
+@@ -569,15 +577,6 @@ isert_connect_request(struct rdma_cm_id
+               goto out_mr;
+       }
+-      if (device->use_fastreg) {
+-              ret = isert_conn_create_fastreg_pool(isert_conn);
+-              if (ret) {
+-                      pr_err("Conn: %p failed to create fastreg pool\n",
+-                             isert_conn);
+-                      goto out_fastreg;
+-              }
+-      }
+-
+       ret = isert_conn_setup_qp(isert_conn, cma_id);
+       if (ret)
+               goto out_conn_dev;
+@@ -591,9 +590,6 @@ isert_connect_request(struct rdma_cm_id
+       return 0;
+ out_conn_dev:
+-      if (device->use_fastreg)
+-              isert_conn_free_fastreg_pool(isert_conn);
+-out_fastreg:
+       ib_dereg_mr(isert_conn->conn_mr);
+ out_mr:
+       ib_dealloc_pd(isert_conn->conn_pd);
+@@ -967,6 +963,15 @@ isert_put_login_tx(struct iscsi_conn *co
+       }
+       if (!login->login_failed) {
+               if (login->login_complete) {
++                      if (isert_conn->conn_device->use_fastreg) {
++                              ret = isert_conn_create_fastreg_pool(isert_conn);
++                              if (ret) {
++                                      pr_err("Conn: %p failed to create"
++                                             " fastreg pool\n", isert_conn);
++                                      return ret;
++                              }
++                      }
++
+                       ret = isert_alloc_rx_descriptors(isert_conn);
+                       if (ret)
+                               return ret;
diff --git a/queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch b/queue-3.14/mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch
new file mode 100644 (file)
index 0000000..69b3369
--- /dev/null
@@ -0,0 +1,89 @@
+From 55f67141a8927b2be3e51840da37b8a2320143ed Mon Sep 17 00:00:00 2001
+From: "Mizuma, Masayoshi" <m.mizuma@jp.fujitsu.com>
+Date: Mon, 7 Apr 2014 15:37:54 -0700
+Subject: mm: hugetlb: fix softlockup when a large number of hugepages are freed.
+
+From: "Mizuma, Masayoshi" <m.mizuma@jp.fujitsu.com>
+
+commit 55f67141a8927b2be3e51840da37b8a2320143ed upstream.
+
+When I decrease the value of nr_hugepage in procfs a lot, softlockup
+happens.  It is because there is no chance of context switch during this
+process.
+
+On the other hand, when I allocate a large number of hugepages, there is
+some chance of context switch.  Hence softlockup doesn't happen during
+this process.  So it's necessary to add the context switch in the
+freeing process as same as allocating process to avoid softlockup.
+
+When I freed 12 TB hugapages with kernel-2.6.32-358.el6, the freeing
+process occupied a CPU over 150 seconds and following softlockup message
+appeared twice or more.
+
+$ echo 6000000 > /proc/sys/vm/nr_hugepages
+$ cat /proc/sys/vm/nr_hugepages
+6000000
+$ grep ^Huge /proc/meminfo
+HugePages_Total:   6000000
+HugePages_Free:    6000000
+HugePages_Rsvd:        0
+HugePages_Surp:        0
+Hugepagesize:       2048 kB
+$ echo 0 > /proc/sys/vm/nr_hugepages
+
+BUG: soft lockup - CPU#16 stuck for 67s! [sh:12883] ...
+Pid: 12883, comm: sh Not tainted 2.6.32-358.el6.x86_64 #1
+Call Trace:
+  free_pool_huge_page+0xb8/0xd0
+  set_max_huge_pages+0x128/0x190
+  hugetlb_sysctl_handler_common+0x113/0x140
+  hugetlb_sysctl_handler+0x1e/0x20
+  proc_sys_call_handler+0x97/0xd0
+  proc_sys_write+0x14/0x20
+  vfs_write+0xb8/0x1a0
+  sys_write+0x51/0x90
+  __audit_syscall_exit+0x265/0x290
+  system_call_fastpath+0x16/0x1b
+
+I have not confirmed this problem with upstream kernels because I am not
+able to prepare the machine equipped with 12TB memory now.  However I
+confirmed that the amount of decreasing hugepages was directly
+proportional to the amount of required time.
+
+I measured required times on a smaller machine.  It showed 130-145
+hugepages decreased in a millisecond.
+
+  Amount of decreasing     Required time      Decreasing rate
+  hugepages                     (msec)         (pages/msec)
+  ------------------------------------------------------------
+  10,000 pages == 20GB         70 -  74          135-142
+  30,000 pages == 60GB        208 - 229          131-144
+
+It means decrement of 6TB hugepages will trigger softlockup with the
+default threshold 20sec, in this decreasing rate.
+
+Signed-off-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1509,6 +1509,7 @@ static unsigned long set_max_huge_pages(
+       while (min_count < persistent_huge_pages(h)) {
+               if (!free_pool_huge_page(h, nodes_allowed, 0))
+                       break;
++              cond_resched_lock(&hugetlb_lock);
+       }
+       while (count < persistent_huge_pages(h)) {
+               if (!adjust_pool_surplus(h, nodes_allowed, 1))
diff --git a/queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch b/queue-3.14/mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch
new file mode 100644 (file)
index 0000000..4b724f5
--- /dev/null
@@ -0,0 +1,227 @@
+From 3a025760fc158b3726eac89ee95d7f29599e9dfa Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Mon, 7 Apr 2014 15:37:48 -0700
+Subject: mm: page_alloc: spill to remote nodes before waking kswapd
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 3a025760fc158b3726eac89ee95d7f29599e9dfa upstream.
+
+On NUMA systems, a node may start thrashing cache or even swap anonymous
+pages while there are still free pages on remote nodes.
+
+This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone
+allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect
+of fair allocation policy").
+
+Before those changes, the allocator would first try all allowed zones,
+including those on remote nodes, before waking any kswapds.  But now,
+the allocator fastpath doubles as the fairness pass, which in turn can
+only consider the local node to prevent remote spilling based on
+exhausted fairness batches alone.  Remote nodes are only considered in
+the slowpath, after the kswapds are woken up.  But if remote nodes still
+have free memory, kswapd should not be woken to rebalance the local node
+or it may thrash cash or swap prematurely.
+
+Fix this by adding one more unfair pass over the zonelist that is
+allowed to spill to remote nodes after the local fairness pass fails but
+before entering the slowpath and waking the kswapds.
+
+This also gets rid of the GFP_THISNODE exemption from the fairness
+protocol because the unfair pass is no longer tied to kswapd, which
+GFP_THISNODE is not allowed to wake up.
+
+However, because remote spills can be more frequent now - we prefer them
+over local kswapd reclaim - the allocation batches on remote nodes could
+underflow more heavily.  When resetting the batches, use
+atomic_long_read() directly instead of zone_page_state() to calculate the
+delta as the latter filters negative counter values.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Rik van Riel <riel@redhat.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/internal.h   |    1 
+ mm/page_alloc.c |   89 ++++++++++++++++++++++++++++----------------------------
+ 2 files changed, 46 insertions(+), 44 deletions(-)
+
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_l
+ #define ALLOC_HIGH            0x20 /* __GFP_HIGH set */
+ #define ALLOC_CPUSET          0x40 /* check for correct cpuset */
+ #define ALLOC_CMA             0x80 /* allow allocations from CMA areas */
++#define ALLOC_FAIR            0x100 /* fair zone allocation */
+ #endif        /* __MM_INTERNAL_H */
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -1238,15 +1238,6 @@ void drain_zone_pages(struct zone *zone,
+       }
+       local_irq_restore(flags);
+ }
+-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+-{
+-      return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
+-}
+-#else
+-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+-{
+-      return false;
+-}
+ #endif
+ /*
+@@ -1583,12 +1574,7 @@ again:
+                                         get_pageblock_migratetype(page));
+       }
+-      /*
+-       * NOTE: GFP_THISNODE allocations do not partake in the kswapd
+-       * aging protocol, so they can't be fair.
+-       */
+-      if (!gfp_thisnode_allocation(gfp_flags))
+-              __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
++      __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       __count_zone_vm_events(PGALLOC, zone, 1 << order);
+       zone_statistics(preferred_zone, zone, gfp_flags);
+@@ -1954,23 +1940,12 @@ zonelist_scan:
+                * zone size to ensure fair page aging.  The zone a
+                * page was allocated in should have no effect on the
+                * time the page has in memory before being reclaimed.
+-               *
+-               * Try to stay in local zones in the fastpath.  If
+-               * that fails, the slowpath is entered, which will do
+-               * another pass starting with the local zones, but
+-               * ultimately fall back to remote zones that do not
+-               * partake in the fairness round-robin cycle of this
+-               * zonelist.
+-               *
+-               * NOTE: GFP_THISNODE allocations do not partake in
+-               * the kswapd aging protocol, so they can't be fair.
+                */
+-              if ((alloc_flags & ALLOC_WMARK_LOW) &&
+-                  !gfp_thisnode_allocation(gfp_mask)) {
+-                      if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+-                              continue;
++              if (alloc_flags & ALLOC_FAIR) {
+                       if (!zone_local(preferred_zone, zone))
+                               continue;
++                      if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
++                              continue;
+               }
+               /*
+                * When allocating a page cache page for writing, we
+@@ -2408,32 +2383,40 @@ __alloc_pages_high_priority(gfp_t gfp_ma
+       return page;
+ }
+-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+-                           struct zonelist *zonelist,
+-                           enum zone_type high_zoneidx,
+-                           struct zone *preferred_zone)
++static void reset_alloc_batches(struct zonelist *zonelist,
++                              enum zone_type high_zoneidx,
++                              struct zone *preferred_zone)
+ {
+       struct zoneref *z;
+       struct zone *zone;
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+-              if (!(gfp_mask & __GFP_NO_KSWAPD))
+-                      wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+               /*
+                * Only reset the batches of zones that were actually
+-               * considered in the fast path, we don't want to
+-               * thrash fairness information for zones that are not
++               * considered in the fairness pass, we don't want to
++               * trash fairness information for zones that are not
+                * actually part of this zonelist's round-robin cycle.
+                */
+               if (!zone_local(preferred_zone, zone))
+                       continue;
+               mod_zone_page_state(zone, NR_ALLOC_BATCH,
+-                                  high_wmark_pages(zone) -
+-                                  low_wmark_pages(zone) -
+-                                  zone_page_state(zone, NR_ALLOC_BATCH));
++                      high_wmark_pages(zone) - low_wmark_pages(zone) -
++                      atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+       }
+ }
++static void wake_all_kswapds(unsigned int order,
++                           struct zonelist *zonelist,
++                           enum zone_type high_zoneidx,
++                           struct zone *preferred_zone)
++{
++      struct zoneref *z;
++      struct zone *zone;
++
++      for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
++              wakeup_kswapd(zone, order, zone_idx(preferred_zone));
++}
++
+ static inline int
+ gfp_to_alloc_flags(gfp_t gfp_mask)
+ {
+@@ -2522,12 +2505,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
+        * allowed per node queues are empty and that nodes are
+        * over allocated.
+        */
+-      if (gfp_thisnode_allocation(gfp_mask))
++      if (IS_ENABLED(CONFIG_NUMA) &&
++          (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+               goto nopage;
+ restart:
+-      prepare_slowpath(gfp_mask, order, zonelist,
+-                       high_zoneidx, preferred_zone);
++      if (!(gfp_mask & __GFP_NO_KSWAPD))
++              wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+       /*
+        * OK, we're below the kswapd watermark and have kicked background
+@@ -2711,7 +2695,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
+       struct page *page = NULL;
+       int migratetype = allocflags_to_migratetype(gfp_mask);
+       unsigned int cpuset_mems_cookie;
+-      int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
++      int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+       struct mem_cgroup *memcg = NULL;
+       gfp_mask &= gfp_allowed_mask;
+@@ -2752,12 +2736,29 @@ retry_cpuset:
+       if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+               alloc_flags |= ALLOC_CMA;
+ #endif
++retry:
+       /* First allocation attempt */
+       page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+                       zonelist, high_zoneidx, alloc_flags,
+                       preferred_zone, migratetype);
+       if (unlikely(!page)) {
+               /*
++               * The first pass makes sure allocations are spread
++               * fairly within the local node.  However, the local
++               * node might have free pages left after the fairness
++               * batches are exhausted, and remote zones haven't
++               * even been considered yet.  Try once more without
++               * fairness, and include remote zones now, before
++               * entering the slowpath and waking kswapd: prefer
++               * spilling to a remote zone over swapping locally.
++               */
++              if (alloc_flags & ALLOC_FAIR) {
++                      reset_alloc_batches(zonelist, high_zoneidx,
++                                          preferred_zone);
++                      alloc_flags &= ~ALLOC_FAIR;
++                      goto retry;
++              }
++              /*
+                * Runtime PM, block IO and its error handling path
+                * can deadlock because I/O on the device might not
+                * complete.
diff --git a/queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch b/queue-3.14/mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch
new file mode 100644 (file)
index 0000000..3ae5ab3
--- /dev/null
@@ -0,0 +1,90 @@
+From 57e68e9cd65b4b8eb4045a1e0d0746458502554c Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 7 Apr 2014 15:37:50 -0700
+Subject: mm: try_to_unmap_cluster() should lock_page() before mlocking
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 57e68e9cd65b4b8eb4045a1e0d0746458502554c upstream.
+
+A BUG_ON(!PageLocked) was triggered in mlock_vma_page() by Sasha Levin
+fuzzing with trinity.  The call site try_to_unmap_cluster() does not lock
+the pages other than its check_page parameter (which is already locked).
+
+The BUG_ON in mlock_vma_page() is not documented and its purpose is
+somewhat unclear, but apparently it serializes against page migration,
+which could otherwise fail to transfer the PG_mlocked flag.  This would
+not be fatal, as the page would be eventually encountered again, but
+NR_MLOCK accounting would become distorted nevertheless.  This patch adds
+a comment to the BUG_ON in mlock_vma_page() and munlock_vma_page() to that
+effect.
+
+The call site try_to_unmap_cluster() is fixed so that for page !=
+check_page, trylock_page() is attempted (to avoid possible deadlocks as we
+already have check_page locked) and mlock_vma_page() is performed only
+upon success.  If the page lock cannot be obtained, the page is left
+without PG_mlocked, which is again not a problem in the whole unevictable
+memory design.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Bob Liu <bob.liu@oracle.com>
+Reported-by: Sasha Levin <sasha.levin@oracle.com>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Michel Lespinasse <walken@google.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mlock.c |    2 ++
+ mm/rmap.c  |   14 ++++++++++++--
+ 2 files changed, 14 insertions(+), 2 deletions(-)
+
+--- a/mm/mlock.c
++++ b/mm/mlock.c
+@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
+  */
+ void mlock_vma_page(struct page *page)
+ {
++      /* Serialize with page migration */
+       BUG_ON(!PageLocked(page));
+       if (!TestSetPageMlocked(page)) {
+@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct pag
+       unsigned int nr_pages;
+       struct zone *zone = page_zone(page);
++      /* For try_to_munlock() and to serialize with page migration */
+       BUG_ON(!PageLocked(page));
+       /*
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -1322,9 +1322,19 @@ static int try_to_unmap_cluster(unsigned
+               BUG_ON(!page || PageAnon(page));
+               if (locked_vma) {
+-                      mlock_vma_page(page);   /* no-op if already mlocked */
+-                      if (page == check_page)
++                      if (page == check_page) {
++                              /* we know we have check_page locked */
++                              mlock_vma_page(page);
+                               ret = SWAP_MLOCK;
++                      } else if (trylock_page(page)) {
++                              /*
++                               * If we can lock the page, perform mlock.
++                               * Otherwise leave the page alone, it will be
++                               * eventually encountered again later.
++                               */
++                              mlock_vma_page(page);
++                              unlock_page(page);
++                      }
+                       continue;       /* don't unmap */
+               }
diff --git a/queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch b/queue-3.14/mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch
new file mode 100644 (file)
index 0000000..24c63b3
--- /dev/null
@@ -0,0 +1,73 @@
+From 0bf1457f0cfca7bc026a82323ad34bcf58ad035d Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Tue, 8 Apr 2014 16:04:10 -0700
+Subject: mm: vmscan: do not swap anon pages just because free+file is low
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d upstream.
+
+Page reclaim force-scans / swaps anonymous pages when file cache drops
+below the high watermark of a zone in order to prevent what little cache
+remains from thrashing.
+
+However, on bigger machines the high watermark value can be quite large
+and when the workload is dominated by a static anonymous/shmem set, the
+file set might just be a small window of used-once cache.  In such
+situations, the VM starts swapping heavily when instead it should be
+recycling the no longer used cache.
+
+This is a longer-standing problem, but it's more likely to trigger after
+commit 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy")
+because file pages can no longer accumulate in a single zone and are
+dispersed into smaller fractions among the available zones.
+
+To resolve this, do not force scan anon when file pages are low but
+instead rely on the scan/rotation ratios to make the right prediction.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Rafael Aquini <aquini@redhat.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Suleiman Souhlal <suleiman@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |   16 +---------------
+ 1 file changed, 1 insertion(+), 15 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1848,7 +1848,7 @@ static void get_scan_count(struct lruvec
+       struct zone *zone = lruvec_zone(lruvec);
+       unsigned long anon_prio, file_prio;
+       enum scan_balance scan_balance;
+-      unsigned long anon, file, free;
++      unsigned long anon, file;
+       bool force_scan = false;
+       unsigned long ap, fp;
+       enum lru_list lru;
+@@ -1902,20 +1902,6 @@ static void get_scan_count(struct lruvec
+               get_lru_size(lruvec, LRU_INACTIVE_FILE);
+       /*
+-       * If it's foreseeable that reclaiming the file cache won't be
+-       * enough to get the zone back into a desirable shape, we have
+-       * to swap.  Better start now and leave the - probably heavily
+-       * thrashing - remaining file pages alone.
+-       */
+-      if (global_reclaim(sc)) {
+-              free = zone_page_state(zone, NR_FREE_PAGES);
+-              if (unlikely(file + free <= high_wmark_pages(zone))) {
+-                      scan_balance = SCAN_ANON;
+-                      goto out;
+-              }
+-      }
+-
+-      /*
+        * There is enough inactive page cache, do not reclaim
+        * anything from the anonymous working set right now.
+        */
index 3342233b4ba1ec0a1945676bcdba8574ed62c54f..2a9ecc0c009673e95a366d08797d7e2eb48dec66 100644 (file)
@@ -130,3 +130,11 @@ usb-unbind-all-interfaces-before-rebinding-any.patch
 mtip32xx-set-queue-bounce-limit.patch
 mtip32xx-unmap-the-dma-segments-before-completing-the-io-request.patch
 mtip32xx-mtip_async_complete-bug-fixes.patch
+iser-target-match-frmr-descriptors-to-available-session-tags.patch
+iser-target-add-missing-se_cmd-put-for-write_pending-in-tx_comp_err.patch
+sh-fix-format-string-bug-in-stack-tracer.patch
+mm-page_alloc-spill-to-remote-nodes-before-waking-kswapd.patch
+mm-try_to_unmap_cluster-should-lock_page-before-mlocking.patch
+mm-hugetlb-fix-softlockup-when-a-large-number-of-hugepages-are-freed.patch
+mm-vmscan-do-not-swap-anon-pages-just-because-free-file-is-low.patch
+hung_task-check-the-value-of-sysctl_hung_task_timeout_sec.patch
diff --git a/queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch b/queue-3.14/sh-fix-format-string-bug-in-stack-tracer.patch
new file mode 100644 (file)
index 0000000..142ff0b
--- /dev/null
@@ -0,0 +1,40 @@
+From a0c32761e73c9999cbf592b702f284221fea8040 Mon Sep 17 00:00:00 2001
+From: Matt Fleming <matt.fleming@intel.com>
+Date: Thu, 3 Apr 2014 14:46:20 -0700
+Subject: sh: fix format string bug in stack tracer
+
+From: Matt Fleming <matt.fleming@intel.com>
+
+commit a0c32761e73c9999cbf592b702f284221fea8040 upstream.
+
+Kees reported the following error:
+
+   arch/sh/kernel/dumpstack.c: In function 'print_trace_address':
+   arch/sh/kernel/dumpstack.c:118:2: error: format not a string literal and no format arguments [-Werror=format-security]
+
+Use the "%s" format so that it's impossible to interpret 'data' as a
+format string.
+
+Signed-off-by: Matt Fleming <matt.fleming@intel.com>
+Reported-by: Kees Cook <keescook@chromium.org>
+Acked-by: Kees Cook <keescook@chromium.org>
+Cc: Paul Mundt <lethal@linux-sh.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/kernel/dumpstack.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/sh/kernel/dumpstack.c
++++ b/arch/sh/kernel/dumpstack.c
+@@ -115,7 +115,7 @@ static int print_trace_stack(void *data,
+  */
+ static void print_trace_address(void *data, unsigned long addr, int reliable)
+ {
+-      printk(data);
++      printk("%s", (char *)data);
+       printk_address(addr, reliable);
+ }