From: Greg Kroah-Hartman Date: Sat, 13 Sep 2014 01:49:17 +0000 (-0700) Subject: 3.16-stable patches X-Git-Tag: v3.10.55~17 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8b77cbeab3fc641bfbc60da9b1200be4a35f28fb;p=thirdparty%2Fkernel%2Fstable-queue.git 3.16-stable patches added patches: __generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch bluetooth-fix-tracking-local-ssp-authentication-requirement.patch bluetooth-fix-using-uninitialized-variable-when-pairing.patch bluetooth-never-linger-on-process-exit.patch fix-copy_tree-regression.patch md-raid1-raid10-always-abort-recover-on-write-error.patch md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch rbd-rework-rbd_request_fn.patch rdma-iwcm-use-a-default-listen-backlog-if-needed.patch rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch xfs-don-t-dirty-buffers-beyond-eof.patch xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch xfs-don-t-zero-partial-page-cache-pages-during.patch xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch --- diff --git a/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch b/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch new file mode 100644 index 00000000000..741fd4cd333 --- /dev/null +++ b/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch @@ -0,0 +1,32 @@ +From 60bb45297f7551833346c5cebc6d483ea17ea5f2 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Fri, 8 Aug 2014 12:39:16 -0400 +Subject: __generic_file_write_iter(): fix handling of sync error after DIO + +From: Al Viro + +commit 60bb45297f7551833346c5cebc6d483ea17ea5f2 upstream. + +If DIO results in short write and sync write fails, we want to bugger off +whether the DIO part has written anything or not; the logics on the return +will take care of the right return value. + +Reported-by: Anton Altaparmakov +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + mm/filemap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2584,7 +2584,7 @@ ssize_t __generic_file_write_iter(struct + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ +- if (unlikely(status < 0) && !written) { ++ if (unlikely(status < 0)) { + err = status; + goto out; + } diff --git a/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch b/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch new file mode 100644 index 00000000000..325e002ede5 --- /dev/null +++ b/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch @@ -0,0 +1,51 @@ +From 32333edb82fb2009980eefc5518100068147ab82 Mon Sep 17 00:00:00 2001 +From: Vignesh Raman +Date: Tue, 22 Jul 2014 19:24:25 +0530 +Subject: Bluetooth: Avoid use of session socket after the session gets freed + +From: Vignesh Raman + +commit 32333edb82fb2009980eefc5518100068147ab82 upstream. + +The commits 08c30aca9e698faddebd34f81e1196295f9dc063 "Bluetooth: Remove +RFCOMM session refcnt" and 8ff52f7d04d9cc31f1e81dcf9a2ba6335ed34905 +"Bluetooth: Return RFCOMM session ptrs to avoid freed session" +allow rfcomm_recv_ua and rfcomm_session_close to delete the session +(and free the corresponding socket) and propagate NULL session pointer +to the upper callers. + +Additional fix is required to terminate the loop in rfcomm_process_rx +function to avoid use of freed 'sk' memory. + +The issue is only reproducible with kernel option CONFIG_PAGE_POISONING +enabled making freed memory being changed and filled up with fixed char +value used to unmask use-after-free issues. + +Signed-off-by: Vignesh Raman +Signed-off-by: Vitaly Kuzmichev +Acked-by: Dean Jenkins +Signed-off-by: Marcel Holtmann +Signed-off-by: Greg Kroah-Hartman + +--- + net/bluetooth/rfcomm/core.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/net/bluetooth/rfcomm/core.c ++++ b/net/bluetooth/rfcomm/core.c +@@ -1909,10 +1909,13 @@ static struct rfcomm_session *rfcomm_pro + /* Get data directly from socket receive queue without copying it. */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); +- if (!skb_linearize(skb)) ++ if (!skb_linearize(skb)) { + s = rfcomm_recv_frame(s, skb); +- else ++ if (!s) ++ break; ++ } else { + kfree_skb(skb); ++ } + } + + if (s && (sk->sk_state == BT_CLOSED)) diff --git a/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch b/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch new file mode 100644 index 00000000000..b05fbb5619a --- /dev/null +++ b/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch @@ -0,0 +1,98 @@ +From 396e04f4bb9afefb0744715dc76d9abe18ee5fb0 Mon Sep 17 00:00:00 2001 +From: Chin-Ran Lo +Date: Tue, 1 Jul 2014 14:00:14 -0700 +Subject: Bluetooth: btmrvl: wait for HOST_SLEEP_ENABLE event in suspend + +From: Chin-Ran Lo + +commit 396e04f4bb9afefb0744715dc76d9abe18ee5fb0 upstream. + +After BT_CMD_HOST_SLEEP_ENABLE command finishes, driver should +wait until getting BT_EVENT_HOST_SLEEP_ENABLE event to complete +suspend procedure. +Without this patch the suspend handler would return success +earlier. By the time when the BT_EVENT_HOST_SLEEP_ENABLE event +comes in the controller driver could have already turned off the +bus clock. This causes kernel crash or system reboot eventually. + +Signed-off-by: Chin-Ran Lo +Signed-off-by: Jeff CF Chen +Signed-off-by: Amitkumar Karwar +Signed-off-by: Bing Zhao +Signed-off-by: Marcel Holtmann +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/bluetooth/btmrvl_drv.h | 1 + + drivers/bluetooth/btmrvl_main.c | 25 ++++++++++++++++++++++++- + 2 files changed, 25 insertions(+), 1 deletion(-) + +--- a/drivers/bluetooth/btmrvl_drv.h ++++ b/drivers/bluetooth/btmrvl_drv.h +@@ -68,6 +68,7 @@ struct btmrvl_adapter { + u8 hs_state; + u8 wakeup_tries; + wait_queue_head_t cmd_wait_q; ++ wait_queue_head_t event_hs_wait_q; + u8 cmd_complete; + bool is_suspended; + }; +--- a/drivers/bluetooth/btmrvl_main.c ++++ b/drivers/bluetooth/btmrvl_main.c +@@ -114,6 +114,7 @@ int btmrvl_process_event(struct btmrvl_p + adapter->hs_state = HS_ACTIVATED; + if (adapter->psmode) + adapter->ps_state = PS_SLEEP; ++ wake_up_interruptible(&adapter->event_hs_wait_q); + BT_DBG("HS ACTIVATED!"); + } else { + BT_DBG("HS Enable failed"); +@@ -253,11 +254,31 @@ EXPORT_SYMBOL_GPL(btmrvl_enable_ps); + + int btmrvl_enable_hs(struct btmrvl_private *priv) + { ++ struct btmrvl_adapter *adapter = priv->adapter; + int ret; + + ret = btmrvl_send_sync_cmd(priv, BT_CMD_HOST_SLEEP_ENABLE, NULL, 0); +- if (ret) ++ if (ret) { + BT_ERR("Host sleep enable command failed\n"); ++ return ret; ++ } ++ ++ ret = wait_event_interruptible_timeout(adapter->event_hs_wait_q, ++ adapter->hs_state, ++ msecs_to_jiffies(WAIT_UNTIL_HS_STATE_CHANGED)); ++ if (ret < 0) { ++ BT_ERR("event_hs_wait_q terminated (%d): %d,%d,%d", ++ ret, adapter->hs_state, adapter->ps_state, ++ adapter->wakeup_tries); ++ } else if (!ret) { ++ BT_ERR("hs_enable timeout: %d,%d,%d", adapter->hs_state, ++ adapter->ps_state, adapter->wakeup_tries); ++ ret = -ETIMEDOUT; ++ } else { ++ BT_DBG("host sleep enabled: %d,%d,%d", adapter->hs_state, ++ adapter->ps_state, adapter->wakeup_tries); ++ ret = 0; ++ } + + return ret; + } +@@ -358,6 +379,7 @@ static void btmrvl_init_adapter(struct b + } + + init_waitqueue_head(&priv->adapter->cmd_wait_q); ++ init_waitqueue_head(&priv->adapter->event_hs_wait_q); + } + + static void btmrvl_free_adapter(struct btmrvl_private *priv) +@@ -666,6 +688,7 @@ int btmrvl_remove_card(struct btmrvl_pri + hdev = priv->btmrvl_dev.hcidev; + + wake_up_interruptible(&priv->adapter->cmd_wait_q); ++ wake_up_interruptible(&priv->adapter->event_hs_wait_q); + + kthread_stop(priv->main_thread.task); + diff --git a/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch b/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch new file mode 100644 index 00000000000..67ce1990163 --- /dev/null +++ b/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch @@ -0,0 +1,34 @@ +From 42bd6a56ed1ab4b2cb50f4d4e674874da9b47f46 Mon Sep 17 00:00:00 2001 +From: Marcel Holtmann +Date: Tue, 1 Jul 2014 14:11:19 +0200 +Subject: Bluetooth: Fix merge of advertising data and scan response data + +From: Marcel Holtmann + +commit 42bd6a56ed1ab4b2cb50f4d4e674874da9b47f46 upstream. + +The advertising data and scan response data are merged in the wrong +order. It should be advertsing data first and then scan response data +and not the other way around. + +Signed-off-by: Marcel Holtmann +Signed-off-by: Johan Hedberg +Signed-off-by: Greg Kroah-Hartman + +--- + net/bluetooth/hci_event.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/bluetooth/hci_event.c ++++ b/net/bluetooth/hci_event.c +@@ -4177,8 +4177,8 @@ static void process_adv_report(struct hc + * sending a merged device found event. + */ + mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, +- d->last_adv_addr_type, NULL, rssi, 0, 1, data, len, +- d->last_adv_data, d->last_adv_data_len); ++ d->last_adv_addr_type, NULL, rssi, 0, 1, ++ d->last_adv_data, d->last_adv_data_len, data, len); + clear_pending_adv_report(hdev); + } + diff --git a/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch b/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch new file mode 100644 index 00000000000..cf37079b5e8 --- /dev/null +++ b/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch @@ -0,0 +1,77 @@ +From 6c53823ae0e10e723131055e1e65dd6a328a228e Mon Sep 17 00:00:00 2001 +From: Johan Hedberg +Date: Fri, 11 Jul 2014 15:32:23 +0300 +Subject: Bluetooth: Fix tracking local SSP authentication requirement + +From: Johan Hedberg + +commit 6c53823ae0e10e723131055e1e65dd6a328a228e upstream. + +When we need to make the decision whether to perform just-works or real +user confirmation we need to know the exact local authentication +requirement that was passed to the controller. So far conn->auth_type +(the local requirement) wasn't in one case updated appropriately in fear +of the user confirmation being rejected later. + +The real problem however was not really that conn->auth_type couldn't +represent the true value but that we were checking the local MITM +requirement in an incorrect way. It's perfectly fine to let auth_type +follow what we tell the controller since we're still tracking the target +security level with conn->pending_sec_level. + +This patch updates the check for local MITM requirement in the +hci_user_confirm_request_evt function to use the locally requested +security level and ensures that auth_type always represents what we tell +the controller. All other code in hci_user_confirm_request_evt still +uses the auth_type instead of pending_sec_level for determining whether +to do just-works or not, since that's the only value that's in sync with +what the remote device knows. + +Signed-off-by: Johan Hedberg +Tested-by: Szymon Janc +Signed-off-by: Marcel Holtmann +Signed-off-by: Greg Kroah-Hartman + +--- + net/bluetooth/hci_event.c | 17 ++++++++--------- + 1 file changed, 8 insertions(+), 9 deletions(-) + +--- a/net/bluetooth/hci_event.c ++++ b/net/bluetooth/hci_event.c +@@ -3538,18 +3538,14 @@ static void hci_io_capa_request_evt(stru + + /* If we are initiators, there is no remote information yet */ + if (conn->remote_auth == 0xff) { +- cp.authentication = conn->auth_type; +- + /* Request MITM protection if our IO caps allow it + * except for the no-bonding case. +- * conn->auth_type is not updated here since +- * that might cause the user confirmation to be +- * rejected in case the remote doesn't have the +- * IO capabilities for MITM. + */ + if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && + cp.authentication != HCI_AT_NO_BONDING) +- cp.authentication |= 0x01; ++ conn->auth_type |= 0x01; ++ ++ cp.authentication = conn->auth_type; + } else { + conn->auth_type = hci_get_auth_req(conn); + cp.authentication = conn->auth_type; +@@ -3621,9 +3617,12 @@ static void hci_user_confirm_request_evt + rem_mitm = (conn->remote_auth & 0x01); + + /* If we require MITM but the remote device can't provide that +- * (it has NoInputNoOutput) then reject the confirmation request ++ * (it has NoInputNoOutput) then reject the confirmation ++ * request. We check the security level here since it doesn't ++ * necessarily match conn->auth_type. + */ +- if (loc_mitm && conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { ++ if (conn->pending_sec_level > BT_SECURITY_MEDIUM && ++ conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) { + BT_DBG("Rejecting request: remote device can't provide MITM"); + hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY, + sizeof(ev->bdaddr), &ev->bdaddr); diff --git a/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch b/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch new file mode 100644 index 00000000000..10295fefc42 --- /dev/null +++ b/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch @@ -0,0 +1,35 @@ +From 9f743d7499bc2c4dc8c35af33bdb2a29bea663b9 Mon Sep 17 00:00:00 2001 +From: Johan Hedberg +Date: Thu, 17 Jul 2014 11:56:33 +0300 +Subject: Bluetooth: Fix using uninitialized variable when pairing + +From: Johan Hedberg + +commit 9f743d7499bc2c4dc8c35af33bdb2a29bea663b9 upstream. + +Commit 6c53823ae0e10e723131055e1e65dd6a328a228e reshuffled the way the +authentication requirement gets set in the hci_io_capa_request_evt() +function, but at the same time it failed to update an if-statement where +cp.authentication is used before it has been initialized. The correct +value the code should be looking for in this if-statement is +conn->auth_type. + +Signed-off-by: Johan Hedberg +Signed-off-by: Marcel Holtmann +Signed-off-by: Greg Kroah-Hartman + +--- + net/bluetooth/hci_event.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/bluetooth/hci_event.c ++++ b/net/bluetooth/hci_event.c +@@ -3542,7 +3542,7 @@ static void hci_io_capa_request_evt(stru + * except for the no-bonding case. + */ + if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT && +- cp.authentication != HCI_AT_NO_BONDING) ++ conn->auth_type != HCI_AT_NO_BONDING) + conn->auth_type |= 0x01; + + cp.authentication = conn->auth_type; diff --git a/queue-3.16/bluetooth-never-linger-on-process-exit.patch b/queue-3.16/bluetooth-never-linger-on-process-exit.patch new file mode 100644 index 00000000000..ed1bc382f62 --- /dev/null +++ b/queue-3.16/bluetooth-never-linger-on-process-exit.patch @@ -0,0 +1,93 @@ +From 093facf3634da1b0c2cc7ed106f1983da901bbab Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Tue, 15 Jul 2014 12:25:28 +0400 +Subject: Bluetooth: never linger on process exit + +From: Vladimir Davydov + +commit 093facf3634da1b0c2cc7ed106f1983da901bbab upstream. + +If the current process is exiting, lingering on socket close will make +it unkillable, so we should avoid it. + +Reproducer: + + #include + #include + + #define BTPROTO_L2CAP 0 + #define BTPROTO_SCO 2 + #define BTPROTO_RFCOMM 3 + + int main() + { + int fd; + struct linger ling; + + fd = socket(PF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM); + //or: fd = socket(PF_BLUETOOTH, SOCK_DGRAM, BTPROTO_L2CAP); + //or: fd = socket(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_SCO); + + ling.l_onoff = 1; + ling.l_linger = 1000000000; + setsockopt(fd, SOL_SOCKET, SO_LINGER, &ling, sizeof(ling)); + + return 0; + } + +Signed-off-by: Vladimir Davydov +Signed-off-by: Marcel Holtmann +Signed-off-by: Greg Kroah-Hartman + +--- + net/bluetooth/l2cap_sock.c | 3 ++- + net/bluetooth/rfcomm/sock.c | 3 ++- + net/bluetooth/sco.c | 6 ++++-- + 3 files changed, 8 insertions(+), 4 deletions(-) + +--- a/net/bluetooth/l2cap_sock.c ++++ b/net/bluetooth/l2cap_sock.c +@@ -1111,7 +1111,8 @@ static int l2cap_sock_shutdown(struct so + l2cap_chan_close(chan, 0); + lock_sock(sk); + +- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) ++ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && ++ !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, + sk->sk_lingertime); + } +--- a/net/bluetooth/rfcomm/sock.c ++++ b/net/bluetooth/rfcomm/sock.c +@@ -918,7 +918,8 @@ static int rfcomm_sock_shutdown(struct s + sk->sk_shutdown = SHUTDOWN_MASK; + __rfcomm_sock_close(sk); + +- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) ++ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && ++ !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + } + release_sock(sk); +--- a/net/bluetooth/sco.c ++++ b/net/bluetooth/sco.c +@@ -909,7 +909,8 @@ static int sco_sock_shutdown(struct sock + sco_sock_clear_timer(sk); + __sco_sock_close(sk); + +- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) ++ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && ++ !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, + sk->sk_lingertime); + } +@@ -929,7 +930,8 @@ static int sco_sock_release(struct socke + + sco_sock_close(sk); + +- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) { ++ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && ++ !(current->flags & PF_EXITING)) { + lock_sock(sk); + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + release_sock(sk); diff --git a/queue-3.16/fix-copy_tree-regression.patch b/queue-3.16/fix-copy_tree-regression.patch new file mode 100644 index 00000000000..a00634b140c --- /dev/null +++ b/queue-3.16/fix-copy_tree-regression.patch @@ -0,0 +1,91 @@ +From 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Sun, 10 Aug 2014 03:44:55 -0400 +Subject: fix copy_tree() regression + +From: Al Viro + +commit 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 upstream. + +Since 3.14 we had copy_tree() get the shadowing wrong - if we had one +vfsmount shadowing another (i.e. if A is a slave of B, C is mounted +on A/foo, then D got mounted on B/foo creating D' on A/foo shadowed +by C), copy_tree() of A would make a copy of D' shadow the the copy of +C, not the other way around. + +It's easy to fix, fortunately - just make sure that mount follows +the one that shadows it in mnt_child as well as in mnt_hash, and when +copy_tree() decides to attach a new mount, check if the last child +it has added to the same parent should be shadowing the new one. +And if it should, just use the same logics commit_tree() has - put the +new mount into the hash and children lists right after the one that +should shadow it. + +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/namespace.c | 31 ++++++++++++++++++++++++------- + 1 file changed, 24 insertions(+), 7 deletions(-) + +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -779,6 +779,20 @@ static void attach_mnt(struct mount *mnt + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + } + ++static void attach_shadowed(struct mount *mnt, ++ struct mount *parent, ++ struct mount *shadows) ++{ ++ if (shadows) { ++ hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); ++ list_add(&mnt->mnt_child, &shadows->mnt_child); ++ } else { ++ hlist_add_head_rcu(&mnt->mnt_hash, ++ m_hash(&parent->mnt, mnt->mnt_mountpoint)); ++ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); ++ } ++} ++ + /* + * vfsmount lock must be held for write + */ +@@ -797,12 +811,7 @@ static void commit_tree(struct mount *mn + + list_splice(&head, n->list.prev); + +- if (shadows) +- hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); +- else +- hlist_add_head_rcu(&mnt->mnt_hash, +- m_hash(&parent->mnt, mnt->mnt_mountpoint)); +- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); ++ attach_shadowed(mnt, parent, shadows); + touch_mnt_namespace(n); + } + +@@ -1513,6 +1522,7 @@ struct mount *copy_tree(struct mount *mn + continue; + + for (s = r; s; s = next_mnt(s, r)) { ++ struct mount *t = NULL; + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); +@@ -1534,7 +1544,14 @@ struct mount *copy_tree(struct mount *mn + goto out; + lock_mount_hash(); + list_add_tail(&q->mnt_list, &res->mnt_list); +- attach_mnt(q, parent, p->mnt_mp); ++ mnt_set_mountpoint(parent, p->mnt_mp, q); ++ if (!list_empty(&parent->mnt_mounts)) { ++ t = list_last_entry(&parent->mnt_mounts, ++ struct mount, mnt_child); ++ if (t->mnt_mp != p->mnt_mp) ++ t = NULL; ++ } ++ attach_shadowed(q, parent, t); + unlock_mount_hash(); + } + } diff --git a/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch b/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch new file mode 100644 index 00000000000..7373ad067ec --- /dev/null +++ b/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch @@ -0,0 +1,80 @@ +From 2446dba03f9dabe0b477a126cbeb377854785b47 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Thu, 31 Jul 2014 10:16:29 +1000 +Subject: md/raid1,raid10: always abort recover on write error. + +From: NeilBrown + +commit 2446dba03f9dabe0b477a126cbeb377854785b47 upstream. + +Currently we don't abort recovery on a write error if the write error +to the recovering device was triggerd by normal IO (as opposed to +recovery IO). + +This means that for one bitmap region, the recovery might write to the +recovering device for a few sectors, then not bother for subsequent +sectors (as it never writes to failed devices). In this case +the bitmap bit will be cleared, but it really shouldn't. + +The result is that if the recovering device fails and is then re-added +(after fixing whatever hardware problem triggerred the failure), +the second recovery won't redo the region it was in the middle of, +so some of the device will not be recovered properly. + +If we abort the recovery, the region being processes will be cancelled +(bit not cleared) and the whole region will be retried. + +As the bug can result in data corruption the patch is suitable for +-stable. For kernels prior to 3.11 there is a conflict in raid10.c +which will require care. + +Original-from: jiao hui +Reported-and-tested-by: jiao hui +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid1.c | 8 ++++---- + drivers/md/raid10.c | 11 +++++------ + 2 files changed, 9 insertions(+), 10 deletions(-) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, s + mddev->degraded++; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); +- /* +- * if recovery is running, make sure it aborts. +- */ +- set_bit(MD_RECOVERY_INTR, &mddev->recovery); + } else + set_bit(Faulty, &rdev->flags); ++ /* ++ * if recovery is running, make sure it aborts. ++ */ ++ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, s + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } +- if (test_and_clear_bit(In_sync, &rdev->flags)) { ++ if (test_and_clear_bit(In_sync, &rdev->flags)) + mddev->degraded++; +- /* +- * if recovery is running, make sure it aborts. +- */ +- set_bit(MD_RECOVERY_INTR, &mddev->recovery); +- } ++ /* ++ * If recovery is running, make sure it aborts. ++ */ ++ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(Blocked, &rdev->flags); + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); diff --git a/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch b/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch new file mode 100644 index 00000000000..446832914d5 --- /dev/null +++ b/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch @@ -0,0 +1,39 @@ +From b39685526f46976bcd13aa08c82480092befa46c Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Mon, 18 Aug 2014 13:59:50 +1000 +Subject: md/raid10: Fix memory leak when raid10 reshape completes. + +From: NeilBrown + +commit b39685526f46976bcd13aa08c82480092befa46c upstream. + +When a raid10 commences a resync/recovery/reshape it allocates +some buffer space. +When a resync/recovery completes the buffer space is freed. But not +when the reshape completes. +This can result in a small memory leak. + +There is a subtle side-effect of this bug. When a RAID10 is reshaped +to a larger array (more devices), the reshape is immediately followed +by a "resync" of the new space. This "resync" will use the buffer +space which was allocated for "reshape". This can cause problems +including a "BUG" in the SCSI layer. So this is suitable for -stable. + +Fixes: 3ea7daa5d7fde47cd41f4d56c2deb949114da9d6 +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid10.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -2953,6 +2953,7 @@ static sector_t sync_request(struct mdde + */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); ++ close_sync(conf); + return 0; + } + diff --git a/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch b/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch new file mode 100644 index 00000000000..f7d6218db55 --- /dev/null +++ b/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch @@ -0,0 +1,40 @@ +From ce0b0a46955d1bb389684a2605dbcaa990ba0154 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Mon, 18 Aug 2014 13:56:38 +1000 +Subject: md/raid10: fix memory leak when reshaping a RAID10. + +From: NeilBrown + +commit ce0b0a46955d1bb389684a2605dbcaa990ba0154 upstream. + +raid10 reshape clears unwanted bits from a bio->bi_flags using +a method which, while clumsy, worked until 3.10 when BIO_OWNS_VEC +was added. +Since then it clears that bit but shouldn't. This results in a +memory leak. + +So change to used the approved method of clearing unwanted bits. + +As this causes a memory leak which can consume all of memory +the fix is suitable for -stable. + +Fixes: a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd +Reported-by: mdraid.pkoch@dfgh.net (Peter Koch) +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid10.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -4410,7 +4410,7 @@ read_more: + read_bio->bi_private = r10_bio; + read_bio->bi_end_io = end_sync_read; + read_bio->bi_rw = READ; +- read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); ++ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); + read_bio->bi_flags |= 1 << BIO_UPTODATE; + read_bio->bi_vcnt = 0; + read_bio->bi_iter.bi_size = 0; diff --git a/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch b/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch new file mode 100644 index 00000000000..c23340c81aa --- /dev/null +++ b/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch @@ -0,0 +1,42 @@ +From a40687ff73a5b14909d6aa522f7d778b158911c5 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 13 Aug 2014 09:48:45 +1000 +Subject: md/raid5: avoid livelock caused by non-aligned writes. + +From: NeilBrown + +commit a40687ff73a5b14909d6aa522f7d778b158911c5 upstream. + +If a stripe in a raid6 array received a write to each data block while +the array is degraded, and if any of these writes to a missing device +are not page-aligned, then a live-lock happens. + +In this case the P and Q blocks need to be read so that the part of +the missing block which is *not* being updated by the write can be +constructed. Due to a logic error, these blocks are not loaded, so +the update cannot proceed and the stripe is 'handled' repeatedly in an +infinite loop. + +This bug is unlikely as most writes are page aligned. However as it +can lead to a livelock it is suitable for -stable. It was introduced +in 3.16. + +Fixed: 67f455486d2ea20b2d94d6adf5b9b783d079e321 +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid5.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -2922,7 +2922,7 @@ static int fetch_block(struct stripe_hea + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && + !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || + (sh->raid_conf->level == 6 && s->failed && s->to_write && +- s->to_write < sh->raid_conf->raid_disks - 2 && ++ s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + /* we would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync diff --git a/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch b/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch new file mode 100644 index 00000000000..df717e126dc --- /dev/null +++ b/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch @@ -0,0 +1,48 @@ +From 9c4bdf697c39805078392d5ddbbba5ae5680e0dd Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 13 Aug 2014 09:57:07 +1000 +Subject: md/raid6: avoid data corruption during recovery of double-degraded RAID6 + +From: NeilBrown + +commit 9c4bdf697c39805078392d5ddbbba5ae5680e0dd upstream. + +During recovery of a double-degraded RAID6 it is possible for +some blocks not to be recovered properly, leading to corruption. + +If a write happens to one block in a stripe that would be written to a +missing device, and at the same time that stripe is recovering data +to the other missing device, then that recovered data may not be written. + +This patch skips, in the double-degraded case, an optimisation that is +only safe for single-degraded arrays. + +Bug was introduced in 2.6.32 and fix is suitable for any kernel since +then. In an older kernel with separate handle_stripe5() and +handle_stripe6() functions the patch must change handle_stripe6(). + +Fixes: 6c0069c0ae9659e3a91b68eaed06a5c6c37f45c8 +Cc: Yuri Tikhonov +Cc: Dan Williams +Reported-by: "Manibalan P" +Tested-by: "Manibalan P" +Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1090423 +Signed-off-by: NeilBrown +Acked-by: Dan Williams +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid5.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -3817,6 +3817,8 @@ static void handle_stripe(struct stripe_ + set_bit(R5_Wantwrite, &dev->flags); + if (prexor) + continue; ++ if (s.failed > 1) ++ continue; + if (!test_bit(R5_Insync, &dev->flags) || + ((i == sh->pd_idx || i == sh->qd_idx) && + s.failed == 0)) diff --git a/queue-3.16/rbd-rework-rbd_request_fn.patch b/queue-3.16/rbd-rework-rbd_request_fn.patch new file mode 100644 index 00000000000..c4bf6183940 --- /dev/null +++ b/queue-3.16/rbd-rework-rbd_request_fn.patch @@ -0,0 +1,303 @@ +From bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Mon, 4 Aug 2014 18:04:39 +0400 +Subject: rbd: rework rbd_request_fn() + +From: Ilya Dryomov + +commit bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 upstream. + +While it was never a good idea to sleep in request_fn(), commit +34c6bc2c919a ("locking/mutexes: Add extra reschedule point") made it +a *bad* idea. mutex_lock() since 3.15 may reschedule *before* putting +task on the mutex wait queue, which for tasks in !TASK_RUNNING state +means block forever. request_fn() may be called with !TASK_RUNNING on +the way to schedule() in io_schedule(). + +Offload request handling to a workqueue, one per rbd device, to avoid +calling blocking primitives from rbd_request_fn(). + +Fixes: http://tracker.ceph.com/issues/8818 + +Signed-off-by: Ilya Dryomov +Tested-by: Eric Eastman +Tested-by: Greg Wilson +Reviewed-by: Alex Elder +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/rbd.c | 194 +++++++++++++++++++++++++++++++--------------------- + 1 file changed, 118 insertions(+), 76 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + #include "rbd_types.h" + +@@ -332,7 +333,10 @@ struct rbd_device { + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + ++ struct list_head rq_queue; /* incoming rq queue */ + spinlock_t lock; /* queue, flags, open_count */ ++ struct workqueue_struct *rq_wq; ++ struct work_struct rq_work; + + struct rbd_image_header header; + unsigned long flags; /* possibly lock protected */ +@@ -3183,102 +3187,129 @@ out: + return ret; + } + +-static void rbd_request_fn(struct request_queue *q) +- __releases(q->queue_lock) __acquires(q->queue_lock) ++static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) + { +- struct rbd_device *rbd_dev = q->queuedata; +- struct request *rq; ++ struct rbd_img_request *img_request; ++ u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; ++ u64 length = blk_rq_bytes(rq); ++ bool wr = rq_data_dir(rq) == WRITE; + int result; + +- while ((rq = blk_fetch_request(q))) { +- bool write_request = rq_data_dir(rq) == WRITE; +- struct rbd_img_request *img_request; +- u64 offset; +- u64 length; ++ /* Ignore/skip any zero-length requests */ + +- /* Ignore any non-FS requests that filter through. */ ++ if (!length) { ++ dout("%s: zero-length request\n", __func__); ++ result = 0; ++ goto err_rq; ++ } + +- if (rq->cmd_type != REQ_TYPE_FS) { +- dout("%s: non-fs request type %d\n", __func__, +- (int) rq->cmd_type); +- __blk_end_request_all(rq, 0); +- continue; ++ /* Disallow writes to a read-only device */ ++ ++ if (wr) { ++ if (rbd_dev->mapping.read_only) { ++ result = -EROFS; ++ goto err_rq; + } ++ rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); ++ } + +- /* Ignore/skip any zero-length requests */ ++ /* ++ * Quit early if the mapped snapshot no longer exists. It's ++ * still possible the snapshot will have disappeared by the ++ * time our request arrives at the osd, but there's no sense in ++ * sending it if we already know. ++ */ ++ if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { ++ dout("request for non-existent snapshot"); ++ rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); ++ result = -ENXIO; ++ goto err_rq; ++ } + +- offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; +- length = (u64) blk_rq_bytes(rq); ++ if (offset && length > U64_MAX - offset + 1) { ++ rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, ++ length); ++ result = -EINVAL; ++ goto err_rq; /* Shouldn't happen */ ++ } + +- if (!length) { +- dout("%s: zero-length request\n", __func__); +- __blk_end_request_all(rq, 0); +- continue; +- } ++ if (offset + length > rbd_dev->mapping.size) { ++ rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, ++ length, rbd_dev->mapping.size); ++ result = -EIO; ++ goto err_rq; ++ } + +- spin_unlock_irq(q->queue_lock); ++ img_request = rbd_img_request_create(rbd_dev, offset, length, wr); ++ if (!img_request) { ++ result = -ENOMEM; ++ goto err_rq; ++ } ++ img_request->rq = rq; + +- /* Disallow writes to a read-only device */ ++ result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); ++ if (result) ++ goto err_img_request; + +- if (write_request) { +- result = -EROFS; +- if (rbd_dev->mapping.read_only) +- goto end_request; +- rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); +- } ++ result = rbd_img_request_submit(img_request); ++ if (result) ++ goto err_img_request; + +- /* +- * Quit early if the mapped snapshot no longer +- * exists. It's still possible the snapshot will +- * have disappeared by the time our request arrives +- * at the osd, but there's no sense in sending it if +- * we already know. +- */ +- if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { +- dout("request for non-existent snapshot"); +- rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); +- result = -ENXIO; +- goto end_request; +- } ++ return; + +- result = -EINVAL; +- if (offset && length > U64_MAX - offset + 1) { +- rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", +- offset, length); +- goto end_request; /* Shouldn't happen */ +- } ++err_img_request: ++ rbd_img_request_put(img_request); ++err_rq: ++ if (result) ++ rbd_warn(rbd_dev, "%s %llx at %llx result %d", ++ wr ? "write" : "read", length, offset, result); ++ blk_end_request_all(rq, result); ++} + +- result = -EIO; +- if (offset + length > rbd_dev->mapping.size) { +- rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", +- offset, length, rbd_dev->mapping.size); +- goto end_request; +- } ++static void rbd_request_workfn(struct work_struct *work) ++{ ++ struct rbd_device *rbd_dev = ++ container_of(work, struct rbd_device, rq_work); ++ struct request *rq, *next; ++ LIST_HEAD(requests); + +- result = -ENOMEM; +- img_request = rbd_img_request_create(rbd_dev, offset, length, +- write_request); +- if (!img_request) +- goto end_request; ++ spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ ++ list_splice_init(&rbd_dev->rq_queue, &requests); ++ spin_unlock_irq(&rbd_dev->lock); + +- img_request->rq = rq; ++ list_for_each_entry_safe(rq, next, &requests, queuelist) { ++ list_del_init(&rq->queuelist); ++ rbd_handle_request(rbd_dev, rq); ++ } ++} + +- result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, +- rq->bio); +- if (!result) +- result = rbd_img_request_submit(img_request); +- if (result) +- rbd_img_request_put(img_request); +-end_request: +- spin_lock_irq(q->queue_lock); +- if (result < 0) { +- rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", +- write_request ? "write" : "read", +- length, offset, result); ++/* ++ * Called with q->queue_lock held and interrupts disabled, possibly on ++ * the way to schedule(). Do not sleep here! ++ */ ++static void rbd_request_fn(struct request_queue *q) ++{ ++ struct rbd_device *rbd_dev = q->queuedata; ++ struct request *rq; ++ int queued = 0; ++ ++ rbd_assert(rbd_dev); + +- __blk_end_request_all(rq, result); ++ while ((rq = blk_fetch_request(q))) { ++ /* Ignore any non-FS requests that filter through. */ ++ if (rq->cmd_type != REQ_TYPE_FS) { ++ dout("%s: non-fs request type %d\n", __func__, ++ (int) rq->cmd_type); ++ __blk_end_request_all(rq, 0); ++ continue; + } ++ ++ list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); ++ queued++; + } ++ ++ if (queued) ++ queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); + } + + /* +@@ -3848,6 +3879,8 @@ static struct rbd_device *rbd_dev_create + return NULL; + + spin_lock_init(&rbd_dev->lock); ++ INIT_LIST_HEAD(&rbd_dev->rq_queue); ++ INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); + rbd_dev->flags = 0; + atomic_set(&rbd_dev->parent_ref, 0); + INIT_LIST_HEAD(&rbd_dev->node); +@@ -5066,12 +5099,17 @@ static int rbd_dev_device_setup(struct r + ret = rbd_dev_mapping_set(rbd_dev); + if (ret) + goto err_out_disk; ++ + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); + ++ rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0); ++ if (!rbd_dev->rq_wq) ++ goto err_out_mapping; ++ + ret = rbd_bus_add_dev(rbd_dev); + if (ret) +- goto err_out_mapping; ++ goto err_out_workqueue; + + /* Everything's ready. Announce the disk to the world. */ + +@@ -5083,6 +5121,9 @@ static int rbd_dev_device_setup(struct r + + return ret; + ++err_out_workqueue: ++ destroy_workqueue(rbd_dev->rq_wq); ++ rbd_dev->rq_wq = NULL; + err_out_mapping: + rbd_dev_mapping_clear(rbd_dev); + err_out_disk: +@@ -5314,6 +5355,7 @@ static void rbd_dev_device_release(struc + { + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + ++ destroy_workqueue(rbd_dev->rq_wq); + rbd_free_disk(rbd_dev); + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + rbd_dev_mapping_clear(rbd_dev); diff --git a/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch b/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch new file mode 100644 index 00000000000..8c778423a5d --- /dev/null +++ b/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch @@ -0,0 +1,87 @@ +From 2f0304d21867476394cd51a54e97f7273d112261 Mon Sep 17 00:00:00 2001 +From: Steve Wise +Date: Fri, 25 Jul 2014 09:11:33 -0500 +Subject: RDMA/iwcm: Use a default listen backlog if needed + +From: Steve Wise + +commit 2f0304d21867476394cd51a54e97f7273d112261 upstream. + +If the user creates a listening cm_id with backlog of 0 the IWCM ends +up not allowing any connection requests at all. The correct behavior +is for the IWCM to pick a default value if the user backlog parameter +is zero. + +Lustre from version 1.8.8 onward uses a backlog of 0, which breaks +iwarp support without this fix. + +Signed-off-by: Steve Wise +Signed-off-by: Roland Dreier +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/core/iwcm.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +--- a/drivers/infiniband/core/iwcm.c ++++ b/drivers/infiniband/core/iwcm.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -65,6 +66,20 @@ struct iwcm_work { + struct list_head free_list; + }; + ++static unsigned int default_backlog = 256; ++ ++static struct ctl_table_header *iwcm_ctl_table_hdr; ++static struct ctl_table iwcm_ctl_table[] = { ++ { ++ .procname = "default_backlog", ++ .data = &default_backlog, ++ .maxlen = sizeof(default_backlog), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { } ++}; ++ + /* + * The following services provide a mechanism for pre-allocating iwcm_work + * elements. The design pre-allocates them based on the cm_id type: +@@ -425,6 +440,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id, + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + ++ if (!backlog) ++ backlog = default_backlog; ++ + ret = alloc_work_entries(cm_id_priv, backlog); + if (ret) + return ret; +@@ -1030,11 +1048,20 @@ static int __init iw_cm_init(void) + if (!iwcm_wq) + return -ENOMEM; + ++ iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", ++ iwcm_ctl_table); ++ if (!iwcm_ctl_table_hdr) { ++ pr_err("iw_cm: couldn't register sysctl paths\n"); ++ destroy_workqueue(iwcm_wq); ++ return -ENOMEM; ++ } ++ + return 0; + } + + static void __exit iw_cm_cleanup(void) + { ++ unregister_net_sysctl_table(iwcm_ctl_table_hdr); + destroy_workqueue(iwcm_wq); + } + diff --git a/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch b/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch new file mode 100644 index 00000000000..7bf9351e661 --- /dev/null +++ b/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch @@ -0,0 +1,34 @@ +From db1044d458a287c18c4d413adc4ad12e92e253b5 Mon Sep 17 00:00:00 2001 +From: Doug Ledford +Date: Tue, 12 Aug 2014 19:20:11 -0400 +Subject: RDMA/uapi: Include socket.h in rdma_user_cm.h + +From: Doug Ledford + +commit db1044d458a287c18c4d413adc4ad12e92e253b5 upstream. + +added struct sockaddr_storage to rdma_user_cm.h without also adding an +include for linux/socket.h to make sure it is defined. Systemtap +needs the header files to build standalone and cannot rely on other +files to pre-include other headers, so add linux/socket.h to the list +of includes in this file. + +Fixes: ee7aed4528f ("RDMA/ucma: Support querying for AF_IB addresses") +Signed-off-by: Doug Ledford +Signed-off-by: Roland Dreier +Signed-off-by: Greg Kroah-Hartman + +--- + include/uapi/rdma/rdma_user_cm.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/include/uapi/rdma/rdma_user_cm.h ++++ b/include/uapi/rdma/rdma_user_cm.h +@@ -34,6 +34,7 @@ + #define RDMA_USER_CM_H + + #include ++#include + #include + #include + #include diff --git a/queue-3.16/series b/queue-3.16/series index 6a9d97e263f..c7732b41283 100644 --- a/queue-3.16/series +++ b/queue-3.16/series @@ -113,3 +113,24 @@ mnt-change-the-default-remount-atime-from-relatime-to-the-existing-value.patch mnt-add-tests-for-unprivileged-remount-cases-that-have-found-to-be-faulty.patch get-rid-of-propagate_umount-mistakenly-treating-slaves-as-busy.patch fix-ebusy-on-umount-from-mnt_shrinkable.patch +bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch +bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch +bluetooth-fix-tracking-local-ssp-authentication-requirement.patch +bluetooth-never-linger-on-process-exit.patch +bluetooth-fix-using-uninitialized-variable-when-pairing.patch +bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch +__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch +rbd-rework-rbd_request_fn.patch +fix-copy_tree-regression.patch +md-raid1-raid10-always-abort-recover-on-write-error.patch +md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch +md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch +md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch +md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch +rdma-iwcm-use-a-default-listen-backlog-if-needed.patch +rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch +xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch +xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch +xfs-don-t-dirty-buffers-beyond-eof.patch +xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch +xfs-don-t-zero-partial-page-cache-pages-during.patch diff --git a/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch b/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch new file mode 100644 index 00000000000..94340e41a8b --- /dev/null +++ b/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch @@ -0,0 +1,130 @@ +From 22e757a49cf010703fcb9c9b4ef793248c39b0c2 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Tue, 2 Sep 2014 12:12:51 +1000 +Subject: xfs: don't dirty buffers beyond EOF + +From: Dave Chinner + +commit 22e757a49cf010703fcb9c9b4ef793248c39b0c2 upstream. + +generic/263 is failing fsx at this point with a page spanning +EOF that cannot be invalidated. The operations are: + +1190 mapwrite 0x52c00 thru 0x5e569 (0xb96a bytes) +1191 mapread 0x5c000 thru 0x5d636 (0x1637 bytes) +1192 write 0x5b600 thru 0x771ff (0x1bc00 bytes) + +where 1190 extents EOF from 0x54000 to 0x5e569. When the direct IO +write attempts to invalidate the cached page over this range, it +fails with -EBUSY and so any attempt to do page invalidation fails. + +The real question is this: Why can't that page be invalidated after +it has been written to disk and cleaned? + +Well, there's data on the first two buffers in the page (1k block +size, 4k page), but the third buffer on the page (i.e. beyond EOF) +is failing drop_buffers because it's bh->b_state == 0x3, which is +BH_Uptodate | BH_Dirty. IOWs, there's dirty buffers beyond EOF. Say +what? + +OK, set_buffer_dirty() is called on all buffers from +__set_page_buffers_dirty(), regardless of whether the buffer is +beyond EOF or not, which means that when we get to ->writepage, +we have buffers marked dirty beyond EOF that we need to clean. +So, we need to implement our own .set_page_dirty method that +doesn't dirty buffers beyond EOF. + +This is messy because the buffer code is not meant to be shared +and it has interesting locking issues on the buffer dirty bits. +So just copy and paste it and then modify it to suit what we need. + +Note: the solutions the other filesystems and generic block code use +of marking the buffers clean in ->writepage does not work for XFS. +It still leaves dirty buffers beyond EOF and invalidations still +fail. Hence rather than play whack-a-mole, this patch simply +prevents those buffers from being dirtied in the first place. + +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 61 insertions(+) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -1753,11 +1753,72 @@ xfs_vm_readpages( + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); + } + ++/* ++ * This is basically a copy of __set_page_dirty_buffers() with one ++ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them ++ * dirty, we'll never be able to clean them because we don't write buffers ++ * beyond EOF, and that means we can't invalidate pages that span EOF ++ * that have been marked dirty. Further, the dirty state can leak into ++ * the file interior if the file is extended, resulting in all sorts of ++ * bad things happening as the state does not match the underlying data. ++ * ++ * XXX: this really indicates that bufferheads in XFS need to die. Warts like ++ * this only exist because of bufferheads and how the generic code manages them. ++ */ ++STATIC int ++xfs_vm_set_page_dirty( ++ struct page *page) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ loff_t end_offset; ++ loff_t offset; ++ int newly_dirty; ++ ++ if (unlikely(!mapping)) ++ return !TestSetPageDirty(page); ++ ++ end_offset = i_size_read(inode); ++ offset = page_offset(page); ++ ++ spin_lock(&mapping->private_lock); ++ if (page_has_buffers(page)) { ++ struct buffer_head *head = page_buffers(page); ++ struct buffer_head *bh = head; ++ ++ do { ++ if (offset < end_offset) ++ set_buffer_dirty(bh); ++ bh = bh->b_this_page; ++ offset += 1 << inode->i_blkbits; ++ } while (bh != head); ++ } ++ newly_dirty = !TestSetPageDirty(page); ++ spin_unlock(&mapping->private_lock); ++ ++ if (newly_dirty) { ++ /* sigh - __set_page_dirty() is static, so copy it here, too */ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&mapping->tree_lock, flags); ++ if (page->mapping) { /* Race with truncate? */ ++ WARN_ON_ONCE(!PageUptodate(page)); ++ account_page_dirtied(page, mapping); ++ radix_tree_tag_set(&mapping->page_tree, ++ page_index(page), PAGECACHE_TAG_DIRTY); ++ } ++ spin_unlock_irqrestore(&mapping->tree_lock, flags); ++ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); ++ } ++ return newly_dirty; ++} ++ + const struct address_space_operations xfs_address_space_operations = { + .readpage = xfs_vm_readpage, + .readpages = xfs_vm_readpages, + .writepage = xfs_vm_writepage, + .writepages = xfs_vm_writepages, ++ .set_page_dirty = xfs_vm_set_page_dirty, + .releasepage = xfs_vm_releasepage, + .invalidatepage = xfs_vm_invalidatepage, + .write_begin = xfs_vm_write_begin, diff --git a/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch new file mode 100644 index 00000000000..e8070afa869 --- /dev/null +++ b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch @@ -0,0 +1,47 @@ +From 834ffca6f7e345a79f6f2e2d131b0dfba8a4b67a Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Tue, 2 Sep 2014 12:12:52 +1000 +Subject: xfs: don't zero partial page cache pages during O_DIRECT writes + +From: Dave Chinner + +commit 834ffca6f7e345a79f6f2e2d131b0dfba8a4b67a upstream. + +Similar to direct IO reads, direct IO writes are using +truncate_pagecache_range to invalidate the page cache. This is +incorrect due to the sub-block zeroing in the page cache that +truncate_pagecache_range() triggers. + +This patch fixes things by using invalidate_inode_pages2_range +instead. It preserves the page cache invalidation, but won't zero +any pages. + +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -634,7 +634,15 @@ xfs_file_dio_aio_write( + pos, -1); + if (ret) + goto out; +- truncate_pagecache_range(VFS_I(ip), pos, -1); ++ /* ++ * Invalidate whole pages. This can return an error if ++ * we fail to invalidate a page, but this should never ++ * happen on XFS. Warn if it does fail. ++ */ ++ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, ++ pos >> PAGE_CACHE_SHIFT, -1); ++ WARN_ON_ONCE(ret); ++ ret = 0; + } + + /* diff --git a/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch new file mode 100644 index 00000000000..f110984aa57 --- /dev/null +++ b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch @@ -0,0 +1,59 @@ +From 85e584da3212140ee80fd047f9058bbee0bc00d5 Mon Sep 17 00:00:00 2001 +From: Chris Mason +Date: Tue, 2 Sep 2014 12:12:52 +1000 +Subject: xfs: don't zero partial page cache pages during + O_DIRECT writes + +From: Chris Mason + +commit 85e584da3212140ee80fd047f9058bbee0bc00d5 upstream. + +xfs is using truncate_pagecache_range to invalidate the page cache +during DIO reads. This is different from the other filesystems who +only invalidate pages during DIO writes. + +truncate_pagecache_range is meant to be used when we are freeing the +underlying data structs from disk, so it will zero any partial +ranges in the page. This means a DIO read can zero out part of the +page cache page, and it is possible the page will stay in cache. + +buffered reads will find an up to date page with zeros instead of +the data actually on disk. + +This patch fixes things by using invalidate_inode_pages2_range +instead. It preserves the page cache invalidation, but won't zero +any pages. + +[dchinner: catch error and warn if it fails. Comment.] + +Signed-off-by: Chris Mason +Reviewed-by: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -295,7 +295,16 @@ xfs_file_read_iter( + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } +- truncate_pagecache_range(VFS_I(ip), pos, -1); ++ ++ /* ++ * Invalidate whole pages. This can return an error if ++ * we fail to invalidate a page, but this should never ++ * happen on XFS. Warn if it does fail. ++ */ ++ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, ++ pos >> PAGE_CACHE_SHIFT, -1); ++ WARN_ON_ONCE(ret); ++ ret = 0; + } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } diff --git a/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch b/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch new file mode 100644 index 00000000000..2209e4e7f81 --- /dev/null +++ b/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch @@ -0,0 +1,159 @@ +From 67dc288c21064b31a98a53dc64f6b9714b819fd6 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 4 Aug 2014 12:43:06 +1000 +Subject: xfs: ensure verifiers are attached to recovered buffers + +From: Dave Chinner + +commit 67dc288c21064b31a98a53dc64f6b9714b819fd6 upstream. + +Crash testing of CRC enabled filesystems has resulted in a number of +reports of bad CRCs being detected after the filesystem was mounted. +Errors such as the following were being seen: + +XFS (sdb3): Mounting V5 Filesystem +XFS (sdb3): Starting recovery (logdev: internal) +XFS (sdb3): Metadata CRC error detected at xfs_agf_read_verify+0x5a/0x100 [xfs], block 0x1 +XFS (sdb3): Unmount and run xfs_repair +XFS (sdb3): First 64 bytes of corrupted metadata buffer: +ffff880136ffd600: 58 41 47 46 00 00 00 01 00 00 00 00 00 0f aa 40 XAGF...........@ +ffff880136ffd610: 00 02 6d 53 00 02 77 f8 00 00 00 00 00 00 00 01 ..mS..w......... +ffff880136ffd620: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 03 ................ +ffff880136ffd630: 00 00 00 04 00 08 81 d0 00 08 81 a7 00 00 00 00 ................ +XFS (sdb3): metadata I/O error: block 0x1 ("xfs_trans_read_buf_map") error 74 numblks 1 + +The errors were typically being seen in AGF, AGI and their related +btree block buffers some time after log recovery had run. Often it +wasn't until later subsequent mounts that the problem was +discovered. The common symptom was a buffer with the correct +contents, but a CRC and an LSN that matched an older version of the +contents. + +Some debug added to _xfs_buf_ioapply() indicated that buffers were +being written without verifiers attached to them from log recovery, +and Jan Kara isolated the cause to log recovery readahead an dit's +interactions with buffers that had a more recent LSN on disk than +the transaction being recovered. In this case, the buffer did not +get a verifier attached, and os when the second phase of log +recovery ran and recovered EFIs and unlinked inodes, the buffers +were modified and written without the verifier running. Hence they +had up to date contents, but stale LSNs and CRCs. + +Fix it by attaching verifiers to buffers we skip due to future LSN +values so they don't escape into the buffer cache without the +correct verifier attached. + +This patch is based on analysis and a patch from Jan Kara. + +Reported-by: Jan Kara +Reported-by: Fanael Linithien +Reported-by: Grozdan +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_log_recover.c | 51 ++++++++++++++++++++++++++++------------------- + 1 file changed, 31 insertions(+), 20 deletions(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -2125,6 +2125,17 @@ xlog_recover_validate_buf_type( + __uint16_t magic16; + __uint16_t magicda; + ++ /* ++ * We can only do post recovery validation on items on CRC enabled ++ * fielsystems as we need to know when the buffer was written to be able ++ * to determine if we should have replayed the item. If we replay old ++ * metadata over a newer buffer, then it will enter a temporarily ++ * inconsistent state resulting in verification failures. Hence for now ++ * just avoid the verification stage for non-crc filesystems ++ */ ++ if (!xfs_sb_version_hascrc(&mp->m_sb)) ++ return; ++ + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); +@@ -2162,8 +2173,6 @@ xlog_recover_validate_buf_type( + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: +- if (!xfs_sb_version_hascrc(&mp->m_sb)) +- break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); +@@ -2196,10 +2205,6 @@ xlog_recover_validate_buf_type( + #endif + break; + case XFS_BLFT_DINO_BUF: +- /* +- * we get here with inode allocation buffers, not buffers that +- * track unlinked list changes. +- */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); +@@ -2279,8 +2284,6 @@ xlog_recover_validate_buf_type( + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: +- if (!xfs_sb_version_hascrc(&mp->m_sb)) +- break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); +@@ -2387,16 +2390,7 @@ xlog_recover_do_reg_buffer( + /* Shouldn't be any more regions */ + ASSERT(i == item->ri_total); + +- /* +- * We can only do post recovery validation on items on CRC enabled +- * fielsystems as we need to know when the buffer was written to be able +- * to determine if we should have replayed the item. If we replay old +- * metadata over a newer buffer, then it will enter a temporarily +- * inconsistent state resulting in verification failures. Hence for now +- * just avoid the verification stage for non-crc filesystems +- */ +- if (xfs_sb_version_hascrc(&mp->m_sb)) +- xlog_recover_validate_buf_type(mp, bp, buf_f); ++ xlog_recover_validate_buf_type(mp, bp, buf_f); + } + + /* +@@ -2504,12 +2498,29 @@ xlog_recover_buffer_pass2( + } + + /* +- * recover the buffer only if we get an LSN from it and it's less than ++ * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. ++ * ++ * Note that we have to be extremely careful of readahead here. ++ * Readahead does not attach verfiers to the buffers so if we don't ++ * actually do any replay after readahead because of the LSN we found ++ * in the buffer if more recent than that current transaction then we ++ * need to attach the verifier directly. Failure to do so can lead to ++ * future recovery actions (e.g. EFI and unlinked list recovery) can ++ * operate on the buffers and they won't get the verifier attached. This ++ * can lead to blocks on disk having the correct content but a stale ++ * CRC. ++ * ++ * It is safe to assume these clean buffers are currently up to date. ++ * If the buffer is dirtied by a later transaction being replayed, then ++ * the verifier will be reset to match whatever recover turns that ++ * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); +- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) ++ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { ++ xlog_recover_validate_buf_type(mp, bp, buf_f); + goto out_release; ++ } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); diff --git a/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch b/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch new file mode 100644 index 00000000000..b4bfcebc225 --- /dev/null +++ b/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch @@ -0,0 +1,110 @@ +From 5fd364fee81a7888af806e42ed8a91c845894f2d Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 4 Aug 2014 12:43:26 +1000 +Subject: xfs: quotacheck leaves dquot buffers without verifiers + +From: Dave Chinner + +commit 5fd364fee81a7888af806e42ed8a91c845894f2d upstream. + +When running xfs/305, I noticed that quotacheck was flushing dquot +buffers that did not have the xfs_dquot_buf_ops verifiers attached: + +XFS (vdb): _xfs_buf_ioapply: no ops on block 0x1dc8/0x1dc8 +ffff880052489000: 44 51 01 04 00 00 65 b8 00 00 00 00 00 00 00 00 DQ....e......... +ffff880052489010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +ffff880052489020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +ffff880052489030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +CPU: 1 PID: 2376 Comm: mount Not tainted 3.16.0-rc2-dgc+ #306 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 + ffff88006fe38000 ffff88004a0ffae8 ffffffff81cf1cca 0000000000000001 + ffff88004a0ffb88 ffffffff814d50ca 000010004a0ffc70 0000000000000000 + ffff88006be56dc4 0000000000000021 0000000000001dc8 ffff88007c773d80 +Call Trace: + [] dump_stack+0x45/0x56 + [] _xfs_buf_ioapply+0x3ca/0x3d0 + [] ? wake_up_state+0x20/0x20 + [] ? xfs_bdstrat_cb+0x55/0xb0 + [] xfs_buf_iorequest+0x6b/0xd0 + [] xfs_bdstrat_cb+0x55/0xb0 + [] __xfs_buf_delwri_submit+0x15b/0x220 + [] ? xfs_buf_delwri_submit+0x30/0x90 + [] xfs_buf_delwri_submit+0x30/0x90 + [] xfs_qm_quotacheck+0x17d/0x3c0 + [] xfs_qm_mount_quotas+0x151/0x1e0 + [] xfs_mountfs+0x56c/0x7d0 + [] xfs_fs_fill_super+0x2c2/0x340 + [] mount_bdev+0x194/0x1d0 + [] ? xfs_finish_flags+0x170/0x170 + [] xfs_fs_mount+0x15/0x20 + [] mount_fs+0x39/0x1b0 + [] vfs_kern_mount+0x67/0x120 + [] do_mount+0x23e/0xad0 + [] ? __get_free_pages+0xe/0x50 + [] ? copy_mount_options+0x36/0x150 + [] SyS_mount+0x83/0xc0 + [] tracesys+0xdd/0xe2 + +This was caused by dquot buffer readahead not attaching a verifier +structure to the buffer when readahead was issued, resulting in the +followup read of the buffer finding a valid buffer and so not +attaching new verifiers to the buffer as part of the read. + +Also, when a verifier failure occurs, we then read the buffer +without verifiers. Attach the verifiers manually after this read so +that if the buffer is then written it will be verified that the +corruption has been repaired. + +Further, when flushing a dquot we don't ask for a verifier when +reading in the dquot buffer the dquot belongs to. Most of the time +this isn't an issue because the buffer is still cached, but when it +is not cached it will result in writing the dquot buffer without +having the verfier attached. + +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_dquot.c | 3 ++- + fs/xfs/xfs_qm.c | 8 +++++++- + 2 files changed, 9 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_dquot.c ++++ b/fs/xfs/xfs_dquot.c +@@ -974,7 +974,8 @@ xfs_qm_dqflush( + * Get the buffer containing the on-disk dquot + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, +- mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); ++ mp->m_quotainfo->qi_dqchunklen, 0, &bp, ++ &xfs_dquot_buf_ops); + if (error) + goto out_unlock; + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1005,6 +1005,12 @@ xfs_qm_dqiter_bufs( + if (error) + break; + ++ /* ++ * A corrupt buffer might not have a verifier attached, so ++ * make sure we have the correct one attached before writeback ++ * occurs. ++ */ ++ bp->b_ops = &xfs_dquot_buf_ops; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); +@@ -1090,7 +1096,7 @@ xfs_qm_dqiterate( + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + mp->m_quotainfo->qi_dqchunklen, +- NULL); ++ &xfs_dquot_buf_ops); + rablkno++; + } + }