3.16-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)
diff --git a/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch b/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch

new file mode 100644 (file)

index 0000000..741fd4c
--- /dev/null
+++ b/queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch
@@ -0,0 +1,32 @@
+From 60bb45297f7551833346c5cebc6d483ea17ea5f2 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Fri, 8 Aug 2014 12:39:16 -0400
+Subject: __generic_file_write_iter(): fix handling of sync error after DIO
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 60bb45297f7551833346c5cebc6d483ea17ea5f2 upstream.
+
+If DIO results in short write and sync write fails, we want to bugger off
+whether the DIO part has written anything or not; the logics on the return
+will take care of the right return value.
+
+Reported-by: Anton Altaparmakov <aia21@cam.ac.uk>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/filemap.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -2584,7 +2584,7 @@ ssize_t __generic_file_write_iter(struct
+                * that this differs from normal direct-io semantics, which
+                * will return -EFOO even if some bytes were written.
+                */
+-              if (unlikely(status < 0) && !written) {
++              if (unlikely(status < 0)) {
+                       err = status;
+                       goto out;
+               }
diff --git a/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch b/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch

new file mode 100644 (file)

index 0000000..325e002
--- /dev/null
+++ b/queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch
@@ -0,0 +1,51 @@
+From 32333edb82fb2009980eefc5518100068147ab82 Mon Sep 17 00:00:00 2001
+From: Vignesh Raman <Vignesh_Raman@mentor.com>
+Date: Tue, 22 Jul 2014 19:24:25 +0530
+Subject: Bluetooth: Avoid use of session socket after the session gets freed
+
+From: Vignesh Raman <Vignesh_Raman@mentor.com>
+
+commit 32333edb82fb2009980eefc5518100068147ab82 upstream.
+
+The commits 08c30aca9e698faddebd34f81e1196295f9dc063 "Bluetooth: Remove
+RFCOMM session refcnt" and 8ff52f7d04d9cc31f1e81dcf9a2ba6335ed34905
+"Bluetooth: Return RFCOMM session ptrs to avoid freed session"
+allow rfcomm_recv_ua and rfcomm_session_close to delete the session
+(and free the corresponding socket) and propagate NULL session pointer
+to the upper callers.
+
+Additional fix is required to terminate the loop in rfcomm_process_rx
+function to avoid use of freed 'sk' memory.
+
+The issue is only reproducible with kernel option CONFIG_PAGE_POISONING
+enabled making freed memory being changed and filled up with fixed char
+value used to unmask use-after-free issues.
+
+Signed-off-by: Vignesh Raman <Vignesh_Raman@mentor.com>
+Signed-off-by: Vitaly Kuzmichev <Vitaly_Kuzmichev@mentor.com>
+Acked-by: Dean Jenkins <Dean_Jenkins@mentor.com>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/bluetooth/rfcomm/core.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/net/bluetooth/rfcomm/core.c
++++ b/net/bluetooth/rfcomm/core.c
+@@ -1909,10 +1909,13 @@ static struct rfcomm_session *rfcomm_pro
+       /* Get data directly from socket receive queue without copying it. */
+       while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+               skb_orphan(skb);
+-              if (!skb_linearize(skb))
++              if (!skb_linearize(skb)) {
+                       s = rfcomm_recv_frame(s, skb);
+-              else
++                      if (!s)
++                              break;
++              } else {
+                       kfree_skb(skb);
++              }
+       }
+ 
+       if (s && (sk->sk_state == BT_CLOSED))
diff --git a/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch b/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch

new file mode 100644 (file)

index 0000000..b05fbb5
--- /dev/null
+++ b/queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch
@@ -0,0 +1,98 @@
+From 396e04f4bb9afefb0744715dc76d9abe18ee5fb0 Mon Sep 17 00:00:00 2001
+From: Chin-Ran Lo <crlo@marvell.com>
+Date: Tue, 1 Jul 2014 14:00:14 -0700
+Subject: Bluetooth: btmrvl: wait for HOST_SLEEP_ENABLE event in suspend
+
+From: Chin-Ran Lo <crlo@marvell.com>
+
+commit 396e04f4bb9afefb0744715dc76d9abe18ee5fb0 upstream.
+
+After BT_CMD_HOST_SLEEP_ENABLE command finishes, driver should
+wait until getting BT_EVENT_HOST_SLEEP_ENABLE event to complete
+suspend procedure.
+Without this patch the suspend handler would return success
+earlier. By the time when the BT_EVENT_HOST_SLEEP_ENABLE event
+comes in the controller driver could have already turned off the
+bus clock. This causes kernel crash or system reboot eventually.
+
+Signed-off-by: Chin-Ran Lo <crlo@marvell.com>
+Signed-off-by: Jeff CF Chen <jeffc@marvell.com>
+Signed-off-by: Amitkumar Karwar <akarwar@marvell.com>
+Signed-off-by: Bing Zhao <bzhao@marvell.com>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/bluetooth/btmrvl_drv.h  |    1 +
+ drivers/bluetooth/btmrvl_main.c |   25 ++++++++++++++++++++++++-
+ 2 files changed, 25 insertions(+), 1 deletion(-)
+
+--- a/drivers/bluetooth/btmrvl_drv.h
++++ b/drivers/bluetooth/btmrvl_drv.h
+@@ -68,6 +68,7 @@ struct btmrvl_adapter {
+       u8 hs_state;
+       u8 wakeup_tries;
+       wait_queue_head_t cmd_wait_q;
++      wait_queue_head_t event_hs_wait_q;
+       u8 cmd_complete;
+       bool is_suspended;
+ };
+--- a/drivers/bluetooth/btmrvl_main.c
++++ b/drivers/bluetooth/btmrvl_main.c
+@@ -114,6 +114,7 @@ int btmrvl_process_event(struct btmrvl_p
+                       adapter->hs_state = HS_ACTIVATED;
+                       if (adapter->psmode)
+                               adapter->ps_state = PS_SLEEP;
++                      wake_up_interruptible(&adapter->event_hs_wait_q);
+                       BT_DBG("HS ACTIVATED!");
+               } else {
+                       BT_DBG("HS Enable failed");
+@@ -253,11 +254,31 @@ EXPORT_SYMBOL_GPL(btmrvl_enable_ps);
+ 
+ int btmrvl_enable_hs(struct btmrvl_private *priv)
+ {
++      struct btmrvl_adapter *adapter = priv->adapter;
+       int ret;
+ 
+       ret = btmrvl_send_sync_cmd(priv, BT_CMD_HOST_SLEEP_ENABLE, NULL, 0);
+-      if (ret)
++      if (ret) {
+               BT_ERR("Host sleep enable command failed\n");
++              return ret;
++      }
++
++      ret = wait_event_interruptible_timeout(adapter->event_hs_wait_q,
++                                             adapter->hs_state,
++                      msecs_to_jiffies(WAIT_UNTIL_HS_STATE_CHANGED));
++      if (ret < 0) {
++              BT_ERR("event_hs_wait_q terminated (%d): %d,%d,%d",
++                     ret, adapter->hs_state, adapter->ps_state,
++                     adapter->wakeup_tries);
++      } else if (!ret) {
++              BT_ERR("hs_enable timeout: %d,%d,%d", adapter->hs_state,
++                     adapter->ps_state, adapter->wakeup_tries);
++              ret = -ETIMEDOUT;
++      } else {
++              BT_DBG("host sleep enabled: %d,%d,%d", adapter->hs_state,
++                     adapter->ps_state, adapter->wakeup_tries);
++              ret = 0;
++      }
+ 
+       return ret;
+ }
+@@ -358,6 +379,7 @@ static void btmrvl_init_adapter(struct b
+       }
+ 
+       init_waitqueue_head(&priv->adapter->cmd_wait_q);
++      init_waitqueue_head(&priv->adapter->event_hs_wait_q);
+ }
+ 
+ static void btmrvl_free_adapter(struct btmrvl_private *priv)
+@@ -666,6 +688,7 @@ int btmrvl_remove_card(struct btmrvl_pri
+       hdev = priv->btmrvl_dev.hcidev;
+ 
+       wake_up_interruptible(&priv->adapter->cmd_wait_q);
++      wake_up_interruptible(&priv->adapter->event_hs_wait_q);
+ 
+       kthread_stop(priv->main_thread.task);
+ 
diff --git a/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch b/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch

new file mode 100644 (file)

index 0000000..67ce199
--- /dev/null
+++ b/queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch
@@ -0,0 +1,34 @@
+From 42bd6a56ed1ab4b2cb50f4d4e674874da9b47f46 Mon Sep 17 00:00:00 2001
+From: Marcel Holtmann <marcel@holtmann.org>
+Date: Tue, 1 Jul 2014 14:11:19 +0200
+Subject: Bluetooth: Fix merge of advertising data and scan response data
+
+From: Marcel Holtmann <marcel@holtmann.org>
+
+commit 42bd6a56ed1ab4b2cb50f4d4e674874da9b47f46 upstream.
+
+The advertising data and scan response data are merged in the wrong
+order. It should be advertsing data first and then scan response data
+and not the other way around.
+
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/bluetooth/hci_event.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -4177,8 +4177,8 @@ static void process_adv_report(struct hc
+        * sending a merged device found event.
+        */
+       mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
+-                        d->last_adv_addr_type, NULL, rssi, 0, 1, data, len,
+-                        d->last_adv_data, d->last_adv_data_len);
++                        d->last_adv_addr_type, NULL, rssi, 0, 1,
++                        d->last_adv_data, d->last_adv_data_len, data, len);
+       clear_pending_adv_report(hdev);
+ }
+ 
diff --git a/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch b/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch

new file mode 100644 (file)

index 0000000..cf37079
--- /dev/null
+++ b/queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch
@@ -0,0 +1,77 @@
+From 6c53823ae0e10e723131055e1e65dd6a328a228e Mon Sep 17 00:00:00 2001
+From: Johan Hedberg <johan.hedberg@intel.com>
+Date: Fri, 11 Jul 2014 15:32:23 +0300
+Subject: Bluetooth: Fix tracking local SSP authentication requirement
+
+From: Johan Hedberg <johan.hedberg@intel.com>
+
+commit 6c53823ae0e10e723131055e1e65dd6a328a228e upstream.
+
+When we need to make the decision whether to perform just-works or real
+user confirmation we need to know the exact local authentication
+requirement that was passed to the controller. So far conn->auth_type
+(the local requirement) wasn't in one case updated appropriately in fear
+of the user confirmation being rejected later.
+
+The real problem however was not really that conn->auth_type couldn't
+represent the true value but that we were checking the local MITM
+requirement in an incorrect way. It's perfectly fine to let auth_type
+follow what we tell the controller since we're still tracking the target
+security level with conn->pending_sec_level.
+
+This patch updates the check for local MITM requirement in the
+hci_user_confirm_request_evt function to use the locally requested
+security level and ensures that auth_type always represents what we tell
+the controller. All other code in hci_user_confirm_request_evt still
+uses the auth_type instead of pending_sec_level for determining whether
+to do just-works or not, since that's the only value that's in sync with
+what the remote device knows.
+
+Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
+Tested-by: Szymon Janc <szymon.janc@tieto.com>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/bluetooth/hci_event.c |   17 ++++++++---------
+ 1 file changed, 8 insertions(+), 9 deletions(-)
+
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -3538,18 +3538,14 @@ static void hci_io_capa_request_evt(stru
+ 
+               /* If we are initiators, there is no remote information yet */
+               if (conn->remote_auth == 0xff) {
+-                      cp.authentication = conn->auth_type;
+-
+                       /* Request MITM protection if our IO caps allow it
+                        * except for the no-bonding case.
+-                       * conn->auth_type is not updated here since
+-                       * that might cause the user confirmation to be
+-                       * rejected in case the remote doesn't have the
+-                       * IO capabilities for MITM.
+                        */
+                       if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
+                           cp.authentication != HCI_AT_NO_BONDING)
+-                              cp.authentication |= 0x01;
++                              conn->auth_type |= 0x01;
++
++                      cp.authentication = conn->auth_type;
+               } else {
+                       conn->auth_type = hci_get_auth_req(conn);
+                       cp.authentication = conn->auth_type;
+@@ -3621,9 +3617,12 @@ static void hci_user_confirm_request_evt
+       rem_mitm = (conn->remote_auth & 0x01);
+ 
+       /* If we require MITM but the remote device can't provide that
+-       * (it has NoInputNoOutput) then reject the confirmation request
++       * (it has NoInputNoOutput) then reject the confirmation
++       * request. We check the security level here since it doesn't
++       * necessarily match conn->auth_type.
+        */
+-      if (loc_mitm && conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
++      if (conn->pending_sec_level > BT_SECURITY_MEDIUM &&
++          conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
+               BT_DBG("Rejecting request: remote device can't provide MITM");
+               hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
+                            sizeof(ev->bdaddr), &ev->bdaddr);
diff --git a/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch b/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch

new file mode 100644 (file)

index 0000000..10295fe
--- /dev/null
+++ b/queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch
@@ -0,0 +1,35 @@
+From 9f743d7499bc2c4dc8c35af33bdb2a29bea663b9 Mon Sep 17 00:00:00 2001
+From: Johan Hedberg <johan.hedberg@intel.com>
+Date: Thu, 17 Jul 2014 11:56:33 +0300
+Subject: Bluetooth: Fix using uninitialized variable when pairing
+
+From: Johan Hedberg <johan.hedberg@intel.com>
+
+commit 9f743d7499bc2c4dc8c35af33bdb2a29bea663b9 upstream.
+
+Commit 6c53823ae0e10e723131055e1e65dd6a328a228e reshuffled the way the
+authentication requirement gets set in the hci_io_capa_request_evt()
+function, but at the same time it failed to update an if-statement where
+cp.authentication is used before it has been initialized. The correct
+value the code should be looking for in this if-statement is
+conn->auth_type.
+
+Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/bluetooth/hci_event.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -3542,7 +3542,7 @@ static void hci_io_capa_request_evt(stru
+                        * except for the no-bonding case.
+                        */
+                       if (conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
+-                          cp.authentication != HCI_AT_NO_BONDING)
++                          conn->auth_type != HCI_AT_NO_BONDING)
+                               conn->auth_type |= 0x01;
+ 
+                       cp.authentication = conn->auth_type;
diff --git a/queue-3.16/bluetooth-never-linger-on-process-exit.patch b/queue-3.16/bluetooth-never-linger-on-process-exit.patch

new file mode 100644 (file)

index 0000000..ed1bc38
--- /dev/null
+++ b/queue-3.16/bluetooth-never-linger-on-process-exit.patch
@@ -0,0 +1,93 @@
+From 093facf3634da1b0c2cc7ed106f1983da901bbab Mon Sep 17 00:00:00 2001
+From: Vladimir Davydov <vdavydov@parallels.com>
+Date: Tue, 15 Jul 2014 12:25:28 +0400
+Subject: Bluetooth: never linger on process exit
+
+From: Vladimir Davydov <vdavydov@parallels.com>
+
+commit 093facf3634da1b0c2cc7ed106f1983da901bbab upstream.
+
+If the current process is exiting, lingering on socket close will make
+it unkillable, so we should avoid it.
+
+Reproducer:
+
+  #include <sys/types.h>
+  #include <sys/socket.h>
+
+  #define BTPROTO_L2CAP   0
+  #define BTPROTO_SCO     2
+  #define BTPROTO_RFCOMM  3
+
+  int main()
+  {
+          int fd;
+          struct linger ling;
+
+          fd = socket(PF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM);
+          //or: fd = socket(PF_BLUETOOTH, SOCK_DGRAM, BTPROTO_L2CAP);
+          //or: fd = socket(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_SCO);
+
+          ling.l_onoff = 1;
+          ling.l_linger = 1000000000;
+          setsockopt(fd, SOL_SOCKET, SO_LINGER, &ling, sizeof(ling));
+
+          return 0;
+  }
+
+Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
+Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/bluetooth/l2cap_sock.c  |    3 ++-
+ net/bluetooth/rfcomm/sock.c |    3 ++-
+ net/bluetooth/sco.c         |    6 ++++--
+ 3 files changed, 8 insertions(+), 4 deletions(-)
+
+--- a/net/bluetooth/l2cap_sock.c
++++ b/net/bluetooth/l2cap_sock.c
+@@ -1111,7 +1111,8 @@ static int l2cap_sock_shutdown(struct so
+               l2cap_chan_close(chan, 0);
+               lock_sock(sk);
+ 
+-              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
++              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
++                  !(current->flags & PF_EXITING))
+                       err = bt_sock_wait_state(sk, BT_CLOSED,
+                                                sk->sk_lingertime);
+       }
+--- a/net/bluetooth/rfcomm/sock.c
++++ b/net/bluetooth/rfcomm/sock.c
+@@ -918,7 +918,8 @@ static int rfcomm_sock_shutdown(struct s
+               sk->sk_shutdown = SHUTDOWN_MASK;
+               __rfcomm_sock_close(sk);
+ 
+-              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
++              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
++                  !(current->flags & PF_EXITING))
+                       err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+       }
+       release_sock(sk);
+--- a/net/bluetooth/sco.c
++++ b/net/bluetooth/sco.c
+@@ -909,7 +909,8 @@ static int sco_sock_shutdown(struct sock
+               sco_sock_clear_timer(sk);
+               __sco_sock_close(sk);
+ 
+-              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
++              if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
++                  !(current->flags & PF_EXITING))
+                       err = bt_sock_wait_state(sk, BT_CLOSED,
+                                                sk->sk_lingertime);
+       }
+@@ -929,7 +930,8 @@ static int sco_sock_release(struct socke
+ 
+       sco_sock_close(sk);
+ 
+-      if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) {
++      if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
++          !(current->flags & PF_EXITING)) {
+               lock_sock(sk);
+               err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+               release_sock(sk);
diff --git a/queue-3.16/fix-copy_tree-regression.patch b/queue-3.16/fix-copy_tree-regression.patch

new file mode 100644 (file)

index 0000000..a00634b
--- /dev/null
+++ b/queue-3.16/fix-copy_tree-regression.patch
@@ -0,0 +1,91 @@
+From 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Sun, 10 Aug 2014 03:44:55 -0400
+Subject: fix copy_tree() regression
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 upstream.
+
+Since 3.14 we had copy_tree() get the shadowing wrong - if we had one
+vfsmount shadowing another (i.e. if A is a slave of B, C is mounted
+on A/foo, then D got mounted on B/foo creating D' on A/foo shadowed
+by C), copy_tree() of A would make a copy of D' shadow the the copy of
+C, not the other way around.
+
+It's easy to fix, fortunately - just make sure that mount follows
+the one that shadows it in mnt_child as well as in mnt_hash, and when
+copy_tree() decides to attach a new mount, check if the last child
+it has added to the same parent should be shadowing the new one.
+And if it should, just use the same logics commit_tree() has - put the
+new mount into the hash and children lists right after the one that
+should shadow it.
+
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/namespace.c |   31 ++++++++++++++++++++++++-------
+ 1 file changed, 24 insertions(+), 7 deletions(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -779,6 +779,20 @@ static void attach_mnt(struct mount *mnt
+       list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ }
+ 
++static void attach_shadowed(struct mount *mnt,
++                      struct mount *parent,
++                      struct mount *shadows)
++{
++      if (shadows) {
++              hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
++              list_add(&mnt->mnt_child, &shadows->mnt_child);
++      } else {
++              hlist_add_head_rcu(&mnt->mnt_hash,
++                              m_hash(&parent->mnt, mnt->mnt_mountpoint));
++              list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
++      }
++}
++
+ /*
+  * vfsmount lock must be held for write
+  */
+@@ -797,12 +811,7 @@ static void commit_tree(struct mount *mn
+ 
+       list_splice(&head, n->list.prev);
+ 
+-      if (shadows)
+-              hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+-      else
+-              hlist_add_head_rcu(&mnt->mnt_hash,
+-                              m_hash(&parent->mnt, mnt->mnt_mountpoint));
+-      list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
++      attach_shadowed(mnt, parent, shadows);
+       touch_mnt_namespace(n);
+ }
+ 
+@@ -1513,6 +1522,7 @@ struct mount *copy_tree(struct mount *mn
+                       continue;
+ 
+               for (s = r; s; s = next_mnt(s, r)) {
++                      struct mount *t = NULL;
+                       if (!(flag & CL_COPY_UNBINDABLE) &&
+                           IS_MNT_UNBINDABLE(s)) {
+                               s = skip_mnt_tree(s);
+@@ -1534,7 +1544,14 @@ struct mount *copy_tree(struct mount *mn
+                               goto out;
+                       lock_mount_hash();
+                       list_add_tail(&q->mnt_list, &res->mnt_list);
+-                      attach_mnt(q, parent, p->mnt_mp);
++                      mnt_set_mountpoint(parent, p->mnt_mp, q);
++                      if (!list_empty(&parent->mnt_mounts)) {
++                              t = list_last_entry(&parent->mnt_mounts,
++                                      struct mount, mnt_child);
++                              if (t->mnt_mp != p->mnt_mp)
++                                      t = NULL;
++                      }
++                      attach_shadowed(q, parent, t);
+                       unlock_mount_hash();
+               }
+       }
diff --git a/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch b/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch

new file mode 100644 (file)

index 0000000..7373ad0
--- /dev/null
+++ b/queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch
@@ -0,0 +1,80 @@
+From 2446dba03f9dabe0b477a126cbeb377854785b47 Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Thu, 31 Jul 2014 10:16:29 +1000
+Subject: md/raid1,raid10: always abort recover on write error.
+
+From: NeilBrown <neilb@suse.de>
+
+commit 2446dba03f9dabe0b477a126cbeb377854785b47 upstream.
+
+Currently we don't abort recovery on a write error if the write error
+to the recovering device was triggerd by normal IO (as opposed to
+recovery IO).
+
+This means that for one bitmap region, the recovery might write to the
+recovering device for a few sectors, then not bother for subsequent
+sectors (as it never writes to failed devices).  In this case
+the bitmap bit will be cleared, but it really shouldn't.
+
+The result is that if the recovering device fails and is then re-added
+(after fixing whatever hardware problem triggerred the failure),
+the second recovery won't redo the region it was in the middle of,
+so some of the device will not be recovered properly.
+
+If we abort the recovery, the region being processes will be cancelled
+(bit not cleared) and the whole region will be retried.
+
+As the bug can result in data corruption the patch is suitable for
+-stable.  For kernels prior to 3.11 there is a conflict in raid10.c
+which will require care.
+
+Original-from: jiao hui <jiaohui@bwstor.com.cn>
+Reported-and-tested-by: jiao hui <jiaohui@bwstor.com.cn>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid1.c  |    8 ++++----
+ drivers/md/raid10.c |   11 +++++------
+ 2 files changed, 9 insertions(+), 10 deletions(-)
+
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -1501,12 +1501,12 @@ static void error(struct mddev *mddev, s
+               mddev->degraded++;
+               set_bit(Faulty, &rdev->flags);
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+-              /*
+-               * if recovery is running, make sure it aborts.
+-               */
+-              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       } else
+               set_bit(Faulty, &rdev->flags);
++      /*
++       * if recovery is running, make sure it aborts.
++       */
++      set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       printk(KERN_ALERT
+              "md/raid1:%s: Disk failure on %s, disabling device.\n"
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1684,13 +1684,12 @@ static void error(struct mddev *mddev, s
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+               return;
+       }
+-      if (test_and_clear_bit(In_sync, &rdev->flags)) {
++      if (test_and_clear_bit(In_sync, &rdev->flags))
+               mddev->degraded++;
+-                      /*
+-               * if recovery is running, make sure it aborts.
+-               */
+-              set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+-      }
++      /*
++       * If recovery is running, make sure it aborts.
++       */
++      set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       set_bit(Blocked, &rdev->flags);
+       set_bit(Faulty, &rdev->flags);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
diff --git a/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch b/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch

new file mode 100644 (file)

index 0000000..4468329
--- /dev/null
+++ b/queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch
@@ -0,0 +1,39 @@
+From b39685526f46976bcd13aa08c82480092befa46c Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Mon, 18 Aug 2014 13:59:50 +1000
+Subject: md/raid10: Fix memory leak when raid10 reshape completes.
+
+From: NeilBrown <neilb@suse.de>
+
+commit b39685526f46976bcd13aa08c82480092befa46c upstream.
+
+When a raid10 commences a resync/recovery/reshape it allocates
+some buffer space.
+When a resync/recovery completes the buffer space is freed.  But not
+when the reshape completes.
+This can result in a small memory leak.
+
+There is a subtle side-effect of this bug.  When a RAID10 is reshaped
+to a larger array (more devices), the reshape is immediately followed
+by a "resync" of the new space.  This "resync" will use the buffer
+space which was allocated for "reshape".  This can cause problems
+including a "BUG" in the SCSI layer.  So this is suitable for -stable.
+
+Fixes: 3ea7daa5d7fde47cd41f4d56c2deb949114da9d6
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid10.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -2953,6 +2953,7 @@ static sector_t sync_request(struct mdde
+                */
+               if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+                       end_reshape(conf);
++                      close_sync(conf);
+                       return 0;
+               }
+ 
diff --git a/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch b/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch

new file mode 100644 (file)

index 0000000..f7d6218
--- /dev/null
+++ b/queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch
@@ -0,0 +1,40 @@
+From ce0b0a46955d1bb389684a2605dbcaa990ba0154 Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Mon, 18 Aug 2014 13:56:38 +1000
+Subject: md/raid10: fix memory leak when reshaping a RAID10.
+
+From: NeilBrown <neilb@suse.de>
+
+commit ce0b0a46955d1bb389684a2605dbcaa990ba0154 upstream.
+
+raid10 reshape clears unwanted bits from a bio->bi_flags using
+a method which, while clumsy, worked until 3.10 when BIO_OWNS_VEC
+was added.
+Since then it clears that bit but shouldn't.  This results in a
+memory leak.
+
+So change to used the approved method of clearing unwanted bits.
+
+As this causes a memory leak which can consume all of memory
+the fix is suitable for -stable.
+
+Fixes: a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd
+Reported-by: mdraid.pkoch@dfgh.net (Peter Koch)
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid10.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -4410,7 +4410,7 @@ read_more:
+       read_bio->bi_private = r10_bio;
+       read_bio->bi_end_io = end_sync_read;
+       read_bio->bi_rw = READ;
+-      read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
++      read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
+       read_bio->bi_flags |= 1 << BIO_UPTODATE;
+       read_bio->bi_vcnt = 0;
+       read_bio->bi_iter.bi_size = 0;
diff --git a/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch b/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch

new file mode 100644 (file)

index 0000000..c23340c
--- /dev/null
+++ b/queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch
@@ -0,0 +1,42 @@
+From a40687ff73a5b14909d6aa522f7d778b158911c5 Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Wed, 13 Aug 2014 09:48:45 +1000
+Subject: md/raid5: avoid livelock caused by non-aligned writes.
+
+From: NeilBrown <neilb@suse.de>
+
+commit a40687ff73a5b14909d6aa522f7d778b158911c5 upstream.
+
+If a stripe in a raid6 array received a write to each data block while
+the array is degraded, and if any of these writes to a missing device
+are not page-aligned, then a live-lock happens.
+
+In this case the P and Q blocks need to be read so that the part of
+the missing block which is *not* being updated by the write can be
+constructed.  Due to a logic error, these blocks are not loaded, so
+the update cannot proceed and the stripe is 'handled' repeatedly in an
+infinite loop.
+
+This bug is unlikely as most writes are page aligned.  However as it
+can lead to a livelock it is suitable for -stable.  It was introduced
+in 3.16.
+
+Fixed: 67f455486d2ea20b2d94d6adf5b9b783d079e321
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -2922,7 +2922,7 @@ static int fetch_block(struct stripe_hea
+             (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
+             !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
+            (sh->raid_conf->level == 6 && s->failed && s->to_write &&
+-            s->to_write < sh->raid_conf->raid_disks - 2 &&
++            s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 &&
+             (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+               /* we would like to get this block, possibly by computing it,
+                * otherwise read it if the backing disk is insync
diff --git a/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch b/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch

new file mode 100644 (file)

index 0000000..df717e1
--- /dev/null
+++ b/queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch
@@ -0,0 +1,48 @@
+From 9c4bdf697c39805078392d5ddbbba5ae5680e0dd Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Wed, 13 Aug 2014 09:57:07 +1000
+Subject: md/raid6: avoid data corruption during recovery of double-degraded RAID6
+
+From: NeilBrown <neilb@suse.de>
+
+commit 9c4bdf697c39805078392d5ddbbba5ae5680e0dd upstream.
+
+During recovery of a double-degraded RAID6 it is possible for
+some blocks not to be recovered properly, leading to corruption.
+
+If a write happens to one block in a stripe that would be written to a
+missing device, and at the same time that stripe is recovering data
+to the other missing device, then that recovered data may not be written.
+
+This patch skips, in the double-degraded case, an optimisation that is
+only safe for single-degraded arrays.
+
+Bug was introduced in 2.6.32 and fix is suitable for any kernel since
+then.  In an older kernel with separate handle_stripe5() and
+handle_stripe6() functions the patch must change handle_stripe6().
+
+Fixes: 6c0069c0ae9659e3a91b68eaed06a5c6c37f45c8
+Cc: Yuri Tikhonov <yur@emcraft.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Reported-by: "Manibalan P" <pmanibalan@amiindia.co.in>
+Tested-by: "Manibalan P" <pmanibalan@amiindia.co.in>
+Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1090423
+Signed-off-by: NeilBrown <neilb@suse.de>
+Acked-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -3817,6 +3817,8 @@ static void handle_stripe(struct stripe_
+                               set_bit(R5_Wantwrite, &dev->flags);
+                               if (prexor)
+                                       continue;
++                              if (s.failed > 1)
++                                      continue;
+                               if (!test_bit(R5_Insync, &dev->flags) ||
+                                   ((i == sh->pd_idx || i == sh->qd_idx)  &&
+                                    s.failed == 0))
diff --git a/queue-3.16/rbd-rework-rbd_request_fn.patch b/queue-3.16/rbd-rework-rbd_request_fn.patch

new file mode 100644 (file)

index 0000000..c4bf618
--- /dev/null
+++ b/queue-3.16/rbd-rework-rbd_request_fn.patch
@@ -0,0 +1,303 @@
+From bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <ilya.dryomov@inktank.com>
+Date: Mon, 4 Aug 2014 18:04:39 +0400
+Subject: rbd: rework rbd_request_fn()
+
+From: Ilya Dryomov <ilya.dryomov@inktank.com>
+
+commit bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 upstream.
+
+While it was never a good idea to sleep in request_fn(), commit
+34c6bc2c919a ("locking/mutexes: Add extra reschedule point") made it
+a *bad* idea.  mutex_lock() since 3.15 may reschedule *before* putting
+task on the mutex wait queue, which for tasks in !TASK_RUNNING state
+means block forever.  request_fn() may be called with !TASK_RUNNING on
+the way to schedule() in io_schedule().
+
+Offload request handling to a workqueue, one per rbd device, to avoid
+calling blocking primitives from rbd_request_fn().
+
+Fixes: http://tracker.ceph.com/issues/8818
+
+Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
+Tested-by: Eric Eastman <eric0e@aol.com>
+Tested-by: Greg Wilson <greg.wilson@keepertech.com>
+Reviewed-by: Alex Elder <elder@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/rbd.c |  194 +++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 118 insertions(+), 76 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -42,6 +42,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/slab.h>
+ #include <linux/idr.h>
++#include <linux/workqueue.h>
+ 
+ #include "rbd_types.h"
+ 
+@@ -332,7 +333,10 @@ struct rbd_device {
+ 
+       char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+ 
++      struct list_head        rq_queue;       /* incoming rq queue */
+       spinlock_t              lock;           /* queue, flags, open_count */
++      struct workqueue_struct *rq_wq;
++      struct work_struct      rq_work;
+ 
+       struct rbd_image_header header;
+       unsigned long           flags;          /* possibly lock protected */
+@@ -3183,102 +3187,129 @@ out:
+       return ret;
+ }
+ 
+-static void rbd_request_fn(struct request_queue *q)
+-              __releases(q->queue_lock) __acquires(q->queue_lock)
++static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+ {
+-      struct rbd_device *rbd_dev = q->queuedata;
+-      struct request *rq;
++      struct rbd_img_request *img_request;
++      u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
++      u64 length = blk_rq_bytes(rq);
++      bool wr = rq_data_dir(rq) == WRITE;
+       int result;
+ 
+-      while ((rq = blk_fetch_request(q))) {
+-              bool write_request = rq_data_dir(rq) == WRITE;
+-              struct rbd_img_request *img_request;
+-              u64 offset;
+-              u64 length;
++      /* Ignore/skip any zero-length requests */
+ 
+-              /* Ignore any non-FS requests that filter through. */
++      if (!length) {
++              dout("%s: zero-length request\n", __func__);
++              result = 0;
++              goto err_rq;
++      }
+ 
+-              if (rq->cmd_type != REQ_TYPE_FS) {
+-                      dout("%s: non-fs request type %d\n", __func__,
+-                              (int) rq->cmd_type);
+-                      __blk_end_request_all(rq, 0);
+-                      continue;
++      /* Disallow writes to a read-only device */
++
++      if (wr) {
++              if (rbd_dev->mapping.read_only) {
++                      result = -EROFS;
++                      goto err_rq;
+               }
++              rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
++      }
+ 
+-              /* Ignore/skip any zero-length requests */
++      /*
++       * Quit early if the mapped snapshot no longer exists.  It's
++       * still possible the snapshot will have disappeared by the
++       * time our request arrives at the osd, but there's no sense in
++       * sending it if we already know.
++       */
++      if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
++              dout("request for non-existent snapshot");
++              rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
++              result = -ENXIO;
++              goto err_rq;
++      }
+ 
+-              offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
+-              length = (u64) blk_rq_bytes(rq);
++      if (offset && length > U64_MAX - offset + 1) {
++              rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
++                       length);
++              result = -EINVAL;
++              goto err_rq;    /* Shouldn't happen */
++      }
+ 
+-              if (!length) {
+-                      dout("%s: zero-length request\n", __func__);
+-                      __blk_end_request_all(rq, 0);
+-                      continue;
+-              }
++      if (offset + length > rbd_dev->mapping.size) {
++              rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
++                       length, rbd_dev->mapping.size);
++              result = -EIO;
++              goto err_rq;
++      }
+ 
+-              spin_unlock_irq(q->queue_lock);
++      img_request = rbd_img_request_create(rbd_dev, offset, length, wr);
++      if (!img_request) {
++              result = -ENOMEM;
++              goto err_rq;
++      }
++      img_request->rq = rq;
+ 
+-              /* Disallow writes to a read-only device */
++      result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio);
++      if (result)
++              goto err_img_request;
+ 
+-              if (write_request) {
+-                      result = -EROFS;
+-                      if (rbd_dev->mapping.read_only)
+-                              goto end_request;
+-                      rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+-              }
++      result = rbd_img_request_submit(img_request);
++      if (result)
++              goto err_img_request;
+ 
+-              /*
+-               * Quit early if the mapped snapshot no longer
+-               * exists.  It's still possible the snapshot will
+-               * have disappeared by the time our request arrives
+-               * at the osd, but there's no sense in sending it if
+-               * we already know.
+-               */
+-              if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+-                      dout("request for non-existent snapshot");
+-                      rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+-                      result = -ENXIO;
+-                      goto end_request;
+-              }
++      return;
+ 
+-              result = -EINVAL;
+-              if (offset && length > U64_MAX - offset + 1) {
+-                      rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
+-                              offset, length);
+-                      goto end_request;       /* Shouldn't happen */
+-              }
++err_img_request:
++      rbd_img_request_put(img_request);
++err_rq:
++      if (result)
++              rbd_warn(rbd_dev, "%s %llx at %llx result %d",
++                       wr ? "write" : "read", length, offset, result);
++      blk_end_request_all(rq, result);
++}
+ 
+-              result = -EIO;
+-              if (offset + length > rbd_dev->mapping.size) {
+-                      rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
+-                              offset, length, rbd_dev->mapping.size);
+-                      goto end_request;
+-              }
++static void rbd_request_workfn(struct work_struct *work)
++{
++      struct rbd_device *rbd_dev =
++          container_of(work, struct rbd_device, rq_work);
++      struct request *rq, *next;
++      LIST_HEAD(requests);
+ 
+-              result = -ENOMEM;
+-              img_request = rbd_img_request_create(rbd_dev, offset, length,
+-                                                      write_request);
+-              if (!img_request)
+-                      goto end_request;
++      spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
++      list_splice_init(&rbd_dev->rq_queue, &requests);
++      spin_unlock_irq(&rbd_dev->lock);
+ 
+-              img_request->rq = rq;
++      list_for_each_entry_safe(rq, next, &requests, queuelist) {
++              list_del_init(&rq->queuelist);
++              rbd_handle_request(rbd_dev, rq);
++      }
++}
+ 
+-              result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+-                                              rq->bio);
+-              if (!result)
+-                      result = rbd_img_request_submit(img_request);
+-              if (result)
+-                      rbd_img_request_put(img_request);
+-end_request:
+-              spin_lock_irq(q->queue_lock);
+-              if (result < 0) {
+-                      rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
+-                              write_request ? "write" : "read",
+-                              length, offset, result);
++/*
++ * Called with q->queue_lock held and interrupts disabled, possibly on
++ * the way to schedule().  Do not sleep here!
++ */
++static void rbd_request_fn(struct request_queue *q)
++{
++      struct rbd_device *rbd_dev = q->queuedata;
++      struct request *rq;
++      int queued = 0;
++
++      rbd_assert(rbd_dev);
+ 
+-                      __blk_end_request_all(rq, result);
++      while ((rq = blk_fetch_request(q))) {
++              /* Ignore any non-FS requests that filter through. */
++              if (rq->cmd_type != REQ_TYPE_FS) {
++                      dout("%s: non-fs request type %d\n", __func__,
++                              (int) rq->cmd_type);
++                      __blk_end_request_all(rq, 0);
++                      continue;
+               }
++
++              list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
++              queued++;
+       }
++
++      if (queued)
++              queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
+ }
+ 
+ /*
+@@ -3848,6 +3879,8 @@ static struct rbd_device *rbd_dev_create
+               return NULL;
+ 
+       spin_lock_init(&rbd_dev->lock);
++      INIT_LIST_HEAD(&rbd_dev->rq_queue);
++      INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
+       rbd_dev->flags = 0;
+       atomic_set(&rbd_dev->parent_ref, 0);
+       INIT_LIST_HEAD(&rbd_dev->node);
+@@ -5066,12 +5099,17 @@ static int rbd_dev_device_setup(struct r
+       ret = rbd_dev_mapping_set(rbd_dev);
+       if (ret)
+               goto err_out_disk;
++
+       set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+       set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
+ 
++      rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0);
++      if (!rbd_dev->rq_wq)
++              goto err_out_mapping;
++
+       ret = rbd_bus_add_dev(rbd_dev);
+       if (ret)
+-              goto err_out_mapping;
++              goto err_out_workqueue;
+ 
+       /* Everything's ready.  Announce the disk to the world. */
+ 
+@@ -5083,6 +5121,9 @@ static int rbd_dev_device_setup(struct r
+ 
+       return ret;
+ 
++err_out_workqueue:
++      destroy_workqueue(rbd_dev->rq_wq);
++      rbd_dev->rq_wq = NULL;
+ err_out_mapping:
+       rbd_dev_mapping_clear(rbd_dev);
+ err_out_disk:
+@@ -5314,6 +5355,7 @@ static void rbd_dev_device_release(struc
+ {
+       struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ 
++      destroy_workqueue(rbd_dev->rq_wq);
+       rbd_free_disk(rbd_dev);
+       clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+       rbd_dev_mapping_clear(rbd_dev);
diff --git a/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch b/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch

new file mode 100644 (file)

index 0000000..8c77842
--- /dev/null
+++ b/queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch
@@ -0,0 +1,87 @@
+From 2f0304d21867476394cd51a54e97f7273d112261 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Fri, 25 Jul 2014 09:11:33 -0500
+Subject: RDMA/iwcm: Use a default listen backlog if needed
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit 2f0304d21867476394cd51a54e97f7273d112261 upstream.
+
+If the user creates a listening cm_id with backlog of 0 the IWCM ends
+up not allowing any connection requests at all.  The correct behavior
+is for the IWCM to pick a default value if the user backlog parameter
+is zero.
+
+Lustre from version 1.8.8 onward uses a backlog of 0, which breaks
+iwarp support without this fix.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/core/iwcm.c |   27 +++++++++++++++++++++++++++
+ 1 file changed, 27 insertions(+)
+
+--- a/drivers/infiniband/core/iwcm.c
++++ b/drivers/infiniband/core/iwcm.c
+@@ -46,6 +46,7 @@
+ #include <linux/completion.h>
+ #include <linux/slab.h>
+ #include <linux/module.h>
++#include <linux/sysctl.h>
+ 
+ #include <rdma/iw_cm.h>
+ #include <rdma/ib_addr.h>
+@@ -65,6 +66,20 @@ struct iwcm_work {
+       struct list_head free_list;
+ };
+ 
++static unsigned int default_backlog = 256;
++
++static struct ctl_table_header *iwcm_ctl_table_hdr;
++static struct ctl_table iwcm_ctl_table[] = {
++      {
++              .procname       = "default_backlog",
++              .data           = &default_backlog,
++              .maxlen         = sizeof(default_backlog),
++              .mode           = 0644,
++              .proc_handler   = proc_dointvec,
++      },
++      { }
++};
++
+ /*
+  * The following services provide a mechanism for pre-allocating iwcm_work
+  * elements.  The design pre-allocates them  based on the cm_id type:
+@@ -425,6 +440,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id,
+ 
+       cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ 
++      if (!backlog)
++              backlog = default_backlog;
++
+       ret = alloc_work_entries(cm_id_priv, backlog);
+       if (ret)
+               return ret;
+@@ -1030,11 +1048,20 @@ static int __init iw_cm_init(void)
+       if (!iwcm_wq)
+               return -ENOMEM;
+ 
++      iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
++                                               iwcm_ctl_table);
++      if (!iwcm_ctl_table_hdr) {
++              pr_err("iw_cm: couldn't register sysctl paths\n");
++              destroy_workqueue(iwcm_wq);
++              return -ENOMEM;
++      }
++
+       return 0;
+ }
+ 
+ static void __exit iw_cm_cleanup(void)
+ {
++      unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+       destroy_workqueue(iwcm_wq);
+ }
+ 
diff --git a/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch b/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch

new file mode 100644 (file)

index 0000000..7bf9351
--- /dev/null
+++ b/queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch
@@ -0,0 +1,34 @@
+From db1044d458a287c18c4d413adc4ad12e92e253b5 Mon Sep 17 00:00:00 2001
+From: Doug Ledford <dledford@redhat.com>
+Date: Tue, 12 Aug 2014 19:20:11 -0400
+Subject: RDMA/uapi: Include socket.h in rdma_user_cm.h
+
+From: Doug Ledford <dledford@redhat.com>
+
+commit db1044d458a287c18c4d413adc4ad12e92e253b5 upstream.
+
+added struct sockaddr_storage to rdma_user_cm.h without also adding an
+include for linux/socket.h to make sure it is defined.  Systemtap
+needs the header files to build standalone and cannot rely on other
+files to pre-include other headers, so add linux/socket.h to the list
+of includes in this file.
+
+Fixes: ee7aed4528f ("RDMA/ucma: Support querying for AF_IB addresses")
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/uapi/rdma/rdma_user_cm.h |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/uapi/rdma/rdma_user_cm.h
++++ b/include/uapi/rdma/rdma_user_cm.h
+@@ -34,6 +34,7 @@
+ #define RDMA_USER_CM_H
+ 
+ #include <linux/types.h>
++#include <linux/socket.h>
+ #include <linux/in6.h>
+ #include <rdma/ib_user_verbs.h>
+ #include <rdma/ib_user_sa.h>
diff --git a/queue-3.16/series b/queue-3.16/series

index 6a9d97e263f96cd19a35df62bf35c40538ac04ac..c7732b4128363b2de0fe645f47e1e70abd4aaf06 100644 (file)
--- a/queue-3.16/series
+++ b/queue-3.16/series
@@ -113,3 +113,24 @@ mnt-change-the-default-remount-atime-from-relatime-to-the-existing-value.patch
  mnt-add-tests-for-unprivileged-remount-cases-that-have-found-to-be-faulty.patch
  get-rid-of-propagate_umount-mistakenly-treating-slaves-as-busy.patch
  fix-ebusy-on-umount-from-mnt_shrinkable.patch
+bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch
+bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch
+bluetooth-fix-tracking-local-ssp-authentication-requirement.patch
+bluetooth-never-linger-on-process-exit.patch
+bluetooth-fix-using-uninitialized-variable-when-pairing.patch
+bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch
+__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch
+rbd-rework-rbd_request_fn.patch
+fix-copy_tree-regression.patch
+md-raid1-raid10-always-abort-recover-on-write-error.patch
+md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch
+md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch
+md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch
+md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch
+rdma-iwcm-use-a-default-listen-backlog-if-needed.patch
+rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch
+xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch
+xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch
+xfs-don-t-dirty-buffers-beyond-eof.patch
+xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch
+xfs-don-t-zero-partial-page-cache-pages-during.patch
diff --git a/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch b/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch

new file mode 100644 (file)

index 0000000..94340e4
--- /dev/null
+++ b/queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch
@@ -0,0 +1,130 @@
+From 22e757a49cf010703fcb9c9b4ef793248c39b0c2 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Tue, 2 Sep 2014 12:12:51 +1000
+Subject: xfs: don't dirty buffers beyond EOF
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 22e757a49cf010703fcb9c9b4ef793248c39b0c2 upstream.
+
+generic/263 is failing fsx at this point with a page spanning
+EOF that cannot be invalidated. The operations are:
+
+1190 mapwrite   0x52c00 thru    0x5e569 (0xb96a bytes)
+1191 mapread    0x5c000 thru    0x5d636 (0x1637 bytes)
+1192 write      0x5b600 thru    0x771ff (0x1bc00 bytes)
+
+where 1190 extents EOF from 0x54000 to 0x5e569. When the direct IO
+write attempts to invalidate the cached page over this range, it
+fails with -EBUSY and so any attempt to do page invalidation fails.
+
+The real question is this: Why can't that page be invalidated after
+it has been written to disk and cleaned?
+
+Well, there's data on the first two buffers in the page (1k block
+size, 4k page), but the third buffer on the page (i.e. beyond EOF)
+is failing drop_buffers because it's bh->b_state == 0x3, which is
+BH_Uptodate | BH_Dirty.  IOWs, there's dirty buffers beyond EOF. Say
+what?
+
+OK, set_buffer_dirty() is called on all buffers from
+__set_page_buffers_dirty(), regardless of whether the buffer is
+beyond EOF or not, which means that when we get to ->writepage,
+we have buffers marked dirty beyond EOF that we need to clean.
+So, we need to implement our own .set_page_dirty method that
+doesn't dirty buffers beyond EOF.
+
+This is messy because the buffer code is not meant to be shared
+and it has interesting locking issues on the buffer dirty bits.
+So just copy and paste it and then modify it to suit what we need.
+
+Note: the solutions the other filesystems and generic block code use
+of marking the buffers clean in ->writepage does not work for XFS.
+It still leaves dirty buffers beyond EOF and invalidations still
+fail. Hence rather than play whack-a-mole, this patch simply
+prevents those buffers from being dirtied in the first place.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c |   61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 61 insertions(+)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
+       return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+ }
+ 
++/*
++ * This is basically a copy of __set_page_dirty_buffers() with one
++ * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
++ * dirty, we'll never be able to clean them because we don't write buffers
++ * beyond EOF, and that means we can't invalidate pages that span EOF
++ * that have been marked dirty. Further, the dirty state can leak into
++ * the file interior if the file is extended, resulting in all sorts of
++ * bad things happening as the state does not match the underlying data.
++ *
++ * XXX: this really indicates that bufferheads in XFS need to die. Warts like
++ * this only exist because of bufferheads and how the generic code manages them.
++ */
++STATIC int
++xfs_vm_set_page_dirty(
++      struct page             *page)
++{
++      struct address_space    *mapping = page->mapping;
++      struct inode            *inode = mapping->host;
++      loff_t                  end_offset;
++      loff_t                  offset;
++      int                     newly_dirty;
++
++      if (unlikely(!mapping))
++              return !TestSetPageDirty(page);
++
++      end_offset = i_size_read(inode);
++      offset = page_offset(page);
++
++      spin_lock(&mapping->private_lock);
++      if (page_has_buffers(page)) {
++              struct buffer_head *head = page_buffers(page);
++              struct buffer_head *bh = head;
++
++              do {
++                      if (offset < end_offset)
++                              set_buffer_dirty(bh);
++                      bh = bh->b_this_page;
++                      offset += 1 << inode->i_blkbits;
++              } while (bh != head);
++      }
++      newly_dirty = !TestSetPageDirty(page);
++      spin_unlock(&mapping->private_lock);
++
++      if (newly_dirty) {
++              /* sigh - __set_page_dirty() is static, so copy it here, too */
++              unsigned long flags;
++
++              spin_lock_irqsave(&mapping->tree_lock, flags);
++              if (page->mapping) {    /* Race with truncate? */
++                      WARN_ON_ONCE(!PageUptodate(page));
++                      account_page_dirtied(page, mapping);
++                      radix_tree_tag_set(&mapping->page_tree,
++                                      page_index(page), PAGECACHE_TAG_DIRTY);
++              }
++              spin_unlock_irqrestore(&mapping->tree_lock, flags);
++              __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
++      }
++      return newly_dirty;
++}
++
+ const struct address_space_operations xfs_address_space_operations = {
+       .readpage               = xfs_vm_readpage,
+       .readpages              = xfs_vm_readpages,
+       .writepage              = xfs_vm_writepage,
+       .writepages             = xfs_vm_writepages,
++      .set_page_dirty         = xfs_vm_set_page_dirty,
+       .releasepage            = xfs_vm_releasepage,
+       .invalidatepage         = xfs_vm_invalidatepage,
+       .write_begin            = xfs_vm_write_begin,
diff --git a/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch

new file mode 100644 (file)

index 0000000..e8070af
--- /dev/null
+++ b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch
@@ -0,0 +1,47 @@
+From 834ffca6f7e345a79f6f2e2d131b0dfba8a4b67a Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Tue, 2 Sep 2014 12:12:52 +1000
+Subject: xfs: don't zero partial page cache pages during O_DIRECT writes
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 834ffca6f7e345a79f6f2e2d131b0dfba8a4b67a upstream.
+
+Similar to direct IO reads, direct IO writes are using
+truncate_pagecache_range to invalidate the page cache. This is
+incorrect due to the sub-block zeroing in the page cache that
+truncate_pagecache_range() triggers.
+
+This patch fixes things by using invalidate_inode_pages2_range
+instead.  It preserves the page cache invalidation, but won't zero
+any pages.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -634,7 +634,15 @@ xfs_file_dio_aio_write(
+                                                   pos, -1);
+               if (ret)
+                       goto out;
+-              truncate_pagecache_range(VFS_I(ip), pos, -1);
++              /*
++               * Invalidate whole pages. This can return an error if
++               * we fail to invalidate a page, but this should never
++               * happen on XFS. Warn if it does fail.
++               */
++              ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
++                                              pos >> PAGE_CACHE_SHIFT, -1);
++              WARN_ON_ONCE(ret);
++              ret = 0;
+       }
+ 
+       /*
diff --git a/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch

new file mode 100644 (file)

index 0000000..f110984
--- /dev/null
+++ b/queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch
@@ -0,0 +1,59 @@
+From 85e584da3212140ee80fd047f9058bbee0bc00d5 Mon Sep 17 00:00:00 2001
+From: Chris Mason <clm@fb.com>
+Date: Tue, 2 Sep 2014 12:12:52 +1000
+Subject: xfs: don't zero partial page cache pages during
+ O_DIRECT writes
+
+From: Chris Mason <clm@fb.com>
+
+commit 85e584da3212140ee80fd047f9058bbee0bc00d5 upstream.
+
+xfs is using truncate_pagecache_range to invalidate the page cache
+during DIO reads.  This is different from the other filesystems who
+only invalidate pages during DIO writes.
+
+truncate_pagecache_range is meant to be used when we are freeing the
+underlying data structs from disk, so it will zero any partial
+ranges in the page.  This means a DIO read can zero out part of the
+page cache page, and it is possible the page will stay in cache.
+
+buffered reads will find an up to date page with zeros instead of
+the data actually on disk.
+
+This patch fixes things by using invalidate_inode_pages2_range
+instead.  It preserves the page cache invalidation, but won't zero
+any pages.
+
+[dchinner: catch error and warn if it fails. Comment.]
+
+Signed-off-by: Chris Mason <clm@fb.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_file.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -295,7 +295,16 @@ xfs_file_read_iter(
+                               xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                               return ret;
+                       }
+-                      truncate_pagecache_range(VFS_I(ip), pos, -1);
++
++                      /*
++                       * Invalidate whole pages. This can return an error if
++                       * we fail to invalidate a page, but this should never
++                       * happen on XFS. Warn if it does fail.
++                       */
++                      ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
++                                              pos >> PAGE_CACHE_SHIFT, -1);
++                      WARN_ON_ONCE(ret);
++                      ret = 0;
+               }
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+       }
diff --git a/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch b/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch

new file mode 100644 (file)

index 0000000..2209e4e
--- /dev/null
+++ b/queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch
@@ -0,0 +1,159 @@
+From 67dc288c21064b31a98a53dc64f6b9714b819fd6 Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Mon, 4 Aug 2014 12:43:06 +1000
+Subject: xfs: ensure verifiers are attached to recovered buffers
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 67dc288c21064b31a98a53dc64f6b9714b819fd6 upstream.
+
+Crash testing of CRC enabled filesystems has resulted in a number of
+reports of bad CRCs being detected after the filesystem was mounted.
+Errors such as the following were being seen:
+
+XFS (sdb3): Mounting V5 Filesystem
+XFS (sdb3): Starting recovery (logdev: internal)
+XFS (sdb3): Metadata CRC error detected at xfs_agf_read_verify+0x5a/0x100 [xfs], block 0x1
+XFS (sdb3): Unmount and run xfs_repair
+XFS (sdb3): First 64 bytes of corrupted metadata buffer:
+ffff880136ffd600: 58 41 47 46 00 00 00 01 00 00 00 00 00 0f aa 40  XAGF...........@
+ffff880136ffd610: 00 02 6d 53 00 02 77 f8 00 00 00 00 00 00 00 01  ..mS..w.........
+ffff880136ffd620: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 03  ................
+ffff880136ffd630: 00 00 00 04 00 08 81 d0 00 08 81 a7 00 00 00 00  ................
+XFS (sdb3): metadata I/O error: block 0x1 ("xfs_trans_read_buf_map") error 74 numblks 1
+
+The errors were typically being seen in AGF, AGI and their related
+btree block buffers some time after log recovery had run. Often it
+wasn't until later subsequent mounts that the problem was
+discovered. The common symptom was a buffer with the correct
+contents, but a CRC and an LSN that matched an older version of the
+contents.
+
+Some debug added to _xfs_buf_ioapply() indicated that buffers were
+being written without verifiers attached to them from log recovery,
+and Jan Kara isolated the cause to log recovery readahead an dit's
+interactions with buffers that had a more recent LSN on disk than
+the transaction being recovered. In this case, the buffer did not
+get a verifier attached, and os when the second phase of log
+recovery ran and recovered EFIs and unlinked inodes, the buffers
+were modified and written without the verifier running. Hence they
+had up to date contents, but stale LSNs and CRCs.
+
+Fix it by attaching verifiers to buffers we skip due to future LSN
+values so they don't escape into the buffer cache without the
+correct verifier attached.
+
+This patch is based on analysis and a patch from Jan Kara.
+
+Reported-by: Jan Kara <jack@suse.cz>
+Reported-by: Fanael Linithien <fanael4@gmail.com>
+Reported-by: Grozdan <neutrino8@gmail.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_log_recover.c |   51 ++++++++++++++++++++++++++++-------------------
+ 1 file changed, 31 insertions(+), 20 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -2125,6 +2125,17 @@ xlog_recover_validate_buf_type(
+       __uint16_t              magic16;
+       __uint16_t              magicda;
+ 
++      /*
++       * We can only do post recovery validation on items on CRC enabled
++       * fielsystems as we need to know when the buffer was written to be able
++       * to determine if we should have replayed the item. If we replay old
++       * metadata over a newer buffer, then it will enter a temporarily
++       * inconsistent state resulting in verification failures. Hence for now
++       * just avoid the verification stage for non-crc filesystems
++       */
++      if (!xfs_sb_version_hascrc(&mp->m_sb))
++              return;
++
+       magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+       magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+       magicda = be16_to_cpu(info->magic);
+@@ -2162,8 +2173,6 @@ xlog_recover_validate_buf_type(
+               bp->b_ops = &xfs_agf_buf_ops;
+               break;
+       case XFS_BLFT_AGFL_BUF:
+-              if (!xfs_sb_version_hascrc(&mp->m_sb))
+-                      break;
+               if (magic32 != XFS_AGFL_MAGIC) {
+                       xfs_warn(mp, "Bad AGFL block magic!");
+                       ASSERT(0);
+@@ -2196,10 +2205,6 @@ xlog_recover_validate_buf_type(
+ #endif
+               break;
+       case XFS_BLFT_DINO_BUF:
+-              /*
+-               * we get here with inode allocation buffers, not buffers that
+-               * track unlinked list changes.
+-               */
+               if (magic16 != XFS_DINODE_MAGIC) {
+                       xfs_warn(mp, "Bad INODE block magic!");
+                       ASSERT(0);
+@@ -2279,8 +2284,6 @@ xlog_recover_validate_buf_type(
+               bp->b_ops = &xfs_attr3_leaf_buf_ops;
+               break;
+       case XFS_BLFT_ATTR_RMT_BUF:
+-              if (!xfs_sb_version_hascrc(&mp->m_sb))
+-                      break;
+               if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+                       xfs_warn(mp, "Bad attr remote magic!");
+                       ASSERT(0);
+@@ -2387,16 +2390,7 @@ xlog_recover_do_reg_buffer(
+       /* Shouldn't be any more regions */
+       ASSERT(i == item->ri_total);
+ 
+-      /*
+-       * We can only do post recovery validation on items on CRC enabled
+-       * fielsystems as we need to know when the buffer was written to be able
+-       * to determine if we should have replayed the item. If we replay old
+-       * metadata over a newer buffer, then it will enter a temporarily
+-       * inconsistent state resulting in verification failures. Hence for now
+-       * just avoid the verification stage for non-crc filesystems
+-       */
+-      if (xfs_sb_version_hascrc(&mp->m_sb))
+-              xlog_recover_validate_buf_type(mp, bp, buf_f);
++      xlog_recover_validate_buf_type(mp, bp, buf_f);
+ }
+ 
+ /*
+@@ -2504,12 +2498,29 @@ xlog_recover_buffer_pass2(
+       }
+ 
+       /*
+-       * recover the buffer only if we get an LSN from it and it's less than
++       * Recover the buffer only if we get an LSN from it and it's less than
+        * the lsn of the transaction we are replaying.
++       *
++       * Note that we have to be extremely careful of readahead here.
++       * Readahead does not attach verfiers to the buffers so if we don't
++       * actually do any replay after readahead because of the LSN we found
++       * in the buffer if more recent than that current transaction then we
++       * need to attach the verifier directly. Failure to do so can lead to
++       * future recovery actions (e.g. EFI and unlinked list recovery) can
++       * operate on the buffers and they won't get the verifier attached. This
++       * can lead to blocks on disk having the correct content but a stale
++       * CRC.
++       *
++       * It is safe to assume these clean buffers are currently up to date.
++       * If the buffer is dirtied by a later transaction being replayed, then
++       * the verifier will be reset to match whatever recover turns that
++       * buffer into.
+        */
+       lsn = xlog_recover_get_buf_lsn(mp, bp);
+-      if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
++      if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
++              xlog_recover_validate_buf_type(mp, bp, buf_f);
+               goto out_release;
++      }
+ 
+       if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+               error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
diff --git a/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch b/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch

new file mode 100644 (file)

index 0000000..b4bfceb
--- /dev/null
+++ b/queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch
@@ -0,0 +1,110 @@
+From 5fd364fee81a7888af806e42ed8a91c845894f2d Mon Sep 17 00:00:00 2001
+From: Dave Chinner <dchinner@redhat.com>
+Date: Mon, 4 Aug 2014 12:43:26 +1000
+Subject: xfs: quotacheck leaves dquot buffers without verifiers
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 5fd364fee81a7888af806e42ed8a91c845894f2d upstream.
+
+When running xfs/305, I noticed that quotacheck was flushing dquot
+buffers that did not have the xfs_dquot_buf_ops verifiers attached:
+
+XFS (vdb): _xfs_buf_ioapply: no ops on block 0x1dc8/0x1dc8
+ffff880052489000: 44 51 01 04 00 00 65 b8 00 00 00 00 00 00 00 00  DQ....e.........
+ffff880052489010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+ffff880052489020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+ffff880052489030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+CPU: 1 PID: 2376 Comm: mount Not tainted 3.16.0-rc2-dgc+ #306
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+ ffff88006fe38000 ffff88004a0ffae8 ffffffff81cf1cca 0000000000000001
+ ffff88004a0ffb88 ffffffff814d50ca 000010004a0ffc70 0000000000000000
+ ffff88006be56dc4 0000000000000021 0000000000001dc8 ffff88007c773d80
+Call Trace:
+ [<ffffffff81cf1cca>] dump_stack+0x45/0x56
+ [<ffffffff814d50ca>] _xfs_buf_ioapply+0x3ca/0x3d0
+ [<ffffffff810db520>] ? wake_up_state+0x20/0x20
+ [<ffffffff814d51f5>] ? xfs_bdstrat_cb+0x55/0xb0
+ [<ffffffff814d513b>] xfs_buf_iorequest+0x6b/0xd0
+ [<ffffffff814d51f5>] xfs_bdstrat_cb+0x55/0xb0
+ [<ffffffff814d53ab>] __xfs_buf_delwri_submit+0x15b/0x220
+ [<ffffffff814d6040>] ? xfs_buf_delwri_submit+0x30/0x90
+ [<ffffffff814d6040>] xfs_buf_delwri_submit+0x30/0x90
+ [<ffffffff8150f89d>] xfs_qm_quotacheck+0x17d/0x3c0
+ [<ffffffff81510591>] xfs_qm_mount_quotas+0x151/0x1e0
+ [<ffffffff814ed01c>] xfs_mountfs+0x56c/0x7d0
+ [<ffffffff814f0f12>] xfs_fs_fill_super+0x2c2/0x340
+ [<ffffffff811c9fe4>] mount_bdev+0x194/0x1d0
+ [<ffffffff814f0c50>] ? xfs_finish_flags+0x170/0x170
+ [<ffffffff814ef0f5>] xfs_fs_mount+0x15/0x20
+ [<ffffffff811ca8c9>] mount_fs+0x39/0x1b0
+ [<ffffffff811e4d67>] vfs_kern_mount+0x67/0x120
+ [<ffffffff811e757e>] do_mount+0x23e/0xad0
+ [<ffffffff8117abde>] ? __get_free_pages+0xe/0x50
+ [<ffffffff811e71e6>] ? copy_mount_options+0x36/0x150
+ [<ffffffff811e8103>] SyS_mount+0x83/0xc0
+ [<ffffffff81cfd40b>] tracesys+0xdd/0xe2
+
+This was caused by dquot buffer readahead not attaching a verifier
+structure to the buffer when readahead was issued, resulting in the
+followup read of the buffer finding a valid buffer and so not
+attaching new verifiers to the buffer as part of the read.
+
+Also, when a verifier failure occurs, we then read the buffer
+without verifiers. Attach the verifiers manually after this read so
+that if the buffer is then written it will be verified that the
+corruption has been repaired.
+
+Further, when flushing a dquot we don't ask for a verifier when
+reading in the dquot buffer the dquot belongs to. Most of the time
+this isn't an issue because the buffer is still cached, but when it
+is not cached it will result in writing the dquot buffer without
+having the verfier attached.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_dquot.c |    3 ++-
+ fs/xfs/xfs_qm.c    |    8 +++++++-
+ 2 files changed, 9 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_dquot.c
++++ b/fs/xfs/xfs_dquot.c
+@@ -974,7 +974,8 @@ xfs_qm_dqflush(
+        * Get the buffer containing the on-disk dquot
+        */
+       error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+-                                 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
++                                 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
++                                 &xfs_dquot_buf_ops);
+       if (error)
+               goto out_unlock;
+ 
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1005,6 +1005,12 @@ xfs_qm_dqiter_bufs(
+               if (error)
+                       break;
+ 
++              /*
++               * A corrupt buffer might not have a verifier attached, so
++               * make sure we have the correct one attached before writeback
++               * occurs.
++               */
++              bp->b_ops = &xfs_dquot_buf_ops;
+               xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+               xfs_buf_delwri_queue(bp, buffer_list);
+               xfs_buf_relse(bp);
+@@ -1090,7 +1096,7 @@ xfs_qm_dqiterate(
+                                       xfs_buf_readahead(mp->m_ddev_targp,
+                                              XFS_FSB_TO_DADDR(mp, rablkno),
+                                              mp->m_quotainfo->qi_dqchunklen,
+-                                             NULL);
++                                             &xfs_dquot_buf_ops);
+                                       rablkno++;
+                               }
+                       }
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 13 Sep 2014 01:49:17 +0000 (18:49 -0700)
queue-3.16/__generic_file_write_iter-fix-handling-of-sync-error-after-dio.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-avoid-use-of-session-socket-after-the-session-gets-freed.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-btmrvl-wait-for-host_sleep_enable-event-in-suspend.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-fix-merge-of-advertising-data-and-scan-response-data.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-fix-tracking-local-ssp-authentication-requirement.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-fix-using-uninitialized-variable-when-pairing.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/bluetooth-never-linger-on-process-exit.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/fix-copy_tree-regression.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/md-raid1-raid10-always-abort-recover-on-write-error.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/md-raid10-fix-memory-leak-when-raid10-reshape-completes.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/md-raid10-fix-memory-leak-when-reshaping-a-raid10.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/md-raid5-avoid-livelock-caused-by-non-aligned-writes.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/md-raid6-avoid-data-corruption-during-recovery-of-double-degraded-raid6.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/rbd-rework-rbd_request_fn.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/rdma-iwcm-use-a-default-listen-backlog-if-needed.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/rdma-uapi-include-socket.h-in-rdma_user_cm.h.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/series		patch \| blob \| blame \| history
queue-3.16/xfs-don-t-dirty-buffers-beyond-eof.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during-o_direct-writes.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/xfs-don-t-zero-partial-page-cache-pages-during.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/xfs-ensure-verifiers-are-attached-to-recovered-buffers.patch	[new file with mode: 0644]	patch \| blob
queue-3.16/xfs-quotacheck-leaves-dquot-buffers-without-verifiers.patch	[new file with mode: 0644]	patch \| blob