--- /dev/null
+From 09dc9cd6528f5b52bcbd3292a6312e762c85260f Mon Sep 17 00:00:00 2001
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Date: Thu, 7 Jan 2016 16:44:10 -0500
+Subject: IB/qib: fix mcast detach when qp not attached
+
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+
+commit 09dc9cd6528f5b52bcbd3292a6312e762c85260f upstream.
+
+The code produces the following trace:
+
+[1750924.419007] general protection fault: 0000 [#3] SMP
+[1750924.420364] Modules linked in: nfnetlink autofs4 rpcsec_gss_krb5 nfsv4
+dcdbas rfcomm bnep bluetooth nfsd auth_rpcgss nfs_acl dm_multipath nfs lockd
+scsi_dh sunrpc fscache radeon ttm drm_kms_helper drm serio_raw parport_pc
+ppdev i2c_algo_bit lpc_ich ipmi_si ib_mthca ib_qib dca lp parport ib_ipoib
+mac_hid ib_cm i3000_edac ib_sa ib_uverbs edac_core ib_umad ib_mad ib_core
+ib_addr tg3 ptp dm_mirror dm_region_hash dm_log psmouse pps_core
+[1750924.420364] CPU: 1 PID: 8401 Comm: python Tainted: G D
+3.13.0-39-generic #66-Ubuntu
+[1750924.420364] Hardware name: Dell Computer Corporation PowerEdge
+860/0XM089, BIOS A04 07/24/2007
+[1750924.420364] task: ffff8800366a9800 ti: ffff88007af1c000 task.ti:
+ffff88007af1c000
+[1750924.420364] RIP: 0010:[<ffffffffa0131d51>] [<ffffffffa0131d51>]
+qib_mcast_qp_free+0x11/0x50 [ib_qib]
+[1750924.420364] RSP: 0018:ffff88007af1dd70 EFLAGS: 00010246
+[1750924.420364] RAX: 0000000000000001 RBX: ffff88007b822688 RCX:
+000000000000000f
+[1750924.420364] RDX: ffff88007b822688 RSI: ffff8800366c15a0 RDI:
+6764697200000000
+[1750924.420364] RBP: ffff88007af1dd78 R08: 0000000000000001 R09:
+0000000000000000
+[1750924.420364] R10: 0000000000000011 R11: 0000000000000246 R12:
+ffff88007baa1d98
+[1750924.420364] R13: ffff88003ecab000 R14: ffff88007b822660 R15:
+0000000000000000
+[1750924.420364] FS: 00007ffff7fd8740(0000) GS:ffff88007fc80000(0000)
+knlGS:0000000000000000
+[1750924.420364] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[1750924.420364] CR2: 00007ffff597c750 CR3: 000000006860b000 CR4:
+00000000000007e0
+[1750924.420364] Stack:
+[1750924.420364] ffff88007b822688 ffff88007af1ddf0 ffffffffa0132429
+000000007af1de20
+[1750924.420364] ffff88007baa1dc8 ffff88007baa0000 ffff88007af1de70
+ffffffffa00cb313
+[1750924.420364] 00007fffffffde88 0000000000000000 0000000000000008
+ffff88003ecab000
+[1750924.420364] Call Trace:
+[1750924.420364] [<ffffffffa0132429>] qib_multicast_detach+0x1e9/0x350
+[ib_qib]
+[1750924.568035] [<ffffffffa00cb313>] ? ib_uverbs_modify_qp+0x323/0x3d0
+[ib_uverbs]
+[1750924.568035] [<ffffffffa0092d61>] ib_detach_mcast+0x31/0x50 [ib_core]
+[1750924.568035] [<ffffffffa00cc213>] ib_uverbs_detach_mcast+0x93/0x170
+[ib_uverbs]
+[1750924.568035] [<ffffffffa00c61f6>] ib_uverbs_write+0xc6/0x2c0 [ib_uverbs]
+[1750924.568035] [<ffffffff81312e68>] ? apparmor_file_permission+0x18/0x20
+[1750924.568035] [<ffffffff812d4cd3>] ? security_file_permission+0x23/0xa0
+[1750924.568035] [<ffffffff811bd214>] vfs_write+0xb4/0x1f0
+[1750924.568035] [<ffffffff811bdc49>] SyS_write+0x49/0xa0
+[1750924.568035] [<ffffffff8172f7ed>] system_call_fastpath+0x1a/0x1f
+[1750924.568035] Code: 66 2e 0f 1f 84 00 00 00 00 00 31 c0 5d c3 66 2e 0f 1f
+84 00 00 00 00 00 66 90 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb 48 8b 7f 10
+<f0> ff 8f 40 01 00 00 74 0e 48 89 df e8 8e f8 06 e1 5b 5d c3 0f
+[1750924.568035] RIP [<ffffffffa0131d51>] qib_mcast_qp_free+0x11/0x50
+[ib_qib]
+[1750924.568035] RSP <ffff88007af1dd70>
+[1750924.650439] ---[ end trace 73d5d4b3f8ad4851 ]
+
+The fix is to note the qib_mcast_qp that was found. If none is found, then
+return EINVAL indicating the error.
+
+Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Reported-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/qib/qib_verbs_mcast.c | 35 ++++++++++++----------------
+ 1 file changed, 15 insertions(+), 20 deletions(-)
+
+--- a/drivers/infiniband/hw/qib/qib_verbs_mcast.c
++++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
+@@ -286,15 +286,13 @@ int qib_multicast_detach(struct ib_qp *i
+ struct qib_ibdev *dev = to_idev(ibqp->device);
+ struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+ struct qib_mcast *mcast = NULL;
+- struct qib_mcast_qp *p, *tmp;
++ struct qib_mcast_qp *p, *tmp, *delp = NULL;
+ struct rb_node *n;
+ int last = 0;
+ int ret;
+
+- if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+- ret = -EINVAL;
+- goto bail;
+- }
++ if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
++ return -EINVAL;
+
+ spin_lock_irq(&ibp->lock);
+
+@@ -303,8 +301,7 @@ int qib_multicast_detach(struct ib_qp *i
+ while (1) {
+ if (n == NULL) {
+ spin_unlock_irq(&ibp->lock);
+- ret = -EINVAL;
+- goto bail;
++ return -EINVAL;
+ }
+
+ mcast = rb_entry(n, struct qib_mcast, rb_node);
+@@ -328,6 +325,7 @@ int qib_multicast_detach(struct ib_qp *i
+ */
+ list_del_rcu(&p->list);
+ mcast->n_attached--;
++ delp = p;
+
+ /* If this was the last attached QP, remove the GID too. */
+ if (list_empty(&mcast->qp_list)) {
+@@ -338,15 +336,16 @@ int qib_multicast_detach(struct ib_qp *i
+ }
+
+ spin_unlock_irq(&ibp->lock);
++ /* QP not attached */
++ if (!delp)
++ return -EINVAL;
++ /*
++ * Wait for any list walkers to finish before freeing the
++ * list element.
++ */
++ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
++ qib_mcast_qp_free(delp);
+
+- if (p) {
+- /*
+- * Wait for any list walkers to finish before freeing the
+- * list element.
+- */
+- wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+- qib_mcast_qp_free(p);
+- }
+ if (last) {
+ atomic_dec(&mcast->refcount);
+ wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+@@ -355,11 +354,7 @@ int qib_multicast_detach(struct ib_qp *i
+ dev->n_mcast_grps_allocated--;
+ spin_unlock_irq(&dev->n_mcast_grps_lock);
+ }
+-
+- ret = 0;
+-
+-bail:
+- return ret;
++ return 0;
+ }
+
+ int qib_mcast_tree_empty(struct qib_ibport *ibp)
--- /dev/null
+From 67645d7619738e51c668ca69f097cb90b5470422 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Mon, 28 Dec 2015 13:18:34 +0300
+Subject: libceph: fix ceph_msg_revoke()
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 67645d7619738e51c668ca69f097cb90b5470422 upstream.
+
+There are a number of problems with revoking a "was sending" message:
+
+(1) We never make any attempt to revoke data - only kvecs contibute to
+con->out_skip. However, once the header (envelope) is written to the
+socket, our peer learns data_len and sets itself to expect at least
+data_len bytes to follow front or front+middle. If ceph_msg_revoke()
+is called while the messenger is sending message's data portion,
+anything we send after that call is counted by the OSD towards the now
+revoked message's data portion. The effects vary, the most common one
+is the eventual hang - higher layers get stuck waiting for the reply to
+the message that was sent out after ceph_msg_revoke() returned and
+treated by the OSD as a bunch of data bytes. This is what Matt ran
+into.
+
+(2) Flat out zeroing con->out_kvec_bytes worth of bytes to handle kvecs
+is wrong. If ceph_msg_revoke() is called before the tag is sent out or
+while the messenger is sending the header, we will get a connection
+reset, either due to a bad tag (0 is not a valid tag) or a bad header
+CRC, which kind of defeats the purpose of revoke. Currently the kernel
+client refuses to work with header CRCs disabled, but that will likely
+change in the future, making this even worse.
+
+(3) con->out_skip is not reset on connection reset, leading to one or
+more spurious connection resets if we happen to get a real one between
+con->out_skip is set in ceph_msg_revoke() and before it's cleared in
+write_partial_skip().
+
+Fixing (1) and (3) is trivial. The idea behind fixing (2) is to never
+zero the tag or the header, i.e. send out tag+header regardless of when
+ceph_msg_revoke() is called. That way the header is always correct, no
+unnecessary resets are induced and revoke stands ready for disabled
+CRCs. Since ceph_msg_revoke() rips out con->out_msg, introduce a new
+"message out temp" and copy the header into it before sending.
+
+Reported-by: Matt Conner <matt.conner@keepertech.com>
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Tested-by: Matt Conner <matt.conner@keepertech.com>
+Reviewed-by: Sage Weil <sage@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/ceph/messenger.h | 2 -
+ net/ceph/messenger.c | 78 +++++++++++++++++++++++++++++++----------
+ 2 files changed, 60 insertions(+), 20 deletions(-)
+
+--- a/include/linux/ceph/messenger.h
++++ b/include/linux/ceph/messenger.h
+@@ -216,6 +216,7 @@ struct ceph_connection {
+ struct ceph_entity_addr actual_peer_addr;
+
+ /* message out temps */
++ struct ceph_msg_header out_hdr;
+ struct ceph_msg *out_msg; /* sending message (== tail of
+ out_sent) */
+ bool out_msg_done;
+@@ -225,7 +226,6 @@ struct ceph_connection {
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+- bool out_kvec_is_msg; /* kvec refers to out_msg */
+ int out_more; /* there is more data after the kvecs */
+ __le64 out_temp_ack; /* for writing an ack */
+
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -665,6 +665,8 @@ static void reset_connection(struct ceph
+ }
+ con->in_seq = 0;
+ con->in_seq_acked = 0;
++
++ con->out_skip = 0;
+ }
+
+ /*
+@@ -764,6 +766,8 @@ static u32 get_global_seq(struct ceph_me
+
+ static void con_out_kvec_reset(struct ceph_connection *con)
+ {
++ BUG_ON(con->out_skip);
++
+ con->out_kvec_left = 0;
+ con->out_kvec_bytes = 0;
+ con->out_kvec_cur = &con->out_kvec[0];
+@@ -772,9 +776,9 @@ static void con_out_kvec_reset(struct ce
+ static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+ {
+- int index;
++ int index = con->out_kvec_left;
+
+- index = con->out_kvec_left;
++ BUG_ON(con->out_skip);
+ BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+ con->out_kvec[index].iov_len = size;
+@@ -783,6 +787,27 @@ static void con_out_kvec_add(struct ceph
+ con->out_kvec_bytes += size;
+ }
+
++/*
++ * Chop off a kvec from the end. Return residual number of bytes for
++ * that kvec, i.e. how many bytes would have been written if the kvec
++ * hadn't been nuked.
++ */
++static int con_out_kvec_skip(struct ceph_connection *con)
++{
++ int off = con->out_kvec_cur - con->out_kvec;
++ int skip = 0;
++
++ if (con->out_kvec_bytes > 0) {
++ skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
++ BUG_ON(con->out_kvec_bytes < skip);
++ BUG_ON(!con->out_kvec_left);
++ con->out_kvec_bytes -= skip;
++ con->out_kvec_left--;
++ }
++
++ return skip;
++}
++
+ #ifdef CONFIG_BLOCK
+
+ /*
+@@ -1184,7 +1209,6 @@ static void prepare_write_message_footer
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+- con->out_kvec_is_msg = true;
+ con->out_kvec[v].iov_base = &m->footer;
+ con->out_kvec[v].iov_len = sizeof(m->footer);
+ con->out_kvec_bytes += sizeof(m->footer);
+@@ -1202,7 +1226,6 @@ static void prepare_write_message(struct
+ u32 crc;
+
+ con_out_kvec_reset(con);
+- con->out_kvec_is_msg = true;
+ con->out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+@@ -1242,18 +1265,19 @@ static void prepare_write_message(struct
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+- con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
++ con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+- /* fill in crc (except data pages), footer */
++ /* fill in hdr crc and finalize hdr */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ con->out_msg->hdr.crc = cpu_to_le32(crc);
+- con->out_msg->footer.flags = 0;
++ memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+
++ /* fill in front and middle crc, footer */
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ con->out_msg->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+@@ -1265,6 +1289,7 @@ static void prepare_write_message(struct
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
++ con->out_msg->footer.flags = 0;
+
+ /* is there a data payload? */
+ con->out_msg->footer.data_crc = 0;
+@@ -1459,7 +1484,6 @@ static int write_partial_kvec(struct cep
+ }
+ }
+ con->out_kvec_left = 0;
+- con->out_kvec_is_msg = false;
+ ret = 1;
+ out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+@@ -1551,6 +1575,7 @@ static int write_partial_skip(struct cep
+ {
+ int ret;
+
++ dout("%s %p %d left\n", __func__, con, con->out_skip);
+ while (con->out_skip > 0) {
+ size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
+
+@@ -2454,13 +2479,13 @@ more:
+
+ more_kvec:
+ /* kvec data queued? */
+- if (con->out_skip) {
+- ret = write_partial_skip(con);
++ if (con->out_kvec_left) {
++ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+- if (con->out_kvec_left) {
+- ret = write_partial_kvec(con);
++ if (con->out_skip) {
++ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+@@ -2974,16 +2999,31 @@ void ceph_msg_revoke(struct ceph_msg *ms
+ ceph_msg_put(msg);
+ }
+ if (con->out_msg == msg) {
+- dout("%s %p msg %p - was sending\n", __func__, con, msg);
+- con->out_msg = NULL;
+- if (con->out_kvec_is_msg) {
+- con->out_skip = con->out_kvec_bytes;
+- con->out_kvec_is_msg = false;
+- }
+- msg->hdr.seq = 0;
++ BUG_ON(con->out_skip);
++ /* footer */
++ if (con->out_msg_done) {
++ con->out_skip += con_out_kvec_skip(con);
++ } else {
++ BUG_ON(!msg->data_length);
++ if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
++ con->out_skip += sizeof(msg->footer);
++ else
++ con->out_skip += sizeof(msg->old_footer);
++ }
++ /* data, middle, front */
++ if (msg->data_length)
++ con->out_skip += msg->cursor.total_resid;
++ if (msg->middle)
++ con->out_skip += con_out_kvec_skip(con);
++ con->out_skip += con_out_kvec_skip(con);
+
++ dout("%s %p msg %p - was sending, will write %d skip %d\n",
++ __func__, con, msg, con->out_kvec_bytes, con->out_skip);
++ msg->hdr.seq = 0;
++ con->out_msg = NULL;
+ ceph_msg_put(msg);
+ }
++
+ mutex_unlock(&con->mutex);
+ }
+