--- /dev/null
+From bbe2e24262afd42a08e2fbc4fb1c134b94f64a57 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+Date: Mon, 19 Nov 2012 10:49:09 +0800
+Subject: ceph: call handle_cap_grant() for cap import message
+
+
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+
+If client sends cap message that requests new max size during
+exporting caps, the exporting MDS will drop the message quietly.
+So the client may wait for the reply that updates the max size
+forever. call handle_cap_grant() for cap import message can
+avoid this issue.
+
+Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 0e5e1774a92e6fe9c511585de8f078b4c4c68dbb)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/caps.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -2749,6 +2749,7 @@ static void handle_cap_import(struct cep
+
+ /* make sure we re-request max_size, if necessary */
+ spin_lock(&ci->i_ceph_lock);
++ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ spin_unlock(&ci->i_ceph_lock);
+ }
+@@ -2844,8 +2845,6 @@ void ceph_handle_caps(struct ceph_mds_se
+ case CEPH_CAP_OP_IMPORT:
+ handle_cap_import(mdsc, inode, h, session,
+ snaptrace, snaptrace_len);
+- ceph_check_caps(ceph_inode(inode), 0, session);
+- goto done_unlocked;
+ }
+
+ /* the rest require a cap */
+@@ -2862,6 +2861,7 @@ void ceph_handle_caps(struct ceph_mds_se
+ switch (op) {
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
++ case CEPH_CAP_OP_IMPORT:
+ handle_cap_grant(inode, h, session, cap, msg->middle);
+ goto done_unlocked;
+
--- /dev/null
+From c9a3f6ab3490925ecc0714a5e4fd4c8b0a110bc4 Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Mon, 30 Jul 2012 16:21:17 -0700
+Subject: ceph: close old con before reopening on mds reconnect
+
+
+From: Sage Weil <sage@inktank.com>
+
+When we detect a mds session reset, close the old ceph_connection before
+reopening it. This ensures we clean up the old socket properly and keep
+the ceph_connection state correct.
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+(cherry picked from commit a53aab645c82f0146e35684b34692c69b5118121)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/mds_client.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -2528,6 +2528,7 @@ static void send_mds_reconnect(struct ce
+ session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+ session->s_seq = 0;
+
++ ceph_con_close(&session->s_con);
+ ceph_con_open(&session->s_con,
+ CEPH_ENTITY_TYPE_MDS, mds,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
--- /dev/null
+From 8677d84432bc48ae52d6fc07e4af459b8b6aaeb4 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+Date: Mon, 19 Nov 2012 10:49:07 +0800
+Subject: ceph: Don't add dirty inode to dirty list if caps is in migration
+
+
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+
+Add dirty inode to cap_dirty_migrating list instead, this can avoid
+ceph_flush_dirty_caps() entering infinite loop.
+
+Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 0685235ffd9dbdb9ccbda587f8a3c83ad1d5a921)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/caps.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -1349,11 +1349,15 @@ int __ceph_mark_dirty_caps(struct ceph_i
+ if (!ci->i_head_snapc)
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+- ci->i_head_snapc);
++ dout(" inode %p now dirty snapc %p auth cap %p\n",
++ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ spin_lock(&mdsc->cap_dirty_lock);
+- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
++ if (ci->i_auth_cap)
++ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
++ else
++ list_add(&ci->i_dirty_item,
++ &mdsc->cap_dirty_migrating);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ ihold(inode);
--- /dev/null
+From f54e923eff7ca2a1711023d39dcd40889f6407a4 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Thu, 29 Nov 2012 08:37:03 -0600
+Subject: ceph: don't reference req after put
+
+
+From: Alex Elder <elder@inktank.com>
+
+In __unregister_request(), there is a call to list_del_init()
+referencing a request that was the subject of a call to
+ceph_osdc_put_request() on the previous line. This is not
+safe, because the request structure could have been freed
+by the time we reach the list_del_init().
+
+Fix this by reversing the order of these lines.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 7d5f24812bd182a2471cb69c1c2baf0648332e1f)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -871,9 +871,9 @@ static void __unregister_request(struct
+ req->r_osd = NULL;
+ }
+
++ list_del_init(&req->r_req_lru_item);
+ ceph_osdc_put_request(req);
+
+- list_del_init(&req->r_req_lru_item);
+ if (osdc->num_requests == 0) {
+ dout(" no requests, canceling timeout\n");
+ __cancel_osd_timeout(osdc);
--- /dev/null
+From 50c532cd7abb2054f5bb045244cd9a561b7e70ff Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+Date: Mon, 19 Nov 2012 10:49:04 +0800
+Subject: ceph: Don't update i_max_size when handling non-auth cap
+
+
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+
+The cap from non-auth mds doesn't have a meaningful max_size value.
+
+Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 5e62ad30157d0da04cf40c6d1a2f4bc840948b9c)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/caps.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ceph/caps.c
++++ b/fs/ceph/caps.c
+@@ -2388,7 +2388,7 @@ static void handle_cap_grant(struct inod
+ &atime);
+
+ /* max size increase? */
+- if (max_size != ci->i_max_size) {
++ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
--- /dev/null
+From 379ad3e7100d3c1deebb150af4dc38b9f4e90006 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+Date: Mon, 19 Nov 2012 10:49:08 +0800
+Subject: ceph: Fix __ceph_do_pending_vmtruncate
+
+
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+
+we should set i_truncate_pending to 0 after page cache is truncated
+to i_truncate_size
+
+Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit a85f50b6ef93fbbb2ae932ce9b2376509d172796)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/inode.c | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct
+ {
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 to;
+- int wrbuffer_refs, wake = 0;
++ int wrbuffer_refs, finish = 0;
+
+ retry:
+ spin_lock(&ci->i_ceph_lock);
+@@ -1498,15 +1498,18 @@ retry:
+ truncate_inode_pages(inode->i_mapping, to);
+
+ spin_lock(&ci->i_ceph_lock);
+- ci->i_truncate_pending--;
+- if (ci->i_truncate_pending == 0)
+- wake = 1;
++ if (to == ci->i_truncate_size) {
++ ci->i_truncate_pending = 0;
++ finish = 1;
++ }
+ spin_unlock(&ci->i_ceph_lock);
++ if (!finish)
++ goto retry;
+
+ if (wrbuffer_refs == 0)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+- if (wake)
+- wake_up_all(&ci->i_cap_wq);
++
++ wake_up_all(&ci->i_cap_wq);
+ }
+
+
--- /dev/null
+From c9c3fd311561a922ebbd999f3ad00b5f907000c2 Mon Sep 17 00:00:00 2001
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+Date: Mon, 19 Nov 2012 10:49:06 +0800
+Subject: ceph: Fix infinite loop in __wake_requests
+
+
+From: "Yan, Zheng" <zheng.z.yan@intel.com>
+
+__wake_requests() will enter infinite loop if we use it to wake
+requests in the session->s_waiting list. __wake_requests() deletes
+requests from the list and __do_request() adds requests back to
+the list.
+
+Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit ed75ec2cd19b47efcd292b6e23f58e56f4c5bc34)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/mds_client.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -1886,9 +1886,14 @@ finish:
+ static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head)
+ {
+- struct ceph_mds_request *req, *nreq;
++ struct ceph_mds_request *req;
++ LIST_HEAD(tmp_list);
+
+- list_for_each_entry_safe(req, nreq, head, r_wait) {
++ list_splice_init(head, &tmp_list);
++
++ while (!list_empty(&tmp_list)) {
++ req = list_entry(tmp_list.next,
++ struct ceph_mds_request, r_wait);
+ list_del_init(&req->r_wait);
+ __do_request(mdsc, req);
+ }
--- /dev/null
+From 7dab35042aab340d087737d42c2fae34af0b5c78 Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Mon, 24 Sep 2012 21:01:02 -0700
+Subject: ceph: propagate layout error on osd request creation
+
+
+From: Sage Weil <sage@inktank.com>
+
+If we are creating an osd request and get an invalid layout, return
+an EINVAL to the caller. We switch up the return to have an error
+code instead of NULL implying -ENOMEM.
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 6816282dab3a72efe8c0d182c1bc2960d87f4322)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/addr.c | 8 ++++----
+ fs/ceph/file.c | 4 ++--
+ net/ceph/osd_client.c | 15 +++++++++------
+ 3 files changed, 15 insertions(+), 12 deletions(-)
+
+--- a/fs/ceph/addr.c
++++ b/fs/ceph/addr.c
+@@ -308,8 +308,8 @@ static int start_read(struct inode *inod
+ NULL, 0,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ NULL, false, 1, 0);
+- if (!req)
+- return -ENOMEM;
++ if (IS_ERR(req))
++ return PTR_ERR(req);
+
+ /* build page vector */
+ nr_pages = len >> PAGE_CACHE_SHIFT;
+@@ -831,8 +831,8 @@ get_more_pages:
+ ci->i_truncate_size,
+ &inode->i_mtime, true, 1, 0);
+
+- if (!req) {
+- rc = -ENOMEM;
++ if (IS_ERR(req)) {
++ rc = PTR_ERR(req);
+ unlock_page(page);
+ break;
+ }
+--- a/fs/ceph/file.c
++++ b/fs/ceph/file.c
+@@ -529,8 +529,8 @@ more:
+ do_sync,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &mtime, false, 2, page_align);
+- if (!req)
+- return -ENOMEM;
++ if (IS_ERR(req))
++ return PTR_ERR(req);
+
+ if (file->f_flags & O_DIRECT) {
+ pages = ceph_get_direct_page_vector(data, num_pages, false);
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -461,6 +461,7 @@ struct ceph_osd_request *ceph_osdc_new_r
+ {
+ struct ceph_osd_req_op ops[3];
+ struct ceph_osd_request *req;
++ int r;
+
+ ops[0].op = opcode;
+ ops[0].extent.truncate_seq = truncate_seq;
+@@ -479,10 +480,12 @@ struct ceph_osd_request *ceph_osdc_new_r
+ use_mempool,
+ GFP_NOFS, NULL, NULL);
+ if (!req)
+- return NULL;
++ return ERR_PTR(-ENOMEM);
+
+ /* calculate max write size */
+- calc_layout(osdc, vino, layout, off, plen, req, ops);
++ r = calc_layout(osdc, vino, layout, off, plen, req, ops);
++ if (r < 0)
++ return ERR_PTR(r);
+ req->r_file_layout = *layout; /* keep a copy */
+
+ /* in case it differs from natural (file) alignment that
+@@ -1925,8 +1928,8 @@ int ceph_osdc_readpages(struct ceph_osd_
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0, truncate_seq, truncate_size, NULL,
+ false, 1, page_align);
+- if (!req)
+- return -ENOMEM;
++ if (IS_ERR(req))
++ return PTR_ERR(req);
+
+ /* it may be a short read due to an object boundary */
+ req->r_pages = pages;
+@@ -1968,8 +1971,8 @@ int ceph_osdc_writepages(struct ceph_osd
+ snapc, do_sync,
+ truncate_seq, truncate_size, mtime,
+ nofail, 1, page_align);
+- if (!req)
+- return -ENOMEM;
++ if (IS_ERR(req))
++ return PTR_ERR(req);
+
+ /* it may be a short write due to an object boundary */
+ req->r_pages = pages;
--- /dev/null
+From ecb6de0b8f805a901457390c4433a923411e139d Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Tue, 21 Aug 2012 15:55:25 -0700
+Subject: ceph: tolerate (and warn on) extraneous dentry from mds
+
+
+From: Sage Weil <sage@inktank.com>
+
+If the MDS gives us a dentry and we weren't prepared to handle it,
+WARN_ON_ONCE instead of crashing.
+
+Reported-by: Yan, Zheng <zheng.z.yan@intel.com>
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 6c5e50fa614fea5325a2973be06f7ec6f1055316)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/inode.c | 15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+--- a/fs/ceph/inode.c
++++ b/fs/ceph/inode.c
+@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *
+ if (rinfo->head->is_dentry) {
+ struct inode *dir = req->r_locked_dir;
+
+- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+- session, req->r_request_started, -1,
+- &req->r_caps_reservation);
+- if (err < 0)
+- return err;
++ if (dir) {
++ err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
++ session, req->r_request_started, -1,
++ &req->r_caps_reservation);
++ if (err < 0)
++ return err;
++ } else {
++ WARN_ON_ONCE(1);
++ }
+ }
+
+ /*
+@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *
+ * will have trouble splicing in the virtual snapdir later
+ */
+ if (rinfo->head->is_dentry && !req->r_aborted &&
++ req->r_locked_dir &&
+ (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+ fsc->mount_options->snapdir_name,
+ req->r_dentry->d_name.len))) {
--- /dev/null
+From 5cad941fcd34022b5c0b7475c88ff618f2db659f Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Wed, 26 Dec 2012 14:31:40 -0600
+Subject: libceph: always reset osds when kicking
+
+
+From: Alex Elder <elder@inktank.com>
+
+When ceph_osdc_handle_map() is called to process a new osd map,
+kick_requests() is called to ensure all affected requests are
+updated if necessary to reflect changes in the osd map. This
+happens in two cases: whenever an incremental map update is
+processed; and when a full map update (or the last one if there is
+more than one) gets processed.
+
+In the former case, the kick_requests() call is followed immediately
+by a call to reset_changed_osds() to ensure any connections to osds
+affected by the map change are reset. But for full map updates
+this isn't done.
+
+Both cases should be doing this osd reset.
+
+Rather than duplicating the reset_changed_osds() call, move it into
+the end of kick_requests().
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit e6d50f67a6b1a6252a616e6e629473b5c4277218)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -1306,7 +1306,7 @@ static void reset_changed_osds(struct ce
+ * Requeue requests whose mapping to an OSD has changed. If requests map to
+ * no osd, request a new map.
+ *
+- * Caller should hold map_sem for read and request_mutex.
++ * Caller should hold map_sem for read.
+ */
+ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
+ {
+@@ -1381,6 +1381,7 @@ static void kick_requests(struct ceph_os
+ dout("%d requests for down osds, need new map\n", needmap);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ }
++ reset_changed_osds(osdc);
+ }
+
+
+@@ -1437,7 +1438,6 @@ void ceph_osdc_handle_map(struct ceph_os
+ osdc->osdmap = newmap;
+ }
+ kick_requests(osdc, 0);
+- reset_changed_osds(osdc);
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
--- /dev/null
+From 3c27b4c0960d284d78b356be6a59ef6cff5a2274 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 7 Dec 2012 09:57:58 -0600
+Subject: libceph: avoid using freed osd in __kick_osd_requests()
+
+
+From: Alex Elder <elder@inktank.com>
+
+If an osd has no requests and no linger requests, __reset_osd()
+will just remove it with a call to __remove_osd(). That drops
+a reference to the osd, and therefore the osd may have been free
+by the time __reset_osd() returns. That function offers no
+indication this may have occurred, and as a result the osd will
+continue to be used even when it's no longer valid.
+
+Change__reset_osd() so it returns an error (ENODEV) when it
+deletes the osd being reset. And change __kick_osd_requests() so it
+returns immediately (before referencing osd again) if __reset_osd()
+returns *any* error.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 685a7555ca69030739ddb57a47f0ea8ea80196a4)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -579,7 +579,7 @@ static void __kick_osd_requests(struct c
+
+ dout("__kick_osd_requests osd%d\n", osd->o_osd);
+ err = __reset_osd(osdc, osd);
+- if (err == -EAGAIN)
++ if (err)
+ return;
+
+ list_for_each_entry(req, &osd->o_requests, r_osd_item) {
+@@ -750,6 +750,7 @@ static int __reset_osd(struct ceph_osd_c
+ if (list_empty(&osd->o_requests) &&
+ list_empty(&osd->o_linger_requests)) {
+ __remove_osd(osdc, osd);
++ ret = -ENODEV;
+ } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+ &osd->o_con.peer_addr,
+ sizeof(osd->o_con.peer_addr)) == 0 &&
--- /dev/null
+From 072373ec146bbeeb0c17b67c285b39c41dc91765 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Mon, 17 Dec 2012 12:23:48 -0600
+Subject: libceph: don't use rb_init_node() in ceph_osdc_alloc_request()
+
+
+From: Alex Elder <elder@inktank.com>
+
+The red-black node in the ceph osd request structure is initialized
+in ceph_osdc_alloc_request() using rbd_init_node(). We do need to
+initialize this, because in __unregister_request() we call
+RB_EMPTY_NODE(), which expects the node it's checking to have
+been initialized. But rb_init_node() is apparently overkill, and
+may in fact be on its way out. So use RB_CLEAR_NODE() instead.
+
+For a little more background, see this commit:
+ 4c199a93 rbtree: empty nodes have no color"
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit a978fa20fb657548561dddbfb605fe43654f0825)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -221,6 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
++ RB_CLEAR_NODE(&req->r_node);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+ INIT_LIST_HEAD(&req->r_linger_item);
+ INIT_LIST_HEAD(&req->r_linger_osd);
--- /dev/null
+From 86dac6809fe52100517c2ada77923b7e4e4632ab Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Mon, 29 Oct 2012 11:01:42 -0700
+Subject: libceph: fix osdmap decode error paths
+
+
+From: Sage Weil <sage@inktank.com>
+
+Ensure that we set the err value correctly so that we do not pass a 0
+value to ERR_PTR and confuse the calling code. (In particular,
+osd_client.c handle_map() will BUG(!newmap)).
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 0ed7285e0001b960c888e5455ae982025210ed3d)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osdmap.c | 31 ++++++++++++++++++++-----------
+ 1 file changed, 20 insertions(+), 11 deletions(-)
+
+--- a/net/ceph/osdmap.c
++++ b/net/ceph/osdmap.c
+@@ -613,10 +613,12 @@ struct ceph_osdmap *osdmap_decode(void *
+ ceph_decode_32_safe(p, end, max, bad);
+ while (max--) {
+ ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
++ err = -ENOMEM;
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ goto bad;
+ pi->id = ceph_decode_32(p);
++ err = -EINVAL;
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+@@ -632,8 +634,13 @@ struct ceph_osdmap *osdmap_decode(void *
+ __insert_pg_pool(&map->pg_pools, pi);
+ }
+
+- if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+- goto bad;
++ if (version >= 5) {
++ err = __decode_pool_names(p, end, map);
++ if (err < 0) {
++ dout("fail to decode pool names");
++ goto bad;
++ }
++ }
+
+ ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+@@ -713,7 +720,7 @@ struct ceph_osdmap *osdmap_decode(void *
+ return map;
+
+ bad:
+- dout("osdmap_decode fail\n");
++ dout("osdmap_decode fail err %d\n", err);
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(err);
+ }
+@@ -807,6 +814,7 @@ struct ceph_osdmap *osdmap_apply_increme
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
++ err = -EINVAL;
+ goto bad;
+ }
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+@@ -823,8 +831,11 @@ struct ceph_osdmap *osdmap_apply_increme
+ if (err < 0)
+ goto bad;
+ }
+- if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+- goto bad;
++ if (version >= 5) {
++ err = __decode_pool_names(p, end, map);
++ if (err < 0)
++ goto bad;
++ }
+
+ /* old_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+@@ -900,15 +911,13 @@ struct ceph_osdmap *osdmap_apply_increme
+ (void) __remove_pg_mapping(&map->pg_temp, pgid);
+
+ /* insert */
+- if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
+- err = -EINVAL;
++ err = -EINVAL;
++ if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+ goto bad;
+- }
++ err = -ENOMEM;
+ pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+- if (!pg) {
+- err = -ENOMEM;
++ if (!pg)
+ goto bad;
+- }
+ pg->pgid = pgid;
+ pg->len = pglen;
+ for (j = 0; j < pglen; j++)
--- /dev/null
+From ebbdceac00f878c2ef43961aa19f7f033c9fbeb3 Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Thu, 27 Dec 2012 20:27:04 -0600
+Subject: libceph: fix protocol feature mismatch failure path
+
+
+From: Sage Weil <sage@inktank.com>
+
+We should not set con->state to CLOSED here; that happens in
+ceph_fault() in the caller, where it first asserts that the state
+is not yet CLOSED. Avoids a BUG when the features don't match.
+
+Since the fail_protocol() has become a trivial wrapper, replace
+calls to it with direct calls to reset_connection().
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 0fa6ebc600bc8e830551aee47a0e929e818a1868)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/messenger.c | 14 ++++----------
+ 1 file changed, 4 insertions(+), 10 deletions(-)
+
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -506,6 +506,7 @@ static void reset_connection(struct ceph
+ {
+ /* reset connection, out_queue, msg_ and connect_seq */
+ /* discard existing out_queue and msg_seq */
++ dout("reset_connection %p\n", con);
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+
+@@ -1502,13 +1503,6 @@ static int process_banner(struct ceph_co
+ return 0;
+ }
+
+-static void fail_protocol(struct ceph_connection *con)
+-{
+- reset_connection(con);
+- WARN_ON(con->state != CON_STATE_NEGOTIATING);
+- con->state = CON_STATE_CLOSED;
+-}
+-
+ static int process_connect(struct ceph_connection *con)
+ {
+ u64 sup_feat = con->msgr->supported_features;
+@@ -1526,7 +1520,7 @@ static int process_connect(struct ceph_c
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+- fail_protocol(con);
++ reset_connection(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+@@ -1537,7 +1531,7 @@ static int process_connect(struct ceph_c
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+- fail_protocol(con);
++ reset_connection(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+@@ -1627,7 +1621,7 @@ static int process_connect(struct ceph_c
+ ceph_pr_addr(&con->peer_addr.in_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+- fail_protocol(con);
++ reset_connection(con);
+ return -1;
+ }
+
--- /dev/null
+From e62bb27ca5fd05f8d490cfaf0d03d39956bb09b4 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Mon, 17 Dec 2012 12:23:48 -0600
+Subject: libceph: init event->node in ceph_osdc_create_event()
+
+
+From: Alex Elder <elder@inktank.com>
+
+The red-black node node in the ceph osd event structure is not
+initialized in create_osdc_create_event(). Because this node can
+be the subject of a RB_EMPTY_NODE() call later on, we should ensure
+the node is initialized properly for that.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 3ee5234df68d253c415ba4f2db72ad250d9c21a9)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -1598,6 +1598,7 @@ int ceph_osdc_create_event(struct ceph_o
+ event->data = data;
+ event->osdc = osdc;
+ INIT_LIST_HEAD(&event->osd_node);
++ RB_CLEAR_NODE(&event->node);
+ kref_init(&event->kref); /* one ref for us */
+ kref_get(&event->kref); /* one ref for the caller */
+ init_completion(&event->completion);
--- /dev/null
+From 31a6ebb578117b21ebb31c7e6cf025c4351a5c56 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Thu, 6 Dec 2012 07:22:04 -0600
+Subject: libceph: init osd->o_node in create_osd()
+
+
+From: Alex Elder <elder@inktank.com>
+
+The red-black node node in the ceph osd structure is not initialized
+in create_osd(). Because this node can be the subject of a
+RB_EMPTY_NODE() call later on, we should ensure the node is
+initialized properly for that. Add a call to RB_CLEAR_NODE()
+initialize it.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit f407731d12214e7686819018f3a1e9d7b6f83a02)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -645,6 +645,7 @@ static struct ceph_osd *create_osd(struc
+ atomic_set(&osd->o_ref, 1);
+ osd->o_osdc = osdc;
+ osd->o_osd = onum;
++ RB_CLEAR_NODE(&osd->o_node);
+ INIT_LIST_HEAD(&osd->o_requests);
+ INIT_LIST_HEAD(&osd->o_linger_requests);
+ INIT_LIST_HEAD(&osd->o_osd_lru);
--- /dev/null
+From fa20948861694bd5b450e667ccb70d1ca8b2374c Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Wed, 19 Dec 2012 15:52:36 -0600
+Subject: libceph: move linger requests sooner in kick_requests()
+
+
+From: Alex Elder <elder@inktank.com>
+
+The kick_requests() function is called by ceph_osdc_handle_map()
+when an osd map change has been indicated. Its purpose is to
+re-queue any request whose target osd is different from what it
+was when it was originally sent.
+
+It is structured as two loops, one for incomplete but registered
+requests, and a second for handling completed linger requests.
+As a special case, in the first loop if a request marked to linger
+has not yet completed, it is moved from the request list to the
+linger list. This is as a quick and dirty way to have the second
+loop handle sending the request along with all the other linger
+requests.
+
+Because of the way it's done now, however, this quick and dirty
+solution can result in these incomplete linger requests never
+getting re-sent as desired. The problem lies in the fact that
+the second loop only arranges for a linger request to be sent
+if it appears its target osd has changed. This is the proper
+handling for *completed* linger requests (it avoids issuing
+the same linger request twice to the same osd).
+
+But although the linger requests added to the list in the first loop
+may have been sent, they have not yet completed, so they need to be
+re-sent regardless of whether their target osd has changed.
+
+The first required fix is we need to avoid calling __map_request()
+on any incomplete linger request. Otherwise the subsequent
+__map_request() call in the second loop will find the target osd
+has not changed and will therefore not re-send the request.
+
+Second, we need to be sure that a sent but incomplete linger request
+gets re-sent. If the target osd is the same with the new osd map as
+it was when the request was originally sent, this won't happen.
+This can be fixed through careful handling when we move these
+requests from the request list to the linger list, by unregistering
+the request *before* it is registered as a linger request. This
+works because a side-effect of unregistering the request is to make
+the request's r_osd pointer be NULL, and *that* will ensure the
+second loop actually re-sends the linger request.
+
+Processing of such a request is done at that point, so continue with
+the next one once it's been moved.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit ab60b16d3c31b9bd9fd5b39f97dc42c52a50b67d)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 30 +++++++++++++++++++-----------
+ 1 file changed, 19 insertions(+), 11 deletions(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -1320,6 +1320,24 @@ static void kick_requests(struct ceph_os
+ for (p = rb_first(&osdc->requests); p; ) {
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+ p = rb_next(p);
++
++ /*
++ * For linger requests that have not yet been
++ * registered, move them to the linger list; they'll
++ * be sent to the osd in the loop below. Unregister
++ * the request before re-registering it as a linger
++ * request to ensure the __map_request() below
++ * will decide it needs to be sent.
++ */
++ if (req->r_linger && list_empty(&req->r_linger_item)) {
++ dout("%p tid %llu restart on osd%d\n",
++ req, req->r_tid,
++ req->r_osd ? req->r_osd->o_osd : -1);
++ __unregister_request(osdc, req);
++ __register_linger_request(osdc, req);
++ continue;
++ }
++
+ err = __map_request(osdc, req, force_resend);
+ if (err < 0)
+ continue; /* error */
+@@ -1334,17 +1352,6 @@ static void kick_requests(struct ceph_os
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ }
+ }
+- if (req->r_linger && list_empty(&req->r_linger_item)) {
+- /*
+- * register as a linger so that we will
+- * re-submit below and get a new tid
+- */
+- dout("%p tid %llu restart on osd%d\n",
+- req, req->r_tid,
+- req->r_osd ? req->r_osd->o_osd : -1);
+- __register_linger_request(osdc, req);
+- __unregister_request(osdc, req);
+- }
+ }
+
+ list_for_each_entry_safe(req, nreq, &osdc->req_linger,
+@@ -1352,6 +1359,7 @@ static void kick_requests(struct ceph_os
+ dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
+
+ err = __map_request(osdc, req, force_resend);
++ dout("__map_request returned %d\n", err);
+ if (err == 0)
+ continue; /* no change and no osd was specified */
+ if (err < 0)
--- /dev/null
+From 0a00fe0504f0416d9e0daf5eed6c6368b88c0aa3 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Thu, 6 Dec 2012 07:22:04 -0600
+Subject: libceph: register request before unregister linger
+
+
+From: Alex Elder <elder@inktank.com>
+
+In kick_requests(), we need to register the request before we
+unregister the linger request. Otherwise the unregister will
+reset the request's osd pointer to NULL.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit c89ce05e0c5a01a256100ac6a6019f276bdd1ca6)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -1364,8 +1364,8 @@ static void kick_requests(struct ceph_os
+
+ dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1);
+- __unregister_linger_request(osdc, req);
+ __register_request(osdc, req);
++ __unregister_linger_request(osdc, req);
+ }
+ mutex_unlock(&osdc->request_mutex);
+
--- /dev/null
+From 55540e5c7745cc0dea6b0af54accd8a9e9a2670e Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Wed, 28 Nov 2012 12:28:24 -0800
+Subject: libceph: remove 'osdtimeout' option
+
+
+From: Sage Weil <sage@inktank.com>
+
+This would reset a connection with any OSD that had an outstanding
+request that was taking more than N seconds. The idea was that if the
+OSD was buggy, the client could compensate by resending the request.
+
+In reality, this only served to hide server bugs, and we haven't
+actually seen such a bug in quite a while. Moreover, the userspace
+client code never did this.
+
+More importantly, often the request is taking a long time because the
+OSD is trying to recover, or overloaded, and killing the connection
+and retrying would only make the situation worse by giving the OSD
+more work to do.
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 83aff95eb9d60aff5497e9f44a2ae906b86d8e88)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/super.c | 2 -
+ include/linux/ceph/libceph.h | 2 -
+ net/ceph/ceph_common.c | 3 --
+ net/ceph/osd_client.c | 47 +++----------------------------------------
+ 4 files changed, 5 insertions(+), 49 deletions(-)
+
+--- a/fs/ceph/super.c
++++ b/fs/ceph/super.c
+@@ -387,8 +387,6 @@ static int ceph_show_options(struct seq_
+ seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+ if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+- if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+- seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+ if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, ",osdkeepalivetimeout=%d",
+ opt->osd_keepalive_timeout);
+--- a/include/linux/ceph/libceph.h
++++ b/include/linux/ceph/libceph.h
+@@ -49,7 +49,6 @@ struct ceph_options {
+ struct ceph_entity_addr my_addr;
+ int mount_timeout;
+ int osd_idle_ttl;
+- int osd_timeout;
+ int osd_keepalive_timeout;
+
+ /*
+@@ -69,7 +68,6 @@ struct ceph_options {
+ * defaults
+ */
+ #define CEPH_MOUNT_TIMEOUT_DEFAULT 60
+-#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
+ #define CEPH_OSD_KEEPALIVE_DEFAULT 5
+ #define CEPH_OSD_IDLE_TTL_DEFAULT 60
+
+--- a/net/ceph/ceph_common.c
++++ b/net/ceph/ceph_common.c
+@@ -304,7 +304,6 @@ ceph_parse_options(char *options, const
+
+ /* start with defaults */
+ opt->flags = CEPH_OPT_DEFAULT;
+- opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
+@@ -390,7 +389,7 @@ ceph_parse_options(char *options, const
+
+ /* misc */
+ case Opt_osdtimeout:
+- opt->osd_timeout = intval;
++ pr_warning("ignoring deprecated osdtimeout option\n");
+ break;
+ case Opt_osdkeepalivetimeout:
+ opt->osd_keepalive_timeout = intval;
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -606,14 +606,6 @@ static void __kick_osd_requests(struct c
+ }
+ }
+
+-static void kick_osd_requests(struct ceph_osd_client *osdc,
+- struct ceph_osd *kickosd)
+-{
+- mutex_lock(&osdc->request_mutex);
+- __kick_osd_requests(osdc, kickosd);
+- mutex_unlock(&osdc->request_mutex);
+-}
+-
+ /*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+@@ -627,7 +619,9 @@ static void osd_reset(struct ceph_connec
+ dout("osd_reset osd%d\n", osd->o_osd);
+ osdc = osd->o_osdc;
+ down_read(&osdc->map_sem);
+- kick_osd_requests(osdc, osd);
++ mutex_lock(&osdc->request_mutex);
++ __kick_osd_requests(osdc, osd);
++ mutex_unlock(&osdc->request_mutex);
+ send_queued(osdc);
+ up_read(&osdc->map_sem);
+ }
+@@ -1091,12 +1085,10 @@ static void handle_timeout(struct work_s
+ {
+ struct ceph_osd_client *osdc =
+ container_of(work, struct ceph_osd_client, timeout_work.work);
+- struct ceph_osd_request *req, *last_req = NULL;
++ struct ceph_osd_request *req;
+ struct ceph_osd *osd;
+- unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+ unsigned long keepalive =
+ osdc->client->options->osd_keepalive_timeout * HZ;
+- unsigned long last_stamp = 0;
+ struct list_head slow_osds;
+ dout("timeout\n");
+ down_read(&osdc->map_sem);
+@@ -1106,37 +1098,6 @@ static void handle_timeout(struct work_s
+ mutex_lock(&osdc->request_mutex);
+
+ /*
+- * reset osds that appear to be _really_ unresponsive. this
+- * is a failsafe measure.. we really shouldn't be getting to
+- * this point if the system is working properly. the monitors
+- * should mark the osd as failed and we should find out about
+- * it from an updated osd map.
+- */
+- while (timeout && !list_empty(&osdc->req_lru)) {
+- req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+- r_req_lru_item);
+-
+- /* hasn't been long enough since we sent it? */
+- if (time_before(jiffies, req->r_stamp + timeout))
+- break;
+-
+- /* hasn't been long enough since it was acked? */
+- if (req->r_request->ack_stamp == 0 ||
+- time_before(jiffies, req->r_request->ack_stamp + timeout))
+- break;
+-
+- BUG_ON(req == last_req && req->r_stamp == last_stamp);
+- last_req = req;
+- last_stamp = req->r_stamp;
+-
+- osd = req->r_osd;
+- BUG_ON(!osd);
+- pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+- req->r_tid, osd->o_osd);
+- __kick_osd_requests(osdc, osd);
+- }
+-
+- /*
+ * ping osds that are a bit slow. this ensures that if there
+ * is a break in the TCP connection we will notice, and reopen
+ * a connection with that osd (from the fault callback).
--- /dev/null
+From 4b2c444a99b137f460f6041ab82e84e4d7873203 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 14 Dec 2012 16:47:41 -0600
+Subject: libceph: report connection fault with warning
+
+
+From: Alex Elder <elder@inktank.com>
+
+When a connection's socket disconnects, or if there's a protocol
+error of some kind on the connection, a fault is signaled and
+the connection is reset (closed and reopened, basically). We
+currently get an error message on the log whenever this occurs.
+
+A ceph connection will attempt to reestablish a socket connection
+repeatedly if a fault occurs. This means that these error messages
+will get repeatedly added to the log, which is undesirable.
+
+Change the error message to be a warning, so they don't get
+logged by default.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 28362986f8743124b3a0fda20a8ed3e80309cce1)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/messenger.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -2365,7 +2365,7 @@ fault:
+ static void ceph_fault(struct ceph_connection *con)
+ __releases(con->mutex)
+ {
+- pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
++ pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
--- /dev/null
+From 8e4a805ffc7fb5406c157a9060f704376df327ef Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 7 Dec 2012 19:50:07 -0600
+Subject: libceph: socket can close in any connection state
+
+
+From: Alex Elder <elder@inktank.com>
+
+A connection's socket can close for any reason, independent of the
+state of the connection (and without irrespective of the connection
+mutex). As a result, the connectino can be in pretty much any state
+at the time its socket is closed.
+
+Handle those other cases at the top of con_work(). Pull this whole
+block of code into a separate function to reduce the clutter.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 7bb21d68c535ad8be38e14a715632ae398b37ac1)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 30 insertions(+), 17 deletions(-)
+
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -2258,6 +2258,35 @@ static void queue_con(struct ceph_connec
+ }
+ }
+
++static bool con_sock_closed(struct ceph_connection *con)
++{
++ if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags))
++ return false;
++
++#define CASE(x) \
++ case CON_STATE_ ## x: \
++ con->error_msg = "socket closed (con state " #x ")"; \
++ break;
++
++ switch (con->state) {
++ CASE(CLOSED);
++ CASE(PREOPEN);
++ CASE(CONNECTING);
++ CASE(NEGOTIATING);
++ CASE(OPEN);
++ CASE(STANDBY);
++ default:
++ pr_warning("%s con %p unrecognized state %lu\n",
++ __func__, con, con->state);
++ con->error_msg = "unrecognized con state";
++ BUG();
++ break;
++ }
++#undef CASE
++
++ return true;
++}
++
+ /*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+@@ -2269,24 +2298,8 @@ static void con_work(struct work_struct
+
+ mutex_lock(&con->mutex);
+ restart:
+- if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
+- switch (con->state) {
+- case CON_STATE_CONNECTING:
+- con->error_msg = "connection failed";
+- break;
+- case CON_STATE_NEGOTIATING:
+- con->error_msg = "negotiation failed";
+- break;
+- case CON_STATE_OPEN:
+- con->error_msg = "socket closed";
+- break;
+- default:
+- dout("unrecognized con state %d\n", (int)con->state);
+- con->error_msg = "unrecognized con state";
+- BUG();
+- }
++ if (con_sock_closed(con))
+ goto fault;
+- }
+
+ if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
+ dout("con_work %p backing off\n", con);
--- /dev/null
+From 44f4872cfb643f95afeeba70a5bc974ba038d77e Mon Sep 17 00:00:00 2001
+From: David Zafman <david.zafman@inktank.com>
+Date: Mon, 3 Dec 2012 19:14:05 -0800
+Subject: libceph: Unlock unprocessed pages in start_read() error path
+
+
+From: David Zafman <david.zafman@inktank.com>
+
+Function start_read() can get an error before processing all pages.
+It must not only release the remaining pages, but unlock them too.
+
+This fixes http://tracker.newdream.net/issues/3370
+
+Signed-off-by: David Zafman <david.zafman@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 8884d53dd63b1d9315b343564fcbe1ede004a99e)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ceph/addr.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/ceph/addr.c
++++ b/fs/ceph/addr.c
+@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_
+ kfree(req->r_pages);
+ }
+
++static void ceph_unlock_page_vector(struct page **pages, int num_pages)
++{
++ int i;
++
++ for (i = 0; i < num_pages; i++)
++ unlock_page(pages[i]);
++}
++
+ /*
+ * start an async read(ahead) operation. return nr_pages we submitted
+ * a read for on success, or negative error code.
+@@ -347,6 +355,7 @@ static int start_read(struct inode *inod
+ return nr_pages;
+
+ out_pages:
++ ceph_unlock_page_vector(pages, nr_pages);
+ ceph_release_page_vector(pages, nr_pages);
+ out:
+ ceph_osdc_put_request(req);
--- /dev/null
+From d1b938deba18ca363fa37af3d55ec5fe8bb61f58 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Wed, 26 Dec 2012 10:43:57 -0600
+Subject: libceph: WARN, don't BUG on unexpected connection states
+
+
+From: Alex Elder <elder@inktank.com>
+
+A number of assertions in the ceph messenger are implemented with
+BUG_ON(), killing the system if connection's state doesn't match
+what's expected. At this point our state model is (evidently) not
+well understood enough for these assertions to trigger a BUG().
+Convert all BUG_ON(con->state...) calls to be WARN_ON(con->state...)
+so we learn about these issues without killing the machine.
+
+We now recognize that a connection fault can occur due to a socket
+closure at any time, regardless of the state of the connection. So
+there is really nothing we can assert about the state of the
+connection at that point so eliminate that assertion.
+
+Reported-by: Ugis <ugis22@gmail.com>
+Tested-by: Ugis <ugis22@gmail.com>
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 122070a2ffc91f87fe8e8493eb0ac61986c5557c)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/messenger.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/net/ceph/messenger.c
++++ b/net/ceph/messenger.c
+@@ -561,7 +561,7 @@ void ceph_con_open(struct ceph_connectio
+ mutex_lock(&con->mutex);
+ dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+
+- BUG_ON(con->state != CON_STATE_CLOSED);
++ WARN_ON(con->state != CON_STATE_CLOSED);
+ con->state = CON_STATE_PREOPEN;
+
+ con->peer_name.type = (__u8) entity_type;
+@@ -1505,7 +1505,7 @@ static int process_banner(struct ceph_co
+ static void fail_protocol(struct ceph_connection *con)
+ {
+ reset_connection(con);
+- BUG_ON(con->state != CON_STATE_NEGOTIATING);
++ WARN_ON(con->state != CON_STATE_NEGOTIATING);
+ con->state = CON_STATE_CLOSED;
+ }
+
+@@ -1631,7 +1631,7 @@ static int process_connect(struct ceph_c
+ return -1;
+ }
+
+- BUG_ON(con->state != CON_STATE_NEGOTIATING);
++ WARN_ON(con->state != CON_STATE_NEGOTIATING);
+ con->state = CON_STATE_OPEN;
+
+ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+@@ -2128,7 +2128,6 @@ more:
+ if (ret < 0)
+ goto out;
+
+- BUG_ON(con->state != CON_STATE_CONNECTING);
+ con->state = CON_STATE_NEGOTIATING;
+
+ /*
+@@ -2156,7 +2155,7 @@ more:
+ goto more;
+ }
+
+- BUG_ON(con->state != CON_STATE_OPEN);
++ WARN_ON(con->state != CON_STATE_OPEN);
+
+ if (con->in_base_pos < 0) {
+ /*
+@@ -2370,7 +2369,7 @@ static void ceph_fault(struct ceph_conne
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+- BUG_ON(con->state != CON_STATE_CONNECTING &&
++ WARN_ON(con->state != CON_STATE_CONNECTING &&
+ con->state != CON_STATE_NEGOTIATING &&
+ con->state != CON_STATE_OPEN);
+
--- /dev/null
+From f0dbbb5eedd159a834f2a864a4a451f14ce889c5 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 10 Aug 2012 13:12:07 -0700
+Subject: rbd: add read_only rbd map option
+
+
+From: Alex Elder <elder@inktank.com>
+
+Add the ability to map an rbd image read-only, by specifying either
+"read_only" or "ro" as an option on the rbd "command line." Also
+allow the inverse to be explicitly specified using "read_write" or
+"rw".
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+(based on commit cc0538b62c839c2df7b9f8378bb37e3b35faa608)
+---
+ drivers/block/rbd.c | 28 ++++++++++++++++++++++++----
+ 1 file changed, 24 insertions(+), 4 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -71,7 +71,8 @@
+ #define DEV_NAME_LEN 32
+ #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
+
+-#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
++#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
++#define RBD_READ_ONLY_DEFAULT false
+
+ /*
+ * block device image metadata (in-memory version)
+@@ -95,6 +96,7 @@ struct rbd_image_header {
+
+ struct rbd_options {
+ int notify_timeout;
++ bool read_only;
+ };
+
+ /*
+@@ -180,7 +182,7 @@ struct rbd_device {
+ u64 snap_id; /* current snapshot id */
+ /* whether the snap_id this device reads from still exists */
+ bool snap_exists;
+- int read_only;
++ bool read_only;
+
+ struct list_head node;
+
+@@ -346,12 +348,21 @@ enum {
+ /* int args above */
+ Opt_last_string,
+ /* string args above */
++ Opt_read_only,
++ Opt_read_write,
++ /* Boolean args above */
++ Opt_last_bool,
+ };
+
+ static match_table_t rbdopt_tokens = {
+ {Opt_notify_timeout, "notify_timeout=%d"},
+ /* int args above */
+ /* string args above */
++ {Opt_read_only, "read_only"},
++ {Opt_read_only, "ro"}, /* Alternate spelling */
++ {Opt_read_write, "read_write"},
++ {Opt_read_write, "rw"}, /* Alternate spelling */
++ /* Boolean args above */
+ {-1, NULL}
+ };
+
+@@ -376,6 +387,8 @@ static int parse_rbd_opts_token(char *c,
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
++ } else if (token > Opt_last_string && token < Opt_last_bool) {
++ dout("got Boolean token %d\n", token);
+ } else {
+ dout("got token %d\n", token);
+ }
+@@ -384,6 +397,12 @@ static int parse_rbd_opts_token(char *c,
+ case Opt_notify_timeout:
+ rbdopt->notify_timeout = intval;
+ break;
++ case Opt_read_only:
++ rbdopt->read_only = true;
++ break;
++ case Opt_read_write:
++ rbdopt->read_only = false;
++ break;
+ default:
+ BUG_ON(token);
+ }
+@@ -407,6 +426,7 @@ static struct rbd_client *rbd_get_client
+ return ERR_PTR(-ENOMEM);
+
+ rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
++ rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+
+ opt = ceph_parse_options(options, mon_addr,
+ mon_addr + mon_addr_len,
+@@ -590,7 +610,7 @@ static int rbd_header_set_snap(struct rb
+ snapc->seq = 0;
+ dev->snap_id = CEPH_NOSNAP;
+ dev->snap_exists = false;
+- dev->read_only = 0;
++ dev->read_only = dev->rbd_client->rbd_opts->read_only;
+ if (size)
+ *size = header->image_size;
+ } else {
+@@ -599,7 +619,7 @@ static int rbd_header_set_snap(struct rb
+ goto done;
+ dev->snap_id = snapc->seq;
+ dev->snap_exists = true;
+- dev->read_only = 1;
++ dev->read_only = true; /* No choice for snapshots */
+ }
+
+ ret = 0;
--- /dev/null
+From 68205f80bfcf2cc6b697bd39b0f9d5c89e37b693 Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@inktank.com>
+Date: Mon, 24 Sep 2012 21:02:47 -0700
+Subject: rbd: BUG on invalid layout
+
+
+From: Sage Weil <sage@inktank.com>
+
+This shouldn't actually be possible because the layout struct is
+constructed from the RBD header and validated then.
+
+[elder@inktank.com: converted BUG() call to equivalent rbd_assert()]
+
+Signed-off-by: Sage Weil <sage@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(based on commit 6cae3717cddaf8e5e96e304733dca66e40d56f89)
+---
+ drivers/block/rbd.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -930,8 +930,9 @@ static int rbd_do_request(struct request
+ layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_pg_preferred = cpu_to_le32(-1);
+ layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+- ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
+- req, ops);
++ ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
++ req, ops);
++ BUG_ON(ret != 0);
+
+ ceph_osdc_build_request(req, ofs, &len,
+ ops,
--- /dev/null
+From 4e6bc65efc9ddd08a5328c3680c8e1679a592e00 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 16 Nov 2012 09:29:16 -0600
+Subject: rbd: do not allow remove of mounted-on image
+
+
+From: Alex Elder <elder@inktank.com>
+
+There is no check in rbd_remove() to see if anybody holds open the
+image being removed. That's not cool.
+
+Add a simple open count that goes up and down with opens and closes
+(releases) of the device, and don't allow an rbd image to be removed
+if the count is non-zero.
+
+Protect the updates of the open count value with ctl_mutex to ensure
+the underlying rbd device doesn't get removed while concurrently
+being opened.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(based on commit 42382b709bd1d143b9f0fa93e0a3a1f2f4210707)
+---
+ drivers/block/rbd.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -189,6 +189,7 @@ struct rbd_device {
+
+ /* sysfs related */
+ struct device dev;
++ unsigned long open_count;
+ };
+
+ static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+@@ -249,8 +250,11 @@ static int rbd_open(struct block_device
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
++ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ rbd_get_dev(rbd_dev);
+ set_device_ro(bdev, rbd_dev->read_only);
++ rbd_dev->open_count++;
++ mutex_unlock(&ctl_mutex);
+
+ return 0;
+ }
+@@ -259,7 +263,11 @@ static int rbd_release(struct gendisk *d
+ {
+ struct rbd_device *rbd_dev = disk->private_data;
+
++ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
++ BUG_ON(!rbd_dev->open_count);
++ rbd_dev->open_count--;
+ rbd_put_dev(rbd_dev);
++ mutex_unlock(&ctl_mutex);
+
+ return 0;
+ }
+@@ -2448,6 +2456,11 @@ static ssize_t rbd_remove(struct bus_typ
+ goto done;
+ }
+
++ if (rbd_dev->open_count) {
++ ret = -EBUSY;
++ goto done;
++ }
++
+ __rbd_remove_all_snaps(rbd_dev);
+ rbd_bus_del_dev(rbd_dev);
+
--- /dev/null
+From a85494c915b96289b51989aa718b56338f54c468 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 10 Aug 2012 13:12:07 -0700
+Subject: rbd: drop dev reference on error in rbd_open()
+
+
+From: Alex Elder <elder@inktank.com>
+
+If a read-only rbd device is opened for writing in rbd_open(), it
+returns without dropping the just-acquired device reference.
+
+Fix this by moving the read-only check before getting the reference.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+(cherry picked from commit 340c7a2b2c9a2da640af28a8c196356484ac8b50)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -250,13 +250,12 @@ static int rbd_open(struct block_device
+ {
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+
+- rbd_get_dev(rbd_dev);
+-
+- set_device_ro(bdev, rbd_dev->read_only);
+-
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
++ rbd_get_dev(rbd_dev);
++ set_device_ro(bdev, rbd_dev->read_only);
++
+ return 0;
+ }
+
--- /dev/null
+From a1d89f7052555740954faef81cd2817d6b7c8bae Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@dreamhost.com>
+Date: Mon, 5 Dec 2011 10:35:04 -0800
+Subject: rbd: expose the correct size of the device in sysfs
+
+
+From: Josh Durgin <josh.durgin@dreamhost.com>
+
+If an image was mapped to a snapshot, the size of the head version
+would be shown. Protect capacity with header_rwsem, since it may
+change.
+
+Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit a51aa0c042fa39946dd017d5f91a073300a71577)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1720,6 +1720,8 @@ static int __rbd_update_snaps(struct rbd
+ if (ret < 0)
+ return ret;
+
++ down_write(&rbd_dev->header_rwsem);
++
+ /* resized? */
+ if (rbd_dev->snap_id == CEPH_NOSNAP) {
+ sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
+@@ -1728,8 +1730,6 @@ static int __rbd_update_snaps(struct rbd
+ set_capacity(rbd_dev->disk, size);
+ }
+
+- down_write(&rbd_dev->header_rwsem);
+-
+ snap_seq = rbd_dev->header.snapc->seq;
+ if (rbd_dev->header.total_snaps &&
+ rbd_dev->header.snapc->snaps[0] == snap_seq)
+@@ -1844,8 +1844,13 @@ static ssize_t rbd_size_show(struct devi
+ struct device_attribute *attr, char *buf)
+ {
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
++ sector_t size;
++
++ down_read(&rbd_dev->header_rwsem);
++ size = get_capacity(rbd_dev->disk);
++ up_read(&rbd_dev->header_rwsem);
+
+- return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
++ return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
+ }
+
+ static ssize_t rbd_major_show(struct device *dev,
--- /dev/null
+From 2000a50aa1dbceae96b4bee06e8e1c15f9359e24 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Wed, 10 Oct 2012 21:19:13 -0700
+Subject: rbd: fix bug in rbd_dev_id_put()
+
+
+From: Alex Elder <elder@inktank.com>
+
+In rbd_dev_id_put(), there's a loop that's intended to determine
+the maximum device id in use. But it isn't doing that at all,
+the effect of how it's written is to simply use the just-put id
+number, which ignores whole purpose of this function.
+
+Fix the bug.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+(cherry picked from commit b213e0b1a62637b2a9395a34349b13d73ca2b90a)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -2150,8 +2150,8 @@ static void rbd_id_put(struct rbd_device
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+- if (rbd_id > max_id)
+- max_id = rbd_id;
++ if (rbd_dev->id > max_id)
++ max_id = rbd_dev->id;
+ }
+ spin_unlock(&rbd_dev_list_lock);
+
--- /dev/null
+From cd9155deb01c86a31836e9a963c208666f5d5abc Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 10 Aug 2012 13:12:10 -0700
+Subject: rbd: kill create_snap sysfs entry
+
+
+From: Alex Elder <elder@inktank.com>
+
+Josh proposed the following change, and I don't think I could
+explain it any better than he did:
+
+ From: Josh Durgin <josh.durgin@inktank.com>
+ Date: Tue, 24 Jul 2012 14:22:11 -0700
+ To: ceph-devel <ceph-devel@vger.kernel.org>
+ Message-ID: <500F1203.9050605@inktank.com>
+ From: Josh Durgin <josh.durgin@inktank.com>
+
+
+ Right now the kernel still has one piece of rbd management
+ duplicated from the rbd command line tool: snapshot creation.
+ There's nothing special about snapshot creation that makes it
+ advantageous to do from the kernel, so I'd like to remove the
+ create_snap sysfs interface. That is,
+ /sys/bus/rbd/devices/<id>/create_snap
+ would be removed.
+
+ Does anyone rely on the sysfs interface for creating rbd
+ snapshots? If so, how hard would it be to replace with:
+
+ rbd snap create pool/image@snap
+
+ Is there any benefit to the sysfs interface that I'm missing?
+
+ Josh
+
+This patch implements this proposal, removing the code that
+implements the "snap_create" sysfs interface for rbd images.
+As a result, quite a lot of other supporting code goes away.
+
+[elder@inktank.com: commented out rbd_req_sync_exec() to avoid warning]
+
+Suggested-by: Josh Durgin <josh.durgin@inktank.com>
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+(based on commit 02cdb02ceab1f3dd9ac2bc899fc51f0e0e744782)
+---
+ Documentation/ABI/testing/sysfs-bus-rbd | 6 -
+ drivers/block/rbd.c | 165 --------------------------------
+ 2 files changed, 2 insertions(+), 169 deletions(-)
+
+--- a/Documentation/ABI/testing/sysfs-bus-rbd
++++ b/Documentation/ABI/testing/sysfs-bus-rbd
+@@ -51,12 +51,6 @@ current_snap
+
+ The current snapshot for which the device is mapped.
+
+-create_snap
+-
+- Create a snapshot:
+-
+- $ echo <snap-name> > /sys/bus/rbd/devices/<dev-id>/snap_create
+-
+ snap_*
+
+ A directory per each snapshot
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -201,10 +201,6 @@ static DEFINE_SPINLOCK(rbd_client_list_l
+
+ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
+ static void rbd_dev_release(struct device *dev);
+-static ssize_t rbd_snap_add(struct device *dev,
+- struct device_attribute *attr,
+- const char *buf,
+- size_t count);
+ static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
+ struct rbd_snap *snap);
+
+@@ -1307,71 +1303,7 @@ static int rbd_req_sync_unwatch(struct r
+ return ret;
+ }
+
+-struct rbd_notify_info {
+- struct rbd_device *dev;
+-};
+-
+-static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+-{
+- struct rbd_device *dev = (struct rbd_device *)data;
+- if (!dev)
+- return;
+-
+- dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
+- notify_id, (int)opcode);
+-}
+-
+-/*
+- * Request sync osd notify
+- */
+-static int rbd_req_sync_notify(struct rbd_device *dev,
+- const char *obj)
+-{
+- struct ceph_osd_req_op *ops;
+- struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
+- struct ceph_osd_event *event;
+- struct rbd_notify_info info;
+- int payload_len = sizeof(u32) + sizeof(u32);
+- int ret;
+-
+- ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
+- if (ret < 0)
+- return ret;
+-
+- info.dev = dev;
+-
+- ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
+- (void *)&info, &event);
+- if (ret < 0)
+- goto fail;
+-
+- ops[0].watch.ver = 1;
+- ops[0].watch.flag = 1;
+- ops[0].watch.cookie = event->cookie;
+- ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
+- ops[0].watch.timeout = 12;
+-
+- ret = rbd_req_sync_op(dev, NULL,
+- CEPH_NOSNAP,
+- 0,
+- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+- ops,
+- 1, obj, 0, 0, NULL, NULL, NULL);
+- if (ret < 0)
+- goto fail_event;
+-
+- ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
+- dout("ceph_osdc_wait_event returned %d\n", ret);
+- rbd_destroy_ops(ops);
+- return 0;
+-
+-fail_event:
+- ceph_osdc_cancel_event(event);
+-fail:
+- rbd_destroy_ops(ops);
+- return ret;
+-}
+-
++#if 0
+ /*
+ * Request sync osd read
+ */
+@@ -1411,6 +1343,7 @@ static int rbd_req_sync_exec(struct rbd_
+ dout("cls_exec returned %d\n", ret);
+ return ret;
+ }
++#endif
+
+ static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
+ {
+@@ -1645,57 +1578,6 @@ out_dh:
+ return rc;
+ }
+
+-/*
+- * create a snapshot
+- */
+-static int rbd_header_add_snap(struct rbd_device *dev,
+- const char *snap_name,
+- gfp_t gfp_flags)
+-{
+- int name_len = strlen(snap_name);
+- u64 new_snapid;
+- int ret;
+- void *data, *p, *e;
+- u64 ver;
+- struct ceph_mon_client *monc;
+-
+- /* we should create a snapshot only if we're pointing at the head */
+- if (dev->snap_id != CEPH_NOSNAP)
+- return -EINVAL;
+-
+- monc = &dev->rbd_client->client->monc;
+- ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
+- dout("created snapid=%lld\n", new_snapid);
+- if (ret < 0)
+- return ret;
+-
+- data = kmalloc(name_len + 16, gfp_flags);
+- if (!data)
+- return -ENOMEM;
+-
+- p = data;
+- e = data + name_len + 16;
+-
+- ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
+- ceph_encode_64_safe(&p, e, new_snapid, bad);
+-
+- ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+- data, p - data, &ver);
+-
+- kfree(data);
+-
+- if (ret < 0)
+- return ret;
+-
+- down_write(&dev->header_rwsem);
+- dev->header.snapc->seq = new_snapid;
+- up_write(&dev->header_rwsem);
+-
+- return 0;
+-bad:
+- return -ERANGE;
+-}
+-
+ static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
+ {
+ struct rbd_snap *snap;
+@@ -1923,7 +1805,6 @@ static DEVICE_ATTR(pool, S_IRUGO, rbd_po
+ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
+ static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
+ static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+-static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
+
+ static struct attribute *rbd_attrs[] = {
+ &dev_attr_size.attr,
+@@ -1933,7 +1814,6 @@ static struct attribute *rbd_attrs[] = {
+ &dev_attr_name.attr,
+ &dev_attr_current_snap.attr,
+ &dev_attr_refresh.attr,
+- &dev_attr_create_snap.attr,
+ NULL
+ };
+
+@@ -2563,47 +2443,6 @@ done:
+ return ret;
+ }
+
+-static ssize_t rbd_snap_add(struct device *dev,
+- struct device_attribute *attr,
+- const char *buf,
+- size_t count)
+-{
+- struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+- int ret;
+- char *name = kmalloc(count + 1, GFP_KERNEL);
+- if (!name)
+- return -ENOMEM;
+-
+- snprintf(name, count, "%s", buf);
+-
+- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+-
+- ret = rbd_header_add_snap(rbd_dev,
+- name, GFP_KERNEL);
+- if (ret < 0)
+- goto err_unlock;
+-
+- ret = __rbd_update_snaps(rbd_dev);
+- if (ret < 0)
+- goto err_unlock;
+-
+- /* shouldn't hold ctl_mutex when notifying.. notify might
+- trigger a watch callback that would need to get that mutex */
+- mutex_unlock(&ctl_mutex);
+-
+- /* make a best effort, don't error if failed */
+- rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
+-
+- ret = count;
+- kfree(name);
+- return ret;
+-
+-err_unlock:
+- mutex_unlock(&ctl_mutex);
+- kfree(name);
+- return ret;
+-}
+-
+ /*
+ * create control files in sysfs
+ * /sys/bus/rbd/...
--- /dev/null
+From 97b2f0e79f5f4171c5e658e46f6b247a98a51389 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Fri, 10 Aug 2012 13:12:07 -0700
+Subject: rbd: kill notify_timeout option
+
+
+From: Alex Elder <elder@inktank.com>
+
+The "notify_timeout" rbd device option is never used, so get rid of
+it.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+(cherry picked from commit 84d34dcc116e117a41c6fc8be13430529fc2d9e7)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 8 --------
+ 1 file changed, 8 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -71,7 +71,6 @@
+ #define DEV_NAME_LEN 32
+ #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
+
+-#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
+ #define RBD_READ_ONLY_DEFAULT false
+
+ /*
+@@ -95,7 +94,6 @@ struct rbd_image_header {
+ };
+
+ struct rbd_options {
+- int notify_timeout;
+ bool read_only;
+ };
+
+@@ -343,7 +341,6 @@ static struct rbd_client *__rbd_client_f
+ * mount options
+ */
+ enum {
+- Opt_notify_timeout,
+ Opt_last_int,
+ /* int args above */
+ Opt_last_string,
+@@ -355,7 +352,6 @@ enum {
+ };
+
+ static match_table_t rbdopt_tokens = {
+- {Opt_notify_timeout, "notify_timeout=%d"},
+ /* int args above */
+ /* string args above */
+ {Opt_read_only, "read_only"},
+@@ -394,9 +390,6 @@ static int parse_rbd_opts_token(char *c,
+ }
+
+ switch (token) {
+- case Opt_notify_timeout:
+- rbdopt->notify_timeout = intval;
+- break;
+ case Opt_read_only:
+ rbdopt->read_only = true;
+ break;
+@@ -425,7 +418,6 @@ static struct rbd_client *rbd_get_client
+ if (!rbd_opts)
+ return ERR_PTR(-ENOMEM);
+
+- rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
+ rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+
+ opt = ceph_parse_options(options, mon_addr,
--- /dev/null
+From 9aa61f22078fc2ccaf1a501b98659e85924a1c91 Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@inktank.com>
+Date: Mon, 21 Nov 2011 17:13:54 -0800
+Subject: rbd: only reset capacity when pointing to head
+
+
+From: Josh Durgin <josh.durgin@inktank.com>
+
+Snapshots cannot be resized, and the new capacity of head should not
+be reflected by the snapshot.
+
+Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 474ef7ce832d471148f63a9d07f67fc5564834f1)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1721,7 +1721,12 @@ static int __rbd_update_snaps(struct rbd
+ return ret;
+
+ /* resized? */
+- set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
++ if (rbd_dev->snap_id == CEPH_NOSNAP) {
++ sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
++
++ dout("setting size to %llu sectors", (unsigned long long) size);
++ set_capacity(rbd_dev->disk, size);
++ }
+
+ down_write(&rbd_dev->header_rwsem);
+
--- /dev/null
+From 38a10d2304baec3e84895517ba674783bc4e1b07 Mon Sep 17 00:00:00 2001
+From: Alex Elder <elder@inktank.com>
+Date: Thu, 6 Dec 2012 09:37:23 -0600
+Subject: rbd: remove linger unconditionally
+
+
+From: Alex Elder <elder@inktank.com>
+
+In __unregister_linger_request(), the request is being removed
+from the osd client's req_linger list only when the request
+has a non-null osd pointer. It should be done whether or not
+the request currently has an osd.
+
+This is most likely a non-issue because I believe the request
+will always have an osd when this function is called.
+
+Signed-off-by: Alex Elder <elder@inktank.com>
+Reviewed-by: Sage Weil <sage@inktank.com>
+(cherry picked from commit 61c74035626beb25a39b0273ccf7d75510bc36a1)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ceph/osd_client.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ceph/osd_client.c
++++ b/net/ceph/osd_client.c
+@@ -905,8 +905,8 @@ static void __unregister_linger_request(
+ struct ceph_osd_request *req)
+ {
+ dout("__unregister_linger_request %p\n", req);
++ list_del_init(&req->r_linger_item);
+ if (req->r_osd) {
+- list_del_init(&req->r_linger_item);
+ list_del_init(&req->r_linger_osd);
+
+ if (list_empty(&req->r_osd->o_requests) &&
--- /dev/null
+From 42febe4f05ffcdb44903da51a438fe9a6a4fb96e Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@inktank.com>
+Date: Mon, 21 Nov 2011 18:14:25 -0800
+Subject: rbd: return errors for mapped but deleted snapshot
+
+
+From: Josh Durgin <josh.durgin@inktank.com>
+
+When a snapshot is deleted, the OSD will return ENOENT when reading
+from it. This is normally interpreted as a hole by rbd, which will
+return zeroes. To minimize the time in which this can happen, stop
+requests early when we are notified that our snapshot no longer
+exists.
+
+[elder@inktank.com: updated __rbd_init_snaps_header() logic]
+
+Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit e88a36ec961b8c1899c59c5e4ae35a318c0209d3)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Conflicts:
+
+ drivers/block/rbd.c
+---
+ drivers/block/rbd.c | 32 ++++++++++++++++++++++++++++++--
+ 1 file changed, 30 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -174,9 +174,13 @@ struct rbd_device {
+
+ /* protects updating the header */
+ struct rw_semaphore header_rwsem;
++ /* name of the snapshot this device reads from */
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
++ /* id of the snapshot this device reads from */
+ u64 snap_id; /* current snapshot id */
+- int read_only;
++ /* whether the snap_id this device reads from still exists */
++ bool snap_exists;
++ int read_only;
+
+ struct list_head node;
+
+@@ -590,6 +594,7 @@ static int rbd_header_set_snap(struct rb
+ else
+ snapc->seq = 0;
+ dev->snap_id = CEPH_NOSNAP;
++ dev->snap_exists = false;
+ dev->read_only = 0;
+ if (size)
+ *size = header->image_size;
+@@ -598,6 +603,7 @@ static int rbd_header_set_snap(struct rb
+ if (ret < 0)
+ goto done;
+ dev->snap_id = snapc->seq;
++ dev->snap_exists = true;
+ dev->read_only = 1;
+ }
+
+@@ -1466,6 +1472,21 @@ static void rbd_rq_fn(struct request_que
+
+ spin_unlock_irq(q->queue_lock);
+
++ if (rbd_dev->snap_id != CEPH_NOSNAP) {
++ bool snap_exists;
++
++ down_read(&rbd_dev->header_rwsem);
++ snap_exists = rbd_dev->snap_exists;
++ up_read(&rbd_dev->header_rwsem);
++
++ if (!snap_exists) {
++ dout("request for non-existent snapshot");
++ spin_lock_irq(q->queue_lock);
++ __blk_end_request_all(rq, -ENXIO);
++ continue;
++ }
++ }
++
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * SECTOR_SIZE);
+@@ -2069,7 +2090,14 @@ static int __rbd_init_snaps_header(struc
+ cur_id = rbd_dev->header.snapc->snaps[i - 1];
+
+ if (!i || old_snap->id < cur_id) {
+- /* old_snap->id was skipped, thus was removed */
++ /*
++ * old_snap->id was skipped, thus was
++ * removed. If this rbd_dev is mapped to
++ * the removed snapshot, record that it no
++ * longer exists, to prevent further I/O.
++ */
++ if (rbd_dev->snap_id == old_snap->id)
++ rbd_dev->snap_exists = false;
+ __rbd_remove_snap_dev(rbd_dev, old_snap);
+ continue;
+ }
--- /dev/null
+From 9c916f5d870e8601b49052be9e451734a333ed23 Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@dreamhost.com>
+Date: Mon, 5 Dec 2011 18:10:44 -0800
+Subject: rbd: send header version when notifying
+
+
+From: Josh Durgin <josh.durgin@dreamhost.com>
+
+Previously the original header version was sent. Now, we update it
+when the header changes.
+
+Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit a71b891bc7d77a070e723c8c53d1dd73cf931555)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1195,7 +1195,7 @@ static int rbd_req_sync_notify_ack(struc
+ if (ret < 0)
+ return ret;
+
+- ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
++ ops[0].watch.ver = cpu_to_le64(ver);
+ ops[0].watch.cookie = notify_id;
+ ops[0].watch.flag = 0;
+
+@@ -1215,6 +1215,7 @@ static int rbd_req_sync_notify_ack(struc
+ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+ {
+ struct rbd_device *dev = (struct rbd_device *)data;
++ u64 hver;
+ int rc;
+
+ if (!dev)
+@@ -1224,12 +1225,13 @@ static void rbd_watch_cb(u64 ver, u64 no
+ notify_id, (int)opcode);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ rc = __rbd_update_snaps(dev);
++ hver = dev->header.obj_version;
+ mutex_unlock(&ctl_mutex);
+ if (rc)
+ pr_warning(RBD_DRV_NAME "%d got notification but failed to "
+ " update snaps: %d\n", dev->major, rc);
+
+- rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
++ rbd_req_sync_notify_ack(dev, hver, notify_id, dev->obj_md_name);
+ }
+
+ /*
+@@ -1740,6 +1742,7 @@ static int __rbd_update_snaps(struct rbd
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
++ rbd_dev->header.obj_version = h.obj_version;
+ rbd_dev->header.image_size = h.image_size;
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
--- /dev/null
+From 1b5cd1af449c8112b751e4820203829173ec47c6 Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@dreamhost.com>
+Date: Mon, 5 Dec 2011 10:41:28 -0800
+Subject: rbd: set image size when header is updated
+
+
+From: Josh Durgin <josh.durgin@dreamhost.com>
+
+The image may have been resized.
+
+Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit 93a24e084d67ba2fcb9a4c289135825b623ec864)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1741,6 +1741,7 @@ static int __rbd_update_snaps(struct rbd
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
++ rbd_dev->header.image_size = h.image_size;
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
+ rbd_dev->header.snap_names = h.snap_names;
--- /dev/null
+From aabc9ab77d676d758db56a08f4de31ff8c6b1bc7 Mon Sep 17 00:00:00 2001
+From: Josh Durgin <josh.durgin@dreamhost.com>
+Date: Mon, 5 Dec 2011 14:03:05 -0800
+Subject: rbd: use reference counting for the snap context
+
+
+From: Josh Durgin <josh.durgin@dreamhost.com>
+
+This prevents a race between requests with a given snap context and
+header updates that free it. The osd client was already expecting the
+snap context to be reference counted, since it get()s it in
+ceph_osdc_build_request and put()s it when the request completes.
+
+Also remove the second down_read()/up_read() on header_rwsem in
+rbd_do_request, which wasn't actually preventing this race or
+protecting any other data.
+
+Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
+Reviewed-by: Alex Elder <elder@inktank.com>
+(cherry picked from commit d1d25646543134d756a02ffe4e02073faa761f2c)
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c | 35 +++++++++++++++++------------------
+ 1 file changed, 17 insertions(+), 18 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -615,7 +615,7 @@ done:
+
+ static void rbd_header_free(struct rbd_image_header *header)
+ {
+- kfree(header->snapc);
++ ceph_put_snap_context(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+ }
+@@ -893,13 +893,10 @@ static int rbd_do_request(struct request
+
+ dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
+
+- down_read(&dev->header_rwsem);
+-
+ osdc = &dev->rbd_client->client->osdc;
+ req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+ false, GFP_NOIO, pages, bio);
+ if (!req) {
+- up_read(&dev->header_rwsem);
+ ret = -ENOMEM;
+ goto done_pages;
+ }
+@@ -934,7 +931,6 @@ static int rbd_do_request(struct request
+ snapc,
+ &mtime,
+ req->r_oid, req->r_oid_len);
+- up_read(&dev->header_rwsem);
+
+ if (linger_req) {
+ ceph_osdc_set_request_linger(osdc, req);
+@@ -1446,6 +1442,7 @@ static void rbd_rq_fn(struct request_que
+ u64 ofs;
+ int num_segs, cur_seg = 0;
+ struct rbd_req_coll *coll;
++ struct ceph_snap_context *snapc;
+
+ /* peek at request from block layer */
+ if (!rq)
+@@ -1472,21 +1469,20 @@ static void rbd_rq_fn(struct request_que
+
+ spin_unlock_irq(q->queue_lock);
+
+- if (rbd_dev->snap_id != CEPH_NOSNAP) {
+- bool snap_exists;
++ down_read(&rbd_dev->header_rwsem);
+
+- down_read(&rbd_dev->header_rwsem);
+- snap_exists = rbd_dev->snap_exists;
++ if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
+ up_read(&rbd_dev->header_rwsem);
+-
+- if (!snap_exists) {
+- dout("request for non-existent snapshot");
+- spin_lock_irq(q->queue_lock);
+- __blk_end_request_all(rq, -ENXIO);
+- continue;
+- }
++ dout("request for non-existent snapshot");
++ spin_lock_irq(q->queue_lock);
++ __blk_end_request_all(rq, -ENXIO);
++ continue;
+ }
+
++ snapc = ceph_get_snap_context(rbd_dev->header.snapc);
++
++ up_read(&rbd_dev->header_rwsem);
++
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * SECTOR_SIZE);
+@@ -1496,6 +1492,7 @@ static void rbd_rq_fn(struct request_que
+ if (!coll) {
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENOMEM);
++ ceph_put_snap_context(snapc);
+ continue;
+ }
+
+@@ -1519,7 +1516,7 @@ static void rbd_rq_fn(struct request_que
+ /* init OSD command: write or read */
+ if (do_write)
+ rbd_req_write(rq, rbd_dev,
+- rbd_dev->header.snapc,
++ snapc,
+ ofs,
+ op_size, bio,
+ coll, cur_seg);
+@@ -1542,6 +1539,8 @@ next_seg:
+ if (bp)
+ bio_pair_release(bp);
+ spin_lock_irq(q->queue_lock);
++
++ ceph_put_snap_context(snapc);
+ }
+ }
+
+@@ -1737,7 +1736,7 @@ static int __rbd_update_snaps(struct rbd
+ if head moves */
+ follow_seq = 1;
+
+- kfree(rbd_dev->header.snapc);
++ ceph_put_snap_context(rbd_dev->header.snapc);
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
0019-drm-i915-call-drm_handle_vblank-before-finish_page_f.patch
0020-drm-i915-Flush-the-pending-flips-on-the-CRTC-before-.patch
revert-drm-i915-no-lvds-quirk-for-zotac-zdbox-sd-id12-id13.patch
+ceph-close-old-con-before-reopening-on-mds-reconnect.patch
+rbd-return-errors-for-mapped-but-deleted-snapshot.patch
+rbd-only-reset-capacity-when-pointing-to-head.patch
+rbd-expose-the-correct-size-of-the-device-in-sysfs.patch
+rbd-set-image-size-when-header-is-updated.patch
+rbd-use-reference-counting-for-the-snap-context.patch
+rbd-send-header-version-when-notifying.patch
+ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch
+rbd-drop-dev-reference-on-error-in-rbd_open.patch
+ceph-propagate-layout-error-on-osd-request-creation.patch
+libceph-socket-can-close-in-any-connection-state.patch
+libceph-report-connection-fault-with-warning.patch
+libceph-init-osd-o_node-in-create_osd.patch
+libceph-init-event-node-in-ceph_osdc_create_event.patch
+libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch
+libceph-register-request-before-unregister-linger.patch
+libceph-move-linger-requests-sooner-in-kick_requests.patch
+libceph-always-reset-osds-when-kicking.patch
+libceph-warn-don-t-bug-on-unexpected-connection-states.patch
+libceph-fix-protocol-feature-mismatch-failure-path.patch
+libceph-fix-osdmap-decode-error-paths.patch
+libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch
+rbd-kill-create_snap-sysfs-entry.patch
+rbd-add-read_only-rbd-map-option.patch
+rbd-kill-notify_timeout-option.patch
+libceph-remove-osdtimeout-option.patch
+ceph-don-t-reference-req-after-put.patch
+rbd-remove-linger-unconditionally.patch
+rbd-bug-on-invalid-layout.patch
+rbd-fix-bug-in-rbd_dev_id_put.patch
+rbd-do-not-allow-remove-of-mounted-on-image.patch
+ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch
+ceph-fix-infinite-loop-in-__wake_requests.patch
+ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch
+ceph-fix-__ceph_do_pending_vmtruncate.patch
+ceph-call-handle_cap_grant-for-cap-import-message.patch
+libceph-unlock-unprocessed-pages-in-start_read-error-path.patch