From: Greg Kroah-Hartman Date: Tue, 15 Jan 2013 18:13:22 +0000 (-0800) Subject: 3.4-stable patches X-Git-Tag: v3.7.3~14 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=a90bfb00b971aa2d8d5259ac876337d10b529196;p=thirdparty%2Fkernel%2Fstable-queue.git 3.4-stable patches added patches: ceph-call-handle_cap_grant-for-cap-import-message.patch ceph-close-old-con-before-reopening-on-mds-reconnect.patch ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch ceph-don-t-reference-req-after-put.patch ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch ceph-fix-__ceph_do_pending_vmtruncate.patch ceph-fix-infinite-loop-in-__wake_requests.patch ceph-propagate-layout-error-on-osd-request-creation.patch ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch libceph-always-reset-osds-when-kicking.patch libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch libceph-fix-osdmap-decode-error-paths.patch libceph-fix-protocol-feature-mismatch-failure-path.patch libceph-init-event-node-in-ceph_osdc_create_event.patch libceph-init-osd-o_node-in-create_osd.patch libceph-move-linger-requests-sooner-in-kick_requests.patch libceph-register-request-before-unregister-linger.patch libceph-remove-osdtimeout-option.patch libceph-report-connection-fault-with-warning.patch libceph-socket-can-close-in-any-connection-state.patch libceph-unlock-unprocessed-pages-in-start_read-error-path.patch libceph-warn-don-t-bug-on-unexpected-connection-states.patch rbd-add-read_only-rbd-map-option.patch rbd-bug-on-invalid-layout.patch rbd-do-not-allow-remove-of-mounted-on-image.patch rbd-drop-dev-reference-on-error-in-rbd_open.patch rbd-expose-the-correct-size-of-the-device-in-sysfs.patch rbd-fix-bug-in-rbd_dev_id_put.patch rbd-kill-create_snap-sysfs-entry.patch rbd-kill-notify_timeout-option.patch rbd-only-reset-capacity-when-pointing-to-head.patch rbd-remove-linger-unconditionally.patch rbd-return-errors-for-mapped-but-deleted-snapshot.patch rbd-send-header-version-when-notifying.patch rbd-set-image-size-when-header-is-updated.patch rbd-use-reference-counting-for-the-snap-context.patch --- diff --git a/queue-3.4/ceph-call-handle_cap_grant-for-cap-import-message.patch b/queue-3.4/ceph-call-handle_cap_grant-for-cap-import-message.patch new file mode 100644 index 00000000000..fe02695dbc3 --- /dev/null +++ b/queue-3.4/ceph-call-handle_cap_grant-for-cap-import-message.patch @@ -0,0 +1,49 @@ +From bbe2e24262afd42a08e2fbc4fb1c134b94f64a57 Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Mon, 19 Nov 2012 10:49:09 +0800 +Subject: ceph: call handle_cap_grant() for cap import message + + +From: "Yan, Zheng" + +If client sends cap message that requests new max size during +exporting caps, the exporting MDS will drop the message quietly. +So the client may wait for the reply that updates the max size +forever. call handle_cap_grant() for cap import message can +avoid this issue. + +Signed-off-by: Yan, Zheng +Signed-off-by: Sage Weil +(cherry picked from commit 0e5e1774a92e6fe9c511585de8f078b4c4c68dbb) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/caps.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ceph/caps.c ++++ b/fs/ceph/caps.c +@@ -2749,6 +2749,7 @@ static void handle_cap_import(struct cep + + /* make sure we re-request max_size, if necessary */ + spin_lock(&ci->i_ceph_lock); ++ ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + } +@@ -2844,8 +2845,6 @@ void ceph_handle_caps(struct ceph_mds_se + case CEPH_CAP_OP_IMPORT: + handle_cap_import(mdsc, inode, h, session, + snaptrace, snaptrace_len); +- ceph_check_caps(ceph_inode(inode), 0, session); +- goto done_unlocked; + } + + /* the rest require a cap */ +@@ -2862,6 +2861,7 @@ void ceph_handle_caps(struct ceph_mds_se + switch (op) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: ++ case CEPH_CAP_OP_IMPORT: + handle_cap_grant(inode, h, session, cap, msg->middle); + goto done_unlocked; + diff --git a/queue-3.4/ceph-close-old-con-before-reopening-on-mds-reconnect.patch b/queue-3.4/ceph-close-old-con-before-reopening-on-mds-reconnect.patch new file mode 100644 index 00000000000..9211984a145 --- /dev/null +++ b/queue-3.4/ceph-close-old-con-before-reopening-on-mds-reconnect.patch @@ -0,0 +1,31 @@ +From c9a3f6ab3490925ecc0714a5e4fd4c8b0a110bc4 Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Mon, 30 Jul 2012 16:21:17 -0700 +Subject: ceph: close old con before reopening on mds reconnect + + +From: Sage Weil + +When we detect a mds session reset, close the old ceph_connection before +reopening it. This ensures we clean up the old socket properly and keep +the ceph_connection state correct. + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +Reviewed-by: Yehuda Sadeh +(cherry picked from commit a53aab645c82f0146e35684b34692c69b5118121) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/mds_client.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -2528,6 +2528,7 @@ static void send_mds_reconnect(struct ce + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; + ++ ceph_con_close(&session->s_con); + ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); diff --git a/queue-3.4/ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch b/queue-3.4/ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch new file mode 100644 index 00000000000..af31f903620 --- /dev/null +++ b/queue-3.4/ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch @@ -0,0 +1,40 @@ +From 8677d84432bc48ae52d6fc07e4af459b8b6aaeb4 Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Mon, 19 Nov 2012 10:49:07 +0800 +Subject: ceph: Don't add dirty inode to dirty list if caps is in migration + + +From: "Yan, Zheng" + +Add dirty inode to cap_dirty_migrating list instead, this can avoid +ceph_flush_dirty_caps() entering infinite loop. + +Signed-off-by: Yan, Zheng +Signed-off-by: Sage Weil +(cherry picked from commit 0685235ffd9dbdb9ccbda587f8a3c83ad1d5a921) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/caps.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/ceph/caps.c ++++ b/fs/ceph/caps.c +@@ -1349,11 +1349,15 @@ int __ceph_mark_dirty_caps(struct ceph_i + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); +- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, +- ci->i_head_snapc); ++ dout(" inode %p now dirty snapc %p auth cap %p\n", ++ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + BUG_ON(!list_empty(&ci->i_dirty_item)); + spin_lock(&mdsc->cap_dirty_lock); +- list_add(&ci->i_dirty_item, &mdsc->cap_dirty); ++ if (ci->i_auth_cap) ++ list_add(&ci->i_dirty_item, &mdsc->cap_dirty); ++ else ++ list_add(&ci->i_dirty_item, ++ &mdsc->cap_dirty_migrating); + spin_unlock(&mdsc->cap_dirty_lock); + if (ci->i_flushing_caps == 0) { + ihold(inode); diff --git a/queue-3.4/ceph-don-t-reference-req-after-put.patch b/queue-3.4/ceph-don-t-reference-req-after-put.patch new file mode 100644 index 00000000000..859b7a0bc0e --- /dev/null +++ b/queue-3.4/ceph-don-t-reference-req-after-put.patch @@ -0,0 +1,37 @@ +From f54e923eff7ca2a1711023d39dcd40889f6407a4 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 29 Nov 2012 08:37:03 -0600 +Subject: ceph: don't reference req after put + + +From: Alex Elder + +In __unregister_request(), there is a call to list_del_init() +referencing a request that was the subject of a call to +ceph_osdc_put_request() on the previous line. This is not +safe, because the request structure could have been freed +by the time we reach the list_del_init(). + +Fix this by reversing the order of these lines. + +Signed-off-by: Alex Elder +Reviewed-off-by: Sage Weil +(cherry picked from commit 7d5f24812bd182a2471cb69c1c2baf0648332e1f) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -871,9 +871,9 @@ static void __unregister_request(struct + req->r_osd = NULL; + } + ++ list_del_init(&req->r_req_lru_item); + ceph_osdc_put_request(req); + +- list_del_init(&req->r_req_lru_item); + if (osdc->num_requests == 0) { + dout(" no requests, canceling timeout\n"); + __cancel_osd_timeout(osdc); diff --git a/queue-3.4/ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch b/queue-3.4/ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch new file mode 100644 index 00000000000..c11687f84f5 --- /dev/null +++ b/queue-3.4/ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch @@ -0,0 +1,29 @@ +From 50c532cd7abb2054f5bb045244cd9a561b7e70ff Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Mon, 19 Nov 2012 10:49:04 +0800 +Subject: ceph: Don't update i_max_size when handling non-auth cap + + +From: "Yan, Zheng" + +The cap from non-auth mds doesn't have a meaningful max_size value. + +Signed-off-by: Yan, Zheng +Signed-off-by: Sage Weil +(cherry picked from commit 5e62ad30157d0da04cf40c6d1a2f4bc840948b9c) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/caps.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ceph/caps.c ++++ b/fs/ceph/caps.c +@@ -2388,7 +2388,7 @@ static void handle_cap_grant(struct inod + &atime); + + /* max size increase? */ +- if (max_size != ci->i_max_size) { ++ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { diff --git a/queue-3.4/ceph-fix-__ceph_do_pending_vmtruncate.patch b/queue-3.4/ceph-fix-__ceph_do_pending_vmtruncate.patch new file mode 100644 index 00000000000..8e33a12f512 --- /dev/null +++ b/queue-3.4/ceph-fix-__ceph_do_pending_vmtruncate.patch @@ -0,0 +1,54 @@ +From 379ad3e7100d3c1deebb150af4dc38b9f4e90006 Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Mon, 19 Nov 2012 10:49:08 +0800 +Subject: ceph: Fix __ceph_do_pending_vmtruncate + + +From: "Yan, Zheng" + +we should set i_truncate_pending to 0 after page cache is truncated +to i_truncate_size + +Signed-off-by: Yan, Zheng +Signed-off-by: Sage Weil +(cherry picked from commit a85f50b6ef93fbbb2ae932ce9b2376509d172796) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/inode.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +--- a/fs/ceph/inode.c ++++ b/fs/ceph/inode.c +@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct + { + struct ceph_inode_info *ci = ceph_inode(inode); + u64 to; +- int wrbuffer_refs, wake = 0; ++ int wrbuffer_refs, finish = 0; + + retry: + spin_lock(&ci->i_ceph_lock); +@@ -1498,15 +1498,18 @@ retry: + truncate_inode_pages(inode->i_mapping, to); + + spin_lock(&ci->i_ceph_lock); +- ci->i_truncate_pending--; +- if (ci->i_truncate_pending == 0) +- wake = 1; ++ if (to == ci->i_truncate_size) { ++ ci->i_truncate_pending = 0; ++ finish = 1; ++ } + spin_unlock(&ci->i_ceph_lock); ++ if (!finish) ++ goto retry; + + if (wrbuffer_refs == 0) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); +- if (wake) +- wake_up_all(&ci->i_cap_wq); ++ ++ wake_up_all(&ci->i_cap_wq); + } + + diff --git a/queue-3.4/ceph-fix-infinite-loop-in-__wake_requests.patch b/queue-3.4/ceph-fix-infinite-loop-in-__wake_requests.patch new file mode 100644 index 00000000000..a18d6185950 --- /dev/null +++ b/queue-3.4/ceph-fix-infinite-loop-in-__wake_requests.patch @@ -0,0 +1,40 @@ +From c9c3fd311561a922ebbd999f3ad00b5f907000c2 Mon Sep 17 00:00:00 2001 +From: "Yan, Zheng" +Date: Mon, 19 Nov 2012 10:49:06 +0800 +Subject: ceph: Fix infinite loop in __wake_requests + + +From: "Yan, Zheng" + +__wake_requests() will enter infinite loop if we use it to wake +requests in the session->s_waiting list. __wake_requests() deletes +requests from the list and __do_request() adds requests back to +the list. + +Signed-off-by: Yan, Zheng +Signed-off-by: Sage Weil +(cherry picked from commit ed75ec2cd19b47efcd292b6e23f58e56f4c5bc34) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/mds_client.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -1886,9 +1886,14 @@ finish: + static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head) + { +- struct ceph_mds_request *req, *nreq; ++ struct ceph_mds_request *req; ++ LIST_HEAD(tmp_list); + +- list_for_each_entry_safe(req, nreq, head, r_wait) { ++ list_splice_init(head, &tmp_list); ++ ++ while (!list_empty(&tmp_list)) { ++ req = list_entry(tmp_list.next, ++ struct ceph_mds_request, r_wait); + list_del_init(&req->r_wait); + __do_request(mdsc, req); + } diff --git a/queue-3.4/ceph-propagate-layout-error-on-osd-request-creation.patch b/queue-3.4/ceph-propagate-layout-error-on-osd-request-creation.patch new file mode 100644 index 00000000000..23b7eca1253 --- /dev/null +++ b/queue-3.4/ceph-propagate-layout-error-on-osd-request-creation.patch @@ -0,0 +1,106 @@ +From 7dab35042aab340d087737d42c2fae34af0b5c78 Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Mon, 24 Sep 2012 21:01:02 -0700 +Subject: ceph: propagate layout error on osd request creation + + +From: Sage Weil + +If we are creating an osd request and get an invalid layout, return +an EINVAL to the caller. We switch up the return to have an error +code instead of NULL implying -ENOMEM. + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(cherry picked from commit 6816282dab3a72efe8c0d182c1bc2960d87f4322) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/addr.c | 8 ++++---- + fs/ceph/file.c | 4 ++-- + net/ceph/osd_client.c | 15 +++++++++------ + 3 files changed, 15 insertions(+), 12 deletions(-) + +--- a/fs/ceph/addr.c ++++ b/fs/ceph/addr.c +@@ -308,8 +308,8 @@ static int start_read(struct inode *inod + NULL, 0, + ci->i_truncate_seq, ci->i_truncate_size, + NULL, false, 1, 0); +- if (!req) +- return -ENOMEM; ++ if (IS_ERR(req)) ++ return PTR_ERR(req); + + /* build page vector */ + nr_pages = len >> PAGE_CACHE_SHIFT; +@@ -831,8 +831,8 @@ get_more_pages: + ci->i_truncate_size, + &inode->i_mtime, true, 1, 0); + +- if (!req) { +- rc = -ENOMEM; ++ if (IS_ERR(req)) { ++ rc = PTR_ERR(req); + unlock_page(page); + break; + } +--- a/fs/ceph/file.c ++++ b/fs/ceph/file.c +@@ -529,8 +529,8 @@ more: + do_sync, + ci->i_truncate_seq, ci->i_truncate_size, + &mtime, false, 2, page_align); +- if (!req) +- return -ENOMEM; ++ if (IS_ERR(req)) ++ return PTR_ERR(req); + + if (file->f_flags & O_DIRECT) { + pages = ceph_get_direct_page_vector(data, num_pages, false); +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -461,6 +461,7 @@ struct ceph_osd_request *ceph_osdc_new_r + { + struct ceph_osd_req_op ops[3]; + struct ceph_osd_request *req; ++ int r; + + ops[0].op = opcode; + ops[0].extent.truncate_seq = truncate_seq; +@@ -479,10 +480,12 @@ struct ceph_osd_request *ceph_osdc_new_r + use_mempool, + GFP_NOFS, NULL, NULL); + if (!req) +- return NULL; ++ return ERR_PTR(-ENOMEM); + + /* calculate max write size */ +- calc_layout(osdc, vino, layout, off, plen, req, ops); ++ r = calc_layout(osdc, vino, layout, off, plen, req, ops); ++ if (r < 0) ++ return ERR_PTR(r); + req->r_file_layout = *layout; /* keep a copy */ + + /* in case it differs from natural (file) alignment that +@@ -1925,8 +1928,8 @@ int ceph_osdc_readpages(struct ceph_osd_ + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, 0, truncate_seq, truncate_size, NULL, + false, 1, page_align); +- if (!req) +- return -ENOMEM; ++ if (IS_ERR(req)) ++ return PTR_ERR(req); + + /* it may be a short read due to an object boundary */ + req->r_pages = pages; +@@ -1968,8 +1971,8 @@ int ceph_osdc_writepages(struct ceph_osd + snapc, do_sync, + truncate_seq, truncate_size, mtime, + nofail, 1, page_align); +- if (!req) +- return -ENOMEM; ++ if (IS_ERR(req)) ++ return PTR_ERR(req); + + /* it may be a short write due to an object boundary */ + req->r_pages = pages; diff --git a/queue-3.4/ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch b/queue-3.4/ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch new file mode 100644 index 00000000000..42894630ea2 --- /dev/null +++ b/queue-3.4/ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch @@ -0,0 +1,51 @@ +From ecb6de0b8f805a901457390c4433a923411e139d Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Tue, 21 Aug 2012 15:55:25 -0700 +Subject: ceph: tolerate (and warn on) extraneous dentry from mds + + +From: Sage Weil + +If the MDS gives us a dentry and we weren't prepared to handle it, +WARN_ON_ONCE instead of crashing. + +Reported-by: Yan, Zheng +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(cherry picked from commit 6c5e50fa614fea5325a2973be06f7ec6f1055316) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/inode.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +--- a/fs/ceph/inode.c ++++ b/fs/ceph/inode.c +@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block * + if (rinfo->head->is_dentry) { + struct inode *dir = req->r_locked_dir; + +- err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, +- session, req->r_request_started, -1, +- &req->r_caps_reservation); +- if (err < 0) +- return err; ++ if (dir) { ++ err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, ++ session, req->r_request_started, -1, ++ &req->r_caps_reservation); ++ if (err < 0) ++ return err; ++ } else { ++ WARN_ON_ONCE(1); ++ } + } + + /* +@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block * + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && !req->r_aborted && ++ req->r_locked_dir && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + fsc->mount_options->snapdir_name, + req->r_dentry->d_name.len))) { diff --git a/queue-3.4/libceph-always-reset-osds-when-kicking.patch b/queue-3.4/libceph-always-reset-osds-when-kicking.patch new file mode 100644 index 00000000000..f02cbb04083 --- /dev/null +++ b/queue-3.4/libceph-always-reset-osds-when-kicking.patch @@ -0,0 +1,60 @@ +From 5cad941fcd34022b5c0b7475c88ff618f2db659f Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Wed, 26 Dec 2012 14:31:40 -0600 +Subject: libceph: always reset osds when kicking + + +From: Alex Elder + +When ceph_osdc_handle_map() is called to process a new osd map, +kick_requests() is called to ensure all affected requests are +updated if necessary to reflect changes in the osd map. This +happens in two cases: whenever an incremental map update is +processed; and when a full map update (or the last one if there is +more than one) gets processed. + +In the former case, the kick_requests() call is followed immediately +by a call to reset_changed_osds() to ensure any connections to osds +affected by the map change are reset. But for full map updates +this isn't done. + +Both cases should be doing this osd reset. + +Rather than duplicating the reset_changed_osds() call, move it into +the end of kick_requests(). + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit e6d50f67a6b1a6252a616e6e629473b5c4277218) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -1306,7 +1306,7 @@ static void reset_changed_osds(struct ce + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. + * +- * Caller should hold map_sem for read and request_mutex. ++ * Caller should hold map_sem for read. + */ + static void kick_requests(struct ceph_osd_client *osdc, int force_resend) + { +@@ -1381,6 +1381,7 @@ static void kick_requests(struct ceph_os + dout("%d requests for down osds, need new map\n", needmap); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } ++ reset_changed_osds(osdc); + } + + +@@ -1437,7 +1438,6 @@ void ceph_osdc_handle_map(struct ceph_os + osdc->osdmap = newmap; + } + kick_requests(osdc, 0); +- reset_changed_osds(osdc); + } else { + dout("ignoring incremental map %u len %d\n", + epoch, maplen); diff --git a/queue-3.4/libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch b/queue-3.4/libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch new file mode 100644 index 00000000000..62ff00c32b3 --- /dev/null +++ b/queue-3.4/libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch @@ -0,0 +1,47 @@ +From 3c27b4c0960d284d78b356be6a59ef6cff5a2274 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 7 Dec 2012 09:57:58 -0600 +Subject: libceph: avoid using freed osd in __kick_osd_requests() + + +From: Alex Elder + +If an osd has no requests and no linger requests, __reset_osd() +will just remove it with a call to __remove_osd(). That drops +a reference to the osd, and therefore the osd may have been free +by the time __reset_osd() returns. That function offers no +indication this may have occurred, and as a result the osd will +continue to be used even when it's no longer valid. + +Change__reset_osd() so it returns an error (ENODEV) when it +deletes the osd being reset. And change __kick_osd_requests() so it +returns immediately (before referencing osd again) if __reset_osd() +returns *any* error. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 685a7555ca69030739ddb57a47f0ea8ea80196a4) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -579,7 +579,7 @@ static void __kick_osd_requests(struct c + + dout("__kick_osd_requests osd%d\n", osd->o_osd); + err = __reset_osd(osdc, osd); +- if (err == -EAGAIN) ++ if (err) + return; + + list_for_each_entry(req, &osd->o_requests, r_osd_item) { +@@ -750,6 +750,7 @@ static int __reset_osd(struct ceph_osd_c + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { + __remove_osd(osdc, osd); ++ ret = -ENODEV; + } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], + &osd->o_con.peer_addr, + sizeof(osd->o_con.peer_addr)) == 0 && diff --git a/queue-3.4/libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch b/queue-3.4/libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch new file mode 100644 index 00000000000..652e0656ea3 --- /dev/null +++ b/queue-3.4/libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch @@ -0,0 +1,36 @@ +From 072373ec146bbeeb0c17b67c285b39c41dc91765 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Mon, 17 Dec 2012 12:23:48 -0600 +Subject: libceph: don't use rb_init_node() in ceph_osdc_alloc_request() + + +From: Alex Elder + +The red-black node in the ceph osd request structure is initialized +in ceph_osdc_alloc_request() using rbd_init_node(). We do need to +initialize this, because in __unregister_request() we call +RB_EMPTY_NODE(), which expects the node it's checking to have +been initialized. But rb_init_node() is apparently overkill, and +may in fact be on its way out. So use RB_CLEAR_NODE() instead. + +For a little more background, see this commit: + 4c199a93 rbtree: empty nodes have no color" + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit a978fa20fb657548561dddbfb605fe43654f0825) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -221,6 +221,7 @@ struct ceph_osd_request *ceph_osdc_alloc + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); ++ RB_CLEAR_NODE(&req->r_node); + INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd); diff --git a/queue-3.4/libceph-fix-osdmap-decode-error-paths.patch b/queue-3.4/libceph-fix-osdmap-decode-error-paths.patch new file mode 100644 index 00000000000..cbacf9dd44d --- /dev/null +++ b/queue-3.4/libceph-fix-osdmap-decode-error-paths.patch @@ -0,0 +1,102 @@ +From 86dac6809fe52100517c2ada77923b7e4e4632ab Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Mon, 29 Oct 2012 11:01:42 -0700 +Subject: libceph: fix osdmap decode error paths + + +From: Sage Weil + +Ensure that we set the err value correctly so that we do not pass a 0 +value to ERR_PTR and confuse the calling code. (In particular, +osd_client.c handle_map() will BUG(!newmap)). + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(cherry picked from commit 0ed7285e0001b960c888e5455ae982025210ed3d) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osdmap.c | 31 ++++++++++++++++++++----------- + 1 file changed, 20 insertions(+), 11 deletions(-) + +--- a/net/ceph/osdmap.c ++++ b/net/ceph/osdmap.c +@@ -613,10 +613,12 @@ struct ceph_osdmap *osdmap_decode(void * + ceph_decode_32_safe(p, end, max, bad); + while (max--) { + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); ++ err = -ENOMEM; + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + goto bad; + pi->id = ceph_decode_32(p); ++ err = -EINVAL; + ev = ceph_decode_8(p); /* encoding version */ + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", +@@ -632,8 +634,13 @@ struct ceph_osdmap *osdmap_decode(void * + __insert_pg_pool(&map->pg_pools, pi); + } + +- if (version >= 5 && __decode_pool_names(p, end, map) < 0) +- goto bad; ++ if (version >= 5) { ++ err = __decode_pool_names(p, end, map); ++ if (err < 0) { ++ dout("fail to decode pool names"); ++ goto bad; ++ } ++ } + + ceph_decode_32_safe(p, end, map->pool_max, bad); + +@@ -713,7 +720,7 @@ struct ceph_osdmap *osdmap_decode(void * + return map; + + bad: +- dout("osdmap_decode fail\n"); ++ dout("osdmap_decode fail err %d\n", err); + ceph_osdmap_destroy(map); + return ERR_PTR(err); + } +@@ -807,6 +814,7 @@ struct ceph_osdmap *osdmap_apply_increme + if (ev > CEPH_PG_POOL_VERSION) { + pr_warning("got unknown v %d > %d of ceph_pg_pool\n", + ev, CEPH_PG_POOL_VERSION); ++ err = -EINVAL; + goto bad; + } + pi = __lookup_pg_pool(&map->pg_pools, pool); +@@ -823,8 +831,11 @@ struct ceph_osdmap *osdmap_apply_increme + if (err < 0) + goto bad; + } +- if (version >= 5 && __decode_pool_names(p, end, map) < 0) +- goto bad; ++ if (version >= 5) { ++ err = __decode_pool_names(p, end, map); ++ if (err < 0) ++ goto bad; ++ } + + /* old_pool */ + ceph_decode_32_safe(p, end, len, bad); +@@ -900,15 +911,13 @@ struct ceph_osdmap *osdmap_apply_increme + (void) __remove_pg_mapping(&map->pg_temp, pgid); + + /* insert */ +- if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) { +- err = -EINVAL; ++ err = -EINVAL; ++ if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + goto bad; +- } ++ err = -ENOMEM; + pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); +- if (!pg) { +- err = -ENOMEM; ++ if (!pg) + goto bad; +- } + pg->pgid = pgid; + pg->len = pglen; + for (j = 0; j < pglen; j++) diff --git a/queue-3.4/libceph-fix-protocol-feature-mismatch-failure-path.patch b/queue-3.4/libceph-fix-protocol-feature-mismatch-failure-path.patch new file mode 100644 index 00000000000..59d2b1391d3 --- /dev/null +++ b/queue-3.4/libceph-fix-protocol-feature-mismatch-failure-path.patch @@ -0,0 +1,74 @@ +From ebbdceac00f878c2ef43961aa19f7f033c9fbeb3 Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Thu, 27 Dec 2012 20:27:04 -0600 +Subject: libceph: fix protocol feature mismatch failure path + + +From: Sage Weil + +We should not set con->state to CLOSED here; that happens in +ceph_fault() in the caller, where it first asserts that the state +is not yet CLOSED. Avoids a BUG when the features don't match. + +Since the fail_protocol() has become a trivial wrapper, replace +calls to it with direct calls to reset_connection(). + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(cherry picked from commit 0fa6ebc600bc8e830551aee47a0e929e818a1868) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/messenger.c | 14 ++++---------- + 1 file changed, 4 insertions(+), 10 deletions(-) + +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -506,6 +506,7 @@ static void reset_connection(struct ceph + { + /* reset connection, out_queue, msg_ and connect_seq */ + /* discard existing out_queue and msg_seq */ ++ dout("reset_connection %p\n", con); + ceph_msg_remove_list(&con->out_queue); + ceph_msg_remove_list(&con->out_sent); + +@@ -1502,13 +1503,6 @@ static int process_banner(struct ceph_co + return 0; + } + +-static void fail_protocol(struct ceph_connection *con) +-{ +- reset_connection(con); +- WARN_ON(con->state != CON_STATE_NEGOTIATING); +- con->state = CON_STATE_CLOSED; +-} +- + static int process_connect(struct ceph_connection *con) + { + u64 sup_feat = con->msgr->supported_features; +@@ -1526,7 +1520,7 @@ static int process_connect(struct ceph_c + ceph_pr_addr(&con->peer_addr.in_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; +- fail_protocol(con); ++ reset_connection(con); + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: +@@ -1537,7 +1531,7 @@ static int process_connect(struct ceph_c + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; +- fail_protocol(con); ++ reset_connection(con); + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: +@@ -1627,7 +1621,7 @@ static int process_connect(struct ceph_c + ceph_pr_addr(&con->peer_addr.in_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; +- fail_protocol(con); ++ reset_connection(con); + return -1; + } + diff --git a/queue-3.4/libceph-init-event-node-in-ceph_osdc_create_event.patch b/queue-3.4/libceph-init-event-node-in-ceph_osdc_create_event.patch new file mode 100644 index 00000000000..e540e9cca58 --- /dev/null +++ b/queue-3.4/libceph-init-event-node-in-ceph_osdc_create_event.patch @@ -0,0 +1,31 @@ +From e62bb27ca5fd05f8d490cfaf0d03d39956bb09b4 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Mon, 17 Dec 2012 12:23:48 -0600 +Subject: libceph: init event->node in ceph_osdc_create_event() + + +From: Alex Elder + +The red-black node node in the ceph osd event structure is not +initialized in create_osdc_create_event(). Because this node can +be the subject of a RB_EMPTY_NODE() call later on, we should ensure +the node is initialized properly for that. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 3ee5234df68d253c415ba4f2db72ad250d9c21a9) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -1598,6 +1598,7 @@ int ceph_osdc_create_event(struct ceph_o + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); ++ RB_CLEAR_NODE(&event->node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + init_completion(&event->completion); diff --git a/queue-3.4/libceph-init-osd-o_node-in-create_osd.patch b/queue-3.4/libceph-init-osd-o_node-in-create_osd.patch new file mode 100644 index 00000000000..afa87f9328e --- /dev/null +++ b/queue-3.4/libceph-init-osd-o_node-in-create_osd.patch @@ -0,0 +1,32 @@ +From 31a6ebb578117b21ebb31c7e6cf025c4351a5c56 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 6 Dec 2012 07:22:04 -0600 +Subject: libceph: init osd->o_node in create_osd() + + +From: Alex Elder + +The red-black node node in the ceph osd structure is not initialized +in create_osd(). Because this node can be the subject of a +RB_EMPTY_NODE() call later on, we should ensure the node is +initialized properly for that. Add a call to RB_CLEAR_NODE() +initialize it. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit f407731d12214e7686819018f3a1e9d7b6f83a02) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -645,6 +645,7 @@ static struct ceph_osd *create_osd(struc + atomic_set(&osd->o_ref, 1); + osd->o_osdc = osdc; + osd->o_osd = onum; ++ RB_CLEAR_NODE(&osd->o_node); + INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); + INIT_LIST_HEAD(&osd->o_osd_lru); diff --git a/queue-3.4/libceph-move-linger-requests-sooner-in-kick_requests.patch b/queue-3.4/libceph-move-linger-requests-sooner-in-kick_requests.patch new file mode 100644 index 00000000000..6caa0a6c033 --- /dev/null +++ b/queue-3.4/libceph-move-linger-requests-sooner-in-kick_requests.patch @@ -0,0 +1,112 @@ +From fa20948861694bd5b450e667ccb70d1ca8b2374c Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Wed, 19 Dec 2012 15:52:36 -0600 +Subject: libceph: move linger requests sooner in kick_requests() + + +From: Alex Elder + +The kick_requests() function is called by ceph_osdc_handle_map() +when an osd map change has been indicated. Its purpose is to +re-queue any request whose target osd is different from what it +was when it was originally sent. + +It is structured as two loops, one for incomplete but registered +requests, and a second for handling completed linger requests. +As a special case, in the first loop if a request marked to linger +has not yet completed, it is moved from the request list to the +linger list. This is as a quick and dirty way to have the second +loop handle sending the request along with all the other linger +requests. + +Because of the way it's done now, however, this quick and dirty +solution can result in these incomplete linger requests never +getting re-sent as desired. The problem lies in the fact that +the second loop only arranges for a linger request to be sent +if it appears its target osd has changed. This is the proper +handling for *completed* linger requests (it avoids issuing +the same linger request twice to the same osd). + +But although the linger requests added to the list in the first loop +may have been sent, they have not yet completed, so they need to be +re-sent regardless of whether their target osd has changed. + +The first required fix is we need to avoid calling __map_request() +on any incomplete linger request. Otherwise the subsequent +__map_request() call in the second loop will find the target osd +has not changed and will therefore not re-send the request. + +Second, we need to be sure that a sent but incomplete linger request +gets re-sent. If the target osd is the same with the new osd map as +it was when the request was originally sent, this won't happen. +This can be fixed through careful handling when we move these +requests from the request list to the linger list, by unregistering +the request *before* it is registered as a linger request. This +works because a side-effect of unregistering the request is to make +the request's r_osd pointer be NULL, and *that* will ensure the +second loop actually re-sends the linger request. + +Processing of such a request is done at that point, so continue with +the next one once it's been moved. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit ab60b16d3c31b9bd9fd5b39f97dc42c52a50b67d) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 30 +++++++++++++++++++----------- + 1 file changed, 19 insertions(+), 11 deletions(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -1320,6 +1320,24 @@ static void kick_requests(struct ceph_os + for (p = rb_first(&osdc->requests); p; ) { + req = rb_entry(p, struct ceph_osd_request, r_node); + p = rb_next(p); ++ ++ /* ++ * For linger requests that have not yet been ++ * registered, move them to the linger list; they'll ++ * be sent to the osd in the loop below. Unregister ++ * the request before re-registering it as a linger ++ * request to ensure the __map_request() below ++ * will decide it needs to be sent. ++ */ ++ if (req->r_linger && list_empty(&req->r_linger_item)) { ++ dout("%p tid %llu restart on osd%d\n", ++ req, req->r_tid, ++ req->r_osd ? req->r_osd->o_osd : -1); ++ __unregister_request(osdc, req); ++ __register_linger_request(osdc, req); ++ continue; ++ } ++ + err = __map_request(osdc, req, force_resend); + if (err < 0) + continue; /* error */ +@@ -1334,17 +1352,6 @@ static void kick_requests(struct ceph_os + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } +- if (req->r_linger && list_empty(&req->r_linger_item)) { +- /* +- * register as a linger so that we will +- * re-submit below and get a new tid +- */ +- dout("%p tid %llu restart on osd%d\n", +- req, req->r_tid, +- req->r_osd ? req->r_osd->o_osd : -1); +- __register_linger_request(osdc, req); +- __unregister_request(osdc, req); +- } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, +@@ -1352,6 +1359,7 @@ static void kick_requests(struct ceph_os + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); + + err = __map_request(osdc, req, force_resend); ++ dout("__map_request returned %d\n", err); + if (err == 0) + continue; /* no change and no osd was specified */ + if (err < 0) diff --git a/queue-3.4/libceph-register-request-before-unregister-linger.patch b/queue-3.4/libceph-register-request-before-unregister-linger.patch new file mode 100644 index 00000000000..c5a82c38c6c --- /dev/null +++ b/queue-3.4/libceph-register-request-before-unregister-linger.patch @@ -0,0 +1,32 @@ +From 0a00fe0504f0416d9e0daf5eed6c6368b88c0aa3 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 6 Dec 2012 07:22:04 -0600 +Subject: libceph: register request before unregister linger + + +From: Alex Elder + +In kick_requests(), we need to register the request before we +unregister the linger request. Otherwise the unregister will +reset the request's osd pointer to NULL. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit c89ce05e0c5a01a256100ac6a6019f276bdd1ca6) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -1364,8 +1364,8 @@ static void kick_requests(struct ceph_os + + dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); +- __unregister_linger_request(osdc, req); + __register_request(osdc, req); ++ __unregister_linger_request(osdc, req); + } + mutex_unlock(&osdc->request_mutex); + diff --git a/queue-3.4/libceph-remove-osdtimeout-option.patch b/queue-3.4/libceph-remove-osdtimeout-option.patch new file mode 100644 index 00000000000..7c29368d1bb --- /dev/null +++ b/queue-3.4/libceph-remove-osdtimeout-option.patch @@ -0,0 +1,160 @@ +From 55540e5c7745cc0dea6b0af54accd8a9e9a2670e Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Wed, 28 Nov 2012 12:28:24 -0800 +Subject: libceph: remove 'osdtimeout' option + + +From: Sage Weil + +This would reset a connection with any OSD that had an outstanding +request that was taking more than N seconds. The idea was that if the +OSD was buggy, the client could compensate by resending the request. + +In reality, this only served to hide server bugs, and we haven't +actually seen such a bug in quite a while. Moreover, the userspace +client code never did this. + +More importantly, often the request is taking a long time because the +OSD is trying to recover, or overloaded, and killing the connection +and retrying would only make the situation worse by giving the OSD +more work to do. + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(cherry picked from commit 83aff95eb9d60aff5497e9f44a2ae906b86d8e88) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/super.c | 2 - + include/linux/ceph/libceph.h | 2 - + net/ceph/ceph_common.c | 3 -- + net/ceph/osd_client.c | 47 +++---------------------------------------- + 4 files changed, 5 insertions(+), 49 deletions(-) + +--- a/fs/ceph/super.c ++++ b/fs/ceph/super.c +@@ -387,8 +387,6 @@ static int ceph_show_options(struct seq_ + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); +- if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) +- seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); +--- a/include/linux/ceph/libceph.h ++++ b/include/linux/ceph/libceph.h +@@ -49,7 +49,6 @@ struct ceph_options { + struct ceph_entity_addr my_addr; + int mount_timeout; + int osd_idle_ttl; +- int osd_timeout; + int osd_keepalive_timeout; + + /* +@@ -69,7 +68,6 @@ struct ceph_options { + * defaults + */ + #define CEPH_MOUNT_TIMEOUT_DEFAULT 60 +-#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ + #define CEPH_OSD_KEEPALIVE_DEFAULT 5 + #define CEPH_OSD_IDLE_TTL_DEFAULT 60 + +--- a/net/ceph/ceph_common.c ++++ b/net/ceph/ceph_common.c +@@ -304,7 +304,6 @@ ceph_parse_options(char *options, const + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; +- opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ +@@ -390,7 +389,7 @@ ceph_parse_options(char *options, const + + /* misc */ + case Opt_osdtimeout: +- opt->osd_timeout = intval; ++ pr_warning("ignoring deprecated osdtimeout option\n"); + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -606,14 +606,6 @@ static void __kick_osd_requests(struct c + } + } + +-static void kick_osd_requests(struct ceph_osd_client *osdc, +- struct ceph_osd *kickosd) +-{ +- mutex_lock(&osdc->request_mutex); +- __kick_osd_requests(osdc, kickosd); +- mutex_unlock(&osdc->request_mutex); +-} +- + /* + * If the osd connection drops, we need to resubmit all requests. + */ +@@ -627,7 +619,9 @@ static void osd_reset(struct ceph_connec + dout("osd_reset osd%d\n", osd->o_osd); + osdc = osd->o_osdc; + down_read(&osdc->map_sem); +- kick_osd_requests(osdc, osd); ++ mutex_lock(&osdc->request_mutex); ++ __kick_osd_requests(osdc, osd); ++ mutex_unlock(&osdc->request_mutex); + send_queued(osdc); + up_read(&osdc->map_sem); + } +@@ -1091,12 +1085,10 @@ static void handle_timeout(struct work_s + { + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, timeout_work.work); +- struct ceph_osd_request *req, *last_req = NULL; ++ struct ceph_osd_request *req; + struct ceph_osd *osd; +- unsigned long timeout = osdc->client->options->osd_timeout * HZ; + unsigned long keepalive = + osdc->client->options->osd_keepalive_timeout * HZ; +- unsigned long last_stamp = 0; + struct list_head slow_osds; + dout("timeout\n"); + down_read(&osdc->map_sem); +@@ -1106,37 +1098,6 @@ static void handle_timeout(struct work_s + mutex_lock(&osdc->request_mutex); + + /* +- * reset osds that appear to be _really_ unresponsive. this +- * is a failsafe measure.. we really shouldn't be getting to +- * this point if the system is working properly. the monitors +- * should mark the osd as failed and we should find out about +- * it from an updated osd map. +- */ +- while (timeout && !list_empty(&osdc->req_lru)) { +- req = list_entry(osdc->req_lru.next, struct ceph_osd_request, +- r_req_lru_item); +- +- /* hasn't been long enough since we sent it? */ +- if (time_before(jiffies, req->r_stamp + timeout)) +- break; +- +- /* hasn't been long enough since it was acked? */ +- if (req->r_request->ack_stamp == 0 || +- time_before(jiffies, req->r_request->ack_stamp + timeout)) +- break; +- +- BUG_ON(req == last_req && req->r_stamp == last_stamp); +- last_req = req; +- last_stamp = req->r_stamp; +- +- osd = req->r_osd; +- BUG_ON(!osd); +- pr_warning(" tid %llu timed out on osd%d, will reset osd\n", +- req->r_tid, osd->o_osd); +- __kick_osd_requests(osdc, osd); +- } +- +- /* + * ping osds that are a bit slow. this ensures that if there + * is a break in the TCP connection we will notice, and reopen + * a connection with that osd (from the fault callback). diff --git a/queue-3.4/libceph-report-connection-fault-with-warning.patch b/queue-3.4/libceph-report-connection-fault-with-warning.patch new file mode 100644 index 00000000000..614c26bb698 --- /dev/null +++ b/queue-3.4/libceph-report-connection-fault-with-warning.patch @@ -0,0 +1,39 @@ +From 4b2c444a99b137f460f6041ab82e84e4d7873203 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 14 Dec 2012 16:47:41 -0600 +Subject: libceph: report connection fault with warning + + +From: Alex Elder + +When a connection's socket disconnects, or if there's a protocol +error of some kind on the connection, a fault is signaled and +the connection is reset (closed and reopened, basically). We +currently get an error message on the log whenever this occurs. + +A ceph connection will attempt to reestablish a socket connection +repeatedly if a fault occurs. This means that these error messages +will get repeatedly added to the log, which is undesirable. + +Change the error message to be a warning, so they don't get +logged by default. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 28362986f8743124b3a0fda20a8ed3e80309cce1) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/messenger.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -2365,7 +2365,7 @@ fault: + static void ceph_fault(struct ceph_connection *con) + __releases(con->mutex) + { +- pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ++ pr_warning("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + dout("fault %p state %lu to peer %s\n", + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); diff --git a/queue-3.4/libceph-socket-can-close-in-any-connection-state.patch b/queue-3.4/libceph-socket-can-close-in-any-connection-state.patch new file mode 100644 index 00000000000..e959843158c --- /dev/null +++ b/queue-3.4/libceph-socket-can-close-in-any-connection-state.patch @@ -0,0 +1,88 @@ +From 8e4a805ffc7fb5406c157a9060f704376df327ef Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 7 Dec 2012 19:50:07 -0600 +Subject: libceph: socket can close in any connection state + + +From: Alex Elder + +A connection's socket can close for any reason, independent of the +state of the connection (and without irrespective of the connection +mutex). As a result, the connectino can be in pretty much any state +at the time its socket is closed. + +Handle those other cases at the top of con_work(). Pull this whole +block of code into a separate function to reduce the clutter. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 7bb21d68c535ad8be38e14a715632ae398b37ac1) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/messenger.c | 47 ++++++++++++++++++++++++++++++----------------- + 1 file changed, 30 insertions(+), 17 deletions(-) + +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -2258,6 +2258,35 @@ static void queue_con(struct ceph_connec + } + } + ++static bool con_sock_closed(struct ceph_connection *con) ++{ ++ if (!test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) ++ return false; ++ ++#define CASE(x) \ ++ case CON_STATE_ ## x: \ ++ con->error_msg = "socket closed (con state " #x ")"; \ ++ break; ++ ++ switch (con->state) { ++ CASE(CLOSED); ++ CASE(PREOPEN); ++ CASE(CONNECTING); ++ CASE(NEGOTIATING); ++ CASE(OPEN); ++ CASE(STANDBY); ++ default: ++ pr_warning("%s con %p unrecognized state %lu\n", ++ __func__, con, con->state); ++ con->error_msg = "unrecognized con state"; ++ BUG(); ++ break; ++ } ++#undef CASE ++ ++ return true; ++} ++ + /* + * Do some work on a connection. Drop a connection ref when we're done. + */ +@@ -2269,24 +2298,8 @@ static void con_work(struct work_struct + + mutex_lock(&con->mutex); + restart: +- if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) { +- switch (con->state) { +- case CON_STATE_CONNECTING: +- con->error_msg = "connection failed"; +- break; +- case CON_STATE_NEGOTIATING: +- con->error_msg = "negotiation failed"; +- break; +- case CON_STATE_OPEN: +- con->error_msg = "socket closed"; +- break; +- default: +- dout("unrecognized con state %d\n", (int)con->state); +- con->error_msg = "unrecognized con state"; +- BUG(); +- } ++ if (con_sock_closed(con)) + goto fault; +- } + + if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) { + dout("con_work %p backing off\n", con); diff --git a/queue-3.4/libceph-unlock-unprocessed-pages-in-start_read-error-path.patch b/queue-3.4/libceph-unlock-unprocessed-pages-in-start_read-error-path.patch new file mode 100644 index 00000000000..bd3f1843e04 --- /dev/null +++ b/queue-3.4/libceph-unlock-unprocessed-pages-in-start_read-error-path.patch @@ -0,0 +1,46 @@ +From 44f4872cfb643f95afeeba70a5bc974ba038d77e Mon Sep 17 00:00:00 2001 +From: David Zafman +Date: Mon, 3 Dec 2012 19:14:05 -0800 +Subject: libceph: Unlock unprocessed pages in start_read() error path + + +From: David Zafman + +Function start_read() can get an error before processing all pages. +It must not only release the remaining pages, but unlock them too. + +This fixes http://tracker.newdream.net/issues/3370 + +Signed-off-by: David Zafman +Reviewed-by: Alex Elder +(cherry picked from commit 8884d53dd63b1d9315b343564fcbe1ede004a99e) +Signed-off-by: Greg Kroah-Hartman +--- + fs/ceph/addr.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/ceph/addr.c ++++ b/fs/ceph/addr.c +@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_ + kfree(req->r_pages); + } + ++static void ceph_unlock_page_vector(struct page **pages, int num_pages) ++{ ++ int i; ++ ++ for (i = 0; i < num_pages; i++) ++ unlock_page(pages[i]); ++} ++ + /* + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. +@@ -347,6 +355,7 @@ static int start_read(struct inode *inod + return nr_pages; + + out_pages: ++ ceph_unlock_page_vector(pages, nr_pages); + ceph_release_page_vector(pages, nr_pages); + out: + ceph_osdc_put_request(req); diff --git a/queue-3.4/libceph-warn-don-t-bug-on-unexpected-connection-states.patch b/queue-3.4/libceph-warn-don-t-bug-on-unexpected-connection-states.patch new file mode 100644 index 00000000000..90261e6fb4a --- /dev/null +++ b/queue-3.4/libceph-warn-don-t-bug-on-unexpected-connection-states.patch @@ -0,0 +1,85 @@ +From d1b938deba18ca363fa37af3d55ec5fe8bb61f58 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Wed, 26 Dec 2012 10:43:57 -0600 +Subject: libceph: WARN, don't BUG on unexpected connection states + + +From: Alex Elder + +A number of assertions in the ceph messenger are implemented with +BUG_ON(), killing the system if connection's state doesn't match +what's expected. At this point our state model is (evidently) not +well understood enough for these assertions to trigger a BUG(). +Convert all BUG_ON(con->state...) calls to be WARN_ON(con->state...) +so we learn about these issues without killing the machine. + +We now recognize that a connection fault can occur due to a socket +closure at any time, regardless of the state of the connection. So +there is really nothing we can assert about the state of the +connection at that point so eliminate that assertion. + +Reported-by: Ugis +Tested-by: Ugis +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 122070a2ffc91f87fe8e8493eb0ac61986c5557c) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/messenger.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +--- a/net/ceph/messenger.c ++++ b/net/ceph/messenger.c +@@ -561,7 +561,7 @@ void ceph_con_open(struct ceph_connectio + mutex_lock(&con->mutex); + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); + +- BUG_ON(con->state != CON_STATE_CLOSED); ++ WARN_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; +@@ -1505,7 +1505,7 @@ static int process_banner(struct ceph_co + static void fail_protocol(struct ceph_connection *con) + { + reset_connection(con); +- BUG_ON(con->state != CON_STATE_NEGOTIATING); ++ WARN_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_CLOSED; + } + +@@ -1631,7 +1631,7 @@ static int process_connect(struct ceph_c + return -1; + } + +- BUG_ON(con->state != CON_STATE_NEGOTIATING); ++ WARN_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); +@@ -2128,7 +2128,6 @@ more: + if (ret < 0) + goto out; + +- BUG_ON(con->state != CON_STATE_CONNECTING); + con->state = CON_STATE_NEGOTIATING; + + /* +@@ -2156,7 +2155,7 @@ more: + goto more; + } + +- BUG_ON(con->state != CON_STATE_OPEN); ++ WARN_ON(con->state != CON_STATE_OPEN); + + if (con->in_base_pos < 0) { + /* +@@ -2370,7 +2369,7 @@ static void ceph_fault(struct ceph_conne + dout("fault %p state %lu to peer %s\n", + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + +- BUG_ON(con->state != CON_STATE_CONNECTING && ++ WARN_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); + diff --git a/queue-3.4/rbd-add-read_only-rbd-map-option.patch b/queue-3.4/rbd-add-read_only-rbd-map-option.patch new file mode 100644 index 00000000000..6f68a7f5b9c --- /dev/null +++ b/queue-3.4/rbd-add-read_only-rbd-map-option.patch @@ -0,0 +1,119 @@ +From f0dbbb5eedd159a834f2a864a4a451f14ce889c5 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 10 Aug 2012 13:12:07 -0700 +Subject: rbd: add read_only rbd map option + + +From: Alex Elder + +Add the ability to map an rbd image read-only, by specifying either +"read_only" or "ro" as an option on the rbd "command line." Also +allow the inverse to be explicitly specified using "read_write" or +"rw". + +Signed-off-by: Alex Elder +Reviewed-by: Yehuda Sadeh +(based on commit cc0538b62c839c2df7b9f8378bb37e3b35faa608) +--- + drivers/block/rbd.c | 28 ++++++++++++++++++++++++---- + 1 file changed, 24 insertions(+), 4 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -71,7 +71,8 @@ + #define DEV_NAME_LEN 32 + #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) + +-#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 ++#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 ++#define RBD_READ_ONLY_DEFAULT false + + /* + * block device image metadata (in-memory version) +@@ -95,6 +96,7 @@ struct rbd_image_header { + + struct rbd_options { + int notify_timeout; ++ bool read_only; + }; + + /* +@@ -180,7 +182,7 @@ struct rbd_device { + u64 snap_id; /* current snapshot id */ + /* whether the snap_id this device reads from still exists */ + bool snap_exists; +- int read_only; ++ bool read_only; + + struct list_head node; + +@@ -346,12 +348,21 @@ enum { + /* int args above */ + Opt_last_string, + /* string args above */ ++ Opt_read_only, ++ Opt_read_write, ++ /* Boolean args above */ ++ Opt_last_bool, + }; + + static match_table_t rbdopt_tokens = { + {Opt_notify_timeout, "notify_timeout=%d"}, + /* int args above */ + /* string args above */ ++ {Opt_read_only, "read_only"}, ++ {Opt_read_only, "ro"}, /* Alternate spelling */ ++ {Opt_read_write, "read_write"}, ++ {Opt_read_write, "rw"}, /* Alternate spelling */ ++ /* Boolean args above */ + {-1, NULL} + }; + +@@ -376,6 +387,8 @@ static int parse_rbd_opts_token(char *c, + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); ++ } else if (token > Opt_last_string && token < Opt_last_bool) { ++ dout("got Boolean token %d\n", token); + } else { + dout("got token %d\n", token); + } +@@ -384,6 +397,12 @@ static int parse_rbd_opts_token(char *c, + case Opt_notify_timeout: + rbdopt->notify_timeout = intval; + break; ++ case Opt_read_only: ++ rbdopt->read_only = true; ++ break; ++ case Opt_read_write: ++ rbdopt->read_only = false; ++ break; + default: + BUG_ON(token); + } +@@ -407,6 +426,7 @@ static struct rbd_client *rbd_get_client + return ERR_PTR(-ENOMEM); + + rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; ++ rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + + opt = ceph_parse_options(options, mon_addr, + mon_addr + mon_addr_len, +@@ -590,7 +610,7 @@ static int rbd_header_set_snap(struct rb + snapc->seq = 0; + dev->snap_id = CEPH_NOSNAP; + dev->snap_exists = false; +- dev->read_only = 0; ++ dev->read_only = dev->rbd_client->rbd_opts->read_only; + if (size) + *size = header->image_size; + } else { +@@ -599,7 +619,7 @@ static int rbd_header_set_snap(struct rb + goto done; + dev->snap_id = snapc->seq; + dev->snap_exists = true; +- dev->read_only = 1; ++ dev->read_only = true; /* No choice for snapshots */ + } + + ret = 0; diff --git a/queue-3.4/rbd-bug-on-invalid-layout.patch b/queue-3.4/rbd-bug-on-invalid-layout.patch new file mode 100644 index 00000000000..33b085d048e --- /dev/null +++ b/queue-3.4/rbd-bug-on-invalid-layout.patch @@ -0,0 +1,34 @@ +From 68205f80bfcf2cc6b697bd39b0f9d5c89e37b693 Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Mon, 24 Sep 2012 21:02:47 -0700 +Subject: rbd: BUG on invalid layout + + +From: Sage Weil + +This shouldn't actually be possible because the layout struct is +constructed from the RBD header and validated then. + +[elder@inktank.com: converted BUG() call to equivalent rbd_assert()] + +Signed-off-by: Sage Weil +Reviewed-by: Alex Elder +(based on commit 6cae3717cddaf8e5e96e304733dca66e40d56f89) +--- + drivers/block/rbd.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -930,8 +930,9 @@ static int rbd_do_request(struct request + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_pg_preferred = cpu_to_le32(-1); + layout->fl_pg_pool = cpu_to_le32(dev->poolid); +- ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, +- req, ops); ++ ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, ++ req, ops); ++ BUG_ON(ret != 0); + + ceph_osdc_build_request(req, ofs, &len, + ops, diff --git a/queue-3.4/rbd-do-not-allow-remove-of-mounted-on-image.patch b/queue-3.4/rbd-do-not-allow-remove-of-mounted-on-image.patch new file mode 100644 index 00000000000..f1cae30faf4 --- /dev/null +++ b/queue-3.4/rbd-do-not-allow-remove-of-mounted-on-image.patch @@ -0,0 +1,72 @@ +From 4e6bc65efc9ddd08a5328c3680c8e1679a592e00 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 16 Nov 2012 09:29:16 -0600 +Subject: rbd: do not allow remove of mounted-on image + + +From: Alex Elder + +There is no check in rbd_remove() to see if anybody holds open the +image being removed. That's not cool. + +Add a simple open count that goes up and down with opens and closes +(releases) of the device, and don't allow an rbd image to be removed +if the count is non-zero. + +Protect the updates of the open count value with ctl_mutex to ensure +the underlying rbd device doesn't get removed while concurrently +being opened. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(based on commit 42382b709bd1d143b9f0fa93e0a3a1f2f4210707) +--- + drivers/block/rbd.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -189,6 +189,7 @@ struct rbd_device { + + /* sysfs related */ + struct device dev; ++ unsigned long open_count; + }; + + static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +@@ -249,8 +250,11 @@ static int rbd_open(struct block_device + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + ++ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rbd_get_dev(rbd_dev); + set_device_ro(bdev, rbd_dev->read_only); ++ rbd_dev->open_count++; ++ mutex_unlock(&ctl_mutex); + + return 0; + } +@@ -259,7 +263,11 @@ static int rbd_release(struct gendisk *d + { + struct rbd_device *rbd_dev = disk->private_data; + ++ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); ++ BUG_ON(!rbd_dev->open_count); ++ rbd_dev->open_count--; + rbd_put_dev(rbd_dev); ++ mutex_unlock(&ctl_mutex); + + return 0; + } +@@ -2448,6 +2456,11 @@ static ssize_t rbd_remove(struct bus_typ + goto done; + } + ++ if (rbd_dev->open_count) { ++ ret = -EBUSY; ++ goto done; ++ } ++ + __rbd_remove_all_snaps(rbd_dev); + rbd_bus_del_dev(rbd_dev); + diff --git a/queue-3.4/rbd-drop-dev-reference-on-error-in-rbd_open.patch b/queue-3.4/rbd-drop-dev-reference-on-error-in-rbd_open.patch new file mode 100644 index 00000000000..b126b43a327 --- /dev/null +++ b/queue-3.4/rbd-drop-dev-reference-on-error-in-rbd_open.patch @@ -0,0 +1,41 @@ +From a85494c915b96289b51989aa718b56338f54c468 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 10 Aug 2012 13:12:07 -0700 +Subject: rbd: drop dev reference on error in rbd_open() + + +From: Alex Elder + +If a read-only rbd device is opened for writing in rbd_open(), it +returns without dropping the just-acquired device reference. + +Fix this by moving the read-only check before getting the reference. + +Signed-off-by: Alex Elder +Reviewed-by: Yehuda Sadeh +Reviewed-by: Josh Durgin +(cherry picked from commit 340c7a2b2c9a2da640af28a8c196356484ac8b50) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -250,13 +250,12 @@ static int rbd_open(struct block_device + { + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + +- rbd_get_dev(rbd_dev); +- +- set_device_ro(bdev, rbd_dev->read_only); +- + if ((mode & FMODE_WRITE) && rbd_dev->read_only) + return -EROFS; + ++ rbd_get_dev(rbd_dev); ++ set_device_ro(bdev, rbd_dev->read_only); ++ + return 0; + } + diff --git a/queue-3.4/rbd-expose-the-correct-size-of-the-device-in-sysfs.patch b/queue-3.4/rbd-expose-the-correct-size-of-the-device-in-sysfs.patch new file mode 100644 index 00000000000..4ef41cc8de2 --- /dev/null +++ b/queue-3.4/rbd-expose-the-correct-size-of-the-device-in-sysfs.patch @@ -0,0 +1,55 @@ +From a1d89f7052555740954faef81cd2817d6b7c8bae Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 5 Dec 2011 10:35:04 -0800 +Subject: rbd: expose the correct size of the device in sysfs + + +From: Josh Durgin + +If an image was mapped to a snapshot, the size of the head version +would be shown. Protect capacity with header_rwsem, since it may +change. + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit a51aa0c042fa39946dd017d5f91a073300a71577) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -1720,6 +1720,8 @@ static int __rbd_update_snaps(struct rbd + if (ret < 0) + return ret; + ++ down_write(&rbd_dev->header_rwsem); ++ + /* resized? */ + if (rbd_dev->snap_id == CEPH_NOSNAP) { + sector_t size = (sector_t) h.image_size / SECTOR_SIZE; +@@ -1728,8 +1730,6 @@ static int __rbd_update_snaps(struct rbd + set_capacity(rbd_dev->disk, size); + } + +- down_write(&rbd_dev->header_rwsem); +- + snap_seq = rbd_dev->header.snapc->seq; + if (rbd_dev->header.total_snaps && + rbd_dev->header.snapc->snaps[0] == snap_seq) +@@ -1844,8 +1844,13 @@ static ssize_t rbd_size_show(struct devi + struct device_attribute *attr, char *buf) + { + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); ++ sector_t size; ++ ++ down_read(&rbd_dev->header_rwsem); ++ size = get_capacity(rbd_dev->disk); ++ up_read(&rbd_dev->header_rwsem); + +- return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); ++ return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); + } + + static ssize_t rbd_major_show(struct device *dev, diff --git a/queue-3.4/rbd-fix-bug-in-rbd_dev_id_put.patch b/queue-3.4/rbd-fix-bug-in-rbd_dev_id_put.patch new file mode 100644 index 00000000000..ae076fa789e --- /dev/null +++ b/queue-3.4/rbd-fix-bug-in-rbd_dev_id_put.patch @@ -0,0 +1,36 @@ +From 2000a50aa1dbceae96b4bee06e8e1c15f9359e24 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Wed, 10 Oct 2012 21:19:13 -0700 +Subject: rbd: fix bug in rbd_dev_id_put() + + +From: Alex Elder + +In rbd_dev_id_put(), there's a loop that's intended to determine +the maximum device id in use. But it isn't doing that at all, +the effect of how it's written is to simply use the just-put id +number, which ignores whole purpose of this function. + +Fix the bug. + +Signed-off-by: Alex Elder +Reviewed-by: Josh Durgin +(cherry picked from commit b213e0b1a62637b2a9395a34349b13d73ca2b90a) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -2150,8 +2150,8 @@ static void rbd_id_put(struct rbd_device + struct rbd_device *rbd_dev; + + rbd_dev = list_entry(tmp, struct rbd_device, node); +- if (rbd_id > max_id) +- max_id = rbd_id; ++ if (rbd_dev->id > max_id) ++ max_id = rbd_dev->id; + } + spin_unlock(&rbd_dev_list_lock); + diff --git a/queue-3.4/rbd-kill-create_snap-sysfs-entry.patch b/queue-3.4/rbd-kill-create_snap-sysfs-entry.patch new file mode 100644 index 00000000000..e1ab829b471 --- /dev/null +++ b/queue-3.4/rbd-kill-create_snap-sysfs-entry.patch @@ -0,0 +1,281 @@ +From cd9155deb01c86a31836e9a963c208666f5d5abc Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 10 Aug 2012 13:12:10 -0700 +Subject: rbd: kill create_snap sysfs entry + + +From: Alex Elder + +Josh proposed the following change, and I don't think I could +explain it any better than he did: + + From: Josh Durgin + Date: Tue, 24 Jul 2012 14:22:11 -0700 + To: ceph-devel + Message-ID: <500F1203.9050605@inktank.com> + From: Josh Durgin + + + Right now the kernel still has one piece of rbd management + duplicated from the rbd command line tool: snapshot creation. + There's nothing special about snapshot creation that makes it + advantageous to do from the kernel, so I'd like to remove the + create_snap sysfs interface. That is, + /sys/bus/rbd/devices//create_snap + would be removed. + + Does anyone rely on the sysfs interface for creating rbd + snapshots? If so, how hard would it be to replace with: + + rbd snap create pool/image@snap + + Is there any benefit to the sysfs interface that I'm missing? + + Josh + +This patch implements this proposal, removing the code that +implements the "snap_create" sysfs interface for rbd images. +As a result, quite a lot of other supporting code goes away. + +[elder@inktank.com: commented out rbd_req_sync_exec() to avoid warning] + +Suggested-by: Josh Durgin +Signed-off-by: Alex Elder +Reviewed-by: Josh Durgin +(based on commit 02cdb02ceab1f3dd9ac2bc899fc51f0e0e744782) +--- + Documentation/ABI/testing/sysfs-bus-rbd | 6 - + drivers/block/rbd.c | 165 -------------------------------- + 2 files changed, 2 insertions(+), 169 deletions(-) + +--- a/Documentation/ABI/testing/sysfs-bus-rbd ++++ b/Documentation/ABI/testing/sysfs-bus-rbd +@@ -51,12 +51,6 @@ current_snap + + The current snapshot for which the device is mapped. + +-create_snap +- +- Create a snapshot: +- +- $ echo > /sys/bus/rbd/devices//snap_create +- + snap_* + + A directory per each snapshot +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -201,10 +201,6 @@ static DEFINE_SPINLOCK(rbd_client_list_l + + static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); + static void rbd_dev_release(struct device *dev); +-static ssize_t rbd_snap_add(struct device *dev, +- struct device_attribute *attr, +- const char *buf, +- size_t count); + static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, + struct rbd_snap *snap); + +@@ -1307,71 +1303,7 @@ static int rbd_req_sync_unwatch(struct r + return ret; + } + +-struct rbd_notify_info { +- struct rbd_device *dev; +-}; +- +-static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +-{ +- struct rbd_device *dev = (struct rbd_device *)data; +- if (!dev) +- return; +- +- dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, +- notify_id, (int)opcode); +-} +- +-/* +- * Request sync osd notify +- */ +-static int rbd_req_sync_notify(struct rbd_device *dev, +- const char *obj) +-{ +- struct ceph_osd_req_op *ops; +- struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; +- struct ceph_osd_event *event; +- struct rbd_notify_info info; +- int payload_len = sizeof(u32) + sizeof(u32); +- int ret; +- +- ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); +- if (ret < 0) +- return ret; +- +- info.dev = dev; +- +- ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, +- (void *)&info, &event); +- if (ret < 0) +- goto fail; +- +- ops[0].watch.ver = 1; +- ops[0].watch.flag = 1; +- ops[0].watch.cookie = event->cookie; +- ops[0].watch.prot_ver = RADOS_NOTIFY_VER; +- ops[0].watch.timeout = 12; +- +- ret = rbd_req_sync_op(dev, NULL, +- CEPH_NOSNAP, +- 0, +- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, +- ops, +- 1, obj, 0, 0, NULL, NULL, NULL); +- if (ret < 0) +- goto fail_event; +- +- ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); +- dout("ceph_osdc_wait_event returned %d\n", ret); +- rbd_destroy_ops(ops); +- return 0; +- +-fail_event: +- ceph_osdc_cancel_event(event); +-fail: +- rbd_destroy_ops(ops); +- return ret; +-} +- ++#if 0 + /* + * Request sync osd read + */ +@@ -1411,6 +1343,7 @@ static int rbd_req_sync_exec(struct rbd_ + dout("cls_exec returned %d\n", ret); + return ret; + } ++#endif + + static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) + { +@@ -1645,57 +1578,6 @@ out_dh: + return rc; + } + +-/* +- * create a snapshot +- */ +-static int rbd_header_add_snap(struct rbd_device *dev, +- const char *snap_name, +- gfp_t gfp_flags) +-{ +- int name_len = strlen(snap_name); +- u64 new_snapid; +- int ret; +- void *data, *p, *e; +- u64 ver; +- struct ceph_mon_client *monc; +- +- /* we should create a snapshot only if we're pointing at the head */ +- if (dev->snap_id != CEPH_NOSNAP) +- return -EINVAL; +- +- monc = &dev->rbd_client->client->monc; +- ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); +- dout("created snapid=%lld\n", new_snapid); +- if (ret < 0) +- return ret; +- +- data = kmalloc(name_len + 16, gfp_flags); +- if (!data) +- return -ENOMEM; +- +- p = data; +- e = data + name_len + 16; +- +- ceph_encode_string_safe(&p, e, snap_name, name_len, bad); +- ceph_encode_64_safe(&p, e, new_snapid, bad); +- +- ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", +- data, p - data, &ver); +- +- kfree(data); +- +- if (ret < 0) +- return ret; +- +- down_write(&dev->header_rwsem); +- dev->header.snapc->seq = new_snapid; +- up_write(&dev->header_rwsem); +- +- return 0; +-bad: +- return -ERANGE; +-} +- + static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) + { + struct rbd_snap *snap; +@@ -1923,7 +1805,6 @@ static DEVICE_ATTR(pool, S_IRUGO, rbd_po + static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); + static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); + static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); +-static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); + + static struct attribute *rbd_attrs[] = { + &dev_attr_size.attr, +@@ -1933,7 +1814,6 @@ static struct attribute *rbd_attrs[] = { + &dev_attr_name.attr, + &dev_attr_current_snap.attr, + &dev_attr_refresh.attr, +- &dev_attr_create_snap.attr, + NULL + }; + +@@ -2563,47 +2443,6 @@ done: + return ret; + } + +-static ssize_t rbd_snap_add(struct device *dev, +- struct device_attribute *attr, +- const char *buf, +- size_t count) +-{ +- struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); +- int ret; +- char *name = kmalloc(count + 1, GFP_KERNEL); +- if (!name) +- return -ENOMEM; +- +- snprintf(name, count, "%s", buf); +- +- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); +- +- ret = rbd_header_add_snap(rbd_dev, +- name, GFP_KERNEL); +- if (ret < 0) +- goto err_unlock; +- +- ret = __rbd_update_snaps(rbd_dev); +- if (ret < 0) +- goto err_unlock; +- +- /* shouldn't hold ctl_mutex when notifying.. notify might +- trigger a watch callback that would need to get that mutex */ +- mutex_unlock(&ctl_mutex); +- +- /* make a best effort, don't error if failed */ +- rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); +- +- ret = count; +- kfree(name); +- return ret; +- +-err_unlock: +- mutex_unlock(&ctl_mutex); +- kfree(name); +- return ret; +-} +- + /* + * create control files in sysfs + * /sys/bus/rbd/... diff --git a/queue-3.4/rbd-kill-notify_timeout-option.patch b/queue-3.4/rbd-kill-notify_timeout-option.patch new file mode 100644 index 00000000000..64b4c7ee7b9 --- /dev/null +++ b/queue-3.4/rbd-kill-notify_timeout-option.patch @@ -0,0 +1,71 @@ +From 97b2f0e79f5f4171c5e658e46f6b247a98a51389 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Fri, 10 Aug 2012 13:12:07 -0700 +Subject: rbd: kill notify_timeout option + + +From: Alex Elder + +The "notify_timeout" rbd device option is never used, so get rid of +it. + +Signed-off-by: Alex Elder +Reviewed-by: Yehuda Sadeh +(cherry picked from commit 84d34dcc116e117a41c6fc8be13430529fc2d9e7) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 8 -------- + 1 file changed, 8 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -71,7 +71,6 @@ + #define DEV_NAME_LEN 32 + #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) + +-#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 + #define RBD_READ_ONLY_DEFAULT false + + /* +@@ -95,7 +94,6 @@ struct rbd_image_header { + }; + + struct rbd_options { +- int notify_timeout; + bool read_only; + }; + +@@ -343,7 +341,6 @@ static struct rbd_client *__rbd_client_f + * mount options + */ + enum { +- Opt_notify_timeout, + Opt_last_int, + /* int args above */ + Opt_last_string, +@@ -355,7 +352,6 @@ enum { + }; + + static match_table_t rbdopt_tokens = { +- {Opt_notify_timeout, "notify_timeout=%d"}, + /* int args above */ + /* string args above */ + {Opt_read_only, "read_only"}, +@@ -394,9 +390,6 @@ static int parse_rbd_opts_token(char *c, + } + + switch (token) { +- case Opt_notify_timeout: +- rbdopt->notify_timeout = intval; +- break; + case Opt_read_only: + rbdopt->read_only = true; + break; +@@ -425,7 +418,6 @@ static struct rbd_client *rbd_get_client + if (!rbd_opts) + return ERR_PTR(-ENOMEM); + +- rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + + opt = ceph_parse_options(options, mon_addr, diff --git a/queue-3.4/rbd-only-reset-capacity-when-pointing-to-head.patch b/queue-3.4/rbd-only-reset-capacity-when-pointing-to-head.patch new file mode 100644 index 00000000000..d8dc903ddd5 --- /dev/null +++ b/queue-3.4/rbd-only-reset-capacity-when-pointing-to-head.patch @@ -0,0 +1,35 @@ +From 9aa61f22078fc2ccaf1a501b98659e85924a1c91 Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 21 Nov 2011 17:13:54 -0800 +Subject: rbd: only reset capacity when pointing to head + + +From: Josh Durgin + +Snapshots cannot be resized, and the new capacity of head should not +be reflected by the snapshot. + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit 474ef7ce832d471148f63a9d07f67fc5564834f1) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -1721,7 +1721,12 @@ static int __rbd_update_snaps(struct rbd + return ret; + + /* resized? */ +- set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); ++ if (rbd_dev->snap_id == CEPH_NOSNAP) { ++ sector_t size = (sector_t) h.image_size / SECTOR_SIZE; ++ ++ dout("setting size to %llu sectors", (unsigned long long) size); ++ set_capacity(rbd_dev->disk, size); ++ } + + down_write(&rbd_dev->header_rwsem); + diff --git a/queue-3.4/rbd-remove-linger-unconditionally.patch b/queue-3.4/rbd-remove-linger-unconditionally.patch new file mode 100644 index 00000000000..0dbc5e936e1 --- /dev/null +++ b/queue-3.4/rbd-remove-linger-unconditionally.patch @@ -0,0 +1,36 @@ +From 38a10d2304baec3e84895517ba674783bc4e1b07 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 6 Dec 2012 09:37:23 -0600 +Subject: rbd: remove linger unconditionally + + +From: Alex Elder + +In __unregister_linger_request(), the request is being removed +from the osd client's req_linger list only when the request +has a non-null osd pointer. It should be done whether or not +the request currently has an osd. + +This is most likely a non-issue because I believe the request +will always have an osd when this function is called. + +Signed-off-by: Alex Elder +Reviewed-by: Sage Weil +(cherry picked from commit 61c74035626beb25a39b0273ccf7d75510bc36a1) +Signed-off-by: Greg Kroah-Hartman +--- + net/ceph/osd_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ceph/osd_client.c ++++ b/net/ceph/osd_client.c +@@ -905,8 +905,8 @@ static void __unregister_linger_request( + struct ceph_osd_request *req) + { + dout("__unregister_linger_request %p\n", req); ++ list_del_init(&req->r_linger_item); + if (req->r_osd) { +- list_del_init(&req->r_linger_item); + list_del_init(&req->r_linger_osd); + + if (list_empty(&req->r_osd->o_requests) && diff --git a/queue-3.4/rbd-return-errors-for-mapped-but-deleted-snapshot.patch b/queue-3.4/rbd-return-errors-for-mapped-but-deleted-snapshot.patch new file mode 100644 index 00000000000..c63a073fe26 --- /dev/null +++ b/queue-3.4/rbd-return-errors-for-mapped-but-deleted-snapshot.patch @@ -0,0 +1,99 @@ +From 42febe4f05ffcdb44903da51a438fe9a6a4fb96e Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 21 Nov 2011 18:14:25 -0800 +Subject: rbd: return errors for mapped but deleted snapshot + + +From: Josh Durgin + +When a snapshot is deleted, the OSD will return ENOENT when reading +from it. This is normally interpreted as a hole by rbd, which will +return zeroes. To minimize the time in which this can happen, stop +requests early when we are notified that our snapshot no longer +exists. + +[elder@inktank.com: updated __rbd_init_snaps_header() logic] + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit e88a36ec961b8c1899c59c5e4ae35a318c0209d3) +Signed-off-by: Greg Kroah-Hartman + +Conflicts: + + drivers/block/rbd.c +--- + drivers/block/rbd.c | 32 ++++++++++++++++++++++++++++++-- + 1 file changed, 30 insertions(+), 2 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -174,9 +174,13 @@ struct rbd_device { + + /* protects updating the header */ + struct rw_semaphore header_rwsem; ++ /* name of the snapshot this device reads from */ + char snap_name[RBD_MAX_SNAP_NAME_LEN]; ++ /* id of the snapshot this device reads from */ + u64 snap_id; /* current snapshot id */ +- int read_only; ++ /* whether the snap_id this device reads from still exists */ ++ bool snap_exists; ++ int read_only; + + struct list_head node; + +@@ -590,6 +594,7 @@ static int rbd_header_set_snap(struct rb + else + snapc->seq = 0; + dev->snap_id = CEPH_NOSNAP; ++ dev->snap_exists = false; + dev->read_only = 0; + if (size) + *size = header->image_size; +@@ -598,6 +603,7 @@ static int rbd_header_set_snap(struct rb + if (ret < 0) + goto done; + dev->snap_id = snapc->seq; ++ dev->snap_exists = true; + dev->read_only = 1; + } + +@@ -1466,6 +1472,21 @@ static void rbd_rq_fn(struct request_que + + spin_unlock_irq(q->queue_lock); + ++ if (rbd_dev->snap_id != CEPH_NOSNAP) { ++ bool snap_exists; ++ ++ down_read(&rbd_dev->header_rwsem); ++ snap_exists = rbd_dev->snap_exists; ++ up_read(&rbd_dev->header_rwsem); ++ ++ if (!snap_exists) { ++ dout("request for non-existent snapshot"); ++ spin_lock_irq(q->queue_lock); ++ __blk_end_request_all(rq, -ENXIO); ++ continue; ++ } ++ } ++ + dout("%s 0x%x bytes at 0x%llx\n", + do_write ? "write" : "read", + size, blk_rq_pos(rq) * SECTOR_SIZE); +@@ -2069,7 +2090,14 @@ static int __rbd_init_snaps_header(struc + cur_id = rbd_dev->header.snapc->snaps[i - 1]; + + if (!i || old_snap->id < cur_id) { +- /* old_snap->id was skipped, thus was removed */ ++ /* ++ * old_snap->id was skipped, thus was ++ * removed. If this rbd_dev is mapped to ++ * the removed snapshot, record that it no ++ * longer exists, to prevent further I/O. ++ */ ++ if (rbd_dev->snap_id == old_snap->id) ++ rbd_dev->snap_exists = false; + __rbd_remove_snap_dev(rbd_dev, old_snap); + continue; + } diff --git a/queue-3.4/rbd-send-header-version-when-notifying.patch b/queue-3.4/rbd-send-header-version-when-notifying.patch new file mode 100644 index 00000000000..1b271eb896a --- /dev/null +++ b/queue-3.4/rbd-send-header-version-when-notifying.patch @@ -0,0 +1,61 @@ +From 9c916f5d870e8601b49052be9e451734a333ed23 Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 5 Dec 2011 18:10:44 -0800 +Subject: rbd: send header version when notifying + + +From: Josh Durgin + +Previously the original header version was sent. Now, we update it +when the header changes. + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit a71b891bc7d77a070e723c8c53d1dd73cf931555) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -1195,7 +1195,7 @@ static int rbd_req_sync_notify_ack(struc + if (ret < 0) + return ret; + +- ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); ++ ops[0].watch.ver = cpu_to_le64(ver); + ops[0].watch.cookie = notify_id; + ops[0].watch.flag = 0; + +@@ -1215,6 +1215,7 @@ static int rbd_req_sync_notify_ack(struc + static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) + { + struct rbd_device *dev = (struct rbd_device *)data; ++ u64 hver; + int rc; + + if (!dev) +@@ -1224,12 +1225,13 @@ static void rbd_watch_cb(u64 ver, u64 no + notify_id, (int)opcode); + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + rc = __rbd_update_snaps(dev); ++ hver = dev->header.obj_version; + mutex_unlock(&ctl_mutex); + if (rc) + pr_warning(RBD_DRV_NAME "%d got notification but failed to " + " update snaps: %d\n", dev->major, rc); + +- rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); ++ rbd_req_sync_notify_ack(dev, hver, notify_id, dev->obj_md_name); + } + + /* +@@ -1740,6 +1742,7 @@ static int __rbd_update_snaps(struct rbd + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + ++ rbd_dev->header.obj_version = h.obj_version; + rbd_dev->header.image_size = h.image_size; + rbd_dev->header.total_snaps = h.total_snaps; + rbd_dev->header.snapc = h.snapc; diff --git a/queue-3.4/rbd-set-image-size-when-header-is-updated.patch b/queue-3.4/rbd-set-image-size-when-header-is-updated.patch new file mode 100644 index 00000000000..60b783dadb5 --- /dev/null +++ b/queue-3.4/rbd-set-image-size-when-header-is-updated.patch @@ -0,0 +1,28 @@ +From 1b5cd1af449c8112b751e4820203829173ec47c6 Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 5 Dec 2011 10:41:28 -0800 +Subject: rbd: set image size when header is updated + + +From: Josh Durgin + +The image may have been resized. + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit 93a24e084d67ba2fcb9a4c289135825b623ec864) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -1741,6 +1741,7 @@ static int __rbd_update_snaps(struct rbd + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + ++ rbd_dev->header.image_size = h.image_size; + rbd_dev->header.total_snaps = h.total_snaps; + rbd_dev->header.snapc = h.snapc; + rbd_dev->header.snap_names = h.snap_names; diff --git a/queue-3.4/rbd-use-reference-counting-for-the-snap-context.patch b/queue-3.4/rbd-use-reference-counting-for-the-snap-context.patch new file mode 100644 index 00000000000..5ece2411eec --- /dev/null +++ b/queue-3.4/rbd-use-reference-counting-for-the-snap-context.patch @@ -0,0 +1,133 @@ +From aabc9ab77d676d758db56a08f4de31ff8c6b1bc7 Mon Sep 17 00:00:00 2001 +From: Josh Durgin +Date: Mon, 5 Dec 2011 14:03:05 -0800 +Subject: rbd: use reference counting for the snap context + + +From: Josh Durgin + +This prevents a race between requests with a given snap context and +header updates that free it. The osd client was already expecting the +snap context to be reference counted, since it get()s it in +ceph_osdc_build_request and put()s it when the request completes. + +Also remove the second down_read()/up_read() on header_rwsem in +rbd_do_request, which wasn't actually preventing this race or +protecting any other data. + +Signed-off-by: Josh Durgin +Reviewed-by: Alex Elder +(cherry picked from commit d1d25646543134d756a02ffe4e02073faa761f2c) +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 35 +++++++++++++++++------------------ + 1 file changed, 17 insertions(+), 18 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -615,7 +615,7 @@ done: + + static void rbd_header_free(struct rbd_image_header *header) + { +- kfree(header->snapc); ++ ceph_put_snap_context(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); + } +@@ -893,13 +893,10 @@ static int rbd_do_request(struct request + + dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); + +- down_read(&dev->header_rwsem); +- + osdc = &dev->rbd_client->client->osdc; + req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + false, GFP_NOIO, pages, bio); + if (!req) { +- up_read(&dev->header_rwsem); + ret = -ENOMEM; + goto done_pages; + } +@@ -934,7 +931,6 @@ static int rbd_do_request(struct request + snapc, + &mtime, + req->r_oid, req->r_oid_len); +- up_read(&dev->header_rwsem); + + if (linger_req) { + ceph_osdc_set_request_linger(osdc, req); +@@ -1446,6 +1442,7 @@ static void rbd_rq_fn(struct request_que + u64 ofs; + int num_segs, cur_seg = 0; + struct rbd_req_coll *coll; ++ struct ceph_snap_context *snapc; + + /* peek at request from block layer */ + if (!rq) +@@ -1472,21 +1469,20 @@ static void rbd_rq_fn(struct request_que + + spin_unlock_irq(q->queue_lock); + +- if (rbd_dev->snap_id != CEPH_NOSNAP) { +- bool snap_exists; ++ down_read(&rbd_dev->header_rwsem); + +- down_read(&rbd_dev->header_rwsem); +- snap_exists = rbd_dev->snap_exists; ++ if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { + up_read(&rbd_dev->header_rwsem); +- +- if (!snap_exists) { +- dout("request for non-existent snapshot"); +- spin_lock_irq(q->queue_lock); +- __blk_end_request_all(rq, -ENXIO); +- continue; +- } ++ dout("request for non-existent snapshot"); ++ spin_lock_irq(q->queue_lock); ++ __blk_end_request_all(rq, -ENXIO); ++ continue; + } + ++ snapc = ceph_get_snap_context(rbd_dev->header.snapc); ++ ++ up_read(&rbd_dev->header_rwsem); ++ + dout("%s 0x%x bytes at 0x%llx\n", + do_write ? "write" : "read", + size, blk_rq_pos(rq) * SECTOR_SIZE); +@@ -1496,6 +1492,7 @@ static void rbd_rq_fn(struct request_que + if (!coll) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENOMEM); ++ ceph_put_snap_context(snapc); + continue; + } + +@@ -1519,7 +1516,7 @@ static void rbd_rq_fn(struct request_que + /* init OSD command: write or read */ + if (do_write) + rbd_req_write(rq, rbd_dev, +- rbd_dev->header.snapc, ++ snapc, + ofs, + op_size, bio, + coll, cur_seg); +@@ -1542,6 +1539,8 @@ next_seg: + if (bp) + bio_pair_release(bp); + spin_lock_irq(q->queue_lock); ++ ++ ceph_put_snap_context(snapc); + } + } + +@@ -1737,7 +1736,7 @@ static int __rbd_update_snaps(struct rbd + if head moves */ + follow_seq = 1; + +- kfree(rbd_dev->header.snapc); ++ ceph_put_snap_context(rbd_dev->header.snapc); + kfree(rbd_dev->header.snap_names); + kfree(rbd_dev->header.snap_sizes); + diff --git a/queue-3.4/series b/queue-3.4/series index a29f832d981..11d2f33175c 100644 --- a/queue-3.4/series +++ b/queue-3.4/series @@ -129,3 +129,40 @@ kvm-ppc-44x-fix-dcr-read-write.patch 0019-drm-i915-call-drm_handle_vblank-before-finish_page_f.patch 0020-drm-i915-Flush-the-pending-flips-on-the-CRTC-before-.patch revert-drm-i915-no-lvds-quirk-for-zotac-zdbox-sd-id12-id13.patch +ceph-close-old-con-before-reopening-on-mds-reconnect.patch +rbd-return-errors-for-mapped-but-deleted-snapshot.patch +rbd-only-reset-capacity-when-pointing-to-head.patch +rbd-expose-the-correct-size-of-the-device-in-sysfs.patch +rbd-set-image-size-when-header-is-updated.patch +rbd-use-reference-counting-for-the-snap-context.patch +rbd-send-header-version-when-notifying.patch +ceph-tolerate-and-warn-on-extraneous-dentry-from-mds.patch +rbd-drop-dev-reference-on-error-in-rbd_open.patch +ceph-propagate-layout-error-on-osd-request-creation.patch +libceph-socket-can-close-in-any-connection-state.patch +libceph-report-connection-fault-with-warning.patch +libceph-init-osd-o_node-in-create_osd.patch +libceph-init-event-node-in-ceph_osdc_create_event.patch +libceph-don-t-use-rb_init_node-in-ceph_osdc_alloc_request.patch +libceph-register-request-before-unregister-linger.patch +libceph-move-linger-requests-sooner-in-kick_requests.patch +libceph-always-reset-osds-when-kicking.patch +libceph-warn-don-t-bug-on-unexpected-connection-states.patch +libceph-fix-protocol-feature-mismatch-failure-path.patch +libceph-fix-osdmap-decode-error-paths.patch +libceph-avoid-using-freed-osd-in-__kick_osd_requests.patch +rbd-kill-create_snap-sysfs-entry.patch +rbd-add-read_only-rbd-map-option.patch +rbd-kill-notify_timeout-option.patch +libceph-remove-osdtimeout-option.patch +ceph-don-t-reference-req-after-put.patch +rbd-remove-linger-unconditionally.patch +rbd-bug-on-invalid-layout.patch +rbd-fix-bug-in-rbd_dev_id_put.patch +rbd-do-not-allow-remove-of-mounted-on-image.patch +ceph-don-t-update-i_max_size-when-handling-non-auth-cap.patch +ceph-fix-infinite-loop-in-__wake_requests.patch +ceph-don-t-add-dirty-inode-to-dirty-list-if-caps-is-in-migration.patch +ceph-fix-__ceph_do_pending_vmtruncate.patch +ceph-call-handle_cap_grant-for-cap-import-message.patch +libceph-unlock-unprocessed-pages-in-start_read-error-path.patch