]>
Commit | Line | Data |
---|---|---|
8b77cbea GKH |
1 | From bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 Mon Sep 17 00:00:00 2001 |
2 | From: Ilya Dryomov <ilya.dryomov@inktank.com> | |
3 | Date: Mon, 4 Aug 2014 18:04:39 +0400 | |
4 | Subject: rbd: rework rbd_request_fn() | |
5 | ||
6 | From: Ilya Dryomov <ilya.dryomov@inktank.com> | |
7 | ||
8 | commit bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 upstream. | |
9 | ||
10 | While it was never a good idea to sleep in request_fn(), commit | |
11 | 34c6bc2c919a ("locking/mutexes: Add extra reschedule point") made it | |
12 | a *bad* idea. mutex_lock() since 3.15 may reschedule *before* putting | |
13 | task on the mutex wait queue, which for tasks in !TASK_RUNNING state | |
14 | means block forever. request_fn() may be called with !TASK_RUNNING on | |
15 | the way to schedule() in io_schedule(). | |
16 | ||
17 | Offload request handling to a workqueue, one per rbd device, to avoid | |
18 | calling blocking primitives from rbd_request_fn(). | |
19 | ||
20 | Fixes: http://tracker.ceph.com/issues/8818 | |
21 | ||
22 | Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> | |
23 | Tested-by: Eric Eastman <eric0e@aol.com> | |
24 | Tested-by: Greg Wilson <greg.wilson@keepertech.com> | |
25 | Reviewed-by: Alex Elder <elder@linaro.org> | |
26 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
27 | ||
28 | --- | |
29 | drivers/block/rbd.c | 194 +++++++++++++++++++++++++++++++--------------------- | |
30 | 1 file changed, 118 insertions(+), 76 deletions(-) | |
31 | ||
32 | --- a/drivers/block/rbd.c | |
33 | +++ b/drivers/block/rbd.c | |
34 | @@ -42,6 +42,7 @@ | |
35 | #include <linux/blkdev.h> | |
36 | #include <linux/slab.h> | |
37 | #include <linux/idr.h> | |
38 | +#include <linux/workqueue.h> | |
39 | ||
40 | #include "rbd_types.h" | |
41 | ||
42 | @@ -332,7 +333,10 @@ struct rbd_device { | |
43 | ||
44 | char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ | |
45 | ||
46 | + struct list_head rq_queue; /* incoming rq queue */ | |
47 | spinlock_t lock; /* queue, flags, open_count */ | |
48 | + struct workqueue_struct *rq_wq; | |
49 | + struct work_struct rq_work; | |
50 | ||
51 | struct rbd_image_header header; | |
52 | unsigned long flags; /* possibly lock protected */ | |
53 | @@ -3183,102 +3187,129 @@ out: | |
54 | return ret; | |
55 | } | |
56 | ||
57 | -static void rbd_request_fn(struct request_queue *q) | |
58 | - __releases(q->queue_lock) __acquires(q->queue_lock) | |
59 | +static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) | |
60 | { | |
61 | - struct rbd_device *rbd_dev = q->queuedata; | |
62 | - struct request *rq; | |
63 | + struct rbd_img_request *img_request; | |
64 | + u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; | |
65 | + u64 length = blk_rq_bytes(rq); | |
66 | + bool wr = rq_data_dir(rq) == WRITE; | |
67 | int result; | |
68 | ||
69 | - while ((rq = blk_fetch_request(q))) { | |
70 | - bool write_request = rq_data_dir(rq) == WRITE; | |
71 | - struct rbd_img_request *img_request; | |
72 | - u64 offset; | |
73 | - u64 length; | |
74 | + /* Ignore/skip any zero-length requests */ | |
75 | ||
76 | - /* Ignore any non-FS requests that filter through. */ | |
77 | + if (!length) { | |
78 | + dout("%s: zero-length request\n", __func__); | |
79 | + result = 0; | |
80 | + goto err_rq; | |
81 | + } | |
82 | ||
83 | - if (rq->cmd_type != REQ_TYPE_FS) { | |
84 | - dout("%s: non-fs request type %d\n", __func__, | |
85 | - (int) rq->cmd_type); | |
86 | - __blk_end_request_all(rq, 0); | |
87 | - continue; | |
88 | + /* Disallow writes to a read-only device */ | |
89 | + | |
90 | + if (wr) { | |
91 | + if (rbd_dev->mapping.read_only) { | |
92 | + result = -EROFS; | |
93 | + goto err_rq; | |
94 | } | |
95 | + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); | |
96 | + } | |
97 | ||
98 | - /* Ignore/skip any zero-length requests */ | |
99 | + /* | |
100 | + * Quit early if the mapped snapshot no longer exists. It's | |
101 | + * still possible the snapshot will have disappeared by the | |
102 | + * time our request arrives at the osd, but there's no sense in | |
103 | + * sending it if we already know. | |
104 | + */ | |
105 | + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { | |
106 | + dout("request for non-existent snapshot"); | |
107 | + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); | |
108 | + result = -ENXIO; | |
109 | + goto err_rq; | |
110 | + } | |
111 | ||
112 | - offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; | |
113 | - length = (u64) blk_rq_bytes(rq); | |
114 | + if (offset && length > U64_MAX - offset + 1) { | |
115 | + rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, | |
116 | + length); | |
117 | + result = -EINVAL; | |
118 | + goto err_rq; /* Shouldn't happen */ | |
119 | + } | |
120 | ||
121 | - if (!length) { | |
122 | - dout("%s: zero-length request\n", __func__); | |
123 | - __blk_end_request_all(rq, 0); | |
124 | - continue; | |
125 | - } | |
126 | + if (offset + length > rbd_dev->mapping.size) { | |
127 | + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, | |
128 | + length, rbd_dev->mapping.size); | |
129 | + result = -EIO; | |
130 | + goto err_rq; | |
131 | + } | |
132 | ||
133 | - spin_unlock_irq(q->queue_lock); | |
134 | + img_request = rbd_img_request_create(rbd_dev, offset, length, wr); | |
135 | + if (!img_request) { | |
136 | + result = -ENOMEM; | |
137 | + goto err_rq; | |
138 | + } | |
139 | + img_request->rq = rq; | |
140 | ||
141 | - /* Disallow writes to a read-only device */ | |
142 | + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); | |
143 | + if (result) | |
144 | + goto err_img_request; | |
145 | ||
146 | - if (write_request) { | |
147 | - result = -EROFS; | |
148 | - if (rbd_dev->mapping.read_only) | |
149 | - goto end_request; | |
150 | - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); | |
151 | - } | |
152 | + result = rbd_img_request_submit(img_request); | |
153 | + if (result) | |
154 | + goto err_img_request; | |
155 | ||
156 | - /* | |
157 | - * Quit early if the mapped snapshot no longer | |
158 | - * exists. It's still possible the snapshot will | |
159 | - * have disappeared by the time our request arrives | |
160 | - * at the osd, but there's no sense in sending it if | |
161 | - * we already know. | |
162 | - */ | |
163 | - if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { | |
164 | - dout("request for non-existent snapshot"); | |
165 | - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); | |
166 | - result = -ENXIO; | |
167 | - goto end_request; | |
168 | - } | |
169 | + return; | |
170 | ||
171 | - result = -EINVAL; | |
172 | - if (offset && length > U64_MAX - offset + 1) { | |
173 | - rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n", | |
174 | - offset, length); | |
175 | - goto end_request; /* Shouldn't happen */ | |
176 | - } | |
177 | +err_img_request: | |
178 | + rbd_img_request_put(img_request); | |
179 | +err_rq: | |
180 | + if (result) | |
181 | + rbd_warn(rbd_dev, "%s %llx at %llx result %d", | |
182 | + wr ? "write" : "read", length, offset, result); | |
183 | + blk_end_request_all(rq, result); | |
184 | +} | |
185 | ||
186 | - result = -EIO; | |
187 | - if (offset + length > rbd_dev->mapping.size) { | |
188 | - rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n", | |
189 | - offset, length, rbd_dev->mapping.size); | |
190 | - goto end_request; | |
191 | - } | |
192 | +static void rbd_request_workfn(struct work_struct *work) | |
193 | +{ | |
194 | + struct rbd_device *rbd_dev = | |
195 | + container_of(work, struct rbd_device, rq_work); | |
196 | + struct request *rq, *next; | |
197 | + LIST_HEAD(requests); | |
198 | ||
199 | - result = -ENOMEM; | |
200 | - img_request = rbd_img_request_create(rbd_dev, offset, length, | |
201 | - write_request); | |
202 | - if (!img_request) | |
203 | - goto end_request; | |
204 | + spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ | |
205 | + list_splice_init(&rbd_dev->rq_queue, &requests); | |
206 | + spin_unlock_irq(&rbd_dev->lock); | |
207 | ||
208 | - img_request->rq = rq; | |
209 | + list_for_each_entry_safe(rq, next, &requests, queuelist) { | |
210 | + list_del_init(&rq->queuelist); | |
211 | + rbd_handle_request(rbd_dev, rq); | |
212 | + } | |
213 | +} | |
214 | ||
215 | - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, | |
216 | - rq->bio); | |
217 | - if (!result) | |
218 | - result = rbd_img_request_submit(img_request); | |
219 | - if (result) | |
220 | - rbd_img_request_put(img_request); | |
221 | -end_request: | |
222 | - spin_lock_irq(q->queue_lock); | |
223 | - if (result < 0) { | |
224 | - rbd_warn(rbd_dev, "%s %llx at %llx result %d\n", | |
225 | - write_request ? "write" : "read", | |
226 | - length, offset, result); | |
227 | +/* | |
228 | + * Called with q->queue_lock held and interrupts disabled, possibly on | |
229 | + * the way to schedule(). Do not sleep here! | |
230 | + */ | |
231 | +static void rbd_request_fn(struct request_queue *q) | |
232 | +{ | |
233 | + struct rbd_device *rbd_dev = q->queuedata; | |
234 | + struct request *rq; | |
235 | + int queued = 0; | |
236 | + | |
237 | + rbd_assert(rbd_dev); | |
238 | ||
239 | - __blk_end_request_all(rq, result); | |
240 | + while ((rq = blk_fetch_request(q))) { | |
241 | + /* Ignore any non-FS requests that filter through. */ | |
242 | + if (rq->cmd_type != REQ_TYPE_FS) { | |
243 | + dout("%s: non-fs request type %d\n", __func__, | |
244 | + (int) rq->cmd_type); | |
245 | + __blk_end_request_all(rq, 0); | |
246 | + continue; | |
247 | } | |
248 | + | |
249 | + list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); | |
250 | + queued++; | |
251 | } | |
252 | + | |
253 | + if (queued) | |
254 | + queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work); | |
255 | } | |
256 | ||
257 | /* | |
258 | @@ -3848,6 +3879,8 @@ static struct rbd_device *rbd_dev_create | |
259 | return NULL; | |
260 | ||
261 | spin_lock_init(&rbd_dev->lock); | |
262 | + INIT_LIST_HEAD(&rbd_dev->rq_queue); | |
263 | + INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); | |
264 | rbd_dev->flags = 0; | |
265 | atomic_set(&rbd_dev->parent_ref, 0); | |
266 | INIT_LIST_HEAD(&rbd_dev->node); | |
267 | @@ -5066,12 +5099,17 @@ static int rbd_dev_device_setup(struct r | |
268 | ret = rbd_dev_mapping_set(rbd_dev); | |
269 | if (ret) | |
270 | goto err_out_disk; | |
271 | + | |
272 | set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); | |
273 | set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); | |
274 | ||
275 | + rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0); | |
276 | + if (!rbd_dev->rq_wq) | |
277 | + goto err_out_mapping; | |
278 | + | |
279 | ret = rbd_bus_add_dev(rbd_dev); | |
280 | if (ret) | |
281 | - goto err_out_mapping; | |
282 | + goto err_out_workqueue; | |
283 | ||
284 | /* Everything's ready. Announce the disk to the world. */ | |
285 | ||
286 | @@ -5083,6 +5121,9 @@ static int rbd_dev_device_setup(struct r | |
287 | ||
288 | return ret; | |
289 | ||
290 | +err_out_workqueue: | |
291 | + destroy_workqueue(rbd_dev->rq_wq); | |
292 | + rbd_dev->rq_wq = NULL; | |
293 | err_out_mapping: | |
294 | rbd_dev_mapping_clear(rbd_dev); | |
295 | err_out_disk: | |
296 | @@ -5314,6 +5355,7 @@ static void rbd_dev_device_release(struc | |
297 | { | |
298 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | |
299 | ||
300 | + destroy_workqueue(rbd_dev->rq_wq); | |
301 | rbd_free_disk(rbd_dev); | |
302 | clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); | |
303 | rbd_dev_mapping_clear(rbd_dev); |