]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/3.16.3/rbd-rework-rbd_request_fn.patch
4.9-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 3.16.3 / rbd-rework-rbd_request_fn.patch
CommitLineData
8b77cbea
GKH
1From bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 Mon Sep 17 00:00:00 2001
2From: Ilya Dryomov <ilya.dryomov@inktank.com>
3Date: Mon, 4 Aug 2014 18:04:39 +0400
4Subject: rbd: rework rbd_request_fn()
5
6From: Ilya Dryomov <ilya.dryomov@inktank.com>
7
8commit bc1ecc65a259fa9333dc8bd6a4ba0cf03b7d4bf8 upstream.
9
10While it was never a good idea to sleep in request_fn(), commit
1134c6bc2c919a ("locking/mutexes: Add extra reschedule point") made it
12a *bad* idea. mutex_lock() since 3.15 may reschedule *before* putting
13task on the mutex wait queue, which for tasks in !TASK_RUNNING state
14means block forever. request_fn() may be called with !TASK_RUNNING on
15the way to schedule() in io_schedule().
16
17Offload request handling to a workqueue, one per rbd device, to avoid
18calling blocking primitives from rbd_request_fn().
19
20Fixes: http://tracker.ceph.com/issues/8818
21
22Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
23Tested-by: Eric Eastman <eric0e@aol.com>
24Tested-by: Greg Wilson <greg.wilson@keepertech.com>
25Reviewed-by: Alex Elder <elder@linaro.org>
26Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
27
28---
29 drivers/block/rbd.c | 194 +++++++++++++++++++++++++++++++---------------------
30 1 file changed, 118 insertions(+), 76 deletions(-)
31
32--- a/drivers/block/rbd.c
33+++ b/drivers/block/rbd.c
34@@ -42,6 +42,7 @@
35 #include <linux/blkdev.h>
36 #include <linux/slab.h>
37 #include <linux/idr.h>
38+#include <linux/workqueue.h>
39
40 #include "rbd_types.h"
41
42@@ -332,7 +333,10 @@ struct rbd_device {
43
44 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
45
46+ struct list_head rq_queue; /* incoming rq queue */
47 spinlock_t lock; /* queue, flags, open_count */
48+ struct workqueue_struct *rq_wq;
49+ struct work_struct rq_work;
50
51 struct rbd_image_header header;
52 unsigned long flags; /* possibly lock protected */
53@@ -3183,102 +3187,129 @@ out:
54 return ret;
55 }
56
57-static void rbd_request_fn(struct request_queue *q)
58- __releases(q->queue_lock) __acquires(q->queue_lock)
59+static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
60 {
61- struct rbd_device *rbd_dev = q->queuedata;
62- struct request *rq;
63+ struct rbd_img_request *img_request;
64+ u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
65+ u64 length = blk_rq_bytes(rq);
66+ bool wr = rq_data_dir(rq) == WRITE;
67 int result;
68
69- while ((rq = blk_fetch_request(q))) {
70- bool write_request = rq_data_dir(rq) == WRITE;
71- struct rbd_img_request *img_request;
72- u64 offset;
73- u64 length;
74+ /* Ignore/skip any zero-length requests */
75
76- /* Ignore any non-FS requests that filter through. */
77+ if (!length) {
78+ dout("%s: zero-length request\n", __func__);
79+ result = 0;
80+ goto err_rq;
81+ }
82
83- if (rq->cmd_type != REQ_TYPE_FS) {
84- dout("%s: non-fs request type %d\n", __func__,
85- (int) rq->cmd_type);
86- __blk_end_request_all(rq, 0);
87- continue;
88+ /* Disallow writes to a read-only device */
89+
90+ if (wr) {
91+ if (rbd_dev->mapping.read_only) {
92+ result = -EROFS;
93+ goto err_rq;
94 }
95+ rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
96+ }
97
98- /* Ignore/skip any zero-length requests */
99+ /*
100+ * Quit early if the mapped snapshot no longer exists. It's
101+ * still possible the snapshot will have disappeared by the
102+ * time our request arrives at the osd, but there's no sense in
103+ * sending it if we already know.
104+ */
105+ if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
106+ dout("request for non-existent snapshot");
107+ rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
108+ result = -ENXIO;
109+ goto err_rq;
110+ }
111
112- offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
113- length = (u64) blk_rq_bytes(rq);
114+ if (offset && length > U64_MAX - offset + 1) {
115+ rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
116+ length);
117+ result = -EINVAL;
118+ goto err_rq; /* Shouldn't happen */
119+ }
120
121- if (!length) {
122- dout("%s: zero-length request\n", __func__);
123- __blk_end_request_all(rq, 0);
124- continue;
125- }
126+ if (offset + length > rbd_dev->mapping.size) {
127+ rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
128+ length, rbd_dev->mapping.size);
129+ result = -EIO;
130+ goto err_rq;
131+ }
132
133- spin_unlock_irq(q->queue_lock);
134+ img_request = rbd_img_request_create(rbd_dev, offset, length, wr);
135+ if (!img_request) {
136+ result = -ENOMEM;
137+ goto err_rq;
138+ }
139+ img_request->rq = rq;
140
141- /* Disallow writes to a read-only device */
142+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio);
143+ if (result)
144+ goto err_img_request;
145
146- if (write_request) {
147- result = -EROFS;
148- if (rbd_dev->mapping.read_only)
149- goto end_request;
150- rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
151- }
152+ result = rbd_img_request_submit(img_request);
153+ if (result)
154+ goto err_img_request;
155
156- /*
157- * Quit early if the mapped snapshot no longer
158- * exists. It's still possible the snapshot will
159- * have disappeared by the time our request arrives
160- * at the osd, but there's no sense in sending it if
161- * we already know.
162- */
163- if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
164- dout("request for non-existent snapshot");
165- rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
166- result = -ENXIO;
167- goto end_request;
168- }
169+ return;
170
171- result = -EINVAL;
172- if (offset && length > U64_MAX - offset + 1) {
173- rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
174- offset, length);
175- goto end_request; /* Shouldn't happen */
176- }
177+err_img_request:
178+ rbd_img_request_put(img_request);
179+err_rq:
180+ if (result)
181+ rbd_warn(rbd_dev, "%s %llx at %llx result %d",
182+ wr ? "write" : "read", length, offset, result);
183+ blk_end_request_all(rq, result);
184+}
185
186- result = -EIO;
187- if (offset + length > rbd_dev->mapping.size) {
188- rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
189- offset, length, rbd_dev->mapping.size);
190- goto end_request;
191- }
192+static void rbd_request_workfn(struct work_struct *work)
193+{
194+ struct rbd_device *rbd_dev =
195+ container_of(work, struct rbd_device, rq_work);
196+ struct request *rq, *next;
197+ LIST_HEAD(requests);
198
199- result = -ENOMEM;
200- img_request = rbd_img_request_create(rbd_dev, offset, length,
201- write_request);
202- if (!img_request)
203- goto end_request;
204+ spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
205+ list_splice_init(&rbd_dev->rq_queue, &requests);
206+ spin_unlock_irq(&rbd_dev->lock);
207
208- img_request->rq = rq;
209+ list_for_each_entry_safe(rq, next, &requests, queuelist) {
210+ list_del_init(&rq->queuelist);
211+ rbd_handle_request(rbd_dev, rq);
212+ }
213+}
214
215- result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
216- rq->bio);
217- if (!result)
218- result = rbd_img_request_submit(img_request);
219- if (result)
220- rbd_img_request_put(img_request);
221-end_request:
222- spin_lock_irq(q->queue_lock);
223- if (result < 0) {
224- rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
225- write_request ? "write" : "read",
226- length, offset, result);
227+/*
228+ * Called with q->queue_lock held and interrupts disabled, possibly on
229+ * the way to schedule(). Do not sleep here!
230+ */
231+static void rbd_request_fn(struct request_queue *q)
232+{
233+ struct rbd_device *rbd_dev = q->queuedata;
234+ struct request *rq;
235+ int queued = 0;
236+
237+ rbd_assert(rbd_dev);
238
239- __blk_end_request_all(rq, result);
240+ while ((rq = blk_fetch_request(q))) {
241+ /* Ignore any non-FS requests that filter through. */
242+ if (rq->cmd_type != REQ_TYPE_FS) {
243+ dout("%s: non-fs request type %d\n", __func__,
244+ (int) rq->cmd_type);
245+ __blk_end_request_all(rq, 0);
246+ continue;
247 }
248+
249+ list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
250+ queued++;
251 }
252+
253+ if (queued)
254+ queue_work(rbd_dev->rq_wq, &rbd_dev->rq_work);
255 }
256
257 /*
258@@ -3848,6 +3879,8 @@ static struct rbd_device *rbd_dev_create
259 return NULL;
260
261 spin_lock_init(&rbd_dev->lock);
262+ INIT_LIST_HEAD(&rbd_dev->rq_queue);
263+ INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
264 rbd_dev->flags = 0;
265 atomic_set(&rbd_dev->parent_ref, 0);
266 INIT_LIST_HEAD(&rbd_dev->node);
267@@ -5066,12 +5099,17 @@ static int rbd_dev_device_setup(struct r
268 ret = rbd_dev_mapping_set(rbd_dev);
269 if (ret)
270 goto err_out_disk;
271+
272 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
273 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
274
275+ rbd_dev->rq_wq = alloc_workqueue(rbd_dev->disk->disk_name, 0, 0);
276+ if (!rbd_dev->rq_wq)
277+ goto err_out_mapping;
278+
279 ret = rbd_bus_add_dev(rbd_dev);
280 if (ret)
281- goto err_out_mapping;
282+ goto err_out_workqueue;
283
284 /* Everything's ready. Announce the disk to the world. */
285
286@@ -5083,6 +5121,9 @@ static int rbd_dev_device_setup(struct r
287
288 return ret;
289
290+err_out_workqueue:
291+ destroy_workqueue(rbd_dev->rq_wq);
292+ rbd_dev->rq_wq = NULL;
293 err_out_mapping:
294 rbd_dev_mapping_clear(rbd_dev);
295 err_out_disk:
296@@ -5314,6 +5355,7 @@ static void rbd_dev_device_release(struc
297 {
298 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
299
300+ destroy_workqueue(rbd_dev->rq_wq);
301 rbd_free_disk(rbd_dev);
302 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
303 rbd_dev_mapping_clear(rbd_dev);