1 From: Jens Axboe <jens.axboe@oracle.com>
2 Subject: Implement rq affinity
5 This is a combined patchset from linux-2.6.git. Commit IDs:
6 700e1be34289bde0359c15d6507d4cf90e5a5a7d
7 9f6dd15ebf6591fb2aff8aa774b1b9f4f8d8535d
8 962b69a665ed7e8aa3d8b9b9b318f9133501f866
10 Signed-off-by: Hannes Reinecke <hare@suse.de>
13 block/as-iosched.c | 6 +-
14 block/blk-core.c | 54 ++++++++++----------
16 block/blk-settings.c | 2
17 block/blk-softirq.c | 126 +++++++++++++++++++++++++++++++++++------------
18 block/blk-sysfs.c | 31 +++++++++++
20 block/cfq-iosched.c | 2
22 include/linux/bio.h | 11 ++++
23 include/linux/blkdev.h | 8 ++
24 include/linux/elevator.h | 8 +-
25 12 files changed, 196 insertions(+), 67 deletions(-)
27 --- a/block/as-iosched.c
28 +++ b/block/as-iosched.c
29 @@ -462,7 +462,7 @@ static void as_antic_stop(struct as_data
30 del_timer(&ad->antic_timer);
31 ad->antic_status = ANTIC_FINISHED;
32 /* see as_work_handler */
33 - kblockd_schedule_work(&ad->antic_work);
34 + kblockd_schedule_work(ad->q, &ad->antic_work);
38 @@ -483,7 +483,7 @@ static void as_antic_timeout(unsigned lo
39 aic = ad->io_context->aic;
41 ad->antic_status = ANTIC_FINISHED;
42 - kblockd_schedule_work(&ad->antic_work);
43 + kblockd_schedule_work(q, &ad->antic_work);
45 if (aic->ttime_samples == 0) {
46 /* process anticipated on has exited or timed out*/
47 @@ -844,7 +844,7 @@ static void as_completed_request(struct
48 if (ad->changed_batch && ad->nr_dispatched == 1) {
49 ad->current_batch_expires = jiffies +
50 ad->batch_expire[ad->batch_data_dir];
51 - kblockd_schedule_work(&ad->antic_work);
52 + kblockd_schedule_work(q, &ad->antic_work);
53 ad->changed_batch = 0;
55 if (ad->batch_data_dir == REQ_SYNC)
56 --- a/block/blk-core.c
57 +++ b/block/blk-core.c
58 @@ -109,7 +109,7 @@ void blk_rq_init(struct request_queue *q
59 memset(rq, 0, sizeof(*rq));
61 INIT_LIST_HEAD(&rq->queuelist);
62 - INIT_LIST_HEAD(&rq->donelist);
65 rq->sector = rq->hard_sector = (sector_t) -1;
66 INIT_HLIST_NODE(&rq->hash);
67 @@ -304,7 +304,7 @@ void blk_unplug_timeout(unsigned long da
68 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
69 q->rq.count[READ] + q->rq.count[WRITE]);
71 - kblockd_schedule_work(&q->unplug_work);
72 + kblockd_schedule_work(q, &q->unplug_work);
75 void blk_unplug(struct request_queue *q)
76 @@ -321,6 +321,21 @@ void blk_unplug(struct request_queue *q)
78 EXPORT_SYMBOL(blk_unplug);
80 +static void blk_invoke_request_fn(struct request_queue *q)
83 + * one level of recursion is ok and is much faster than kicking
84 + * the unplug handling
86 + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
88 + queue_flag_clear(QUEUE_FLAG_REENTER, q);
90 + queue_flag_set(QUEUE_FLAG_PLUGGED, q);
91 + kblockd_schedule_work(q, &q->unplug_work);
96 * blk_start_queue - restart a previously stopped queue
97 * @q: The &struct request_queue in question
98 @@ -335,18 +350,7 @@ void blk_start_queue(struct request_queu
99 WARN_ON(!irqs_disabled());
101 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
104 - * one level of recursion is ok and is much faster than kicking
105 - * the unplug handling
107 - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
109 - queue_flag_clear(QUEUE_FLAG_REENTER, q);
111 - blk_plug_device(q);
112 - kblockd_schedule_work(&q->unplug_work);
114 + blk_invoke_request_fn(q);
116 EXPORT_SYMBOL(blk_start_queue);
118 @@ -404,15 +408,8 @@ void __blk_run_queue(struct request_queu
119 * Only recurse once to avoid overrunning the stack, let the unplug
120 * handling reinvoke the handler shortly if we already got there.
122 - if (!elv_queue_empty(q)) {
123 - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
125 - queue_flag_clear(QUEUE_FLAG_REENTER, q);
127 - blk_plug_device(q);
128 - kblockd_schedule_work(&q->unplug_work);
131 + if (!elv_queue_empty(q))
132 + blk_invoke_request_fn(q);
134 EXPORT_SYMBOL(__blk_run_queue);
136 @@ -1062,6 +1059,7 @@ EXPORT_SYMBOL(blk_put_request);
138 void init_request_from_bio(struct request *req, struct bio *bio)
140 + req->cpu = bio->bi_comp_cpu;
141 req->cmd_type = REQ_TYPE_FS;
144 @@ -1142,6 +1140,8 @@ static int __make_request(struct request
146 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
147 req->ioprio = ioprio_best(req->ioprio, prio);
148 + if (!blk_rq_cpu_valid(req))
149 + req->cpu = bio->bi_comp_cpu;
150 drive_stat_acct(req, 0);
151 if (!attempt_back_merge(q, req))
152 elv_merged_request(q, req, el_ret);
153 @@ -1169,6 +1169,8 @@ static int __make_request(struct request
154 req->sector = req->hard_sector = bio->bi_sector;
155 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
156 req->ioprio = ioprio_best(req->ioprio, prio);
157 + if (!blk_rq_cpu_valid(req))
158 + req->cpu = bio->bi_comp_cpu;
159 drive_stat_acct(req, 0);
160 if (!attempt_front_merge(q, req))
161 elv_merged_request(q, req, el_ret);
162 @@ -1204,13 +1206,15 @@ get_rq:
163 init_request_from_bio(req, bio);
165 spin_lock_irq(q->queue_lock);
166 + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
167 + bio_flagged(bio, BIO_CPU_AFFINE))
168 + req->cpu = blk_cpu_to_group(smp_processor_id());
169 if (elv_queue_empty(q))
174 __generic_unplug_device(q);
176 spin_unlock_irq(q->queue_lock);
179 @@ -1958,7 +1962,7 @@ void blk_rq_bio_prep(struct request_queu
180 rq->rq_disk = bio->bi_bdev->bd_disk;
183 -int kblockd_schedule_work(struct work_struct *work)
184 +int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
186 return queue_work(kblockd_workqueue, work);
190 @@ -59,4 +59,16 @@ static inline int queue_congestion_off_t
192 #endif /* BLK_DEV_INTEGRITY */
194 +static inline int blk_cpu_to_group(int cpu)
196 +#ifdef CONFIG_SCHED_MC
197 + cpumask_t mask = cpu_coregroup_map(cpu);
198 + return first_cpu(mask);
199 +#elif defined(CONFIG_SCHED_SMT)
200 + return first_cpu(per_cpu(cpu_sibling_map, cpu));
207 --- a/block/blk-merge.c
208 +++ b/block/blk-merge.c
209 @@ -413,6 +413,8 @@ static int attempt_merge(struct request_
212 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
213 + if (blk_rq_cpu_valid(next))
214 + req->cpu = next->cpu;
216 __blk_put_request(q, next);
218 --- a/block/blk-settings.c
219 +++ b/block/blk-settings.c
220 @@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(stru
222 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
224 -static int __init blk_settings_init(void)
225 +int __init blk_settings_init(void)
227 blk_max_low_pfn = max_low_pfn - 1;
228 blk_max_pfn = max_pfn - 1;
229 --- a/block/blk-softirq.c
230 +++ b/block/blk-softirq.c
233 static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
236 + * Softirq action handler - move entries to local list and loop over them
237 + * while passing them to the queue registered handler.
239 +static void blk_done_softirq(struct softirq_action *h)
241 + struct list_head *cpu_list, local_list;
243 + local_irq_disable();
244 + cpu_list = &__get_cpu_var(blk_cpu_done);
245 + list_replace_init(cpu_list, &local_list);
246 + local_irq_enable();
248 + while (!list_empty(&local_list)) {
249 + struct request *rq;
251 + rq = list_entry(local_list.next, struct request, csd.list);
252 + list_del_init(&rq->csd.list);
253 + rq->q->softirq_done_fn(rq);
257 +#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
258 +static void trigger_softirq(void *data)
260 + struct request *rq = data;
261 + unsigned long flags;
262 + struct list_head *list;
264 + local_irq_save(flags);
265 + list = &__get_cpu_var(blk_cpu_done);
266 + list_add_tail(&rq->csd.list, list);
268 + if (list->next == &rq->csd.list)
269 + raise_softirq_irqoff(BLOCK_SOFTIRQ);
271 + local_irq_restore(flags);
275 + * Setup and invoke a run of 'trigger_softirq' on the given cpu.
277 +static int raise_blk_irq(int cpu, struct request *rq)
279 + if (cpu_online(cpu)) {
280 + struct call_single_data *data = &rq->csd;
282 + data->func = trigger_softirq;
286 + __smp_call_function_single(cpu, data);
292 +#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
293 +static int raise_blk_irq(int cpu, struct request *rq)
299 static int __cpuinit blk_cpu_notify(struct notifier_block *self,
300 unsigned long action, void *hcpu)
302 @@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(stru
307 -static struct notifier_block blk_cpu_notifier __cpuinitdata = {
308 +static struct notifier_block __cpuinitdata blk_cpu_notifier = {
309 .notifier_call = blk_cpu_notify,
313 - * splice the completion data to a local structure and hand off to
314 - * process_completion_queue() to complete the requests
316 -static void blk_done_softirq(struct softirq_action *h)
318 - struct list_head *cpu_list, local_list;
320 - local_irq_disable();
321 - cpu_list = &__get_cpu_var(blk_cpu_done);
322 - list_replace_init(cpu_list, &local_list);
323 - local_irq_enable();
325 - while (!list_empty(&local_list)) {
326 - struct request *rq;
328 - rq = list_entry(local_list.next, struct request, donelist);
329 - list_del_init(&rq->donelist);
330 - rq->q->softirq_done_fn(rq);
335 * blk_complete_request - end I/O on a request
336 * @req: the request being processed
337 @@ -71,25 +112,48 @@ static void blk_done_softirq(struct soft
338 * through a softirq handler. The user must have registered a completion
339 * callback through blk_queue_softirq_done().
342 void blk_complete_request(struct request *req)
344 - struct list_head *cpu_list;
345 + struct request_queue *q = req->q;
347 + int ccpu, cpu, group_cpu;
349 - BUG_ON(!req->q->softirq_done_fn);
350 + BUG_ON(!q->softirq_done_fn);
352 local_irq_save(flags);
353 + cpu = smp_processor_id();
354 + group_cpu = blk_cpu_to_group(cpu);
356 - cpu_list = &__get_cpu_var(blk_cpu_done);
357 - list_add_tail(&req->donelist, cpu_list);
358 - raise_softirq_irqoff(BLOCK_SOFTIRQ);
360 + * Select completion CPU
362 + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
367 + if (ccpu == cpu || ccpu == group_cpu) {
368 + struct list_head *list;
370 + list = &__get_cpu_var(blk_cpu_done);
371 + list_add_tail(&req->csd.list, list);
374 + * if the list only contains our just added request,
375 + * signal a raise of the softirq. If there are already
376 + * entries there, someone already raised the irq but it
379 + if (list->next == &req->csd.list)
380 + raise_softirq_irqoff(BLOCK_SOFTIRQ);
381 + } else if (raise_blk_irq(ccpu, req))
384 local_irq_restore(flags);
386 EXPORT_SYMBOL(blk_complete_request);
388 -int __init blk_softirq_init(void)
389 +__init int blk_softirq_init(void)
393 --- a/block/blk-sysfs.c
394 +++ b/block/blk-sysfs.c
395 @@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(stru
399 +static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
401 + unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
403 + return queue_var_show(set != 0, page);
407 +queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
409 + ssize_t ret = -EINVAL;
410 +#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
413 + ret = queue_var_store(&val, page, count);
414 + spin_lock_irq(q->queue_lock);
416 + queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
418 + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
419 + spin_unlock_irq(q->queue_lock);
424 static struct queue_sysfs_entry queue_requests_entry = {
425 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
426 @@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_no
427 .store = queue_nomerges_store,
430 +static struct queue_sysfs_entry queue_rq_affinity_entry = {
431 + .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
432 + .show = queue_rq_affinity_show,
433 + .store = queue_rq_affinity_store,
436 static struct attribute *default_attrs[] = {
437 &queue_requests_entry.attr,
438 &queue_ra_entry.attr,
439 @@ -205,6 +235,7 @@ static struct attribute *default_attrs[]
440 &queue_iosched_entry.attr,
441 &queue_hw_sector_size_entry.attr,
442 &queue_nomerges_entry.attr,
443 + &queue_rq_affinity_entry.attr,
447 --- a/block/cfq-iosched.c
448 +++ b/block/cfq-iosched.c
449 @@ -252,7 +252,7 @@ static inline void cfq_schedule_dispatch
451 if (cfqd->busy_queues) {
452 cfq_log(cfqd, "schedule dispatch");
453 - kblockd_schedule_work(&cfqd->unplug_work);
454 + kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
460 @@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
462 memset(bio, 0, sizeof(*bio));
463 bio->bi_flags = 1 << BIO_UPTODATE;
464 + bio->bi_comp_cpu = -1;
465 atomic_set(&bio->bi_cnt, 1);
468 --- a/include/linux/bio.h
469 +++ b/include/linux/bio.h
470 @@ -88,6 +88,8 @@ struct bio {
472 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
474 + unsigned int bi_comp_cpu; /* completion CPU */
476 struct bio_vec *bi_io_vec; /* the actual vec list */
478 bio_end_io_t *bi_end_io;
479 @@ -112,6 +114,7 @@ struct bio {
480 #define BIO_BOUNCED 5 /* bio is a bounce bio */
481 #define BIO_USER_MAPPED 6 /* contains user pages */
482 #define BIO_EOPNOTSUPP 7 /* not supported */
483 +#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */
484 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
487 @@ -350,6 +353,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp
488 extern unsigned int bvec_nr_vecs(unsigned short idx);
491 + * Allow queuer to specify a completion CPU for this bio
493 +static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
495 + bio->bi_comp_cpu = cpu;
499 * bio_set is used to allow other portions of the IO system to
500 * allocate their own private memory pools for bio and iovec structures.
501 * These memory pools in turn all allocate from the bio_slab
502 --- a/include/linux/blkdev.h
503 +++ b/include/linux/blkdev.h
505 #include <linux/module.h>
506 #include <linux/stringify.h>
507 #include <linux/bsg.h>
508 +#include <linux/smp.h>
510 #include <asm/scatterlist.h>
512 @@ -139,7 +140,8 @@ enum rq_flag_bits {
515 struct list_head queuelist;
516 - struct list_head donelist;
517 + struct call_single_data csd;
520 struct request_queue *q;
522 @@ -420,6 +422,7 @@ struct request_queue
523 #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
524 #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
525 #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */
526 +#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */
528 static inline int queue_is_locked(struct request_queue *q)
530 @@ -542,6 +545,7 @@ enum {
531 #define blk_pm_request(rq) \
532 (blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
534 +#define blk_rq_cpu_valid(rq) ((rq)->cpu != -1)
535 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
536 #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER)
537 #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA)
538 @@ -913,7 +917,7 @@ static inline void put_dev_sector(Sector
542 -int kblockd_schedule_work(struct work_struct *work);
543 +int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
544 void kblockd_flush_work(struct work_struct *work);
546 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
547 --- a/include/linux/elevator.h
548 +++ b/include/linux/elevator.h
549 @@ -173,15 +173,15 @@ enum {
550 #define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
553 - * Hack to reuse the donelist list_head as the fifo time holder while
554 + * Hack to reuse the csd.list list_head as the fifo time holder while
555 * the request is in the io scheduler. Saves an unsigned long in rq.
557 -#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next)
558 -#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp))
559 +#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next)
560 +#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp))
561 #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
562 #define rq_fifo_clear(rq) do { \
563 list_del_init(&(rq)->queuelist); \
564 - INIT_LIST_HEAD(&(rq)->donelist); \
565 + INIT_LIST_HEAD(&(rq)->csd.list); \