1 // SPDX-License-Identifier: GPL-2.0
3 * Block multiqueue core code
5 * Copyright (C) 2013-2014 Jens Axboe
6 * Copyright (C) 2013-2014 Christoph Hellwig
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/backing-dev.h>
11 #include <linux/bio.h>
12 #include <linux/blkdev.h>
13 #include <linux/blk-integrity.h>
14 #include <linux/kmemleak.h>
16 #include <linux/init.h>
17 #include <linux/slab.h>
18 #include <linux/workqueue.h>
19 #include <linux/smp.h>
20 #include <linux/interrupt.h>
21 #include <linux/llist.h>
22 #include <linux/cpu.h>
23 #include <linux/cache.h>
24 #include <linux/sched/sysctl.h>
25 #include <linux/sched/topology.h>
26 #include <linux/sched/signal.h>
27 #include <linux/delay.h>
28 #include <linux/crash_dump.h>
29 #include <linux/prefetch.h>
30 #include <linux/blk-crypto.h>
31 #include <linux/part_stat.h>
33 #include <trace/events/block.h>
35 #include <linux/t10-pi.h>
38 #include "blk-mq-debugfs.h"
41 #include "blk-mq-sched.h"
42 #include "blk-rq-qos.h"
43 #include "blk-ioprio.h"
45 static DEFINE_PER_CPU(struct llist_head
, blk_cpu_done
);
46 static DEFINE_PER_CPU(call_single_data_t
, blk_cpu_csd
);
48 static void blk_mq_insert_request(struct request
*rq
, blk_insert_t flags
);
49 static void blk_mq_request_bypass_insert(struct request
*rq
,
51 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx
*hctx
,
52 struct list_head
*list
);
53 static int blk_hctx_poll(struct request_queue
*q
, struct blk_mq_hw_ctx
*hctx
,
54 struct io_comp_batch
*iob
, unsigned int flags
);
57 * Check if any of the ctx, dispatch list or elevator
58 * have pending work in this hardware queue.
60 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx
*hctx
)
62 return !list_empty_careful(&hctx
->dispatch
) ||
63 sbitmap_any_bit_set(&hctx
->ctx_map
) ||
64 blk_mq_sched_has_work(hctx
);
68 * Mark this ctx as having pending work in this hardware queue
70 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx
*hctx
,
71 struct blk_mq_ctx
*ctx
)
73 const int bit
= ctx
->index_hw
[hctx
->type
];
75 if (!sbitmap_test_bit(&hctx
->ctx_map
, bit
))
76 sbitmap_set_bit(&hctx
->ctx_map
, bit
);
79 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx
*hctx
,
80 struct blk_mq_ctx
*ctx
)
82 const int bit
= ctx
->index_hw
[hctx
->type
];
84 sbitmap_clear_bit(&hctx
->ctx_map
, bit
);
88 struct block_device
*part
;
89 unsigned int inflight
[2];
92 static bool blk_mq_check_inflight(struct request
*rq
, void *priv
)
94 struct mq_inflight
*mi
= priv
;
96 if (rq
->part
&& blk_do_io_stat(rq
) &&
97 (!mi
->part
->bd_partno
|| rq
->part
== mi
->part
) &&
98 blk_mq_rq_state(rq
) == MQ_RQ_IN_FLIGHT
)
99 mi
->inflight
[rq_data_dir(rq
)]++;
104 unsigned int blk_mq_in_flight(struct request_queue
*q
,
105 struct block_device
*part
)
107 struct mq_inflight mi
= { .part
= part
};
109 blk_mq_queue_tag_busy_iter(q
, blk_mq_check_inflight
, &mi
);
111 return mi
.inflight
[0] + mi
.inflight
[1];
114 void blk_mq_in_flight_rw(struct request_queue
*q
, struct block_device
*part
,
115 unsigned int inflight
[2])
117 struct mq_inflight mi
= { .part
= part
};
119 blk_mq_queue_tag_busy_iter(q
, blk_mq_check_inflight
, &mi
);
120 inflight
[0] = mi
.inflight
[0];
121 inflight
[1] = mi
.inflight
[1];
124 void blk_freeze_queue_start(struct request_queue
*q
)
126 mutex_lock(&q
->mq_freeze_lock
);
127 if (++q
->mq_freeze_depth
== 1) {
128 percpu_ref_kill(&q
->q_usage_counter
);
129 mutex_unlock(&q
->mq_freeze_lock
);
131 blk_mq_run_hw_queues(q
, false);
133 mutex_unlock(&q
->mq_freeze_lock
);
136 EXPORT_SYMBOL_GPL(blk_freeze_queue_start
);
138 void blk_mq_freeze_queue_wait(struct request_queue
*q
)
140 wait_event(q
->mq_freeze_wq
, percpu_ref_is_zero(&q
->q_usage_counter
));
142 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait
);
144 int blk_mq_freeze_queue_wait_timeout(struct request_queue
*q
,
145 unsigned long timeout
)
147 return wait_event_timeout(q
->mq_freeze_wq
,
148 percpu_ref_is_zero(&q
->q_usage_counter
),
151 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout
);
154 * Guarantee no request is in use, so we can change any data structure of
155 * the queue afterward.
157 void blk_freeze_queue(struct request_queue
*q
)
160 * In the !blk_mq case we are only calling this to kill the
161 * q_usage_counter, otherwise this increases the freeze depth
162 * and waits for it to return to zero. For this reason there is
163 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
164 * exported to drivers as the only user for unfreeze is blk_mq.
166 blk_freeze_queue_start(q
);
167 blk_mq_freeze_queue_wait(q
);
170 void blk_mq_freeze_queue(struct request_queue
*q
)
173 * ...just an alias to keep freeze and unfreeze actions balanced
174 * in the blk_mq_* namespace
178 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue
);
180 void __blk_mq_unfreeze_queue(struct request_queue
*q
, bool force_atomic
)
182 mutex_lock(&q
->mq_freeze_lock
);
184 q
->q_usage_counter
.data
->force_atomic
= true;
185 q
->mq_freeze_depth
--;
186 WARN_ON_ONCE(q
->mq_freeze_depth
< 0);
187 if (!q
->mq_freeze_depth
) {
188 percpu_ref_resurrect(&q
->q_usage_counter
);
189 wake_up_all(&q
->mq_freeze_wq
);
191 mutex_unlock(&q
->mq_freeze_lock
);
194 void blk_mq_unfreeze_queue(struct request_queue
*q
)
196 __blk_mq_unfreeze_queue(q
, false);
198 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue
);
201 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
202 * mpt3sas driver such that this function can be removed.
204 void blk_mq_quiesce_queue_nowait(struct request_queue
*q
)
208 spin_lock_irqsave(&q
->queue_lock
, flags
);
209 if (!q
->quiesce_depth
++)
210 blk_queue_flag_set(QUEUE_FLAG_QUIESCED
, q
);
211 spin_unlock_irqrestore(&q
->queue_lock
, flags
);
213 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait
);
216 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
217 * @set: tag_set to wait on
219 * Note: it is driver's responsibility for making sure that quiesce has
220 * been started on or more of the request_queues of the tag_set. This
221 * function only waits for the quiesce on those request_queues that had
222 * the quiesce flag set using blk_mq_quiesce_queue_nowait.
224 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set
*set
)
226 if (set
->flags
& BLK_MQ_F_BLOCKING
)
227 synchronize_srcu(set
->srcu
);
231 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done
);
234 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
237 * Note: this function does not prevent that the struct request end_io()
238 * callback function is invoked. Once this function is returned, we make
239 * sure no dispatch can happen until the queue is unquiesced via
240 * blk_mq_unquiesce_queue().
242 void blk_mq_quiesce_queue(struct request_queue
*q
)
244 blk_mq_quiesce_queue_nowait(q
);
245 /* nothing to wait for non-mq queues */
247 blk_mq_wait_quiesce_done(q
->tag_set
);
249 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue
);
252 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
255 * This function recovers queue into the state before quiescing
256 * which is done by blk_mq_quiesce_queue.
258 void blk_mq_unquiesce_queue(struct request_queue
*q
)
261 bool run_queue
= false;
263 spin_lock_irqsave(&q
->queue_lock
, flags
);
264 if (WARN_ON_ONCE(q
->quiesce_depth
<= 0)) {
266 } else if (!--q
->quiesce_depth
) {
267 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED
, q
);
270 spin_unlock_irqrestore(&q
->queue_lock
, flags
);
272 /* dispatch requests which are inserted during quiescing */
274 blk_mq_run_hw_queues(q
, true);
276 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue
);
278 void blk_mq_quiesce_tagset(struct blk_mq_tag_set
*set
)
280 struct request_queue
*q
;
282 mutex_lock(&set
->tag_list_lock
);
283 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
284 if (!blk_queue_skip_tagset_quiesce(q
))
285 blk_mq_quiesce_queue_nowait(q
);
287 blk_mq_wait_quiesce_done(set
);
288 mutex_unlock(&set
->tag_list_lock
);
290 EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset
);
292 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set
*set
)
294 struct request_queue
*q
;
296 mutex_lock(&set
->tag_list_lock
);
297 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
298 if (!blk_queue_skip_tagset_quiesce(q
))
299 blk_mq_unquiesce_queue(q
);
301 mutex_unlock(&set
->tag_list_lock
);
303 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset
);
305 void blk_mq_wake_waiters(struct request_queue
*q
)
307 struct blk_mq_hw_ctx
*hctx
;
310 queue_for_each_hw_ctx(q
, hctx
, i
)
311 if (blk_mq_hw_queue_mapped(hctx
))
312 blk_mq_tag_wakeup_all(hctx
->tags
, true);
315 void blk_rq_init(struct request_queue
*q
, struct request
*rq
)
317 memset(rq
, 0, sizeof(*rq
));
319 INIT_LIST_HEAD(&rq
->queuelist
);
321 rq
->__sector
= (sector_t
) -1;
322 INIT_HLIST_NODE(&rq
->hash
);
323 RB_CLEAR_NODE(&rq
->rb_node
);
324 rq
->tag
= BLK_MQ_NO_TAG
;
325 rq
->internal_tag
= BLK_MQ_NO_TAG
;
326 rq
->start_time_ns
= ktime_get_ns();
328 blk_crypto_rq_set_defaults(rq
);
330 EXPORT_SYMBOL(blk_rq_init
);
332 /* Set start and alloc time when the allocated request is actually used */
333 static inline void blk_mq_rq_time_init(struct request
*rq
, u64 alloc_time_ns
)
335 if (blk_mq_need_time_stamp(rq
))
336 rq
->start_time_ns
= ktime_get_ns();
338 rq
->start_time_ns
= 0;
340 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
341 if (blk_queue_rq_alloc_time(rq
->q
))
342 rq
->alloc_time_ns
= alloc_time_ns
?: rq
->start_time_ns
;
344 rq
->alloc_time_ns
= 0;
348 static struct request
*blk_mq_rq_ctx_init(struct blk_mq_alloc_data
*data
,
349 struct blk_mq_tags
*tags
, unsigned int tag
)
351 struct blk_mq_ctx
*ctx
= data
->ctx
;
352 struct blk_mq_hw_ctx
*hctx
= data
->hctx
;
353 struct request_queue
*q
= data
->q
;
354 struct request
*rq
= tags
->static_rqs
[tag
];
359 rq
->cmd_flags
= data
->cmd_flags
;
361 if (data
->flags
& BLK_MQ_REQ_PM
)
362 data
->rq_flags
|= RQF_PM
;
363 if (blk_queue_io_stat(q
))
364 data
->rq_flags
|= RQF_IO_STAT
;
365 rq
->rq_flags
= data
->rq_flags
;
367 if (data
->rq_flags
& RQF_SCHED_TAGS
) {
368 rq
->tag
= BLK_MQ_NO_TAG
;
369 rq
->internal_tag
= tag
;
372 rq
->internal_tag
= BLK_MQ_NO_TAG
;
377 rq
->io_start_time_ns
= 0;
378 rq
->stats_sectors
= 0;
379 rq
->nr_phys_segments
= 0;
380 #if defined(CONFIG_BLK_DEV_INTEGRITY)
381 rq
->nr_integrity_segments
= 0;
384 rq
->end_io_data
= NULL
;
386 blk_crypto_rq_set_defaults(rq
);
387 INIT_LIST_HEAD(&rq
->queuelist
);
388 /* tag was already set */
389 WRITE_ONCE(rq
->deadline
, 0);
392 if (rq
->rq_flags
& RQF_USE_SCHED
) {
393 struct elevator_queue
*e
= data
->q
->elevator
;
395 INIT_HLIST_NODE(&rq
->hash
);
396 RB_CLEAR_NODE(&rq
->rb_node
);
398 if (e
->type
->ops
.prepare_request
)
399 e
->type
->ops
.prepare_request(rq
);
405 static inline struct request
*
406 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data
*data
)
408 unsigned int tag
, tag_offset
;
409 struct blk_mq_tags
*tags
;
411 unsigned long tag_mask
;
414 tag_mask
= blk_mq_get_tags(data
, data
->nr_tags
, &tag_offset
);
415 if (unlikely(!tag_mask
))
418 tags
= blk_mq_tags_from_data(data
);
419 for (i
= 0; tag_mask
; i
++) {
420 if (!(tag_mask
& (1UL << i
)))
422 tag
= tag_offset
+ i
;
423 prefetch(tags
->static_rqs
[tag
]);
424 tag_mask
&= ~(1UL << i
);
425 rq
= blk_mq_rq_ctx_init(data
, tags
, tag
);
426 rq_list_add(data
->cached_rq
, rq
);
429 /* caller already holds a reference, add for remainder */
430 percpu_ref_get_many(&data
->q
->q_usage_counter
, nr
- 1);
433 return rq_list_pop(data
->cached_rq
);
436 static struct request
*__blk_mq_alloc_requests(struct blk_mq_alloc_data
*data
)
438 struct request_queue
*q
= data
->q
;
439 u64 alloc_time_ns
= 0;
443 /* alloc_time includes depth and tag waits */
444 if (blk_queue_rq_alloc_time(q
))
445 alloc_time_ns
= ktime_get_ns();
447 if (data
->cmd_flags
& REQ_NOWAIT
)
448 data
->flags
|= BLK_MQ_REQ_NOWAIT
;
452 * All requests use scheduler tags when an I/O scheduler is
453 * enabled for the queue.
455 data
->rq_flags
|= RQF_SCHED_TAGS
;
458 * Flush/passthrough requests are special and go directly to the
461 if ((data
->cmd_flags
& REQ_OP_MASK
) != REQ_OP_FLUSH
&&
462 !blk_op_is_passthrough(data
->cmd_flags
)) {
463 struct elevator_mq_ops
*ops
= &q
->elevator
->type
->ops
;
465 WARN_ON_ONCE(data
->flags
& BLK_MQ_REQ_RESERVED
);
467 data
->rq_flags
|= RQF_USE_SCHED
;
468 if (ops
->limit_depth
)
469 ops
->limit_depth(data
->cmd_flags
, data
);
474 data
->ctx
= blk_mq_get_ctx(q
);
475 data
->hctx
= blk_mq_map_queue(q
, data
->cmd_flags
, data
->ctx
);
476 if (!(data
->rq_flags
& RQF_SCHED_TAGS
))
477 blk_mq_tag_busy(data
->hctx
);
479 if (data
->flags
& BLK_MQ_REQ_RESERVED
)
480 data
->rq_flags
|= RQF_RESV
;
483 * Try batched alloc if we want more than 1 tag.
485 if (data
->nr_tags
> 1) {
486 rq
= __blk_mq_alloc_requests_batch(data
);
488 blk_mq_rq_time_init(rq
, alloc_time_ns
);
495 * Waiting allocations only fail because of an inactive hctx. In that
496 * case just retry the hctx assignment and tag allocation as CPU hotplug
497 * should have migrated us to an online CPU by now.
499 tag
= blk_mq_get_tag(data
);
500 if (tag
== BLK_MQ_NO_TAG
) {
501 if (data
->flags
& BLK_MQ_REQ_NOWAIT
)
504 * Give up the CPU and sleep for a random short time to
505 * ensure that thread using a realtime scheduling class
506 * are migrated off the CPU, and thus off the hctx that
513 rq
= blk_mq_rq_ctx_init(data
, blk_mq_tags_from_data(data
), tag
);
514 blk_mq_rq_time_init(rq
, alloc_time_ns
);
518 static struct request
*blk_mq_rq_cache_fill(struct request_queue
*q
,
519 struct blk_plug
*plug
,
521 blk_mq_req_flags_t flags
)
523 struct blk_mq_alloc_data data
= {
527 .nr_tags
= plug
->nr_ios
,
528 .cached_rq
= &plug
->cached_rq
,
532 if (blk_queue_enter(q
, flags
))
537 rq
= __blk_mq_alloc_requests(&data
);
543 static struct request
*blk_mq_alloc_cached_request(struct request_queue
*q
,
545 blk_mq_req_flags_t flags
)
547 struct blk_plug
*plug
= current
->plug
;
553 if (rq_list_empty(plug
->cached_rq
)) {
554 if (plug
->nr_ios
== 1)
556 rq
= blk_mq_rq_cache_fill(q
, plug
, opf
, flags
);
560 rq
= rq_list_peek(&plug
->cached_rq
);
561 if (!rq
|| rq
->q
!= q
)
564 if (blk_mq_get_hctx_type(opf
) != rq
->mq_hctx
->type
)
566 if (op_is_flush(rq
->cmd_flags
) != op_is_flush(opf
))
569 plug
->cached_rq
= rq_list_next(rq
);
570 blk_mq_rq_time_init(rq
, 0);
574 INIT_LIST_HEAD(&rq
->queuelist
);
578 struct request
*blk_mq_alloc_request(struct request_queue
*q
, blk_opf_t opf
,
579 blk_mq_req_flags_t flags
)
583 rq
= blk_mq_alloc_cached_request(q
, opf
, flags
);
585 struct blk_mq_alloc_data data
= {
593 ret
= blk_queue_enter(q
, flags
);
597 rq
= __blk_mq_alloc_requests(&data
);
602 rq
->__sector
= (sector_t
) -1;
603 rq
->bio
= rq
->biotail
= NULL
;
607 return ERR_PTR(-EWOULDBLOCK
);
609 EXPORT_SYMBOL(blk_mq_alloc_request
);
611 struct request
*blk_mq_alloc_request_hctx(struct request_queue
*q
,
612 blk_opf_t opf
, blk_mq_req_flags_t flags
, unsigned int hctx_idx
)
614 struct blk_mq_alloc_data data
= {
620 u64 alloc_time_ns
= 0;
626 /* alloc_time includes depth and tag waits */
627 if (blk_queue_rq_alloc_time(q
))
628 alloc_time_ns
= ktime_get_ns();
631 * If the tag allocator sleeps we could get an allocation for a
632 * different hardware context. No need to complicate the low level
633 * allocator for this for the rare use case of a command tied to
636 if (WARN_ON_ONCE(!(flags
& BLK_MQ_REQ_NOWAIT
)) ||
637 WARN_ON_ONCE(!(flags
& BLK_MQ_REQ_RESERVED
)))
638 return ERR_PTR(-EINVAL
);
640 if (hctx_idx
>= q
->nr_hw_queues
)
641 return ERR_PTR(-EIO
);
643 ret
= blk_queue_enter(q
, flags
);
648 * Check if the hardware context is actually mapped to anything.
649 * If not tell the caller that it should skip this queue.
652 data
.hctx
= xa_load(&q
->hctx_table
, hctx_idx
);
653 if (!blk_mq_hw_queue_mapped(data
.hctx
))
655 cpu
= cpumask_first_and(data
.hctx
->cpumask
, cpu_online_mask
);
656 if (cpu
>= nr_cpu_ids
)
658 data
.ctx
= __blk_mq_get_ctx(q
, cpu
);
661 data
.rq_flags
|= RQF_SCHED_TAGS
;
663 blk_mq_tag_busy(data
.hctx
);
665 if (flags
& BLK_MQ_REQ_RESERVED
)
666 data
.rq_flags
|= RQF_RESV
;
669 tag
= blk_mq_get_tag(&data
);
670 if (tag
== BLK_MQ_NO_TAG
)
672 rq
= blk_mq_rq_ctx_init(&data
, blk_mq_tags_from_data(&data
), tag
);
673 blk_mq_rq_time_init(rq
, alloc_time_ns
);
675 rq
->__sector
= (sector_t
) -1;
676 rq
->bio
= rq
->biotail
= NULL
;
683 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx
);
685 static void blk_mq_finish_request(struct request
*rq
)
687 struct request_queue
*q
= rq
->q
;
689 if (rq
->rq_flags
& RQF_USE_SCHED
) {
690 q
->elevator
->type
->ops
.finish_request(rq
);
692 * For postflush request that may need to be
693 * completed twice, we should clear this flag
694 * to avoid double finish_request() on the rq.
696 rq
->rq_flags
&= ~RQF_USE_SCHED
;
700 static void __blk_mq_free_request(struct request
*rq
)
702 struct request_queue
*q
= rq
->q
;
703 struct blk_mq_ctx
*ctx
= rq
->mq_ctx
;
704 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
705 const int sched_tag
= rq
->internal_tag
;
707 blk_crypto_free_request(rq
);
708 blk_pm_mark_last_busy(rq
);
711 if (rq
->rq_flags
& RQF_MQ_INFLIGHT
)
712 __blk_mq_dec_active_requests(hctx
);
714 if (rq
->tag
!= BLK_MQ_NO_TAG
)
715 blk_mq_put_tag(hctx
->tags
, ctx
, rq
->tag
);
716 if (sched_tag
!= BLK_MQ_NO_TAG
)
717 blk_mq_put_tag(hctx
->sched_tags
, ctx
, sched_tag
);
718 blk_mq_sched_restart(hctx
);
722 void blk_mq_free_request(struct request
*rq
)
724 struct request_queue
*q
= rq
->q
;
726 blk_mq_finish_request(rq
);
728 if (unlikely(laptop_mode
&& !blk_rq_is_passthrough(rq
)))
729 laptop_io_completion(q
->disk
->bdi
);
733 WRITE_ONCE(rq
->state
, MQ_RQ_IDLE
);
734 if (req_ref_put_and_test(rq
))
735 __blk_mq_free_request(rq
);
737 EXPORT_SYMBOL_GPL(blk_mq_free_request
);
739 void blk_mq_free_plug_rqs(struct blk_plug
*plug
)
743 while ((rq
= rq_list_pop(&plug
->cached_rq
)) != NULL
)
744 blk_mq_free_request(rq
);
747 void blk_dump_rq_flags(struct request
*rq
, char *msg
)
749 printk(KERN_INFO
"%s: dev %s: flags=%llx\n", msg
,
750 rq
->q
->disk
? rq
->q
->disk
->disk_name
: "?",
751 (__force
unsigned long long) rq
->cmd_flags
);
753 printk(KERN_INFO
" sector %llu, nr/cnr %u/%u\n",
754 (unsigned long long)blk_rq_pos(rq
),
755 blk_rq_sectors(rq
), blk_rq_cur_sectors(rq
));
756 printk(KERN_INFO
" bio %p, biotail %p, len %u\n",
757 rq
->bio
, rq
->biotail
, blk_rq_bytes(rq
));
759 EXPORT_SYMBOL(blk_dump_rq_flags
);
761 static void req_bio_endio(struct request
*rq
, struct bio
*bio
,
762 unsigned int nbytes
, blk_status_t error
)
764 if (unlikely(error
)) {
765 bio
->bi_status
= error
;
766 } else if (req_op(rq
) == REQ_OP_ZONE_APPEND
) {
768 * Partial zone append completions cannot be supported as the
769 * BIO fragments may end up not being written sequentially.
771 if (bio
->bi_iter
.bi_size
!= nbytes
)
772 bio
->bi_status
= BLK_STS_IOERR
;
774 bio
->bi_iter
.bi_sector
= rq
->__sector
;
777 bio_advance(bio
, nbytes
);
779 if (unlikely(rq
->rq_flags
& RQF_QUIET
))
780 bio_set_flag(bio
, BIO_QUIET
);
781 /* don't actually finish bio if it's part of flush sequence */
782 if (bio
->bi_iter
.bi_size
== 0 && !(rq
->rq_flags
& RQF_FLUSH_SEQ
))
786 static void blk_account_io_completion(struct request
*req
, unsigned int bytes
)
788 if (req
->part
&& blk_do_io_stat(req
)) {
789 const int sgrp
= op_stat_group(req_op(req
));
792 part_stat_add(req
->part
, sectors
[sgrp
], bytes
>> 9);
797 static void blk_print_req_error(struct request
*req
, blk_status_t status
)
799 printk_ratelimited(KERN_ERR
800 "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
801 "phys_seg %u prio class %u\n",
802 blk_status_to_str(status
),
803 req
->q
->disk
? req
->q
->disk
->disk_name
: "?",
804 blk_rq_pos(req
), (__force u32
)req_op(req
),
805 blk_op_str(req_op(req
)),
806 (__force u32
)(req
->cmd_flags
& ~REQ_OP_MASK
),
807 req
->nr_phys_segments
,
808 IOPRIO_PRIO_CLASS(req
->ioprio
));
812 * Fully end IO on a request. Does not support partial completions, or
815 static void blk_complete_request(struct request
*req
)
817 const bool is_flush
= (req
->rq_flags
& RQF_FLUSH_SEQ
) != 0;
818 int total_bytes
= blk_rq_bytes(req
);
819 struct bio
*bio
= req
->bio
;
821 trace_block_rq_complete(req
, BLK_STS_OK
, total_bytes
);
826 #ifdef CONFIG_BLK_DEV_INTEGRITY
827 if (blk_integrity_rq(req
) && req_op(req
) == REQ_OP_READ
)
828 req
->q
->integrity
.profile
->complete_fn(req
, total_bytes
);
832 * Upper layers may call blk_crypto_evict_key() anytime after the last
833 * bio_endio(). Therefore, the keyslot must be released before that.
835 blk_crypto_rq_put_keyslot(req
);
837 blk_account_io_completion(req
, total_bytes
);
840 struct bio
*next
= bio
->bi_next
;
842 /* Completion has already been traced */
843 bio_clear_flag(bio
, BIO_TRACE_COMPLETION
);
845 if (req_op(req
) == REQ_OP_ZONE_APPEND
)
846 bio
->bi_iter
.bi_sector
= req
->__sector
;
854 * Reset counters so that the request stacking driver
855 * can find how many bytes remain in the request
865 * blk_update_request - Complete multiple bytes without completing the request
866 * @req: the request being processed
867 * @error: block status code
868 * @nr_bytes: number of bytes to complete for @req
871 * Ends I/O on a number of bytes attached to @req, but doesn't complete
872 * the request structure even if @req doesn't have leftover.
873 * If @req has leftover, sets it up for the next range of segments.
875 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
876 * %false return from this function.
879 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
880 * except in the consistency check at the end of this function.
883 * %false - this request doesn't have any more data
884 * %true - this request has more data
886 bool blk_update_request(struct request
*req
, blk_status_t error
,
887 unsigned int nr_bytes
)
891 trace_block_rq_complete(req
, error
, nr_bytes
);
896 #ifdef CONFIG_BLK_DEV_INTEGRITY
897 if (blk_integrity_rq(req
) && req_op(req
) == REQ_OP_READ
&&
899 req
->q
->integrity
.profile
->complete_fn(req
, nr_bytes
);
903 * Upper layers may call blk_crypto_evict_key() anytime after the last
904 * bio_endio(). Therefore, the keyslot must be released before that.
906 if (blk_crypto_rq_has_keyslot(req
) && nr_bytes
>= blk_rq_bytes(req
))
907 __blk_crypto_rq_put_keyslot(req
);
909 if (unlikely(error
&& !blk_rq_is_passthrough(req
) &&
910 !(req
->rq_flags
& RQF_QUIET
)) &&
911 !test_bit(GD_DEAD
, &req
->q
->disk
->state
)) {
912 blk_print_req_error(req
, error
);
913 trace_block_rq_error(req
, error
, nr_bytes
);
916 blk_account_io_completion(req
, nr_bytes
);
920 struct bio
*bio
= req
->bio
;
921 unsigned bio_bytes
= min(bio
->bi_iter
.bi_size
, nr_bytes
);
923 if (bio_bytes
== bio
->bi_iter
.bi_size
)
924 req
->bio
= bio
->bi_next
;
926 /* Completion has already been traced */
927 bio_clear_flag(bio
, BIO_TRACE_COMPLETION
);
928 req_bio_endio(req
, bio
, bio_bytes
, error
);
930 total_bytes
+= bio_bytes
;
931 nr_bytes
-= bio_bytes
;
942 * Reset counters so that the request stacking driver
943 * can find how many bytes remain in the request
950 req
->__data_len
-= total_bytes
;
952 /* update sector only for requests with clear definition of sector */
953 if (!blk_rq_is_passthrough(req
))
954 req
->__sector
+= total_bytes
>> 9;
956 /* mixed attributes always follow the first bio */
957 if (req
->rq_flags
& RQF_MIXED_MERGE
) {
958 req
->cmd_flags
&= ~REQ_FAILFAST_MASK
;
959 req
->cmd_flags
|= req
->bio
->bi_opf
& REQ_FAILFAST_MASK
;
962 if (!(req
->rq_flags
& RQF_SPECIAL_PAYLOAD
)) {
964 * If total number of sectors is less than the first segment
965 * size, something has gone terribly wrong.
967 if (blk_rq_bytes(req
) < blk_rq_cur_bytes(req
)) {
968 blk_dump_rq_flags(req
, "request botched");
969 req
->__data_len
= blk_rq_cur_bytes(req
);
972 /* recalculate the number of segments */
973 req
->nr_phys_segments
= blk_recalc_rq_segments(req
);
978 EXPORT_SYMBOL_GPL(blk_update_request
);
980 static inline void blk_account_io_done(struct request
*req
, u64 now
)
982 trace_block_io_done(req
);
985 * Account IO completion. flush_rq isn't accounted as a
986 * normal IO on queueing nor completion. Accounting the
987 * containing request is enough.
989 if (blk_do_io_stat(req
) && req
->part
&&
990 !(req
->rq_flags
& RQF_FLUSH_SEQ
)) {
991 const int sgrp
= op_stat_group(req_op(req
));
994 update_io_ticks(req
->part
, jiffies
, true);
995 part_stat_inc(req
->part
, ios
[sgrp
]);
996 part_stat_add(req
->part
, nsecs
[sgrp
], now
- req
->start_time_ns
);
1001 static inline void blk_account_io_start(struct request
*req
)
1003 trace_block_io_start(req
);
1005 if (blk_do_io_stat(req
)) {
1007 * All non-passthrough requests are created from a bio with one
1008 * exception: when a flush command that is part of a flush sequence
1009 * generated by the state machine in blk-flush.c is cloned onto the
1010 * lower device by dm-multipath we can get here without a bio.
1013 req
->part
= req
->bio
->bi_bdev
;
1015 req
->part
= req
->q
->disk
->part0
;
1018 update_io_ticks(req
->part
, jiffies
, false);
1023 static inline void __blk_mq_end_request_acct(struct request
*rq
, u64 now
)
1025 if (rq
->rq_flags
& RQF_STATS
)
1026 blk_stat_add(rq
, now
);
1028 blk_mq_sched_completed_request(rq
, now
);
1029 blk_account_io_done(rq
, now
);
1032 inline void __blk_mq_end_request(struct request
*rq
, blk_status_t error
)
1034 if (blk_mq_need_time_stamp(rq
))
1035 __blk_mq_end_request_acct(rq
, ktime_get_ns());
1037 blk_mq_finish_request(rq
);
1040 rq_qos_done(rq
->q
, rq
);
1041 if (rq
->end_io(rq
, error
) == RQ_END_IO_FREE
)
1042 blk_mq_free_request(rq
);
1044 blk_mq_free_request(rq
);
1047 EXPORT_SYMBOL(__blk_mq_end_request
);
1049 void blk_mq_end_request(struct request
*rq
, blk_status_t error
)
1051 if (blk_update_request(rq
, error
, blk_rq_bytes(rq
)))
1053 __blk_mq_end_request(rq
, error
);
1055 EXPORT_SYMBOL(blk_mq_end_request
);
1057 #define TAG_COMP_BATCH 32
1059 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx
*hctx
,
1060 int *tag_array
, int nr_tags
)
1062 struct request_queue
*q
= hctx
->queue
;
1065 * All requests should have been marked as RQF_MQ_INFLIGHT, so
1066 * update hctx->nr_active in batch
1068 if (hctx
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
)
1069 __blk_mq_sub_active_requests(hctx
, nr_tags
);
1071 blk_mq_put_tags(hctx
->tags
, tag_array
, nr_tags
);
1072 percpu_ref_put_many(&q
->q_usage_counter
, nr_tags
);
1075 void blk_mq_end_request_batch(struct io_comp_batch
*iob
)
1077 int tags
[TAG_COMP_BATCH
], nr_tags
= 0;
1078 struct blk_mq_hw_ctx
*cur_hctx
= NULL
;
1083 now
= ktime_get_ns();
1085 while ((rq
= rq_list_pop(&iob
->req_list
)) != NULL
) {
1087 prefetch(rq
->rq_next
);
1089 blk_complete_request(rq
);
1091 __blk_mq_end_request_acct(rq
, now
);
1093 blk_mq_finish_request(rq
);
1095 rq_qos_done(rq
->q
, rq
);
1098 * If end_io handler returns NONE, then it still has
1099 * ownership of the request.
1101 if (rq
->end_io
&& rq
->end_io(rq
, 0) == RQ_END_IO_NONE
)
1104 WRITE_ONCE(rq
->state
, MQ_RQ_IDLE
);
1105 if (!req_ref_put_and_test(rq
))
1108 blk_crypto_free_request(rq
);
1109 blk_pm_mark_last_busy(rq
);
1111 if (nr_tags
== TAG_COMP_BATCH
|| cur_hctx
!= rq
->mq_hctx
) {
1113 blk_mq_flush_tag_batch(cur_hctx
, tags
, nr_tags
);
1115 cur_hctx
= rq
->mq_hctx
;
1117 tags
[nr_tags
++] = rq
->tag
;
1121 blk_mq_flush_tag_batch(cur_hctx
, tags
, nr_tags
);
1123 EXPORT_SYMBOL_GPL(blk_mq_end_request_batch
);
1125 static void blk_complete_reqs(struct llist_head
*list
)
1127 struct llist_node
*entry
= llist_reverse_order(llist_del_all(list
));
1128 struct request
*rq
, *next
;
1130 llist_for_each_entry_safe(rq
, next
, entry
, ipi_list
)
1131 rq
->q
->mq_ops
->complete(rq
);
1134 static __latent_entropy
void blk_done_softirq(struct softirq_action
*h
)
1136 blk_complete_reqs(this_cpu_ptr(&blk_cpu_done
));
1139 static int blk_softirq_cpu_dead(unsigned int cpu
)
1141 blk_complete_reqs(&per_cpu(blk_cpu_done
, cpu
));
1145 static void __blk_mq_complete_request_remote(void *data
)
1147 __raise_softirq_irqoff(BLOCK_SOFTIRQ
);
1150 static inline bool blk_mq_complete_need_ipi(struct request
*rq
)
1152 int cpu
= raw_smp_processor_id();
1154 if (!IS_ENABLED(CONFIG_SMP
) ||
1155 !test_bit(QUEUE_FLAG_SAME_COMP
, &rq
->q
->queue_flags
))
1158 * With force threaded interrupts enabled, raising softirq from an SMP
1159 * function call will always result in waking the ksoftirqd thread.
1160 * This is probably worse than completing the request on a different
1163 if (force_irqthreads())
1166 /* same CPU or cache domain? Complete locally */
1167 if (cpu
== rq
->mq_ctx
->cpu
||
1168 (!test_bit(QUEUE_FLAG_SAME_FORCE
, &rq
->q
->queue_flags
) &&
1169 cpus_share_cache(cpu
, rq
->mq_ctx
->cpu
)))
1172 /* don't try to IPI to an offline CPU */
1173 return cpu_online(rq
->mq_ctx
->cpu
);
1176 static void blk_mq_complete_send_ipi(struct request
*rq
)
1180 cpu
= rq
->mq_ctx
->cpu
;
1181 if (llist_add(&rq
->ipi_list
, &per_cpu(blk_cpu_done
, cpu
)))
1182 smp_call_function_single_async(cpu
, &per_cpu(blk_cpu_csd
, cpu
));
1185 static void blk_mq_raise_softirq(struct request
*rq
)
1187 struct llist_head
*list
;
1190 list
= this_cpu_ptr(&blk_cpu_done
);
1191 if (llist_add(&rq
->ipi_list
, list
))
1192 raise_softirq(BLOCK_SOFTIRQ
);
1196 bool blk_mq_complete_request_remote(struct request
*rq
)
1198 WRITE_ONCE(rq
->state
, MQ_RQ_COMPLETE
);
1201 * For request which hctx has only one ctx mapping,
1202 * or a polled request, always complete locally,
1203 * it's pointless to redirect the completion.
1205 if ((rq
->mq_hctx
->nr_ctx
== 1 &&
1206 rq
->mq_ctx
->cpu
== raw_smp_processor_id()) ||
1207 rq
->cmd_flags
& REQ_POLLED
)
1210 if (blk_mq_complete_need_ipi(rq
)) {
1211 blk_mq_complete_send_ipi(rq
);
1215 if (rq
->q
->nr_hw_queues
== 1) {
1216 blk_mq_raise_softirq(rq
);
1221 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote
);
1224 * blk_mq_complete_request - end I/O on a request
1225 * @rq: the request being processed
1228 * Complete a request by scheduling the ->complete_rq operation.
1230 void blk_mq_complete_request(struct request
*rq
)
1232 if (!blk_mq_complete_request_remote(rq
))
1233 rq
->q
->mq_ops
->complete(rq
);
1235 EXPORT_SYMBOL(blk_mq_complete_request
);
1238 * blk_mq_start_request - Start processing a request
1239 * @rq: Pointer to request to be started
1241 * Function used by device drivers to notify the block layer that a request
1242 * is going to be processed now, so blk layer can do proper initializations
1243 * such as starting the timeout timer.
1245 void blk_mq_start_request(struct request
*rq
)
1247 struct request_queue
*q
= rq
->q
;
1249 trace_block_rq_issue(rq
);
1251 if (test_bit(QUEUE_FLAG_STATS
, &q
->queue_flags
)) {
1252 rq
->io_start_time_ns
= ktime_get_ns();
1253 rq
->stats_sectors
= blk_rq_sectors(rq
);
1254 rq
->rq_flags
|= RQF_STATS
;
1255 rq_qos_issue(q
, rq
);
1258 WARN_ON_ONCE(blk_mq_rq_state(rq
) != MQ_RQ_IDLE
);
1261 WRITE_ONCE(rq
->state
, MQ_RQ_IN_FLIGHT
);
1263 #ifdef CONFIG_BLK_DEV_INTEGRITY
1264 if (blk_integrity_rq(rq
) && req_op(rq
) == REQ_OP_WRITE
)
1265 q
->integrity
.profile
->prepare_fn(rq
);
1267 if (rq
->bio
&& rq
->bio
->bi_opf
& REQ_POLLED
)
1268 WRITE_ONCE(rq
->bio
->bi_cookie
, rq
->mq_hctx
->queue_num
);
1270 EXPORT_SYMBOL(blk_mq_start_request
);
1273 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
1274 * queues. This is important for md arrays to benefit from merging
1277 static inline unsigned short blk_plug_max_rq_count(struct blk_plug
*plug
)
1279 if (plug
->multiple_queues
)
1280 return BLK_MAX_REQUEST_COUNT
* 2;
1281 return BLK_MAX_REQUEST_COUNT
;
1284 static void blk_add_rq_to_plug(struct blk_plug
*plug
, struct request
*rq
)
1286 struct request
*last
= rq_list_peek(&plug
->mq_list
);
1288 if (!plug
->rq_count
) {
1289 trace_block_plug(rq
->q
);
1290 } else if (plug
->rq_count
>= blk_plug_max_rq_count(plug
) ||
1291 (!blk_queue_nomerges(rq
->q
) &&
1292 blk_rq_bytes(last
) >= BLK_PLUG_FLUSH_SIZE
)) {
1293 blk_mq_flush_plug_list(plug
, false);
1295 trace_block_plug(rq
->q
);
1298 if (!plug
->multiple_queues
&& last
&& last
->q
!= rq
->q
)
1299 plug
->multiple_queues
= true;
1301 * Any request allocated from sched tags can't be issued to
1302 * ->queue_rqs() directly
1304 if (!plug
->has_elevator
&& (rq
->rq_flags
& RQF_SCHED_TAGS
))
1305 plug
->has_elevator
= true;
1307 rq_list_add(&plug
->mq_list
, rq
);
1312 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
1313 * @rq: request to insert
1314 * @at_head: insert request at head or tail of queue
1317 * Insert a fully prepared request at the back of the I/O scheduler queue
1318 * for execution. Don't wait for completion.
1321 * This function will invoke @done directly if the queue is dead.
1323 void blk_execute_rq_nowait(struct request
*rq
, bool at_head
)
1325 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
1327 WARN_ON(irqs_disabled());
1328 WARN_ON(!blk_rq_is_passthrough(rq
));
1330 blk_account_io_start(rq
);
1333 * As plugging can be enabled for passthrough requests on a zoned
1334 * device, directly accessing the plug instead of using blk_mq_plug()
1335 * should not have any consequences.
1337 if (current
->plug
&& !at_head
) {
1338 blk_add_rq_to_plug(current
->plug
, rq
);
1342 blk_mq_insert_request(rq
, at_head
? BLK_MQ_INSERT_AT_HEAD
: 0);
1343 blk_mq_run_hw_queue(hctx
, hctx
->flags
& BLK_MQ_F_BLOCKING
);
1345 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait
);
1347 struct blk_rq_wait
{
1348 struct completion done
;
1352 static enum rq_end_io_ret
blk_end_sync_rq(struct request
*rq
, blk_status_t ret
)
1354 struct blk_rq_wait
*wait
= rq
->end_io_data
;
1357 complete(&wait
->done
);
1358 return RQ_END_IO_NONE
;
1361 bool blk_rq_is_poll(struct request
*rq
)
1365 if (rq
->mq_hctx
->type
!= HCTX_TYPE_POLL
)
1369 EXPORT_SYMBOL_GPL(blk_rq_is_poll
);
1371 static void blk_rq_poll_completion(struct request
*rq
, struct completion
*wait
)
1374 blk_hctx_poll(rq
->q
, rq
->mq_hctx
, NULL
, 0);
1376 } while (!completion_done(wait
));
1380 * blk_execute_rq - insert a request into queue for execution
1381 * @rq: request to insert
1382 * @at_head: insert request at head or tail of queue
1385 * Insert a fully prepared request at the back of the I/O scheduler queue
1386 * for execution and wait for completion.
1387 * Return: The blk_status_t result provided to blk_mq_end_request().
1389 blk_status_t
blk_execute_rq(struct request
*rq
, bool at_head
)
1391 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
1392 struct blk_rq_wait wait
= {
1393 .done
= COMPLETION_INITIALIZER_ONSTACK(wait
.done
),
1396 WARN_ON(irqs_disabled());
1397 WARN_ON(!blk_rq_is_passthrough(rq
));
1399 rq
->end_io_data
= &wait
;
1400 rq
->end_io
= blk_end_sync_rq
;
1402 blk_account_io_start(rq
);
1403 blk_mq_insert_request(rq
, at_head
? BLK_MQ_INSERT_AT_HEAD
: 0);
1404 blk_mq_run_hw_queue(hctx
, false);
1406 if (blk_rq_is_poll(rq
)) {
1407 blk_rq_poll_completion(rq
, &wait
.done
);
1410 * Prevent hang_check timer from firing at us during very long
1413 unsigned long hang_check
= sysctl_hung_task_timeout_secs
;
1416 while (!wait_for_completion_io_timeout(&wait
.done
,
1417 hang_check
* (HZ
/2)))
1420 wait_for_completion_io(&wait
.done
);
1425 EXPORT_SYMBOL(blk_execute_rq
);
1427 static void __blk_mq_requeue_request(struct request
*rq
)
1429 struct request_queue
*q
= rq
->q
;
1431 blk_mq_put_driver_tag(rq
);
1433 trace_block_rq_requeue(rq
);
1434 rq_qos_requeue(q
, rq
);
1436 if (blk_mq_request_started(rq
)) {
1437 WRITE_ONCE(rq
->state
, MQ_RQ_IDLE
);
1438 rq
->rq_flags
&= ~RQF_TIMED_OUT
;
1442 void blk_mq_requeue_request(struct request
*rq
, bool kick_requeue_list
)
1444 struct request_queue
*q
= rq
->q
;
1445 unsigned long flags
;
1447 __blk_mq_requeue_request(rq
);
1449 /* this request will be re-inserted to io scheduler queue */
1450 blk_mq_sched_requeue_request(rq
);
1452 spin_lock_irqsave(&q
->requeue_lock
, flags
);
1453 list_add_tail(&rq
->queuelist
, &q
->requeue_list
);
1454 spin_unlock_irqrestore(&q
->requeue_lock
, flags
);
1456 if (kick_requeue_list
)
1457 blk_mq_kick_requeue_list(q
);
1459 EXPORT_SYMBOL(blk_mq_requeue_request
);
1461 static void blk_mq_requeue_work(struct work_struct
*work
)
1463 struct request_queue
*q
=
1464 container_of(work
, struct request_queue
, requeue_work
.work
);
1466 LIST_HEAD(flush_list
);
1469 spin_lock_irq(&q
->requeue_lock
);
1470 list_splice_init(&q
->requeue_list
, &rq_list
);
1471 list_splice_init(&q
->flush_list
, &flush_list
);
1472 spin_unlock_irq(&q
->requeue_lock
);
1474 while (!list_empty(&rq_list
)) {
1475 rq
= list_entry(rq_list
.next
, struct request
, queuelist
);
1477 * If RQF_DONTPREP ist set, the request has been started by the
1478 * driver already and might have driver-specific data allocated
1479 * already. Insert it into the hctx dispatch list to avoid
1480 * block layer merges for the request.
1482 if (rq
->rq_flags
& RQF_DONTPREP
) {
1483 list_del_init(&rq
->queuelist
);
1484 blk_mq_request_bypass_insert(rq
, 0);
1486 list_del_init(&rq
->queuelist
);
1487 blk_mq_insert_request(rq
, BLK_MQ_INSERT_AT_HEAD
);
1491 while (!list_empty(&flush_list
)) {
1492 rq
= list_entry(flush_list
.next
, struct request
, queuelist
);
1493 list_del_init(&rq
->queuelist
);
1494 blk_mq_insert_request(rq
, 0);
1497 blk_mq_run_hw_queues(q
, false);
1500 void blk_mq_kick_requeue_list(struct request_queue
*q
)
1502 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND
, &q
->requeue_work
, 0);
1504 EXPORT_SYMBOL(blk_mq_kick_requeue_list
);
1506 void blk_mq_delay_kick_requeue_list(struct request_queue
*q
,
1507 unsigned long msecs
)
1509 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND
, &q
->requeue_work
,
1510 msecs_to_jiffies(msecs
));
1512 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list
);
1514 static bool blk_mq_rq_inflight(struct request
*rq
, void *priv
)
1517 * If we find a request that isn't idle we know the queue is busy
1518 * as it's checked in the iter.
1519 * Return false to stop the iteration.
1521 if (blk_mq_request_started(rq
)) {
1531 bool blk_mq_queue_inflight(struct request_queue
*q
)
1535 blk_mq_queue_tag_busy_iter(q
, blk_mq_rq_inflight
, &busy
);
1538 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight
);
1540 static void blk_mq_rq_timed_out(struct request
*req
)
1542 req
->rq_flags
|= RQF_TIMED_OUT
;
1543 if (req
->q
->mq_ops
->timeout
) {
1544 enum blk_eh_timer_return ret
;
1546 ret
= req
->q
->mq_ops
->timeout(req
);
1547 if (ret
== BLK_EH_DONE
)
1549 WARN_ON_ONCE(ret
!= BLK_EH_RESET_TIMER
);
1555 struct blk_expired_data
{
1556 bool has_timedout_rq
;
1558 unsigned long timeout_start
;
1561 static bool blk_mq_req_expired(struct request
*rq
, struct blk_expired_data
*expired
)
1563 unsigned long deadline
;
1565 if (blk_mq_rq_state(rq
) != MQ_RQ_IN_FLIGHT
)
1567 if (rq
->rq_flags
& RQF_TIMED_OUT
)
1570 deadline
= READ_ONCE(rq
->deadline
);
1571 if (time_after_eq(expired
->timeout_start
, deadline
))
1574 if (expired
->next
== 0)
1575 expired
->next
= deadline
;
1576 else if (time_after(expired
->next
, deadline
))
1577 expired
->next
= deadline
;
1581 void blk_mq_put_rq_ref(struct request
*rq
)
1583 if (is_flush_rq(rq
)) {
1584 if (rq
->end_io(rq
, 0) == RQ_END_IO_FREE
)
1585 blk_mq_free_request(rq
);
1586 } else if (req_ref_put_and_test(rq
)) {
1587 __blk_mq_free_request(rq
);
1591 static bool blk_mq_check_expired(struct request
*rq
, void *priv
)
1593 struct blk_expired_data
*expired
= priv
;
1596 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
1597 * be reallocated underneath the timeout handler's processing, then
1598 * the expire check is reliable. If the request is not expired, then
1599 * it was completed and reallocated as a new request after returning
1600 * from blk_mq_check_expired().
1602 if (blk_mq_req_expired(rq
, expired
)) {
1603 expired
->has_timedout_rq
= true;
1609 static bool blk_mq_handle_expired(struct request
*rq
, void *priv
)
1611 struct blk_expired_data
*expired
= priv
;
1613 if (blk_mq_req_expired(rq
, expired
))
1614 blk_mq_rq_timed_out(rq
);
1618 static void blk_mq_timeout_work(struct work_struct
*work
)
1620 struct request_queue
*q
=
1621 container_of(work
, struct request_queue
, timeout_work
);
1622 struct blk_expired_data expired
= {
1623 .timeout_start
= jiffies
,
1625 struct blk_mq_hw_ctx
*hctx
;
1628 /* A deadlock might occur if a request is stuck requiring a
1629 * timeout at the same time a queue freeze is waiting
1630 * completion, since the timeout code would not be able to
1631 * acquire the queue reference here.
1633 * That's why we don't use blk_queue_enter here; instead, we use
1634 * percpu_ref_tryget directly, because we need to be able to
1635 * obtain a reference even in the short window between the queue
1636 * starting to freeze, by dropping the first reference in
1637 * blk_freeze_queue_start, and the moment the last request is
1638 * consumed, marked by the instant q_usage_counter reaches
1641 if (!percpu_ref_tryget(&q
->q_usage_counter
))
1644 /* check if there is any timed-out request */
1645 blk_mq_queue_tag_busy_iter(q
, blk_mq_check_expired
, &expired
);
1646 if (expired
.has_timedout_rq
) {
1648 * Before walking tags, we must ensure any submit started
1649 * before the current time has finished. Since the submit
1650 * uses srcu or rcu, wait for a synchronization point to
1651 * ensure all running submits have finished
1653 blk_mq_wait_quiesce_done(q
->tag_set
);
1656 blk_mq_queue_tag_busy_iter(q
, blk_mq_handle_expired
, &expired
);
1659 if (expired
.next
!= 0) {
1660 mod_timer(&q
->timeout
, expired
.next
);
1663 * Request timeouts are handled as a forward rolling timer. If
1664 * we end up here it means that no requests are pending and
1665 * also that no request has been pending for a while. Mark
1666 * each hctx as idle.
1668 queue_for_each_hw_ctx(q
, hctx
, i
) {
1669 /* the hctx may be unmapped, so check it here */
1670 if (blk_mq_hw_queue_mapped(hctx
))
1671 blk_mq_tag_idle(hctx
);
1677 struct flush_busy_ctx_data
{
1678 struct blk_mq_hw_ctx
*hctx
;
1679 struct list_head
*list
;
1682 static bool flush_busy_ctx(struct sbitmap
*sb
, unsigned int bitnr
, void *data
)
1684 struct flush_busy_ctx_data
*flush_data
= data
;
1685 struct blk_mq_hw_ctx
*hctx
= flush_data
->hctx
;
1686 struct blk_mq_ctx
*ctx
= hctx
->ctxs
[bitnr
];
1687 enum hctx_type type
= hctx
->type
;
1689 spin_lock(&ctx
->lock
);
1690 list_splice_tail_init(&ctx
->rq_lists
[type
], flush_data
->list
);
1691 sbitmap_clear_bit(sb
, bitnr
);
1692 spin_unlock(&ctx
->lock
);
1697 * Process software queues that have been marked busy, splicing them
1698 * to the for-dispatch
1700 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx
*hctx
, struct list_head
*list
)
1702 struct flush_busy_ctx_data data
= {
1707 sbitmap_for_each_set(&hctx
->ctx_map
, flush_busy_ctx
, &data
);
1709 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs
);
1711 struct dispatch_rq_data
{
1712 struct blk_mq_hw_ctx
*hctx
;
1716 static bool dispatch_rq_from_ctx(struct sbitmap
*sb
, unsigned int bitnr
,
1719 struct dispatch_rq_data
*dispatch_data
= data
;
1720 struct blk_mq_hw_ctx
*hctx
= dispatch_data
->hctx
;
1721 struct blk_mq_ctx
*ctx
= hctx
->ctxs
[bitnr
];
1722 enum hctx_type type
= hctx
->type
;
1724 spin_lock(&ctx
->lock
);
1725 if (!list_empty(&ctx
->rq_lists
[type
])) {
1726 dispatch_data
->rq
= list_entry_rq(ctx
->rq_lists
[type
].next
);
1727 list_del_init(&dispatch_data
->rq
->queuelist
);
1728 if (list_empty(&ctx
->rq_lists
[type
]))
1729 sbitmap_clear_bit(sb
, bitnr
);
1731 spin_unlock(&ctx
->lock
);
1733 return !dispatch_data
->rq
;
1736 struct request
*blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx
*hctx
,
1737 struct blk_mq_ctx
*start
)
1739 unsigned off
= start
? start
->index_hw
[hctx
->type
] : 0;
1740 struct dispatch_rq_data data
= {
1745 __sbitmap_for_each_set(&hctx
->ctx_map
, off
,
1746 dispatch_rq_from_ctx
, &data
);
1751 static bool __blk_mq_alloc_driver_tag(struct request
*rq
)
1753 struct sbitmap_queue
*bt
= &rq
->mq_hctx
->tags
->bitmap_tags
;
1754 unsigned int tag_offset
= rq
->mq_hctx
->tags
->nr_reserved_tags
;
1757 blk_mq_tag_busy(rq
->mq_hctx
);
1759 if (blk_mq_tag_is_reserved(rq
->mq_hctx
->sched_tags
, rq
->internal_tag
)) {
1760 bt
= &rq
->mq_hctx
->tags
->breserved_tags
;
1763 if (!hctx_may_queue(rq
->mq_hctx
, bt
))
1767 tag
= __sbitmap_queue_get(bt
);
1768 if (tag
== BLK_MQ_NO_TAG
)
1771 rq
->tag
= tag
+ tag_offset
;
1775 bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx
*hctx
, struct request
*rq
)
1777 if (rq
->tag
== BLK_MQ_NO_TAG
&& !__blk_mq_alloc_driver_tag(rq
))
1780 if ((hctx
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
) &&
1781 !(rq
->rq_flags
& RQF_MQ_INFLIGHT
)) {
1782 rq
->rq_flags
|= RQF_MQ_INFLIGHT
;
1783 __blk_mq_inc_active_requests(hctx
);
1785 hctx
->tags
->rqs
[rq
->tag
] = rq
;
1789 static int blk_mq_dispatch_wake(wait_queue_entry_t
*wait
, unsigned mode
,
1790 int flags
, void *key
)
1792 struct blk_mq_hw_ctx
*hctx
;
1794 hctx
= container_of(wait
, struct blk_mq_hw_ctx
, dispatch_wait
);
1796 spin_lock(&hctx
->dispatch_wait_lock
);
1797 if (!list_empty(&wait
->entry
)) {
1798 struct sbitmap_queue
*sbq
;
1800 list_del_init(&wait
->entry
);
1801 sbq
= &hctx
->tags
->bitmap_tags
;
1802 atomic_dec(&sbq
->ws_active
);
1804 spin_unlock(&hctx
->dispatch_wait_lock
);
1806 blk_mq_run_hw_queue(hctx
, true);
1811 * Mark us waiting for a tag. For shared tags, this involves hooking us into
1812 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1813 * restart. For both cases, take care to check the condition again after
1814 * marking us as waiting.
1816 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx
*hctx
,
1819 struct sbitmap_queue
*sbq
;
1820 struct wait_queue_head
*wq
;
1821 wait_queue_entry_t
*wait
;
1824 if (!(hctx
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
) &&
1825 !(blk_mq_is_shared_tags(hctx
->flags
))) {
1826 blk_mq_sched_mark_restart_hctx(hctx
);
1829 * It's possible that a tag was freed in the window between the
1830 * allocation failure and adding the hardware queue to the wait
1833 * Don't clear RESTART here, someone else could have set it.
1834 * At most this will cost an extra queue run.
1836 return blk_mq_get_driver_tag(rq
);
1839 wait
= &hctx
->dispatch_wait
;
1840 if (!list_empty_careful(&wait
->entry
))
1843 if (blk_mq_tag_is_reserved(rq
->mq_hctx
->sched_tags
, rq
->internal_tag
))
1844 sbq
= &hctx
->tags
->breserved_tags
;
1846 sbq
= &hctx
->tags
->bitmap_tags
;
1847 wq
= &bt_wait_ptr(sbq
, hctx
)->wait
;
1849 spin_lock_irq(&wq
->lock
);
1850 spin_lock(&hctx
->dispatch_wait_lock
);
1851 if (!list_empty(&wait
->entry
)) {
1852 spin_unlock(&hctx
->dispatch_wait_lock
);
1853 spin_unlock_irq(&wq
->lock
);
1857 atomic_inc(&sbq
->ws_active
);
1858 wait
->flags
&= ~WQ_FLAG_EXCLUSIVE
;
1859 __add_wait_queue(wq
, wait
);
1862 * It's possible that a tag was freed in the window between the
1863 * allocation failure and adding the hardware queue to the wait
1866 ret
= blk_mq_get_driver_tag(rq
);
1868 spin_unlock(&hctx
->dispatch_wait_lock
);
1869 spin_unlock_irq(&wq
->lock
);
1874 * We got a tag, remove ourselves from the wait queue to ensure
1875 * someone else gets the wakeup.
1877 list_del_init(&wait
->entry
);
1878 atomic_dec(&sbq
->ws_active
);
1879 spin_unlock(&hctx
->dispatch_wait_lock
);
1880 spin_unlock_irq(&wq
->lock
);
1885 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
1886 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
1888 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1889 * - EWMA is one simple way to compute running average value
1890 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1891 * - take 4 as factor for avoiding to get too small(0) result, and this
1892 * factor doesn't matter because EWMA decreases exponentially
1894 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx
*hctx
, bool busy
)
1898 ewma
= hctx
->dispatch_busy
;
1903 ewma
*= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT
- 1;
1905 ewma
+= 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR
;
1906 ewma
/= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT
;
1908 hctx
->dispatch_busy
= ewma
;
1911 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
1913 static void blk_mq_handle_dev_resource(struct request
*rq
,
1914 struct list_head
*list
)
1916 list_add(&rq
->queuelist
, list
);
1917 __blk_mq_requeue_request(rq
);
1920 static void blk_mq_handle_zone_resource(struct request
*rq
,
1921 struct list_head
*zone_list
)
1924 * If we end up here it is because we cannot dispatch a request to a
1925 * specific zone due to LLD level zone-write locking or other zone
1926 * related resource not being available. In this case, set the request
1927 * aside in zone_list for retrying it later.
1929 list_add(&rq
->queuelist
, zone_list
);
1930 __blk_mq_requeue_request(rq
);
1933 enum prep_dispatch
{
1935 PREP_DISPATCH_NO_TAG
,
1936 PREP_DISPATCH_NO_BUDGET
,
1939 static enum prep_dispatch
blk_mq_prep_dispatch_rq(struct request
*rq
,
1942 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
1943 int budget_token
= -1;
1946 budget_token
= blk_mq_get_dispatch_budget(rq
->q
);
1947 if (budget_token
< 0) {
1948 blk_mq_put_driver_tag(rq
);
1949 return PREP_DISPATCH_NO_BUDGET
;
1951 blk_mq_set_rq_budget_token(rq
, budget_token
);
1954 if (!blk_mq_get_driver_tag(rq
)) {
1956 * The initial allocation attempt failed, so we need to
1957 * rerun the hardware queue when a tag is freed. The
1958 * waitqueue takes care of that. If the queue is run
1959 * before we add this entry back on the dispatch list,
1960 * we'll re-run it below.
1962 if (!blk_mq_mark_tag_wait(hctx
, rq
)) {
1964 * All budgets not got from this function will be put
1965 * together during handling partial dispatch
1968 blk_mq_put_dispatch_budget(rq
->q
, budget_token
);
1969 return PREP_DISPATCH_NO_TAG
;
1973 return PREP_DISPATCH_OK
;
1976 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1977 static void blk_mq_release_budgets(struct request_queue
*q
,
1978 struct list_head
*list
)
1982 list_for_each_entry(rq
, list
, queuelist
) {
1983 int budget_token
= blk_mq_get_rq_budget_token(rq
);
1985 if (budget_token
>= 0)
1986 blk_mq_put_dispatch_budget(q
, budget_token
);
1991 * blk_mq_commit_rqs will notify driver using bd->last that there is no
1992 * more requests. (See comment in struct blk_mq_ops for commit_rqs for
1994 * Attention, we should explicitly call this in unusual cases:
1995 * 1) did not queue everything initially scheduled to queue
1996 * 2) the last attempt to queue a request failed
1998 static void blk_mq_commit_rqs(struct blk_mq_hw_ctx
*hctx
, int queued
,
2001 if (hctx
->queue
->mq_ops
->commit_rqs
&& queued
) {
2002 trace_block_unplug(hctx
->queue
, queued
, !from_schedule
);
2003 hctx
->queue
->mq_ops
->commit_rqs(hctx
);
2008 * Returns true if we did some work AND can potentially do more.
2010 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx
*hctx
, struct list_head
*list
,
2011 unsigned int nr_budgets
)
2013 enum prep_dispatch prep
;
2014 struct request_queue
*q
= hctx
->queue
;
2017 blk_status_t ret
= BLK_STS_OK
;
2018 LIST_HEAD(zone_list
);
2019 bool needs_resource
= false;
2021 if (list_empty(list
))
2025 * Now process all the entries, sending them to the driver.
2029 struct blk_mq_queue_data bd
;
2031 rq
= list_first_entry(list
, struct request
, queuelist
);
2033 WARN_ON_ONCE(hctx
!= rq
->mq_hctx
);
2034 prep
= blk_mq_prep_dispatch_rq(rq
, !nr_budgets
);
2035 if (prep
!= PREP_DISPATCH_OK
)
2038 list_del_init(&rq
->queuelist
);
2041 bd
.last
= list_empty(list
);
2044 * once the request is queued to lld, no need to cover the
2049 ret
= q
->mq_ops
->queue_rq(hctx
, &bd
);
2054 case BLK_STS_RESOURCE
:
2055 needs_resource
= true;
2057 case BLK_STS_DEV_RESOURCE
:
2058 blk_mq_handle_dev_resource(rq
, list
);
2060 case BLK_STS_ZONE_RESOURCE
:
2062 * Move the request to zone_list and keep going through
2063 * the dispatch list to find more requests the drive can
2066 blk_mq_handle_zone_resource(rq
, &zone_list
);
2067 needs_resource
= true;
2070 blk_mq_end_request(rq
, ret
);
2072 } while (!list_empty(list
));
2074 if (!list_empty(&zone_list
))
2075 list_splice_tail_init(&zone_list
, list
);
2077 /* If we didn't flush the entire list, we could have told the driver
2078 * there was more coming, but that turned out to be a lie.
2080 if (!list_empty(list
) || ret
!= BLK_STS_OK
)
2081 blk_mq_commit_rqs(hctx
, queued
, false);
2084 * Any items that need requeuing? Stuff them into hctx->dispatch,
2085 * that is where we will continue on next queue run.
2087 if (!list_empty(list
)) {
2089 /* For non-shared tags, the RESTART check will suffice */
2090 bool no_tag
= prep
== PREP_DISPATCH_NO_TAG
&&
2091 ((hctx
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
) ||
2092 blk_mq_is_shared_tags(hctx
->flags
));
2095 blk_mq_release_budgets(q
, list
);
2097 spin_lock(&hctx
->lock
);
2098 list_splice_tail_init(list
, &hctx
->dispatch
);
2099 spin_unlock(&hctx
->lock
);
2102 * Order adding requests to hctx->dispatch and checking
2103 * SCHED_RESTART flag. The pair of this smp_mb() is the one
2104 * in blk_mq_sched_restart(). Avoid restart code path to
2105 * miss the new added requests to hctx->dispatch, meantime
2106 * SCHED_RESTART is observed here.
2111 * If SCHED_RESTART was set by the caller of this function and
2112 * it is no longer set that means that it was cleared by another
2113 * thread and hence that a queue rerun is needed.
2115 * If 'no_tag' is set, that means that we failed getting
2116 * a driver tag with an I/O scheduler attached. If our dispatch
2117 * waitqueue is no longer active, ensure that we run the queue
2118 * AFTER adding our entries back to the list.
2120 * If no I/O scheduler has been configured it is possible that
2121 * the hardware queue got stopped and restarted before requests
2122 * were pushed back onto the dispatch list. Rerun the queue to
2123 * avoid starvation. Notes:
2124 * - blk_mq_run_hw_queue() checks whether or not a queue has
2125 * been stopped before rerunning a queue.
2126 * - Some but not all block drivers stop a queue before
2127 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
2130 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
2131 * bit is set, run queue after a delay to avoid IO stalls
2132 * that could otherwise occur if the queue is idle. We'll do
2133 * similar if we couldn't get budget or couldn't lock a zone
2134 * and SCHED_RESTART is set.
2136 needs_restart
= blk_mq_sched_needs_restart(hctx
);
2137 if (prep
== PREP_DISPATCH_NO_BUDGET
)
2138 needs_resource
= true;
2139 if (!needs_restart
||
2140 (no_tag
&& list_empty_careful(&hctx
->dispatch_wait
.entry
)))
2141 blk_mq_run_hw_queue(hctx
, true);
2142 else if (needs_resource
)
2143 blk_mq_delay_run_hw_queue(hctx
, BLK_MQ_RESOURCE_DELAY
);
2145 blk_mq_update_dispatch_busy(hctx
, true);
2149 blk_mq_update_dispatch_busy(hctx
, false);
2153 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx
*hctx
)
2155 int cpu
= cpumask_first_and(hctx
->cpumask
, cpu_online_mask
);
2157 if (cpu
>= nr_cpu_ids
)
2158 cpu
= cpumask_first(hctx
->cpumask
);
2163 * It'd be great if the workqueue API had a way to pass
2164 * in a mask and had some smarts for more clever placement.
2165 * For now we just round-robin here, switching for every
2166 * BLK_MQ_CPU_WORK_BATCH queued items.
2168 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx
*hctx
)
2171 int next_cpu
= hctx
->next_cpu
;
2173 if (hctx
->queue
->nr_hw_queues
== 1)
2174 return WORK_CPU_UNBOUND
;
2176 if (--hctx
->next_cpu_batch
<= 0) {
2178 next_cpu
= cpumask_next_and(next_cpu
, hctx
->cpumask
,
2180 if (next_cpu
>= nr_cpu_ids
)
2181 next_cpu
= blk_mq_first_mapped_cpu(hctx
);
2182 hctx
->next_cpu_batch
= BLK_MQ_CPU_WORK_BATCH
;
2186 * Do unbound schedule if we can't find a online CPU for this hctx,
2187 * and it should only happen in the path of handling CPU DEAD.
2189 if (!cpu_online(next_cpu
)) {
2196 * Make sure to re-select CPU next time once after CPUs
2197 * in hctx->cpumask become online again.
2199 hctx
->next_cpu
= next_cpu
;
2200 hctx
->next_cpu_batch
= 1;
2201 return WORK_CPU_UNBOUND
;
2204 hctx
->next_cpu
= next_cpu
;
2209 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
2210 * @hctx: Pointer to the hardware queue to run.
2211 * @msecs: Milliseconds of delay to wait before running the queue.
2213 * Run a hardware queue asynchronously with a delay of @msecs.
2215 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx
*hctx
, unsigned long msecs
)
2217 if (unlikely(blk_mq_hctx_stopped(hctx
)))
2219 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx
), &hctx
->run_work
,
2220 msecs_to_jiffies(msecs
));
2222 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue
);
2225 * blk_mq_run_hw_queue - Start to run a hardware queue.
2226 * @hctx: Pointer to the hardware queue to run.
2227 * @async: If we want to run the queue asynchronously.
2229 * Check if the request queue is not in a quiesced state and if there are
2230 * pending requests to be sent. If this is true, run the queue to send requests
2233 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx
*hctx
, bool async
)
2238 * We can't run the queue inline with interrupts disabled.
2240 WARN_ON_ONCE(!async
&& in_interrupt());
2242 might_sleep_if(!async
&& hctx
->flags
& BLK_MQ_F_BLOCKING
);
2245 * When queue is quiesced, we may be switching io scheduler, or
2246 * updating nr_hw_queues, or other things, and we can't run queue
2247 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
2249 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
2252 __blk_mq_run_dispatch_ops(hctx
->queue
, false,
2253 need_run
= !blk_queue_quiesced(hctx
->queue
) &&
2254 blk_mq_hctx_has_pending(hctx
));
2259 if (async
|| !cpumask_test_cpu(raw_smp_processor_id(), hctx
->cpumask
)) {
2260 blk_mq_delay_run_hw_queue(hctx
, 0);
2264 blk_mq_run_dispatch_ops(hctx
->queue
,
2265 blk_mq_sched_dispatch_requests(hctx
));
2267 EXPORT_SYMBOL(blk_mq_run_hw_queue
);
2270 * Return prefered queue to dispatch from (if any) for non-mq aware IO
2273 static struct blk_mq_hw_ctx
*blk_mq_get_sq_hctx(struct request_queue
*q
)
2275 struct blk_mq_ctx
*ctx
= blk_mq_get_ctx(q
);
2277 * If the IO scheduler does not respect hardware queues when
2278 * dispatching, we just don't bother with multiple HW queues and
2279 * dispatch from hctx for the current CPU since running multiple queues
2280 * just causes lock contention inside the scheduler and pointless cache
2283 struct blk_mq_hw_ctx
*hctx
= ctx
->hctxs
[HCTX_TYPE_DEFAULT
];
2285 if (!blk_mq_hctx_stopped(hctx
))
2291 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
2292 * @q: Pointer to the request queue to run.
2293 * @async: If we want to run the queue asynchronously.
2295 void blk_mq_run_hw_queues(struct request_queue
*q
, bool async
)
2297 struct blk_mq_hw_ctx
*hctx
, *sq_hctx
;
2301 if (blk_queue_sq_sched(q
))
2302 sq_hctx
= blk_mq_get_sq_hctx(q
);
2303 queue_for_each_hw_ctx(q
, hctx
, i
) {
2304 if (blk_mq_hctx_stopped(hctx
))
2307 * Dispatch from this hctx either if there's no hctx preferred
2308 * by IO scheduler or if it has requests that bypass the
2311 if (!sq_hctx
|| sq_hctx
== hctx
||
2312 !list_empty_careful(&hctx
->dispatch
))
2313 blk_mq_run_hw_queue(hctx
, async
);
2316 EXPORT_SYMBOL(blk_mq_run_hw_queues
);
2319 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
2320 * @q: Pointer to the request queue to run.
2321 * @msecs: Milliseconds of delay to wait before running the queues.
2323 void blk_mq_delay_run_hw_queues(struct request_queue
*q
, unsigned long msecs
)
2325 struct blk_mq_hw_ctx
*hctx
, *sq_hctx
;
2329 if (blk_queue_sq_sched(q
))
2330 sq_hctx
= blk_mq_get_sq_hctx(q
);
2331 queue_for_each_hw_ctx(q
, hctx
, i
) {
2332 if (blk_mq_hctx_stopped(hctx
))
2335 * If there is already a run_work pending, leave the
2336 * pending delay untouched. Otherwise, a hctx can stall
2337 * if another hctx is re-delaying the other's work
2338 * before the work executes.
2340 if (delayed_work_pending(&hctx
->run_work
))
2343 * Dispatch from this hctx either if there's no hctx preferred
2344 * by IO scheduler or if it has requests that bypass the
2347 if (!sq_hctx
|| sq_hctx
== hctx
||
2348 !list_empty_careful(&hctx
->dispatch
))
2349 blk_mq_delay_run_hw_queue(hctx
, msecs
);
2352 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues
);
2355 * This function is often used for pausing .queue_rq() by driver when
2356 * there isn't enough resource or some conditions aren't satisfied, and
2357 * BLK_STS_RESOURCE is usually returned.
2359 * We do not guarantee that dispatch can be drained or blocked
2360 * after blk_mq_stop_hw_queue() returns. Please use
2361 * blk_mq_quiesce_queue() for that requirement.
2363 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx
*hctx
)
2365 cancel_delayed_work(&hctx
->run_work
);
2367 set_bit(BLK_MQ_S_STOPPED
, &hctx
->state
);
2369 EXPORT_SYMBOL(blk_mq_stop_hw_queue
);
2372 * This function is often used for pausing .queue_rq() by driver when
2373 * there isn't enough resource or some conditions aren't satisfied, and
2374 * BLK_STS_RESOURCE is usually returned.
2376 * We do not guarantee that dispatch can be drained or blocked
2377 * after blk_mq_stop_hw_queues() returns. Please use
2378 * blk_mq_quiesce_queue() for that requirement.
2380 void blk_mq_stop_hw_queues(struct request_queue
*q
)
2382 struct blk_mq_hw_ctx
*hctx
;
2385 queue_for_each_hw_ctx(q
, hctx
, i
)
2386 blk_mq_stop_hw_queue(hctx
);
2388 EXPORT_SYMBOL(blk_mq_stop_hw_queues
);
2390 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx
*hctx
)
2392 clear_bit(BLK_MQ_S_STOPPED
, &hctx
->state
);
2394 blk_mq_run_hw_queue(hctx
, hctx
->flags
& BLK_MQ_F_BLOCKING
);
2396 EXPORT_SYMBOL(blk_mq_start_hw_queue
);
2398 void blk_mq_start_hw_queues(struct request_queue
*q
)
2400 struct blk_mq_hw_ctx
*hctx
;
2403 queue_for_each_hw_ctx(q
, hctx
, i
)
2404 blk_mq_start_hw_queue(hctx
);
2406 EXPORT_SYMBOL(blk_mq_start_hw_queues
);
2408 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx
*hctx
, bool async
)
2410 if (!blk_mq_hctx_stopped(hctx
))
2413 clear_bit(BLK_MQ_S_STOPPED
, &hctx
->state
);
2414 blk_mq_run_hw_queue(hctx
, async
);
2416 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue
);
2418 void blk_mq_start_stopped_hw_queues(struct request_queue
*q
, bool async
)
2420 struct blk_mq_hw_ctx
*hctx
;
2423 queue_for_each_hw_ctx(q
, hctx
, i
)
2424 blk_mq_start_stopped_hw_queue(hctx
, async
||
2425 (hctx
->flags
& BLK_MQ_F_BLOCKING
));
2427 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues
);
2429 static void blk_mq_run_work_fn(struct work_struct
*work
)
2431 struct blk_mq_hw_ctx
*hctx
=
2432 container_of(work
, struct blk_mq_hw_ctx
, run_work
.work
);
2434 blk_mq_run_dispatch_ops(hctx
->queue
,
2435 blk_mq_sched_dispatch_requests(hctx
));
2439 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
2440 * @rq: Pointer to request to be inserted.
2441 * @flags: BLK_MQ_INSERT_*
2443 * Should only be used carefully, when the caller knows we want to
2444 * bypass a potential IO scheduler on the target device.
2446 static void blk_mq_request_bypass_insert(struct request
*rq
, blk_insert_t flags
)
2448 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
2450 spin_lock(&hctx
->lock
);
2451 if (flags
& BLK_MQ_INSERT_AT_HEAD
)
2452 list_add(&rq
->queuelist
, &hctx
->dispatch
);
2454 list_add_tail(&rq
->queuelist
, &hctx
->dispatch
);
2455 spin_unlock(&hctx
->lock
);
2458 static void blk_mq_insert_requests(struct blk_mq_hw_ctx
*hctx
,
2459 struct blk_mq_ctx
*ctx
, struct list_head
*list
,
2460 bool run_queue_async
)
2463 enum hctx_type type
= hctx
->type
;
2466 * Try to issue requests directly if the hw queue isn't busy to save an
2467 * extra enqueue & dequeue to the sw queue.
2469 if (!hctx
->dispatch_busy
&& !run_queue_async
) {
2470 blk_mq_run_dispatch_ops(hctx
->queue
,
2471 blk_mq_try_issue_list_directly(hctx
, list
));
2472 if (list_empty(list
))
2477 * preemption doesn't flush plug list, so it's possible ctx->cpu is
2480 list_for_each_entry(rq
, list
, queuelist
) {
2481 BUG_ON(rq
->mq_ctx
!= ctx
);
2482 trace_block_rq_insert(rq
);
2483 if (rq
->cmd_flags
& REQ_NOWAIT
)
2484 run_queue_async
= true;
2487 spin_lock(&ctx
->lock
);
2488 list_splice_tail_init(list
, &ctx
->rq_lists
[type
]);
2489 blk_mq_hctx_mark_pending(hctx
, ctx
);
2490 spin_unlock(&ctx
->lock
);
2492 blk_mq_run_hw_queue(hctx
, run_queue_async
);
2495 static void blk_mq_insert_request(struct request
*rq
, blk_insert_t flags
)
2497 struct request_queue
*q
= rq
->q
;
2498 struct blk_mq_ctx
*ctx
= rq
->mq_ctx
;
2499 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
2501 if (blk_rq_is_passthrough(rq
)) {
2503 * Passthrough request have to be added to hctx->dispatch
2504 * directly. The device may be in a situation where it can't
2505 * handle FS request, and always returns BLK_STS_RESOURCE for
2506 * them, which gets them added to hctx->dispatch.
2508 * If a passthrough request is required to unblock the queues,
2509 * and it is added to the scheduler queue, there is no chance to
2510 * dispatch it given we prioritize requests in hctx->dispatch.
2512 blk_mq_request_bypass_insert(rq
, flags
);
2513 } else if (req_op(rq
) == REQ_OP_FLUSH
) {
2515 * Firstly normal IO request is inserted to scheduler queue or
2516 * sw queue, meantime we add flush request to dispatch queue(
2517 * hctx->dispatch) directly and there is at most one in-flight
2518 * flush request for each hw queue, so it doesn't matter to add
2519 * flush request to tail or front of the dispatch queue.
2521 * Secondly in case of NCQ, flush request belongs to non-NCQ
2522 * command, and queueing it will fail when there is any
2523 * in-flight normal IO request(NCQ command). When adding flush
2524 * rq to the front of hctx->dispatch, it is easier to introduce
2525 * extra time to flush rq's latency because of S_SCHED_RESTART
2526 * compared with adding to the tail of dispatch queue, then
2527 * chance of flush merge is increased, and less flush requests
2528 * will be issued to controller. It is observed that ~10% time
2529 * is saved in blktests block/004 on disk attached to AHCI/NCQ
2530 * drive when adding flush rq to the front of hctx->dispatch.
2532 * Simply queue flush rq to the front of hctx->dispatch so that
2533 * intensive flush workloads can benefit in case of NCQ HW.
2535 blk_mq_request_bypass_insert(rq
, BLK_MQ_INSERT_AT_HEAD
);
2536 } else if (q
->elevator
) {
2539 WARN_ON_ONCE(rq
->tag
!= BLK_MQ_NO_TAG
);
2541 list_add(&rq
->queuelist
, &list
);
2542 q
->elevator
->type
->ops
.insert_requests(hctx
, &list
, flags
);
2544 trace_block_rq_insert(rq
);
2546 spin_lock(&ctx
->lock
);
2547 if (flags
& BLK_MQ_INSERT_AT_HEAD
)
2548 list_add(&rq
->queuelist
, &ctx
->rq_lists
[hctx
->type
]);
2550 list_add_tail(&rq
->queuelist
,
2551 &ctx
->rq_lists
[hctx
->type
]);
2552 blk_mq_hctx_mark_pending(hctx
, ctx
);
2553 spin_unlock(&ctx
->lock
);
2557 static void blk_mq_bio_to_request(struct request
*rq
, struct bio
*bio
,
2558 unsigned int nr_segs
)
2562 if (bio
->bi_opf
& REQ_RAHEAD
)
2563 rq
->cmd_flags
|= REQ_FAILFAST_MASK
;
2565 rq
->__sector
= bio
->bi_iter
.bi_sector
;
2566 blk_rq_bio_prep(rq
, bio
, nr_segs
);
2568 /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2569 err
= blk_crypto_rq_bio_prep(rq
, bio
, GFP_NOIO
);
2572 blk_account_io_start(rq
);
2575 static blk_status_t
__blk_mq_issue_directly(struct blk_mq_hw_ctx
*hctx
,
2576 struct request
*rq
, bool last
)
2578 struct request_queue
*q
= rq
->q
;
2579 struct blk_mq_queue_data bd
= {
2586 * For OK queue, we are done. For error, caller may kill it.
2587 * Any other error (busy), just add it to our list as we
2588 * previously would have done.
2590 ret
= q
->mq_ops
->queue_rq(hctx
, &bd
);
2593 blk_mq_update_dispatch_busy(hctx
, false);
2595 case BLK_STS_RESOURCE
:
2596 case BLK_STS_DEV_RESOURCE
:
2597 blk_mq_update_dispatch_busy(hctx
, true);
2598 __blk_mq_requeue_request(rq
);
2601 blk_mq_update_dispatch_busy(hctx
, false);
2608 static bool blk_mq_get_budget_and_tag(struct request
*rq
)
2612 budget_token
= blk_mq_get_dispatch_budget(rq
->q
);
2613 if (budget_token
< 0)
2615 blk_mq_set_rq_budget_token(rq
, budget_token
);
2616 if (!blk_mq_get_driver_tag(rq
)) {
2617 blk_mq_put_dispatch_budget(rq
->q
, budget_token
);
2624 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2625 * @hctx: Pointer of the associated hardware queue.
2626 * @rq: Pointer to request to be sent.
2628 * If the device has enough resources to accept a new request now, send the
2629 * request directly to device driver. Else, insert at hctx->dispatch queue, so
2630 * we can try send it another time in the future. Requests inserted at this
2631 * queue have higher priority.
2633 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx
*hctx
,
2638 if (blk_mq_hctx_stopped(hctx
) || blk_queue_quiesced(rq
->q
)) {
2639 blk_mq_insert_request(rq
, 0);
2643 if ((rq
->rq_flags
& RQF_USE_SCHED
) || !blk_mq_get_budget_and_tag(rq
)) {
2644 blk_mq_insert_request(rq
, 0);
2645 blk_mq_run_hw_queue(hctx
, rq
->cmd_flags
& REQ_NOWAIT
);
2649 ret
= __blk_mq_issue_directly(hctx
, rq
, true);
2653 case BLK_STS_RESOURCE
:
2654 case BLK_STS_DEV_RESOURCE
:
2655 blk_mq_request_bypass_insert(rq
, 0);
2656 blk_mq_run_hw_queue(hctx
, false);
2659 blk_mq_end_request(rq
, ret
);
2664 static blk_status_t
blk_mq_request_issue_directly(struct request
*rq
, bool last
)
2666 struct blk_mq_hw_ctx
*hctx
= rq
->mq_hctx
;
2668 if (blk_mq_hctx_stopped(hctx
) || blk_queue_quiesced(rq
->q
)) {
2669 blk_mq_insert_request(rq
, 0);
2673 if (!blk_mq_get_budget_and_tag(rq
))
2674 return BLK_STS_RESOURCE
;
2675 return __blk_mq_issue_directly(hctx
, rq
, last
);
2678 static void blk_mq_plug_issue_direct(struct blk_plug
*plug
)
2680 struct blk_mq_hw_ctx
*hctx
= NULL
;
2683 blk_status_t ret
= BLK_STS_OK
;
2685 while ((rq
= rq_list_pop(&plug
->mq_list
))) {
2686 bool last
= rq_list_empty(plug
->mq_list
);
2688 if (hctx
!= rq
->mq_hctx
) {
2690 blk_mq_commit_rqs(hctx
, queued
, false);
2696 ret
= blk_mq_request_issue_directly(rq
, last
);
2701 case BLK_STS_RESOURCE
:
2702 case BLK_STS_DEV_RESOURCE
:
2703 blk_mq_request_bypass_insert(rq
, 0);
2704 blk_mq_run_hw_queue(hctx
, false);
2707 blk_mq_end_request(rq
, ret
);
2713 if (ret
!= BLK_STS_OK
)
2714 blk_mq_commit_rqs(hctx
, queued
, false);
2717 static void __blk_mq_flush_plug_list(struct request_queue
*q
,
2718 struct blk_plug
*plug
)
2720 if (blk_queue_quiesced(q
))
2722 q
->mq_ops
->queue_rqs(&plug
->mq_list
);
2725 static void blk_mq_dispatch_plug_list(struct blk_plug
*plug
, bool from_sched
)
2727 struct blk_mq_hw_ctx
*this_hctx
= NULL
;
2728 struct blk_mq_ctx
*this_ctx
= NULL
;
2729 struct request
*requeue_list
= NULL
;
2730 struct request
**requeue_lastp
= &requeue_list
;
2731 unsigned int depth
= 0;
2732 bool is_passthrough
= false;
2736 struct request
*rq
= rq_list_pop(&plug
->mq_list
);
2739 this_hctx
= rq
->mq_hctx
;
2740 this_ctx
= rq
->mq_ctx
;
2741 is_passthrough
= blk_rq_is_passthrough(rq
);
2742 } else if (this_hctx
!= rq
->mq_hctx
|| this_ctx
!= rq
->mq_ctx
||
2743 is_passthrough
!= blk_rq_is_passthrough(rq
)) {
2744 rq_list_add_tail(&requeue_lastp
, rq
);
2747 list_add(&rq
->queuelist
, &list
);
2749 } while (!rq_list_empty(plug
->mq_list
));
2751 plug
->mq_list
= requeue_list
;
2752 trace_block_unplug(this_hctx
->queue
, depth
, !from_sched
);
2754 percpu_ref_get(&this_hctx
->queue
->q_usage_counter
);
2755 /* passthrough requests should never be issued to the I/O scheduler */
2756 if (is_passthrough
) {
2757 spin_lock(&this_hctx
->lock
);
2758 list_splice_tail_init(&list
, &this_hctx
->dispatch
);
2759 spin_unlock(&this_hctx
->lock
);
2760 blk_mq_run_hw_queue(this_hctx
, from_sched
);
2761 } else if (this_hctx
->queue
->elevator
) {
2762 this_hctx
->queue
->elevator
->type
->ops
.insert_requests(this_hctx
,
2764 blk_mq_run_hw_queue(this_hctx
, from_sched
);
2766 blk_mq_insert_requests(this_hctx
, this_ctx
, &list
, from_sched
);
2768 percpu_ref_put(&this_hctx
->queue
->q_usage_counter
);
2771 void blk_mq_flush_plug_list(struct blk_plug
*plug
, bool from_schedule
)
2776 * We may have been called recursively midway through handling
2777 * plug->mq_list via a schedule() in the driver's queue_rq() callback.
2778 * To avoid mq_list changing under our feet, clear rq_count early and
2779 * bail out specifically if rq_count is 0 rather than checking
2780 * whether the mq_list is empty.
2782 if (plug
->rq_count
== 0)
2786 if (!plug
->multiple_queues
&& !plug
->has_elevator
&& !from_schedule
) {
2787 struct request_queue
*q
;
2789 rq
= rq_list_peek(&plug
->mq_list
);
2793 * Peek first request and see if we have a ->queue_rqs() hook.
2794 * If we do, we can dispatch the whole plug list in one go. We
2795 * already know at this point that all requests belong to the
2796 * same queue, caller must ensure that's the case.
2798 * Since we pass off the full list to the driver at this point,
2799 * we do not increment the active request count for the queue.
2800 * Bypass shared tags for now because of that.
2802 if (q
->mq_ops
->queue_rqs
&&
2803 !(rq
->mq_hctx
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
)) {
2804 blk_mq_run_dispatch_ops(q
,
2805 __blk_mq_flush_plug_list(q
, plug
));
2806 if (rq_list_empty(plug
->mq_list
))
2810 blk_mq_run_dispatch_ops(q
,
2811 blk_mq_plug_issue_direct(plug
));
2812 if (rq_list_empty(plug
->mq_list
))
2817 blk_mq_dispatch_plug_list(plug
, from_schedule
);
2818 } while (!rq_list_empty(plug
->mq_list
));
2821 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx
*hctx
,
2822 struct list_head
*list
)
2825 blk_status_t ret
= BLK_STS_OK
;
2827 while (!list_empty(list
)) {
2828 struct request
*rq
= list_first_entry(list
, struct request
,
2831 list_del_init(&rq
->queuelist
);
2832 ret
= blk_mq_request_issue_directly(rq
, list_empty(list
));
2837 case BLK_STS_RESOURCE
:
2838 case BLK_STS_DEV_RESOURCE
:
2839 blk_mq_request_bypass_insert(rq
, 0);
2840 if (list_empty(list
))
2841 blk_mq_run_hw_queue(hctx
, false);
2844 blk_mq_end_request(rq
, ret
);
2850 if (ret
!= BLK_STS_OK
)
2851 blk_mq_commit_rqs(hctx
, queued
, false);
2854 static bool blk_mq_attempt_bio_merge(struct request_queue
*q
,
2855 struct bio
*bio
, unsigned int nr_segs
)
2857 if (!blk_queue_nomerges(q
) && bio_mergeable(bio
)) {
2858 if (blk_attempt_plug_merge(q
, bio
, nr_segs
))
2860 if (blk_mq_sched_bio_merge(q
, bio
, nr_segs
))
2866 static struct request
*blk_mq_get_new_requests(struct request_queue
*q
,
2867 struct blk_plug
*plug
,
2871 struct blk_mq_alloc_data data
= {
2874 .cmd_flags
= bio
->bi_opf
,
2878 if (unlikely(bio_queue_enter(bio
)))
2881 if (blk_mq_attempt_bio_merge(q
, bio
, nsegs
))
2884 rq_qos_throttle(q
, bio
);
2887 data
.nr_tags
= plug
->nr_ios
;
2889 data
.cached_rq
= &plug
->cached_rq
;
2892 rq
= __blk_mq_alloc_requests(&data
);
2895 rq_qos_cleanup(q
, bio
);
2896 if (bio
->bi_opf
& REQ_NOWAIT
)
2897 bio_wouldblock_error(bio
);
2903 static inline struct request
*blk_mq_get_cached_request(struct request_queue
*q
,
2904 struct blk_plug
*plug
, struct bio
**bio
, unsigned int nsegs
)
2907 enum hctx_type type
, hctx_type
;
2911 rq
= rq_list_peek(&plug
->cached_rq
);
2912 if (!rq
|| rq
->q
!= q
)
2915 if (blk_mq_attempt_bio_merge(q
, *bio
, nsegs
)) {
2920 type
= blk_mq_get_hctx_type((*bio
)->bi_opf
);
2921 hctx_type
= rq
->mq_hctx
->type
;
2922 if (type
!= hctx_type
&&
2923 !(type
== HCTX_TYPE_READ
&& hctx_type
== HCTX_TYPE_DEFAULT
))
2925 if (op_is_flush(rq
->cmd_flags
) != op_is_flush((*bio
)->bi_opf
))
2929 * If any qos ->throttle() end up blocking, we will have flushed the
2930 * plug and hence killed the cached_rq list as well. Pop this entry
2931 * before we throttle.
2933 plug
->cached_rq
= rq_list_next(rq
);
2934 rq_qos_throttle(q
, *bio
);
2936 blk_mq_rq_time_init(rq
, 0);
2937 rq
->cmd_flags
= (*bio
)->bi_opf
;
2938 INIT_LIST_HEAD(&rq
->queuelist
);
2942 static void bio_set_ioprio(struct bio
*bio
)
2944 /* Nobody set ioprio so far? Initialize it based on task's nice value */
2945 if (IOPRIO_PRIO_CLASS(bio
->bi_ioprio
) == IOPRIO_CLASS_NONE
)
2946 bio
->bi_ioprio
= get_current_ioprio();
2947 blkcg_set_ioprio(bio
);
2951 * blk_mq_submit_bio - Create and send a request to block device.
2952 * @bio: Bio pointer.
2954 * Builds up a request structure from @q and @bio and send to the device. The
2955 * request may not be queued directly to hardware if:
2956 * * This request can be merged with another one
2957 * * We want to place request at plug queue for possible future merging
2958 * * There is an IO scheduler active at this queue
2960 * It will not queue the request if there is an error with the bio, or at the
2963 void blk_mq_submit_bio(struct bio
*bio
)
2965 struct request_queue
*q
= bdev_get_queue(bio
->bi_bdev
);
2966 struct blk_plug
*plug
= blk_mq_plug(bio
);
2967 const int is_sync
= op_is_sync(bio
->bi_opf
);
2968 struct blk_mq_hw_ctx
*hctx
;
2970 unsigned int nr_segs
= 1;
2973 bio
= blk_queue_bounce(bio
, q
);
2974 if (bio_may_exceed_limits(bio
, &q
->limits
)) {
2975 bio
= __bio_split_to_limits(bio
, &q
->limits
, &nr_segs
);
2980 if (!bio_integrity_prep(bio
))
2983 bio_set_ioprio(bio
);
2985 rq
= blk_mq_get_cached_request(q
, plug
, &bio
, nr_segs
);
2989 rq
= blk_mq_get_new_requests(q
, plug
, bio
, nr_segs
);
2994 trace_block_getrq(bio
);
2996 rq_qos_track(q
, rq
, bio
);
2998 blk_mq_bio_to_request(rq
, bio
, nr_segs
);
3000 ret
= blk_crypto_rq_get_keyslot(rq
);
3001 if (ret
!= BLK_STS_OK
) {
3002 bio
->bi_status
= ret
;
3004 blk_mq_free_request(rq
);
3008 if (op_is_flush(bio
->bi_opf
) && blk_insert_flush(rq
))
3012 blk_add_rq_to_plug(plug
, rq
);
3017 if ((rq
->rq_flags
& RQF_USE_SCHED
) ||
3018 (hctx
->dispatch_busy
&& (q
->nr_hw_queues
== 1 || !is_sync
))) {
3019 blk_mq_insert_request(rq
, 0);
3020 blk_mq_run_hw_queue(hctx
, true);
3022 blk_mq_run_dispatch_ops(q
, blk_mq_try_issue_directly(hctx
, rq
));
3026 #ifdef CONFIG_BLK_MQ_STACKING
3028 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
3029 * @rq: the request being queued
3031 blk_status_t
blk_insert_cloned_request(struct request
*rq
)
3033 struct request_queue
*q
= rq
->q
;
3034 unsigned int max_sectors
= blk_queue_get_max_sectors(q
, req_op(rq
));
3035 unsigned int max_segments
= blk_rq_get_max_segments(rq
);
3038 if (blk_rq_sectors(rq
) > max_sectors
) {
3040 * SCSI device does not have a good way to return if
3041 * Write Same/Zero is actually supported. If a device rejects
3042 * a non-read/write command (discard, write same,etc.) the
3043 * low-level device driver will set the relevant queue limit to
3044 * 0 to prevent blk-lib from issuing more of the offending
3045 * operations. Commands queued prior to the queue limit being
3046 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
3047 * errors being propagated to upper layers.
3049 if (max_sectors
== 0)
3050 return BLK_STS_NOTSUPP
;
3052 printk(KERN_ERR
"%s: over max size limit. (%u > %u)\n",
3053 __func__
, blk_rq_sectors(rq
), max_sectors
);
3054 return BLK_STS_IOERR
;
3058 * The queue settings related to segment counting may differ from the
3061 rq
->nr_phys_segments
= blk_recalc_rq_segments(rq
);
3062 if (rq
->nr_phys_segments
> max_segments
) {
3063 printk(KERN_ERR
"%s: over max segments limit. (%u > %u)\n",
3064 __func__
, rq
->nr_phys_segments
, max_segments
);
3065 return BLK_STS_IOERR
;
3068 if (q
->disk
&& should_fail_request(q
->disk
->part0
, blk_rq_bytes(rq
)))
3069 return BLK_STS_IOERR
;
3071 ret
= blk_crypto_rq_get_keyslot(rq
);
3072 if (ret
!= BLK_STS_OK
)
3075 blk_account_io_start(rq
);
3078 * Since we have a scheduler attached on the top device,
3079 * bypass a potential scheduler on the bottom device for
3082 blk_mq_run_dispatch_ops(q
,
3083 ret
= blk_mq_request_issue_directly(rq
, true));
3085 blk_account_io_done(rq
, ktime_get_ns());
3088 EXPORT_SYMBOL_GPL(blk_insert_cloned_request
);
3091 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
3092 * @rq: the clone request to be cleaned up
3095 * Free all bios in @rq for a cloned request.
3097 void blk_rq_unprep_clone(struct request
*rq
)
3101 while ((bio
= rq
->bio
) != NULL
) {
3102 rq
->bio
= bio
->bi_next
;
3107 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone
);
3110 * blk_rq_prep_clone - Helper function to setup clone request
3111 * @rq: the request to be setup
3112 * @rq_src: original request to be cloned
3113 * @bs: bio_set that bios for clone are allocated from
3114 * @gfp_mask: memory allocation mask for bio
3115 * @bio_ctr: setup function to be called for each clone bio.
3116 * Returns %0 for success, non %0 for failure.
3117 * @data: private data to be passed to @bio_ctr
3120 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
3121 * Also, pages which the original bios are pointing to are not copied
3122 * and the cloned bios just point same pages.
3123 * So cloned bios must be completed before original bios, which means
3124 * the caller must complete @rq before @rq_src.
3126 int blk_rq_prep_clone(struct request
*rq
, struct request
*rq_src
,
3127 struct bio_set
*bs
, gfp_t gfp_mask
,
3128 int (*bio_ctr
)(struct bio
*, struct bio
*, void *),
3131 struct bio
*bio
, *bio_src
;
3136 __rq_for_each_bio(bio_src
, rq_src
) {
3137 bio
= bio_alloc_clone(rq
->q
->disk
->part0
, bio_src
, gfp_mask
,
3142 if (bio_ctr
&& bio_ctr(bio
, bio_src
, data
))
3146 rq
->biotail
->bi_next
= bio
;
3149 rq
->bio
= rq
->biotail
= bio
;
3154 /* Copy attributes of the original request to the clone request. */
3155 rq
->__sector
= blk_rq_pos(rq_src
);
3156 rq
->__data_len
= blk_rq_bytes(rq_src
);
3157 if (rq_src
->rq_flags
& RQF_SPECIAL_PAYLOAD
) {
3158 rq
->rq_flags
|= RQF_SPECIAL_PAYLOAD
;
3159 rq
->special_vec
= rq_src
->special_vec
;
3161 rq
->nr_phys_segments
= rq_src
->nr_phys_segments
;
3162 rq
->ioprio
= rq_src
->ioprio
;
3164 if (rq
->bio
&& blk_crypto_rq_bio_prep(rq
, rq
->bio
, gfp_mask
) < 0)
3172 blk_rq_unprep_clone(rq
);
3176 EXPORT_SYMBOL_GPL(blk_rq_prep_clone
);
3177 #endif /* CONFIG_BLK_MQ_STACKING */
3180 * Steal bios from a request and add them to a bio list.
3181 * The request must not have been partially completed before.
3183 void blk_steal_bios(struct bio_list
*list
, struct request
*rq
)
3187 list
->tail
->bi_next
= rq
->bio
;
3189 list
->head
= rq
->bio
;
3190 list
->tail
= rq
->biotail
;
3198 EXPORT_SYMBOL_GPL(blk_steal_bios
);
3200 static size_t order_to_size(unsigned int order
)
3202 return (size_t)PAGE_SIZE
<< order
;
3205 /* called before freeing request pool in @tags */
3206 static void blk_mq_clear_rq_mapping(struct blk_mq_tags
*drv_tags
,
3207 struct blk_mq_tags
*tags
)
3210 unsigned long flags
;
3213 * There is no need to clear mapping if driver tags is not initialized
3214 * or the mapping belongs to the driver tags.
3216 if (!drv_tags
|| drv_tags
== tags
)
3219 list_for_each_entry(page
, &tags
->page_list
, lru
) {
3220 unsigned long start
= (unsigned long)page_address(page
);
3221 unsigned long end
= start
+ order_to_size(page
->private);
3224 for (i
= 0; i
< drv_tags
->nr_tags
; i
++) {
3225 struct request
*rq
= drv_tags
->rqs
[i
];
3226 unsigned long rq_addr
= (unsigned long)rq
;
3228 if (rq_addr
>= start
&& rq_addr
< end
) {
3229 WARN_ON_ONCE(req_ref_read(rq
) != 0);
3230 cmpxchg(&drv_tags
->rqs
[i
], rq
, NULL
);
3236 * Wait until all pending iteration is done.
3238 * Request reference is cleared and it is guaranteed to be observed
3239 * after the ->lock is released.
3241 spin_lock_irqsave(&drv_tags
->lock
, flags
);
3242 spin_unlock_irqrestore(&drv_tags
->lock
, flags
);
3245 void blk_mq_free_rqs(struct blk_mq_tag_set
*set
, struct blk_mq_tags
*tags
,
3246 unsigned int hctx_idx
)
3248 struct blk_mq_tags
*drv_tags
;
3251 if (list_empty(&tags
->page_list
))
3254 if (blk_mq_is_shared_tags(set
->flags
))
3255 drv_tags
= set
->shared_tags
;
3257 drv_tags
= set
->tags
[hctx_idx
];
3259 if (tags
->static_rqs
&& set
->ops
->exit_request
) {
3262 for (i
= 0; i
< tags
->nr_tags
; i
++) {
3263 struct request
*rq
= tags
->static_rqs
[i
];
3267 set
->ops
->exit_request(set
, rq
, hctx_idx
);
3268 tags
->static_rqs
[i
] = NULL
;
3272 blk_mq_clear_rq_mapping(drv_tags
, tags
);
3274 while (!list_empty(&tags
->page_list
)) {
3275 page
= list_first_entry(&tags
->page_list
, struct page
, lru
);
3276 list_del_init(&page
->lru
);
3278 * Remove kmemleak object previously allocated in
3279 * blk_mq_alloc_rqs().
3281 kmemleak_free(page_address(page
));
3282 __free_pages(page
, page
->private);
3286 void blk_mq_free_rq_map(struct blk_mq_tags
*tags
)
3290 kfree(tags
->static_rqs
);
3291 tags
->static_rqs
= NULL
;
3293 blk_mq_free_tags(tags
);
3296 static enum hctx_type
hctx_idx_to_type(struct blk_mq_tag_set
*set
,
3297 unsigned int hctx_idx
)
3301 for (i
= 0; i
< set
->nr_maps
; i
++) {
3302 unsigned int start
= set
->map
[i
].queue_offset
;
3303 unsigned int end
= start
+ set
->map
[i
].nr_queues
;
3305 if (hctx_idx
>= start
&& hctx_idx
< end
)
3309 if (i
>= set
->nr_maps
)
3310 i
= HCTX_TYPE_DEFAULT
;
3315 static int blk_mq_get_hctx_node(struct blk_mq_tag_set
*set
,
3316 unsigned int hctx_idx
)
3318 enum hctx_type type
= hctx_idx_to_type(set
, hctx_idx
);
3320 return blk_mq_hw_queue_to_node(&set
->map
[type
], hctx_idx
);
3323 static struct blk_mq_tags
*blk_mq_alloc_rq_map(struct blk_mq_tag_set
*set
,
3324 unsigned int hctx_idx
,
3325 unsigned int nr_tags
,
3326 unsigned int reserved_tags
)
3328 int node
= blk_mq_get_hctx_node(set
, hctx_idx
);
3329 struct blk_mq_tags
*tags
;
3331 if (node
== NUMA_NO_NODE
)
3332 node
= set
->numa_node
;
3334 tags
= blk_mq_init_tags(nr_tags
, reserved_tags
, node
,
3335 BLK_MQ_FLAG_TO_ALLOC_POLICY(set
->flags
));
3339 tags
->rqs
= kcalloc_node(nr_tags
, sizeof(struct request
*),
3340 GFP_NOIO
| __GFP_NOWARN
| __GFP_NORETRY
,
3345 tags
->static_rqs
= kcalloc_node(nr_tags
, sizeof(struct request
*),
3346 GFP_NOIO
| __GFP_NOWARN
| __GFP_NORETRY
,
3348 if (!tags
->static_rqs
)
3356 blk_mq_free_tags(tags
);
3360 static int blk_mq_init_request(struct blk_mq_tag_set
*set
, struct request
*rq
,
3361 unsigned int hctx_idx
, int node
)
3365 if (set
->ops
->init_request
) {
3366 ret
= set
->ops
->init_request(set
, rq
, hctx_idx
, node
);
3371 WRITE_ONCE(rq
->state
, MQ_RQ_IDLE
);
3375 static int blk_mq_alloc_rqs(struct blk_mq_tag_set
*set
,
3376 struct blk_mq_tags
*tags
,
3377 unsigned int hctx_idx
, unsigned int depth
)
3379 unsigned int i
, j
, entries_per_page
, max_order
= 4;
3380 int node
= blk_mq_get_hctx_node(set
, hctx_idx
);
3381 size_t rq_size
, left
;
3383 if (node
== NUMA_NO_NODE
)
3384 node
= set
->numa_node
;
3386 INIT_LIST_HEAD(&tags
->page_list
);
3389 * rq_size is the size of the request plus driver payload, rounded
3390 * to the cacheline size
3392 rq_size
= round_up(sizeof(struct request
) + set
->cmd_size
,
3394 left
= rq_size
* depth
;
3396 for (i
= 0; i
< depth
; ) {
3397 int this_order
= max_order
;
3402 while (this_order
&& left
< order_to_size(this_order
- 1))
3406 page
= alloc_pages_node(node
,
3407 GFP_NOIO
| __GFP_NOWARN
| __GFP_NORETRY
| __GFP_ZERO
,
3413 if (order_to_size(this_order
) < rq_size
)
3420 page
->private = this_order
;
3421 list_add_tail(&page
->lru
, &tags
->page_list
);
3423 p
= page_address(page
);
3425 * Allow kmemleak to scan these pages as they contain pointers
3426 * to additional allocations like via ops->init_request().
3428 kmemleak_alloc(p
, order_to_size(this_order
), 1, GFP_NOIO
);
3429 entries_per_page
= order_to_size(this_order
) / rq_size
;
3430 to_do
= min(entries_per_page
, depth
- i
);
3431 left
-= to_do
* rq_size
;
3432 for (j
= 0; j
< to_do
; j
++) {
3433 struct request
*rq
= p
;
3435 tags
->static_rqs
[i
] = rq
;
3436 if (blk_mq_init_request(set
, rq
, hctx_idx
, node
)) {
3437 tags
->static_rqs
[i
] = NULL
;
3448 blk_mq_free_rqs(set
, tags
, hctx_idx
);
3452 struct rq_iter_data
{
3453 struct blk_mq_hw_ctx
*hctx
;
3457 static bool blk_mq_has_request(struct request
*rq
, void *data
)
3459 struct rq_iter_data
*iter_data
= data
;
3461 if (rq
->mq_hctx
!= iter_data
->hctx
)
3463 iter_data
->has_rq
= true;
3467 static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx
*hctx
)
3469 struct blk_mq_tags
*tags
= hctx
->sched_tags
?
3470 hctx
->sched_tags
: hctx
->tags
;
3471 struct rq_iter_data data
= {
3475 blk_mq_all_tag_iter(tags
, blk_mq_has_request
, &data
);
3479 static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu
,
3480 struct blk_mq_hw_ctx
*hctx
)
3482 if (cpumask_first_and(hctx
->cpumask
, cpu_online_mask
) != cpu
)
3484 if (cpumask_next_and(cpu
, hctx
->cpumask
, cpu_online_mask
) < nr_cpu_ids
)
3489 static int blk_mq_hctx_notify_offline(unsigned int cpu
, struct hlist_node
*node
)
3491 struct blk_mq_hw_ctx
*hctx
= hlist_entry_safe(node
,
3492 struct blk_mq_hw_ctx
, cpuhp_online
);
3494 if (!cpumask_test_cpu(cpu
, hctx
->cpumask
) ||
3495 !blk_mq_last_cpu_in_hctx(cpu
, hctx
))
3499 * Prevent new request from being allocated on the current hctx.
3501 * The smp_mb__after_atomic() Pairs with the implied barrier in
3502 * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
3503 * seen once we return from the tag allocator.
3505 set_bit(BLK_MQ_S_INACTIVE
, &hctx
->state
);
3506 smp_mb__after_atomic();
3509 * Try to grab a reference to the queue and wait for any outstanding
3510 * requests. If we could not grab a reference the queue has been
3511 * frozen and there are no requests.
3513 if (percpu_ref_tryget(&hctx
->queue
->q_usage_counter
)) {
3514 while (blk_mq_hctx_has_requests(hctx
))
3516 percpu_ref_put(&hctx
->queue
->q_usage_counter
);
3522 static int blk_mq_hctx_notify_online(unsigned int cpu
, struct hlist_node
*node
)
3524 struct blk_mq_hw_ctx
*hctx
= hlist_entry_safe(node
,
3525 struct blk_mq_hw_ctx
, cpuhp_online
);
3527 if (cpumask_test_cpu(cpu
, hctx
->cpumask
))
3528 clear_bit(BLK_MQ_S_INACTIVE
, &hctx
->state
);
3533 * 'cpu' is going away. splice any existing rq_list entries from this
3534 * software queue to the hw queue dispatch list, and ensure that it
3537 static int blk_mq_hctx_notify_dead(unsigned int cpu
, struct hlist_node
*node
)
3539 struct blk_mq_hw_ctx
*hctx
;
3540 struct blk_mq_ctx
*ctx
;
3542 enum hctx_type type
;
3544 hctx
= hlist_entry_safe(node
, struct blk_mq_hw_ctx
, cpuhp_dead
);
3545 if (!cpumask_test_cpu(cpu
, hctx
->cpumask
))
3548 ctx
= __blk_mq_get_ctx(hctx
->queue
, cpu
);
3551 spin_lock(&ctx
->lock
);
3552 if (!list_empty(&ctx
->rq_lists
[type
])) {
3553 list_splice_init(&ctx
->rq_lists
[type
], &tmp
);
3554 blk_mq_hctx_clear_pending(hctx
, ctx
);
3556 spin_unlock(&ctx
->lock
);
3558 if (list_empty(&tmp
))
3561 spin_lock(&hctx
->lock
);
3562 list_splice_tail_init(&tmp
, &hctx
->dispatch
);
3563 spin_unlock(&hctx
->lock
);
3565 blk_mq_run_hw_queue(hctx
, true);
3569 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx
*hctx
)
3571 if (!(hctx
->flags
& BLK_MQ_F_STACKING
))
3572 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE
,
3573 &hctx
->cpuhp_online
);
3574 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD
,
3579 * Before freeing hw queue, clearing the flush request reference in
3580 * tags->rqs[] for avoiding potential UAF.
3582 static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags
*tags
,
3583 unsigned int queue_depth
, struct request
*flush_rq
)
3586 unsigned long flags
;
3588 /* The hw queue may not be mapped yet */
3592 WARN_ON_ONCE(req_ref_read(flush_rq
) != 0);
3594 for (i
= 0; i
< queue_depth
; i
++)
3595 cmpxchg(&tags
->rqs
[i
], flush_rq
, NULL
);
3598 * Wait until all pending iteration is done.
3600 * Request reference is cleared and it is guaranteed to be observed
3601 * after the ->lock is released.
3603 spin_lock_irqsave(&tags
->lock
, flags
);
3604 spin_unlock_irqrestore(&tags
->lock
, flags
);
3607 /* hctx->ctxs will be freed in queue's release handler */
3608 static void blk_mq_exit_hctx(struct request_queue
*q
,
3609 struct blk_mq_tag_set
*set
,
3610 struct blk_mq_hw_ctx
*hctx
, unsigned int hctx_idx
)
3612 struct request
*flush_rq
= hctx
->fq
->flush_rq
;
3614 if (blk_mq_hw_queue_mapped(hctx
))
3615 blk_mq_tag_idle(hctx
);
3617 if (blk_queue_init_done(q
))
3618 blk_mq_clear_flush_rq_mapping(set
->tags
[hctx_idx
],
3619 set
->queue_depth
, flush_rq
);
3620 if (set
->ops
->exit_request
)
3621 set
->ops
->exit_request(set
, flush_rq
, hctx_idx
);
3623 if (set
->ops
->exit_hctx
)
3624 set
->ops
->exit_hctx(hctx
, hctx_idx
);
3626 blk_mq_remove_cpuhp(hctx
);
3628 xa_erase(&q
->hctx_table
, hctx_idx
);
3630 spin_lock(&q
->unused_hctx_lock
);
3631 list_add(&hctx
->hctx_list
, &q
->unused_hctx_list
);
3632 spin_unlock(&q
->unused_hctx_lock
);
3635 static void blk_mq_exit_hw_queues(struct request_queue
*q
,
3636 struct blk_mq_tag_set
*set
, int nr_queue
)
3638 struct blk_mq_hw_ctx
*hctx
;
3641 queue_for_each_hw_ctx(q
, hctx
, i
) {
3644 blk_mq_exit_hctx(q
, set
, hctx
, i
);
3648 static int blk_mq_init_hctx(struct request_queue
*q
,
3649 struct blk_mq_tag_set
*set
,
3650 struct blk_mq_hw_ctx
*hctx
, unsigned hctx_idx
)
3652 hctx
->queue_num
= hctx_idx
;
3654 if (!(hctx
->flags
& BLK_MQ_F_STACKING
))
3655 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE
,
3656 &hctx
->cpuhp_online
);
3657 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD
, &hctx
->cpuhp_dead
);
3659 hctx
->tags
= set
->tags
[hctx_idx
];
3661 if (set
->ops
->init_hctx
&&
3662 set
->ops
->init_hctx(hctx
, set
->driver_data
, hctx_idx
))
3663 goto unregister_cpu_notifier
;
3665 if (blk_mq_init_request(set
, hctx
->fq
->flush_rq
, hctx_idx
,
3669 if (xa_insert(&q
->hctx_table
, hctx_idx
, hctx
, GFP_KERNEL
))
3675 if (set
->ops
->exit_request
)
3676 set
->ops
->exit_request(set
, hctx
->fq
->flush_rq
, hctx_idx
);
3678 if (set
->ops
->exit_hctx
)
3679 set
->ops
->exit_hctx(hctx
, hctx_idx
);
3680 unregister_cpu_notifier
:
3681 blk_mq_remove_cpuhp(hctx
);
3685 static struct blk_mq_hw_ctx
*
3686 blk_mq_alloc_hctx(struct request_queue
*q
, struct blk_mq_tag_set
*set
,
3689 struct blk_mq_hw_ctx
*hctx
;
3690 gfp_t gfp
= GFP_NOIO
| __GFP_NOWARN
| __GFP_NORETRY
;
3692 hctx
= kzalloc_node(sizeof(struct blk_mq_hw_ctx
), gfp
, node
);
3694 goto fail_alloc_hctx
;
3696 if (!zalloc_cpumask_var_node(&hctx
->cpumask
, gfp
, node
))
3699 atomic_set(&hctx
->nr_active
, 0);
3700 if (node
== NUMA_NO_NODE
)
3701 node
= set
->numa_node
;
3702 hctx
->numa_node
= node
;
3704 INIT_DELAYED_WORK(&hctx
->run_work
, blk_mq_run_work_fn
);
3705 spin_lock_init(&hctx
->lock
);
3706 INIT_LIST_HEAD(&hctx
->dispatch
);
3708 hctx
->flags
= set
->flags
& ~BLK_MQ_F_TAG_QUEUE_SHARED
;
3710 INIT_LIST_HEAD(&hctx
->hctx_list
);
3713 * Allocate space for all possible cpus to avoid allocation at
3716 hctx
->ctxs
= kmalloc_array_node(nr_cpu_ids
, sizeof(void *),
3721 if (sbitmap_init_node(&hctx
->ctx_map
, nr_cpu_ids
, ilog2(8),
3722 gfp
, node
, false, false))
3726 spin_lock_init(&hctx
->dispatch_wait_lock
);
3727 init_waitqueue_func_entry(&hctx
->dispatch_wait
, blk_mq_dispatch_wake
);
3728 INIT_LIST_HEAD(&hctx
->dispatch_wait
.entry
);
3730 hctx
->fq
= blk_alloc_flush_queue(hctx
->numa_node
, set
->cmd_size
, gfp
);
3734 blk_mq_hctx_kobj_init(hctx
);
3739 sbitmap_free(&hctx
->ctx_map
);
3743 free_cpumask_var(hctx
->cpumask
);
3750 static void blk_mq_init_cpu_queues(struct request_queue
*q
,
3751 unsigned int nr_hw_queues
)
3753 struct blk_mq_tag_set
*set
= q
->tag_set
;
3756 for_each_possible_cpu(i
) {
3757 struct blk_mq_ctx
*__ctx
= per_cpu_ptr(q
->queue_ctx
, i
);
3758 struct blk_mq_hw_ctx
*hctx
;
3762 spin_lock_init(&__ctx
->lock
);
3763 for (k
= HCTX_TYPE_DEFAULT
; k
< HCTX_MAX_TYPES
; k
++)
3764 INIT_LIST_HEAD(&__ctx
->rq_lists
[k
]);
3769 * Set local node, IFF we have more than one hw queue. If
3770 * not, we remain on the home node of the device
3772 for (j
= 0; j
< set
->nr_maps
; j
++) {
3773 hctx
= blk_mq_map_queue_type(q
, j
, i
);
3774 if (nr_hw_queues
> 1 && hctx
->numa_node
== NUMA_NO_NODE
)
3775 hctx
->numa_node
= cpu_to_node(i
);
3780 struct blk_mq_tags
*blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set
*set
,
3781 unsigned int hctx_idx
,
3784 struct blk_mq_tags
*tags
;
3787 tags
= blk_mq_alloc_rq_map(set
, hctx_idx
, depth
, set
->reserved_tags
);
3791 ret
= blk_mq_alloc_rqs(set
, tags
, hctx_idx
, depth
);
3793 blk_mq_free_rq_map(tags
);
3800 static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set
*set
,
3803 if (blk_mq_is_shared_tags(set
->flags
)) {
3804 set
->tags
[hctx_idx
] = set
->shared_tags
;
3809 set
->tags
[hctx_idx
] = blk_mq_alloc_map_and_rqs(set
, hctx_idx
,
3812 return set
->tags
[hctx_idx
];
3815 void blk_mq_free_map_and_rqs(struct blk_mq_tag_set
*set
,
3816 struct blk_mq_tags
*tags
,
3817 unsigned int hctx_idx
)
3820 blk_mq_free_rqs(set
, tags
, hctx_idx
);
3821 blk_mq_free_rq_map(tags
);
3825 static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set
*set
,
3826 unsigned int hctx_idx
)
3828 if (!blk_mq_is_shared_tags(set
->flags
))
3829 blk_mq_free_map_and_rqs(set
, set
->tags
[hctx_idx
], hctx_idx
);
3831 set
->tags
[hctx_idx
] = NULL
;
3834 static void blk_mq_map_swqueue(struct request_queue
*q
)
3836 unsigned int j
, hctx_idx
;
3838 struct blk_mq_hw_ctx
*hctx
;
3839 struct blk_mq_ctx
*ctx
;
3840 struct blk_mq_tag_set
*set
= q
->tag_set
;
3842 queue_for_each_hw_ctx(q
, hctx
, i
) {
3843 cpumask_clear(hctx
->cpumask
);
3845 hctx
->dispatch_from
= NULL
;
3849 * Map software to hardware queues.
3851 * If the cpu isn't present, the cpu is mapped to first hctx.
3853 for_each_possible_cpu(i
) {
3855 ctx
= per_cpu_ptr(q
->queue_ctx
, i
);
3856 for (j
= 0; j
< set
->nr_maps
; j
++) {
3857 if (!set
->map
[j
].nr_queues
) {
3858 ctx
->hctxs
[j
] = blk_mq_map_queue_type(q
,
3859 HCTX_TYPE_DEFAULT
, i
);
3862 hctx_idx
= set
->map
[j
].mq_map
[i
];
3863 /* unmapped hw queue can be remapped after CPU topo changed */
3864 if (!set
->tags
[hctx_idx
] &&
3865 !__blk_mq_alloc_map_and_rqs(set
, hctx_idx
)) {
3867 * If tags initialization fail for some hctx,
3868 * that hctx won't be brought online. In this
3869 * case, remap the current ctx to hctx[0] which
3870 * is guaranteed to always have tags allocated
3872 set
->map
[j
].mq_map
[i
] = 0;
3875 hctx
= blk_mq_map_queue_type(q
, j
, i
);
3876 ctx
->hctxs
[j
] = hctx
;
3878 * If the CPU is already set in the mask, then we've
3879 * mapped this one already. This can happen if
3880 * devices share queues across queue maps.
3882 if (cpumask_test_cpu(i
, hctx
->cpumask
))
3885 cpumask_set_cpu(i
, hctx
->cpumask
);
3887 ctx
->index_hw
[hctx
->type
] = hctx
->nr_ctx
;
3888 hctx
->ctxs
[hctx
->nr_ctx
++] = ctx
;
3891 * If the nr_ctx type overflows, we have exceeded the
3892 * amount of sw queues we can support.
3894 BUG_ON(!hctx
->nr_ctx
);
3897 for (; j
< HCTX_MAX_TYPES
; j
++)
3898 ctx
->hctxs
[j
] = blk_mq_map_queue_type(q
,
3899 HCTX_TYPE_DEFAULT
, i
);
3902 queue_for_each_hw_ctx(q
, hctx
, i
) {
3904 * If no software queues are mapped to this hardware queue,
3905 * disable it and free the request entries.
3907 if (!hctx
->nr_ctx
) {
3908 /* Never unmap queue 0. We need it as a
3909 * fallback in case of a new remap fails
3913 __blk_mq_free_map_and_rqs(set
, i
);
3919 hctx
->tags
= set
->tags
[i
];
3920 WARN_ON(!hctx
->tags
);
3923 * Set the map size to the number of mapped software queues.
3924 * This is more accurate and more efficient than looping
3925 * over all possibly mapped software queues.
3927 sbitmap_resize(&hctx
->ctx_map
, hctx
->nr_ctx
);
3930 * Initialize batch roundrobin counts
3932 hctx
->next_cpu
= blk_mq_first_mapped_cpu(hctx
);
3933 hctx
->next_cpu_batch
= BLK_MQ_CPU_WORK_BATCH
;
3938 * Caller needs to ensure that we're either frozen/quiesced, or that
3939 * the queue isn't live yet.
3941 static void queue_set_hctx_shared(struct request_queue
*q
, bool shared
)
3943 struct blk_mq_hw_ctx
*hctx
;
3946 queue_for_each_hw_ctx(q
, hctx
, i
) {
3948 hctx
->flags
|= BLK_MQ_F_TAG_QUEUE_SHARED
;
3950 blk_mq_tag_idle(hctx
);
3951 hctx
->flags
&= ~BLK_MQ_F_TAG_QUEUE_SHARED
;
3956 static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set
*set
,
3959 struct request_queue
*q
;
3961 lockdep_assert_held(&set
->tag_list_lock
);
3963 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
3964 blk_mq_freeze_queue(q
);
3965 queue_set_hctx_shared(q
, shared
);
3966 blk_mq_unfreeze_queue(q
);
3970 static void blk_mq_del_queue_tag_set(struct request_queue
*q
)
3972 struct blk_mq_tag_set
*set
= q
->tag_set
;
3974 mutex_lock(&set
->tag_list_lock
);
3975 list_del(&q
->tag_set_list
);
3976 if (list_is_singular(&set
->tag_list
)) {
3977 /* just transitioned to unshared */
3978 set
->flags
&= ~BLK_MQ_F_TAG_QUEUE_SHARED
;
3979 /* update existing queue */
3980 blk_mq_update_tag_set_shared(set
, false);
3982 mutex_unlock(&set
->tag_list_lock
);
3983 INIT_LIST_HEAD(&q
->tag_set_list
);
3986 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set
*set
,
3987 struct request_queue
*q
)
3989 mutex_lock(&set
->tag_list_lock
);
3992 * Check to see if we're transitioning to shared (from 1 to 2 queues).
3994 if (!list_empty(&set
->tag_list
) &&
3995 !(set
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
)) {
3996 set
->flags
|= BLK_MQ_F_TAG_QUEUE_SHARED
;
3997 /* update existing queue */
3998 blk_mq_update_tag_set_shared(set
, true);
4000 if (set
->flags
& BLK_MQ_F_TAG_QUEUE_SHARED
)
4001 queue_set_hctx_shared(q
, true);
4002 list_add_tail(&q
->tag_set_list
, &set
->tag_list
);
4004 mutex_unlock(&set
->tag_list_lock
);
4007 /* All allocations will be freed in release handler of q->mq_kobj */
4008 static int blk_mq_alloc_ctxs(struct request_queue
*q
)
4010 struct blk_mq_ctxs
*ctxs
;
4013 ctxs
= kzalloc(sizeof(*ctxs
), GFP_KERNEL
);
4017 ctxs
->queue_ctx
= alloc_percpu(struct blk_mq_ctx
);
4018 if (!ctxs
->queue_ctx
)
4021 for_each_possible_cpu(cpu
) {
4022 struct blk_mq_ctx
*ctx
= per_cpu_ptr(ctxs
->queue_ctx
, cpu
);
4026 q
->mq_kobj
= &ctxs
->kobj
;
4027 q
->queue_ctx
= ctxs
->queue_ctx
;
4036 * It is the actual release handler for mq, but we do it from
4037 * request queue's release handler for avoiding use-after-free
4038 * and headache because q->mq_kobj shouldn't have been introduced,
4039 * but we can't group ctx/kctx kobj without it.
4041 void blk_mq_release(struct request_queue
*q
)
4043 struct blk_mq_hw_ctx
*hctx
, *next
;
4046 queue_for_each_hw_ctx(q
, hctx
, i
)
4047 WARN_ON_ONCE(hctx
&& list_empty(&hctx
->hctx_list
));
4049 /* all hctx are in .unused_hctx_list now */
4050 list_for_each_entry_safe(hctx
, next
, &q
->unused_hctx_list
, hctx_list
) {
4051 list_del_init(&hctx
->hctx_list
);
4052 kobject_put(&hctx
->kobj
);
4055 xa_destroy(&q
->hctx_table
);
4058 * release .mq_kobj and sw queue's kobject now because
4059 * both share lifetime with request queue.
4061 blk_mq_sysfs_deinit(q
);
4064 static struct request_queue
*blk_mq_init_queue_data(struct blk_mq_tag_set
*set
,
4067 struct request_queue
*q
;
4070 q
= blk_alloc_queue(set
->numa_node
);
4072 return ERR_PTR(-ENOMEM
);
4073 q
->queuedata
= queuedata
;
4074 ret
= blk_mq_init_allocated_queue(set
, q
);
4077 return ERR_PTR(ret
);
4082 struct request_queue
*blk_mq_init_queue(struct blk_mq_tag_set
*set
)
4084 return blk_mq_init_queue_data(set
, NULL
);
4086 EXPORT_SYMBOL(blk_mq_init_queue
);
4089 * blk_mq_destroy_queue - shutdown a request queue
4090 * @q: request queue to shutdown
4092 * This shuts down a request queue allocated by blk_mq_init_queue(). All future
4093 * requests will be failed with -ENODEV. The caller is responsible for dropping
4094 * the reference from blk_mq_init_queue() by calling blk_put_queue().
4096 * Context: can sleep
4098 void blk_mq_destroy_queue(struct request_queue
*q
)
4100 WARN_ON_ONCE(!queue_is_mq(q
));
4101 WARN_ON_ONCE(blk_queue_registered(q
));
4105 blk_queue_flag_set(QUEUE_FLAG_DYING
, q
);
4106 blk_queue_start_drain(q
);
4107 blk_mq_freeze_queue_wait(q
);
4110 blk_mq_cancel_work_sync(q
);
4111 blk_mq_exit_queue(q
);
4113 EXPORT_SYMBOL(blk_mq_destroy_queue
);
4115 struct gendisk
*__blk_mq_alloc_disk(struct blk_mq_tag_set
*set
, void *queuedata
,
4116 struct lock_class_key
*lkclass
)
4118 struct request_queue
*q
;
4119 struct gendisk
*disk
;
4121 q
= blk_mq_init_queue_data(set
, queuedata
);
4125 disk
= __alloc_disk_node(q
, set
->numa_node
, lkclass
);
4127 blk_mq_destroy_queue(q
);
4129 return ERR_PTR(-ENOMEM
);
4131 set_bit(GD_OWNS_QUEUE
, &disk
->state
);
4134 EXPORT_SYMBOL(__blk_mq_alloc_disk
);
4136 struct gendisk
*blk_mq_alloc_disk_for_queue(struct request_queue
*q
,
4137 struct lock_class_key
*lkclass
)
4139 struct gendisk
*disk
;
4141 if (!blk_get_queue(q
))
4143 disk
= __alloc_disk_node(q
, NUMA_NO_NODE
, lkclass
);
4148 EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue
);
4150 static struct blk_mq_hw_ctx
*blk_mq_alloc_and_init_hctx(
4151 struct blk_mq_tag_set
*set
, struct request_queue
*q
,
4152 int hctx_idx
, int node
)
4154 struct blk_mq_hw_ctx
*hctx
= NULL
, *tmp
;
4156 /* reuse dead hctx first */
4157 spin_lock(&q
->unused_hctx_lock
);
4158 list_for_each_entry(tmp
, &q
->unused_hctx_list
, hctx_list
) {
4159 if (tmp
->numa_node
== node
) {
4165 list_del_init(&hctx
->hctx_list
);
4166 spin_unlock(&q
->unused_hctx_lock
);
4169 hctx
= blk_mq_alloc_hctx(q
, set
, node
);
4173 if (blk_mq_init_hctx(q
, set
, hctx
, hctx_idx
))
4179 kobject_put(&hctx
->kobj
);
4184 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set
*set
,
4185 struct request_queue
*q
)
4187 struct blk_mq_hw_ctx
*hctx
;
4190 /* protect against switching io scheduler */
4191 mutex_lock(&q
->sysfs_lock
);
4192 for (i
= 0; i
< set
->nr_hw_queues
; i
++) {
4194 int node
= blk_mq_get_hctx_node(set
, i
);
4195 struct blk_mq_hw_ctx
*old_hctx
= xa_load(&q
->hctx_table
, i
);
4198 old_node
= old_hctx
->numa_node
;
4199 blk_mq_exit_hctx(q
, set
, old_hctx
, i
);
4202 if (!blk_mq_alloc_and_init_hctx(set
, q
, i
, node
)) {
4205 pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
4207 hctx
= blk_mq_alloc_and_init_hctx(set
, q
, i
, old_node
);
4208 WARN_ON_ONCE(!hctx
);
4212 * Increasing nr_hw_queues fails. Free the newly allocated
4213 * hctxs and keep the previous q->nr_hw_queues.
4215 if (i
!= set
->nr_hw_queues
) {
4216 j
= q
->nr_hw_queues
;
4219 q
->nr_hw_queues
= set
->nr_hw_queues
;
4222 xa_for_each_start(&q
->hctx_table
, j
, hctx
, j
)
4223 blk_mq_exit_hctx(q
, set
, hctx
, j
);
4224 mutex_unlock(&q
->sysfs_lock
);
4227 static void blk_mq_update_poll_flag(struct request_queue
*q
)
4229 struct blk_mq_tag_set
*set
= q
->tag_set
;
4231 if (set
->nr_maps
> HCTX_TYPE_POLL
&&
4232 set
->map
[HCTX_TYPE_POLL
].nr_queues
)
4233 blk_queue_flag_set(QUEUE_FLAG_POLL
, q
);
4235 blk_queue_flag_clear(QUEUE_FLAG_POLL
, q
);
4238 int blk_mq_init_allocated_queue(struct blk_mq_tag_set
*set
,
4239 struct request_queue
*q
)
4241 /* mark the queue as mq asap */
4242 q
->mq_ops
= set
->ops
;
4244 if (blk_mq_alloc_ctxs(q
))
4247 /* init q->mq_kobj and sw queues' kobjects */
4248 blk_mq_sysfs_init(q
);
4250 INIT_LIST_HEAD(&q
->unused_hctx_list
);
4251 spin_lock_init(&q
->unused_hctx_lock
);
4253 xa_init(&q
->hctx_table
);
4255 blk_mq_realloc_hw_ctxs(set
, q
);
4256 if (!q
->nr_hw_queues
)
4259 INIT_WORK(&q
->timeout_work
, blk_mq_timeout_work
);
4260 blk_queue_rq_timeout(q
, set
->timeout
? set
->timeout
: 30 * HZ
);
4264 q
->queue_flags
|= QUEUE_FLAG_MQ_DEFAULT
;
4265 blk_mq_update_poll_flag(q
);
4267 INIT_DELAYED_WORK(&q
->requeue_work
, blk_mq_requeue_work
);
4268 INIT_LIST_HEAD(&q
->flush_list
);
4269 INIT_LIST_HEAD(&q
->requeue_list
);
4270 spin_lock_init(&q
->requeue_lock
);
4272 q
->nr_requests
= set
->queue_depth
;
4274 blk_mq_init_cpu_queues(q
, set
->nr_hw_queues
);
4275 blk_mq_add_queue_tag_set(set
, q
);
4276 blk_mq_map_swqueue(q
);
4285 EXPORT_SYMBOL(blk_mq_init_allocated_queue
);
4287 /* tags can _not_ be used after returning from blk_mq_exit_queue */
4288 void blk_mq_exit_queue(struct request_queue
*q
)
4290 struct blk_mq_tag_set
*set
= q
->tag_set
;
4292 /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
4293 blk_mq_exit_hw_queues(q
, set
, set
->nr_hw_queues
);
4294 /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
4295 blk_mq_del_queue_tag_set(q
);
4298 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set
*set
)
4302 if (blk_mq_is_shared_tags(set
->flags
)) {
4303 set
->shared_tags
= blk_mq_alloc_map_and_rqs(set
,
4306 if (!set
->shared_tags
)
4310 for (i
= 0; i
< set
->nr_hw_queues
; i
++) {
4311 if (!__blk_mq_alloc_map_and_rqs(set
, i
))
4320 __blk_mq_free_map_and_rqs(set
, i
);
4322 if (blk_mq_is_shared_tags(set
->flags
)) {
4323 blk_mq_free_map_and_rqs(set
, set
->shared_tags
,
4324 BLK_MQ_NO_HCTX_IDX
);
4331 * Allocate the request maps associated with this tag_set. Note that this
4332 * may reduce the depth asked for, if memory is tight. set->queue_depth
4333 * will be updated to reflect the allocated depth.
4335 static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set
*set
)
4340 depth
= set
->queue_depth
;
4342 err
= __blk_mq_alloc_rq_maps(set
);
4346 set
->queue_depth
>>= 1;
4347 if (set
->queue_depth
< set
->reserved_tags
+ BLK_MQ_TAG_MIN
) {
4351 } while (set
->queue_depth
);
4353 if (!set
->queue_depth
|| err
) {
4354 pr_err("blk-mq: failed to allocate request map\n");
4358 if (depth
!= set
->queue_depth
)
4359 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
4360 depth
, set
->queue_depth
);
4365 static void blk_mq_update_queue_map(struct blk_mq_tag_set
*set
)
4368 * blk_mq_map_queues() and multiple .map_queues() implementations
4369 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
4370 * number of hardware queues.
4372 if (set
->nr_maps
== 1)
4373 set
->map
[HCTX_TYPE_DEFAULT
].nr_queues
= set
->nr_hw_queues
;
4375 if (set
->ops
->map_queues
&& !is_kdump_kernel()) {
4379 * transport .map_queues is usually done in the following
4382 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
4383 * mask = get_cpu_mask(queue)
4384 * for_each_cpu(cpu, mask)
4385 * set->map[x].mq_map[cpu] = queue;
4388 * When we need to remap, the table has to be cleared for
4389 * killing stale mapping since one CPU may not be mapped
4392 for (i
= 0; i
< set
->nr_maps
; i
++)
4393 blk_mq_clear_mq_map(&set
->map
[i
]);
4395 set
->ops
->map_queues(set
);
4397 BUG_ON(set
->nr_maps
> 1);
4398 blk_mq_map_queues(&set
->map
[HCTX_TYPE_DEFAULT
]);
4402 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set
*set
,
4403 int new_nr_hw_queues
)
4405 struct blk_mq_tags
**new_tags
;
4408 if (set
->nr_hw_queues
>= new_nr_hw_queues
) {
4409 for (i
= new_nr_hw_queues
; i
< set
->nr_hw_queues
; i
++)
4410 __blk_mq_free_map_and_rqs(set
, i
);
4414 new_tags
= kcalloc_node(new_nr_hw_queues
, sizeof(struct blk_mq_tags
*),
4415 GFP_KERNEL
, set
->numa_node
);
4420 memcpy(new_tags
, set
->tags
, set
->nr_hw_queues
*
4421 sizeof(*set
->tags
));
4423 set
->tags
= new_tags
;
4425 for (i
= set
->nr_hw_queues
; i
< new_nr_hw_queues
; i
++) {
4426 if (!__blk_mq_alloc_map_and_rqs(set
, i
)) {
4427 while (--i
>= set
->nr_hw_queues
)
4428 __blk_mq_free_map_and_rqs(set
, i
);
4435 set
->nr_hw_queues
= new_nr_hw_queues
;
4440 * Alloc a tag set to be associated with one or more request queues.
4441 * May fail with EINVAL for various error conditions. May adjust the
4442 * requested depth down, if it's too large. In that case, the set
4443 * value will be stored in set->queue_depth.
4445 int blk_mq_alloc_tag_set(struct blk_mq_tag_set
*set
)
4449 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH
> 1 << BLK_MQ_UNIQUE_TAG_BITS
);
4451 if (!set
->nr_hw_queues
)
4453 if (!set
->queue_depth
)
4455 if (set
->queue_depth
< set
->reserved_tags
+ BLK_MQ_TAG_MIN
)
4458 if (!set
->ops
->queue_rq
)
4461 if (!set
->ops
->get_budget
^ !set
->ops
->put_budget
)
4464 if (set
->queue_depth
> BLK_MQ_MAX_DEPTH
) {
4465 pr_info("blk-mq: reduced tag depth to %u\n",
4467 set
->queue_depth
= BLK_MQ_MAX_DEPTH
;
4472 else if (set
->nr_maps
> HCTX_MAX_TYPES
)
4476 * If a crashdump is active, then we are potentially in a very
4477 * memory constrained environment. Limit us to 1 queue and
4478 * 64 tags to prevent using too much memory.
4480 if (is_kdump_kernel()) {
4481 set
->nr_hw_queues
= 1;
4483 set
->queue_depth
= min(64U, set
->queue_depth
);
4486 * There is no use for more h/w queues than cpus if we just have
4489 if (set
->nr_maps
== 1 && set
->nr_hw_queues
> nr_cpu_ids
)
4490 set
->nr_hw_queues
= nr_cpu_ids
;
4492 if (set
->flags
& BLK_MQ_F_BLOCKING
) {
4493 set
->srcu
= kmalloc(sizeof(*set
->srcu
), GFP_KERNEL
);
4496 ret
= init_srcu_struct(set
->srcu
);
4502 set
->tags
= kcalloc_node(set
->nr_hw_queues
,
4503 sizeof(struct blk_mq_tags
*), GFP_KERNEL
,
4506 goto out_cleanup_srcu
;
4508 for (i
= 0; i
< set
->nr_maps
; i
++) {
4509 set
->map
[i
].mq_map
= kcalloc_node(nr_cpu_ids
,
4510 sizeof(set
->map
[i
].mq_map
[0]),
4511 GFP_KERNEL
, set
->numa_node
);
4512 if (!set
->map
[i
].mq_map
)
4513 goto out_free_mq_map
;
4514 set
->map
[i
].nr_queues
= is_kdump_kernel() ? 1 : set
->nr_hw_queues
;
4517 blk_mq_update_queue_map(set
);
4519 ret
= blk_mq_alloc_set_map_and_rqs(set
);
4521 goto out_free_mq_map
;
4523 mutex_init(&set
->tag_list_lock
);
4524 INIT_LIST_HEAD(&set
->tag_list
);
4529 for (i
= 0; i
< set
->nr_maps
; i
++) {
4530 kfree(set
->map
[i
].mq_map
);
4531 set
->map
[i
].mq_map
= NULL
;
4536 if (set
->flags
& BLK_MQ_F_BLOCKING
)
4537 cleanup_srcu_struct(set
->srcu
);
4539 if (set
->flags
& BLK_MQ_F_BLOCKING
)
4543 EXPORT_SYMBOL(blk_mq_alloc_tag_set
);
4545 /* allocate and initialize a tagset for a simple single-queue device */
4546 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set
*set
,
4547 const struct blk_mq_ops
*ops
, unsigned int queue_depth
,
4548 unsigned int set_flags
)
4550 memset(set
, 0, sizeof(*set
));
4552 set
->nr_hw_queues
= 1;
4554 set
->queue_depth
= queue_depth
;
4555 set
->numa_node
= NUMA_NO_NODE
;
4556 set
->flags
= set_flags
;
4557 return blk_mq_alloc_tag_set(set
);
4559 EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set
);
4561 void blk_mq_free_tag_set(struct blk_mq_tag_set
*set
)
4565 for (i
= 0; i
< set
->nr_hw_queues
; i
++)
4566 __blk_mq_free_map_and_rqs(set
, i
);
4568 if (blk_mq_is_shared_tags(set
->flags
)) {
4569 blk_mq_free_map_and_rqs(set
, set
->shared_tags
,
4570 BLK_MQ_NO_HCTX_IDX
);
4573 for (j
= 0; j
< set
->nr_maps
; j
++) {
4574 kfree(set
->map
[j
].mq_map
);
4575 set
->map
[j
].mq_map
= NULL
;
4580 if (set
->flags
& BLK_MQ_F_BLOCKING
) {
4581 cleanup_srcu_struct(set
->srcu
);
4585 EXPORT_SYMBOL(blk_mq_free_tag_set
);
4587 int blk_mq_update_nr_requests(struct request_queue
*q
, unsigned int nr
)
4589 struct blk_mq_tag_set
*set
= q
->tag_set
;
4590 struct blk_mq_hw_ctx
*hctx
;
4597 if (q
->nr_requests
== nr
)
4600 blk_mq_freeze_queue(q
);
4601 blk_mq_quiesce_queue(q
);
4604 queue_for_each_hw_ctx(q
, hctx
, i
) {
4608 * If we're using an MQ scheduler, just update the scheduler
4609 * queue depth. This is similar to what the old code would do.
4611 if (hctx
->sched_tags
) {
4612 ret
= blk_mq_tag_update_depth(hctx
, &hctx
->sched_tags
,
4615 ret
= blk_mq_tag_update_depth(hctx
, &hctx
->tags
, nr
,
4620 if (q
->elevator
&& q
->elevator
->type
->ops
.depth_updated
)
4621 q
->elevator
->type
->ops
.depth_updated(hctx
);
4624 q
->nr_requests
= nr
;
4625 if (blk_mq_is_shared_tags(set
->flags
)) {
4627 blk_mq_tag_update_sched_shared_tags(q
);
4629 blk_mq_tag_resize_shared_tags(set
, nr
);
4633 blk_mq_unquiesce_queue(q
);
4634 blk_mq_unfreeze_queue(q
);
4640 * request_queue and elevator_type pair.
4641 * It is just used by __blk_mq_update_nr_hw_queues to cache
4642 * the elevator_type associated with a request_queue.
4644 struct blk_mq_qe_pair
{
4645 struct list_head node
;
4646 struct request_queue
*q
;
4647 struct elevator_type
*type
;
4651 * Cache the elevator_type in qe pair list and switch the
4652 * io scheduler to 'none'
4654 static bool blk_mq_elv_switch_none(struct list_head
*head
,
4655 struct request_queue
*q
)
4657 struct blk_mq_qe_pair
*qe
;
4659 qe
= kmalloc(sizeof(*qe
), GFP_NOIO
| __GFP_NOWARN
| __GFP_NORETRY
);
4663 /* q->elevator needs protection from ->sysfs_lock */
4664 mutex_lock(&q
->sysfs_lock
);
4666 /* the check has to be done with holding sysfs_lock */
4672 INIT_LIST_HEAD(&qe
->node
);
4674 qe
->type
= q
->elevator
->type
;
4675 /* keep a reference to the elevator module as we'll switch back */
4676 __elevator_get(qe
->type
);
4677 list_add(&qe
->node
, head
);
4678 elevator_disable(q
);
4680 mutex_unlock(&q
->sysfs_lock
);
4685 static struct blk_mq_qe_pair
*blk_lookup_qe_pair(struct list_head
*head
,
4686 struct request_queue
*q
)
4688 struct blk_mq_qe_pair
*qe
;
4690 list_for_each_entry(qe
, head
, node
)
4697 static void blk_mq_elv_switch_back(struct list_head
*head
,
4698 struct request_queue
*q
)
4700 struct blk_mq_qe_pair
*qe
;
4701 struct elevator_type
*t
;
4703 qe
= blk_lookup_qe_pair(head
, q
);
4707 list_del(&qe
->node
);
4710 mutex_lock(&q
->sysfs_lock
);
4711 elevator_switch(q
, t
);
4712 /* drop the reference acquired in blk_mq_elv_switch_none */
4714 mutex_unlock(&q
->sysfs_lock
);
4717 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set
*set
,
4720 struct request_queue
*q
;
4722 int prev_nr_hw_queues
;
4724 lockdep_assert_held(&set
->tag_list_lock
);
4726 if (set
->nr_maps
== 1 && nr_hw_queues
> nr_cpu_ids
)
4727 nr_hw_queues
= nr_cpu_ids
;
4728 if (nr_hw_queues
< 1)
4730 if (set
->nr_maps
== 1 && nr_hw_queues
== set
->nr_hw_queues
)
4733 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
)
4734 blk_mq_freeze_queue(q
);
4736 * Switch IO scheduler to 'none', cleaning up the data associated
4737 * with the previous scheduler. We will switch back once we are done
4738 * updating the new sw to hw queue mappings.
4740 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
)
4741 if (!blk_mq_elv_switch_none(&head
, q
))
4744 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
4745 blk_mq_debugfs_unregister_hctxs(q
);
4746 blk_mq_sysfs_unregister_hctxs(q
);
4749 prev_nr_hw_queues
= set
->nr_hw_queues
;
4750 if (blk_mq_realloc_tag_set_tags(set
, nr_hw_queues
) < 0)
4754 blk_mq_update_queue_map(set
);
4755 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
4756 blk_mq_realloc_hw_ctxs(set
, q
);
4757 blk_mq_update_poll_flag(q
);
4758 if (q
->nr_hw_queues
!= set
->nr_hw_queues
) {
4759 int i
= prev_nr_hw_queues
;
4761 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
4762 nr_hw_queues
, prev_nr_hw_queues
);
4763 for (; i
< set
->nr_hw_queues
; i
++)
4764 __blk_mq_free_map_and_rqs(set
, i
);
4766 set
->nr_hw_queues
= prev_nr_hw_queues
;
4769 blk_mq_map_swqueue(q
);
4773 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
) {
4774 blk_mq_sysfs_register_hctxs(q
);
4775 blk_mq_debugfs_register_hctxs(q
);
4779 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
)
4780 blk_mq_elv_switch_back(&head
, q
);
4782 list_for_each_entry(q
, &set
->tag_list
, tag_set_list
)
4783 blk_mq_unfreeze_queue(q
);
4786 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set
*set
, int nr_hw_queues
)
4788 mutex_lock(&set
->tag_list_lock
);
4789 __blk_mq_update_nr_hw_queues(set
, nr_hw_queues
);
4790 mutex_unlock(&set
->tag_list_lock
);
4792 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues
);
4794 static int blk_hctx_poll(struct request_queue
*q
, struct blk_mq_hw_ctx
*hctx
,
4795 struct io_comp_batch
*iob
, unsigned int flags
)
4797 long state
= get_current_state();
4801 ret
= q
->mq_ops
->poll(hctx
, iob
);
4803 __set_current_state(TASK_RUNNING
);
4807 if (signal_pending_state(state
, current
))
4808 __set_current_state(TASK_RUNNING
);
4809 if (task_is_running(current
))
4812 if (ret
< 0 || (flags
& BLK_POLL_ONESHOT
))
4815 } while (!need_resched());
4817 __set_current_state(TASK_RUNNING
);
4821 int blk_mq_poll(struct request_queue
*q
, blk_qc_t cookie
,
4822 struct io_comp_batch
*iob
, unsigned int flags
)
4824 struct blk_mq_hw_ctx
*hctx
= xa_load(&q
->hctx_table
, cookie
);
4826 return blk_hctx_poll(q
, hctx
, iob
, flags
);
4829 int blk_rq_poll(struct request
*rq
, struct io_comp_batch
*iob
,
4830 unsigned int poll_flags
)
4832 struct request_queue
*q
= rq
->q
;
4835 if (!blk_rq_is_poll(rq
))
4837 if (!percpu_ref_tryget(&q
->q_usage_counter
))
4840 ret
= blk_hctx_poll(q
, rq
->mq_hctx
, iob
, poll_flags
);
4845 EXPORT_SYMBOL_GPL(blk_rq_poll
);
4847 unsigned int blk_mq_rq_cpu(struct request
*rq
)
4849 return rq
->mq_ctx
->cpu
;
4851 EXPORT_SYMBOL(blk_mq_rq_cpu
);
4853 void blk_mq_cancel_work_sync(struct request_queue
*q
)
4855 struct blk_mq_hw_ctx
*hctx
;
4858 cancel_delayed_work_sync(&q
->requeue_work
);
4860 queue_for_each_hw_ctx(q
, hctx
, i
)
4861 cancel_delayed_work_sync(&hctx
->run_work
);
4864 static int __init
blk_mq_init(void)
4868 for_each_possible_cpu(i
)
4869 init_llist_head(&per_cpu(blk_cpu_done
, i
));
4870 for_each_possible_cpu(i
)
4871 INIT_CSD(&per_cpu(blk_cpu_csd
, i
),
4872 __blk_mq_complete_request_remote
, NULL
);
4873 open_softirq(BLOCK_SOFTIRQ
, blk_done_softirq
);
4875 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD
,
4876 "block/softirq:dead", NULL
,
4877 blk_softirq_cpu_dead
);
4878 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD
, "block/mq:dead", NULL
,
4879 blk_mq_hctx_notify_dead
);
4880 cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE
, "block/mq:online",
4881 blk_mq_hctx_notify_online
,
4882 blk_mq_hctx_notify_offline
);
4885 subsys_initcall(blk_mq_init
);