block/blk-mq.c

   1 /*
   2  * Block multiqueue core code
   3  *
   4  * Copyright (C) 2013-2014 Jens Axboe
   5  * Copyright (C) 2013-2014 Christoph Hellwig
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/backing-dev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/kmemleak.h>
  13 #include <linux/mm.h>
  14 #include <linux/init.h>
  15 #include <linux/slab.h>
  16 #include <linux/workqueue.h>
  17 #include <linux/smp.h>
  18 #include <linux/llist.h>
  19 #include <linux/list_sort.h>
  20 #include <linux/cpu.h>
  21 #include <linux/cache.h>
  22 #include <linux/sched/sysctl.h>
  23 #include <linux/sched/topology.h>
  24 #include <linux/sched/signal.h>
  25 #include <linux/delay.h>
  26 #include <linux/crash_dump.h>
  27 #include <linux/prefetch.h>
  28
  29 #include <trace/events/block.h>
  30
  31 #include <linux/blk-mq.h>
  32 #include "blk.h"
  33 #include "blk-mq.h"
  34 #include "blk-mq-debugfs.h"
  35 #include "blk-mq-tag.h"
  36 #include "blk-pm.h"
  37 #include "blk-stat.h"
  38 #include "blk-mq-sched.h"
  39 #include "blk-rq-qos.h"
  40
  41 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
  42 static void blk_mq_poll_stats_start(struct request_queue *q);
  43 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  44
  45 static int blk_mq_poll_stats_bkt(const struct request *rq)
  46 {
  47         int ddir, bytes, bucket;
  48
  49         ddir = rq_data_dir(rq);
  50         bytes = blk_rq_bytes(rq);
  51
  52         bucket = ddir + 2*(ilog2(bytes) - 9);
  53
  54         if (bucket < 0)
  55                 return -1;
  56         else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
  57                 return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
  58
  59         return bucket;
  60 }
  61
  62 /*
  63  * Check if any of the ctx's have pending work in this hardware queue
  64  */
  65 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  66 {
  67         return !list_empty_careful(&hctx->dispatch) ||
  68                 sbitmap_any_bit_set(&hctx->ctx_map) ||
  69                         blk_mq_sched_has_work(hctx);
  70 }
  71
  72 /*
  73  * Mark this ctx as having pending work in this hardware queue
  74  */
  75 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  76                                      struct blk_mq_ctx *ctx)
  77 {
  78         const int bit = ctx->index_hw[hctx->type];
  79
  80         if (!sbitmap_test_bit(&hctx->ctx_map, bit))
  81                 sbitmap_set_bit(&hctx->ctx_map, bit);
  82 }
  83
  84 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  85                                       struct blk_mq_ctx *ctx)
  86 {
  87         const int bit = ctx->index_hw[hctx->type];
  88
  89         sbitmap_clear_bit(&hctx->ctx_map, bit);
  90 }
  91
  92 struct mq_inflight {
  93         struct hd_struct *part;
  94         unsigned int *inflight;
  95 };
  96
  97 static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
  98                                   struct request *rq, void *priv,
  99                                   bool reserved)
 100 {
 101         struct mq_inflight *mi = priv;
 102
 103         /*
 104          * index[0] counts the specific partition that was asked for. index[1]
 105          * counts the ones that are active on the whole device, so increment
 106          * that if mi->part is indeed a partition, and not a whole device.
 107          */
 108         if (rq->part == mi->part)
 109                 mi->inflight[0]++;
 110         if (mi->part->partno)
 111                 mi->inflight[1]++;
 112 }
 113
 114 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 115                       unsigned int inflight[2])
 116 {
 117         struct mq_inflight mi = { .part = part, .inflight = inflight, };
 118
 119         inflight[0] = inflight[1] = 0;
 120         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 121 }
 122
 123 static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
 124                                      struct request *rq, void *priv,
 125                                      bool reserved)
 126 {
 127         struct mq_inflight *mi = priv;
 128
 129         if (rq->part == mi->part)
 130                 mi->inflight[rq_data_dir(rq)]++;
 131 }
 132
 133 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 134                          unsigned int inflight[2])
 135 {
 136         struct mq_inflight mi = { .part = part, .inflight = inflight, };
 137
 138         inflight[0] = inflight[1] = 0;
 139         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
 140 }
 141
 142 void blk_freeze_queue_start(struct request_queue *q)
 143 {
 144         int freeze_depth;
 145
 146         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 147         if (freeze_depth == 1) {
 148                 percpu_ref_kill(&q->q_usage_counter);
 149                 if (q->mq_ops)
 150                         blk_mq_run_hw_queues(q, false);
 151         }
 152 }
 153 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 154
 155 void blk_mq_freeze_queue_wait(struct request_queue *q)
 156 {
 157         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 158 }
 159 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 160
 161 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 162                                      unsigned long timeout)
 163 {
 164         return wait_event_timeout(q->mq_freeze_wq,
 165                                         percpu_ref_is_zero(&q->q_usage_counter),
 166                                         timeout);
 167 }
 168 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 169
 170 /*
 171  * Guarantee no request is in use, so we can change any data structure of
 172  * the queue afterward.
 173  */
 174 void blk_freeze_queue(struct request_queue *q)
 175 {
 176         /*
 177          * In the !blk_mq case we are only calling this to kill the
 178          * q_usage_counter, otherwise this increases the freeze depth
 179          * and waits for it to return to zero.  For this reason there is
 180          * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 181          * exported to drivers as the only user for unfreeze is blk_mq.
 182          */
 183         blk_freeze_queue_start(q);
 184         blk_mq_freeze_queue_wait(q);
 185 }
 186
 187 void blk_mq_freeze_queue(struct request_queue *q)
 188 {
 189         /*
 190          * ...just an alias to keep freeze and unfreeze actions balanced
 191          * in the blk_mq_* namespace
 192          */
 193         blk_freeze_queue(q);
 194 }
 195 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 196
 197 void blk_mq_unfreeze_queue(struct request_queue *q)
 198 {
 199         int freeze_depth;
 200
 201         freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 202         WARN_ON_ONCE(freeze_depth < 0);
 203         if (!freeze_depth) {
 204                 percpu_ref_resurrect(&q->q_usage_counter);
 205                 wake_up_all(&q->mq_freeze_wq);
 206         }
 207 }
 208 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 209
 210 /*
 211  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 212  * mpt3sas driver such that this function can be removed.
 213  */
 214 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 215 {
 216         blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 217 }
 218 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 219
 220 /**
 221  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 222  * @q: request queue.
 223  *
 224  * Note: this function does not prevent that the struct request end_io()
 225  * callback function is invoked. Once this function is returned, we make
 226  * sure no dispatch can happen until the queue is unquiesced via
 227  * blk_mq_unquiesce_queue().
 228  */
 229 void blk_mq_quiesce_queue(struct request_queue *q)
 230 {
 231         struct blk_mq_hw_ctx *hctx;
 232         unsigned int i;
 233         bool rcu = false;
 234
 235         blk_mq_quiesce_queue_nowait(q);
 236
 237         queue_for_each_hw_ctx(q, hctx, i) {
 238                 if (hctx->flags & BLK_MQ_F_BLOCKING)
 239                         synchronize_srcu(hctx->srcu);
 240                 else
 241                         rcu = true;
 242         }
 243         if (rcu)
 244                 synchronize_rcu();
 245 }
 246 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 247
 248 /*
 249  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 250  * @q: request queue.
 251  *
 252  * This function recovers queue into the state before quiescing
 253  * which is done by blk_mq_quiesce_queue.
 254  */
 255 void blk_mq_unquiesce_queue(struct request_queue *q)
 256 {
 257         blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 258
 259         /* dispatch requests which are inserted during quiescing */
 260         blk_mq_run_hw_queues(q, true);
 261 }
 262 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 263
 264 void blk_mq_wake_waiters(struct request_queue *q)
 265 {
 266         struct blk_mq_hw_ctx *hctx;
 267         unsigned int i;
 268
 269         queue_for_each_hw_ctx(q, hctx, i)
 270                 if (blk_mq_hw_queue_mapped(hctx))
 271                         blk_mq_tag_wakeup_all(hctx->tags, true);
 272 }
 273
 274 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 275 {
 276         return blk_mq_has_free_tags(hctx->tags);
 277 }
 278 EXPORT_SYMBOL(blk_mq_can_queue);
 279
 280 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 281                 unsigned int tag, unsigned int op)
 282 {
 283         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 284         struct request *rq = tags->static_rqs[tag];
 285         req_flags_t rq_flags = 0;
 286
 287         if (data->flags & BLK_MQ_REQ_INTERNAL) {
 288                 rq->tag = -1;
 289                 rq->internal_tag = tag;
 290         } else {
 291                 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
 292                         rq_flags = RQF_MQ_INFLIGHT;
 293                         atomic_inc(&data->hctx->nr_active);
 294                 }
 295                 rq->tag = tag;
 296                 rq->internal_tag = -1;
 297                 data->hctx->tags->rqs[rq->tag] = rq;
 298         }
 299
 300         /* csd/requeue_work/fifo_time is initialized before use */
 301         rq->q = data->q;
 302         rq->mq_ctx = data->ctx;
 303         rq->rq_flags = rq_flags;
 304         rq->cmd_flags = op;
 305         if (data->flags & BLK_MQ_REQ_PREEMPT)
 306                 rq->rq_flags |= RQF_PREEMPT;
 307         if (blk_queue_io_stat(data->q))
 308                 rq->rq_flags |= RQF_IO_STAT;
 309         INIT_LIST_HEAD(&rq->queuelist);
 310         INIT_HLIST_NODE(&rq->hash);
 311         RB_CLEAR_NODE(&rq->rb_node);
 312         rq->rq_disk = NULL;
 313         rq->part = NULL;
 314         rq->start_time_ns = ktime_get_ns();
 315         rq->io_start_time_ns = 0;
 316         rq->nr_phys_segments = 0;
 317 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 318         rq->nr_integrity_segments = 0;
 319 #endif
 320         rq->special = NULL;
 321         /* tag was already set */
 322         rq->extra_len = 0;
 323         rq->__deadline = 0;
 324
 325         INIT_LIST_HEAD(&rq->timeout_list);
 326         rq->timeout = 0;
 327
 328         rq->end_io = NULL;
 329         rq->end_io_data = NULL;
 330         rq->next_rq = NULL;
 331
 332         data->ctx->rq_dispatched[op_is_sync(op)]++;
 333         refcount_set(&rq->ref, 1);
 334         return rq;
 335 }
 336
 337 static struct request *blk_mq_get_request(struct request_queue *q,
 338                                           struct bio *bio,
 339                                           struct blk_mq_alloc_data *data)
 340 {
 341         struct elevator_queue *e = q->elevator;
 342         struct request *rq;
 343         unsigned int tag;
 344         bool put_ctx_on_error = false;
 345
 346         blk_queue_enter_live(q);
 347         data->q = q;
 348         if (likely(!data->ctx)) {
 349                 data->ctx = blk_mq_get_ctx(q);
 350                 put_ctx_on_error = true;
 351         }
 352         if (likely(!data->hctx))
 353                 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
 354                                                 data->ctx->cpu);
 355         if (data->cmd_flags & REQ_NOWAIT)
 356                 data->flags |= BLK_MQ_REQ_NOWAIT;
 357
 358         if (e) {
 359                 data->flags |= BLK_MQ_REQ_INTERNAL;
 360
 361                 /*
 362                  * Flush requests are special and go directly to the
 363                  * dispatch list. Don't include reserved tags in the
 364                  * limiting, as it isn't useful.
 365                  */
 366                 if (!op_is_flush(data->cmd_flags) &&
 367                     e->type->ops.limit_depth &&
 368                     !(data->flags & BLK_MQ_REQ_RESERVED))
 369                         e->type->ops.limit_depth(data->cmd_flags, data);
 370         } else {
 371                 blk_mq_tag_busy(data->hctx);
 372         }
 373
 374         tag = blk_mq_get_tag(data);
 375         if (tag == BLK_MQ_TAG_FAIL) {
 376                 if (put_ctx_on_error) {
 377                         blk_mq_put_ctx(data->ctx);
 378                         data->ctx = NULL;
 379                 }
 380                 blk_queue_exit(q);
 381                 return NULL;
 382         }
 383
 384         rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
 385         if (!op_is_flush(data->cmd_flags)) {
 386                 rq->elv.icq = NULL;
 387                 if (e && e->type->ops.prepare_request) {
 388                         if (e->type->icq_cache && rq_ioc(bio))
 389                                 blk_mq_sched_assign_ioc(rq, bio);
 390
 391                         e->type->ops.prepare_request(rq, bio);
 392                         rq->rq_flags |= RQF_ELVPRIV;
 393                 }
 394         }
 395         data->hctx->queued++;
 396         return rq;
 397 }
 398
 399 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 400                 blk_mq_req_flags_t flags)
 401 {
 402         struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 403         struct request *rq;
 404         int ret;
 405
 406         ret = blk_queue_enter(q, flags);
 407         if (ret)
 408                 return ERR_PTR(ret);
 409
 410         rq = blk_mq_get_request(q, NULL, &alloc_data);
 411         blk_queue_exit(q);
 412
 413         if (!rq)
 414                 return ERR_PTR(-EWOULDBLOCK);
 415
 416         blk_mq_put_ctx(alloc_data.ctx);
 417
 418         rq->__data_len = 0;
 419         rq->__sector = (sector_t) -1;
 420         rq->bio = rq->biotail = NULL;
 421         return rq;
 422 }
 423 EXPORT_SYMBOL(blk_mq_alloc_request);
 424
 425 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 426         unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 427 {
 428         struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 429         struct request *rq;
 430         unsigned int cpu;
 431         int ret;
 432
 433         /*
 434          * If the tag allocator sleeps we could get an allocation for a
 435          * different hardware context.  No need to complicate the low level
 436          * allocator for this for the rare use case of a command tied to
 437          * a specific queue.
 438          */
 439         if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 440                 return ERR_PTR(-EINVAL);
 441
 442         if (hctx_idx >= q->nr_hw_queues)
 443                 return ERR_PTR(-EIO);
 444
 445         ret = blk_queue_enter(q, flags);
 446         if (ret)
 447                 return ERR_PTR(ret);
 448
 449         /*
 450          * Check if the hardware context is actually mapped to anything.
 451          * If not tell the caller that it should skip this queue.
 452          */
 453         alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
 454         if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
 455                 blk_queue_exit(q);
 456                 return ERR_PTR(-EXDEV);
 457         }
 458         cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 459         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 460
 461         rq = blk_mq_get_request(q, NULL, &alloc_data);
 462         blk_queue_exit(q);
 463
 464         if (!rq)
 465                 return ERR_PTR(-EWOULDBLOCK);
 466
 467         return rq;
 468 }
 469 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 470
 471 static void __blk_mq_free_request(struct request *rq)
 472 {
 473         struct request_queue *q = rq->q;
 474         struct blk_mq_ctx *ctx = rq->mq_ctx;
 475         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 476         const int sched_tag = rq->internal_tag;
 477
 478         blk_pm_mark_last_busy(rq);
 479         if (rq->tag != -1)
 480                 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 481         if (sched_tag != -1)
 482                 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 483         blk_mq_sched_restart(hctx);
 484         blk_queue_exit(q);
 485 }
 486
 487 void blk_mq_free_request(struct request *rq)
 488 {
 489         struct request_queue *q = rq->q;
 490         struct elevator_queue *e = q->elevator;
 491         struct blk_mq_ctx *ctx = rq->mq_ctx;
 492         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 493
 494         if (rq->rq_flags & RQF_ELVPRIV) {
 495                 if (e && e->type->ops.finish_request)
 496                         e->type->ops.finish_request(rq);
 497                 if (rq->elv.icq) {
 498                         put_io_context(rq->elv.icq->ioc);
 499                         rq->elv.icq = NULL;
 500                 }
 501         }
 502
 503         ctx->rq_completed[rq_is_sync(rq)]++;
 504         if (rq->rq_flags & RQF_MQ_INFLIGHT)
 505                 atomic_dec(&hctx->nr_active);
 506
 507         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 508                 laptop_io_completion(q->backing_dev_info);
 509
 510         rq_qos_done(q, rq);
 511
 512         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 513         if (refcount_dec_and_test(&rq->ref))
 514                 __blk_mq_free_request(rq);
 515 }
 516 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 517
 518 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 519 {
 520         u64 now = ktime_get_ns();
 521
 522         if (rq->rq_flags & RQF_STATS) {
 523                 blk_mq_poll_stats_start(rq->q);
 524                 blk_stat_add(rq, now);
 525         }
 526
 527         if (rq->internal_tag != -1)
 528                 blk_mq_sched_completed_request(rq, now);
 529
 530         blk_account_io_done(rq, now);
 531
 532         if (rq->end_io) {
 533                 rq_qos_done(rq->q, rq);
 534                 rq->end_io(rq, error);
 535         } else {
 536                 if (unlikely(blk_bidi_rq(rq)))
 537                         blk_mq_free_request(rq->next_rq);
 538                 blk_mq_free_request(rq);
 539         }
 540 }
 541 EXPORT_SYMBOL(__blk_mq_end_request);
 542
 543 void blk_mq_end_request(struct request *rq, blk_status_t error)
 544 {
 545         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 546                 BUG();
 547         __blk_mq_end_request(rq, error);
 548 }
 549 EXPORT_SYMBOL(blk_mq_end_request);
 550
 551 static void __blk_mq_complete_request_remote(void *data)
 552 {
 553         struct request *rq = data;
 554         struct request_queue *q = rq->q;
 555
 556         q->mq_ops->complete(rq);
 557 }
 558
 559 static void __blk_mq_complete_request(struct request *rq)
 560 {
 561         struct blk_mq_ctx *ctx = rq->mq_ctx;
 562         struct request_queue *q = rq->q;
 563         bool shared = false;
 564         int cpu;
 565
 566         if (!blk_mq_mark_complete(rq))
 567                 return;
 568
 569         /*
 570          * Most of single queue controllers, there is only one irq vector
 571          * for handling IO completion, and the only irq's affinity is set
 572          * as all possible CPUs. On most of ARCHs, this affinity means the
 573          * irq is handled on one specific CPU.
 574          *
 575          * So complete IO reqeust in softirq context in case of single queue
 576          * for not degrading IO performance by irqsoff latency.
 577          */
 578         if (q->nr_hw_queues == 1) {
 579                 __blk_complete_request(rq);
 580                 return;
 581         }
 582
 583         if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
 584                 q->mq_ops->complete(rq);
 585                 return;
 586         }
 587
 588         cpu = get_cpu();
 589         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 590                 shared = cpus_share_cache(cpu, ctx->cpu);
 591
 592         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 593                 rq->csd.func = __blk_mq_complete_request_remote;
 594                 rq->csd.info = rq;
 595                 rq->csd.flags = 0;
 596                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 597         } else {
 598                 q->mq_ops->complete(rq);
 599         }
 600         put_cpu();
 601 }
 602
 603 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 604         __releases(hctx->srcu)
 605 {
 606         if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 607                 rcu_read_unlock();
 608         else
 609                 srcu_read_unlock(hctx->srcu, srcu_idx);
 610 }
 611
 612 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 613         __acquires(hctx->srcu)
 614 {
 615         if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 616                 /* shut up gcc false positive */
 617                 *srcu_idx = 0;
 618                 rcu_read_lock();
 619         } else
 620                 *srcu_idx = srcu_read_lock(hctx->srcu);
 621 }
 622
 623 /**
 624  * blk_mq_complete_request - end I/O on a request
 625  * @rq:         the request being processed
 626  *
 627  * Description:
 628  *      Ends all I/O on a request. It does not handle partial completions.
 629  *      The actual completion happens out-of-order, through a IPI handler.
 630  **/
 631 void blk_mq_complete_request(struct request *rq)
 632 {
 633         if (unlikely(blk_should_fake_timeout(rq->q)))
 634                 return;
 635         __blk_mq_complete_request(rq);
 636 }
 637 EXPORT_SYMBOL(blk_mq_complete_request);
 638
 639 int blk_mq_request_started(struct request *rq)
 640 {
 641         return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
 642 }
 643 EXPORT_SYMBOL_GPL(blk_mq_request_started);
 644
 645 void blk_mq_start_request(struct request *rq)
 646 {
 647         struct request_queue *q = rq->q;
 648
 649         blk_mq_sched_started_request(rq);
 650
 651         trace_block_rq_issue(q, rq);
 652
 653         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 654                 rq->io_start_time_ns = ktime_get_ns();
 655 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 656                 rq->throtl_size = blk_rq_sectors(rq);
 657 #endif
 658                 rq->rq_flags |= RQF_STATS;
 659                 rq_qos_issue(q, rq);
 660         }
 661
 662         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 663
 664         blk_add_timer(rq);
 665         WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 666
 667         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 668                 /*
 669                  * Make sure space for the drain appears.  We know we can do
 670                  * this because max_hw_segments has been adjusted to be one
 671                  * fewer than the device can handle.
 672                  */
 673                 rq->nr_phys_segments++;
 674         }
 675 }
 676 EXPORT_SYMBOL(blk_mq_start_request);
 677
 678 static void __blk_mq_requeue_request(struct request *rq)
 679 {
 680         struct request_queue *q = rq->q;
 681
 682         blk_mq_put_driver_tag(rq);
 683
 684         trace_block_rq_requeue(q, rq);
 685         rq_qos_requeue(q, rq);
 686
 687         if (blk_mq_request_started(rq)) {
 688                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 689                 rq->rq_flags &= ~RQF_TIMED_OUT;
 690                 if (q->dma_drain_size && blk_rq_bytes(rq))
 691                         rq->nr_phys_segments--;
 692         }
 693 }
 694
 695 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 696 {
 697         __blk_mq_requeue_request(rq);
 698
 699         /* this request will be re-inserted to io scheduler queue */
 700         blk_mq_sched_requeue_request(rq);
 701
 702         BUG_ON(!list_empty(&rq->queuelist));
 703         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 704 }
 705 EXPORT_SYMBOL(blk_mq_requeue_request);
 706
 707 static void blk_mq_requeue_work(struct work_struct *work)
 708 {
 709         struct request_queue *q =
 710                 container_of(work, struct request_queue, requeue_work.work);
 711         LIST_HEAD(rq_list);
 712         struct request *rq, *next;
 713
 714         spin_lock_irq(&q->requeue_lock);
 715         list_splice_init(&q->requeue_list, &rq_list);
 716         spin_unlock_irq(&q->requeue_lock);
 717
 718         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 719                 if (!(rq->rq_flags & RQF_SOFTBARRIER))
 720                         continue;
 721
 722                 rq->rq_flags &= ~RQF_SOFTBARRIER;
 723                 list_del_init(&rq->queuelist);
 724                 blk_mq_sched_insert_request(rq, true, false, false);
 725         }
 726
 727         while (!list_empty(&rq_list)) {
 728                 rq = list_entry(rq_list.next, struct request, queuelist);
 729                 list_del_init(&rq->queuelist);
 730                 blk_mq_sched_insert_request(rq, false, false, false);
 731         }
 732
 733         blk_mq_run_hw_queues(q, false);
 734 }
 735
 736 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 737                                 bool kick_requeue_list)
 738 {
 739         struct request_queue *q = rq->q;
 740         unsigned long flags;
 741
 742         /*
 743          * We abuse this flag that is otherwise used by the I/O scheduler to
 744          * request head insertion from the workqueue.
 745          */
 746         BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 747
 748         spin_lock_irqsave(&q->requeue_lock, flags);
 749         if (at_head) {
 750                 rq->rq_flags |= RQF_SOFTBARRIER;
 751                 list_add(&rq->queuelist, &q->requeue_list);
 752         } else {
 753                 list_add_tail(&rq->queuelist, &q->requeue_list);
 754         }
 755         spin_unlock_irqrestore(&q->requeue_lock, flags);
 756
 757         if (kick_requeue_list)
 758                 blk_mq_kick_requeue_list(q);
 759 }
 760 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 761
 762 void blk_mq_kick_requeue_list(struct request_queue *q)
 763 {
 764         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 765 }
 766 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 767
 768 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 769                                     unsigned long msecs)
 770 {
 771         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
 772                                     msecs_to_jiffies(msecs));
 773 }
 774 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 775
 776 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 777 {
 778         if (tag < tags->nr_tags) {
 779                 prefetch(tags->rqs[tag]);
 780                 return tags->rqs[tag];
 781         }
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 786
 787 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 788 {
 789         req->rq_flags |= RQF_TIMED_OUT;
 790         if (req->q->mq_ops->timeout) {
 791                 enum blk_eh_timer_return ret;
 792
 793                 ret = req->q->mq_ops->timeout(req, reserved);
 794                 if (ret == BLK_EH_DONE)
 795                         return;
 796                 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 797         }
 798
 799         blk_add_timer(req);
 800 }
 801
 802 static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 803 {
 804         unsigned long deadline;
 805
 806         if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
 807                 return false;
 808         if (rq->rq_flags & RQF_TIMED_OUT)
 809                 return false;
 810
 811         deadline = blk_rq_deadline(rq);
 812         if (time_after_eq(jiffies, deadline))
 813                 return true;
 814
 815         if (*next == 0)
 816                 *next = deadline;
 817         else if (time_after(*next, deadline))
 818                 *next = deadline;
 819         return false;
 820 }
 821
 822 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 823                 struct request *rq, void *priv, bool reserved)
 824 {
 825         unsigned long *next = priv;
 826
 827         /*
 828          * Just do a quick check if it is expired before locking the request in
 829          * so we're not unnecessarilly synchronizing across CPUs.
 830          */
 831         if (!blk_mq_req_expired(rq, next))
 832                 return;
 833
 834         /*
 835          * We have reason to believe the request may be expired. Take a
 836          * reference on the request to lock this request lifetime into its
 837          * currently allocated context to prevent it from being reallocated in
 838          * the event the completion by-passes this timeout handler.
 839          *
 840          * If the reference was already released, then the driver beat the
 841          * timeout handler to posting a natural completion.
 842          */
 843         if (!refcount_inc_not_zero(&rq->ref))
 844                 return;
 845
 846         /*
 847          * The request is now locked and cannot be reallocated underneath the
 848          * timeout handler's processing. Re-verify this exact request is truly
 849          * expired; if it is not expired, then the request was completed and
 850          * reallocated as a new request.
 851          */
 852         if (blk_mq_req_expired(rq, next))
 853                 blk_mq_rq_timed_out(rq, reserved);
 854         if (refcount_dec_and_test(&rq->ref))
 855                 __blk_mq_free_request(rq);
 856 }
 857
 858 static void blk_mq_timeout_work(struct work_struct *work)
 859 {
 860         struct request_queue *q =
 861                 container_of(work, struct request_queue, timeout_work);
 862         unsigned long next = 0;
 863         struct blk_mq_hw_ctx *hctx;
 864         int i;
 865
 866         /* A deadlock might occur if a request is stuck requiring a
 867          * timeout at the same time a queue freeze is waiting
 868          * completion, since the timeout code would not be able to
 869          * acquire the queue reference here.
 870          *
 871          * That's why we don't use blk_queue_enter here; instead, we use
 872          * percpu_ref_tryget directly, because we need to be able to
 873          * obtain a reference even in the short window between the queue
 874          * starting to freeze, by dropping the first reference in
 875          * blk_freeze_queue_start, and the moment the last request is
 876          * consumed, marked by the instant q_usage_counter reaches
 877          * zero.
 878          */
 879         if (!percpu_ref_tryget(&q->q_usage_counter))
 880                 return;
 881
 882         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 883
 884         if (next != 0) {
 885                 mod_timer(&q->timeout, next);
 886         } else {
 887                 /*
 888                  * Request timeouts are handled as a forward rolling timer. If
 889                  * we end up here it means that no requests are pending and
 890                  * also that no request has been pending for a while. Mark
 891                  * each hctx as idle.
 892                  */
 893                 queue_for_each_hw_ctx(q, hctx, i) {
 894                         /* the hctx may be unmapped, so check it here */
 895                         if (blk_mq_hw_queue_mapped(hctx))
 896                                 blk_mq_tag_idle(hctx);
 897                 }
 898         }
 899         blk_queue_exit(q);
 900 }
 901
 902 struct flush_busy_ctx_data {
 903         struct blk_mq_hw_ctx *hctx;
 904         struct list_head *list;
 905 };
 906
 907 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 908 {
 909         struct flush_busy_ctx_data *flush_data = data;
 910         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 911         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 912
 913         spin_lock(&ctx->lock);
 914         list_splice_tail_init(&ctx->rq_list, flush_data->list);
 915         sbitmap_clear_bit(sb, bitnr);
 916         spin_unlock(&ctx->lock);
 917         return true;
 918 }
 919
 920 /*
 921  * Process software queues that have been marked busy, splicing them
 922  * to the for-dispatch
 923  */
 924 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 925 {
 926         struct flush_busy_ctx_data data = {
 927                 .hctx = hctx,
 928                 .list = list,
 929         };
 930
 931         sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 932 }
 933 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 934
 935 struct dispatch_rq_data {
 936         struct blk_mq_hw_ctx *hctx;
 937         struct request *rq;
 938 };
 939
 940 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 941                 void *data)
 942 {
 943         struct dispatch_rq_data *dispatch_data = data;
 944         struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
 945         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 946
 947         spin_lock(&ctx->lock);
 948         if (!list_empty(&ctx->rq_list)) {
 949                 dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
 950                 list_del_init(&dispatch_data->rq->queuelist);
 951                 if (list_empty(&ctx->rq_list))
 952                         sbitmap_clear_bit(sb, bitnr);
 953         }
 954         spin_unlock(&ctx->lock);
 955
 956         return !dispatch_data->rq;
 957 }
 958
 959 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 960                                         struct blk_mq_ctx *start)
 961 {
 962         unsigned off = start ? start->index_hw[hctx->type] : 0;
 963         struct dispatch_rq_data data = {
 964                 .hctx = hctx,
 965                 .rq   = NULL,
 966         };
 967
 968         __sbitmap_for_each_set(&hctx->ctx_map, off,
 969                                dispatch_rq_from_ctx, &data);
 970
 971         return data.rq;
 972 }
 973
 974 static inline unsigned int queued_to_index(unsigned int queued)
 975 {
 976         if (!queued)
 977                 return 0;
 978
 979         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 980 }
 981
 982 bool blk_mq_get_driver_tag(struct request *rq)
 983 {
 984         struct blk_mq_alloc_data data = {
 985                 .q = rq->q,
 986                 .hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu),
 987                 .flags = BLK_MQ_REQ_NOWAIT,
 988                 .cmd_flags = rq->cmd_flags,
 989         };
 990         bool shared;
 991
 992         if (rq->tag != -1)
 993                 goto done;
 994
 995         if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
 996                 data.flags |= BLK_MQ_REQ_RESERVED;
 997
 998         shared = blk_mq_tag_busy(data.hctx);
 999         rq->tag = blk_mq_get_tag(&data);
1000         if (rq->tag >= 0) {
1001                 if (shared) {
1002                         rq->rq_flags |= RQF_MQ_INFLIGHT;
1003                         atomic_inc(&data.hctx->nr_active);
1004                 }
1005                 data.hctx->tags->rqs[rq->tag] = rq;
1006         }
1007
1008 done:
1009         return rq->tag != -1;
1010 }
1011
1012 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1013                                 int flags, void *key)
1014 {
1015         struct blk_mq_hw_ctx *hctx;
1016
1017         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1018
1019         spin_lock(&hctx->dispatch_wait_lock);
1020         list_del_init(&wait->entry);
1021         spin_unlock(&hctx->dispatch_wait_lock);
1022
1023         blk_mq_run_hw_queue(hctx, true);
1024         return 1;
1025 }
1026
1027 /*
1028  * Mark us waiting for a tag. For shared tags, this involves hooking us into
1029  * the tag wakeups. For non-shared tags, we can simply mark us needing a
1030  * restart. For both cases, take care to check the condition again after
1031  * marking us as waiting.
1032  */
1033 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1034                                  struct request *rq)
1035 {
1036         struct wait_queue_head *wq;
1037         wait_queue_entry_t *wait;
1038         bool ret;
1039
1040         if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1041                 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1042                         set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1043
1044                 /*
1045                  * It's possible that a tag was freed in the window between the
1046                  * allocation failure and adding the hardware queue to the wait
1047                  * queue.
1048                  *
1049                  * Don't clear RESTART here, someone else could have set it.
1050                  * At most this will cost an extra queue run.
1051                  */
1052                 return blk_mq_get_driver_tag(rq);
1053         }
1054
1055         wait = &hctx->dispatch_wait;
1056         if (!list_empty_careful(&wait->entry))
1057                 return false;
1058
1059         wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1060
1061         spin_lock_irq(&wq->lock);
1062         spin_lock(&hctx->dispatch_wait_lock);
1063         if (!list_empty(&wait->entry)) {
1064                 spin_unlock(&hctx->dispatch_wait_lock);
1065                 spin_unlock_irq(&wq->lock);
1066                 return false;
1067         }
1068
1069         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1070         __add_wait_queue(wq, wait);
1071
1072         /*
1073          * It's possible that a tag was freed in the window between the
1074          * allocation failure and adding the hardware queue to the wait
1075          * queue.
1076          */
1077         ret = blk_mq_get_driver_tag(rq);
1078         if (!ret) {
1079                 spin_unlock(&hctx->dispatch_wait_lock);
1080                 spin_unlock_irq(&wq->lock);
1081                 return false;
1082         }
1083
1084         /*
1085          * We got a tag, remove ourselves from the wait queue to ensure
1086          * someone else gets the wakeup.
1087          */
1088         list_del_init(&wait->entry);
1089         spin_unlock(&hctx->dispatch_wait_lock);
1090         spin_unlock_irq(&wq->lock);
1091
1092         return true;
1093 }
1094
1095 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1096 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1097 /*
1098  * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1099  * - EWMA is one simple way to compute running average value
1100  * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1101  * - take 4 as factor for avoiding to get too small(0) result, and this
1102  *   factor doesn't matter because EWMA decreases exponentially
1103  */
1104 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1105 {
1106         unsigned int ewma;
1107
1108         if (hctx->queue->elevator)
1109                 return;
1110
1111         ewma = hctx->dispatch_busy;
1112
1113         if (!ewma && !busy)
1114                 return;
1115
1116         ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1117         if (busy)
1118                 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1119         ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1120
1121         hctx->dispatch_busy = ewma;
1122 }
1123
1124 #define BLK_MQ_RESOURCE_DELAY   3               /* ms units */
1125
1126 /*
1127  * Returns true if we did some work AND can potentially do more.
1128  */
1129 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1130                              bool got_budget)
1131 {
1132         struct blk_mq_hw_ctx *hctx;
1133         struct request *rq, *nxt;
1134         bool no_tag = false;
1135         int errors, queued;
1136         blk_status_t ret = BLK_STS_OK;
1137
1138         if (list_empty(list))
1139                 return false;
1140
1141         WARN_ON(!list_is_singular(list) && got_budget);
1142
1143         /*
1144          * Now process all the entries, sending them to the driver.
1145          */
1146         errors = queued = 0;
1147         do {
1148                 struct blk_mq_queue_data bd;
1149
1150                 rq = list_first_entry(list, struct request, queuelist);
1151
1152                 hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
1153                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1154                         break;
1155
1156                 if (!blk_mq_get_driver_tag(rq)) {
1157                         /*
1158                          * The initial allocation attempt failed, so we need to
1159                          * rerun the hardware queue when a tag is freed. The
1160                          * waitqueue takes care of that. If the queue is run
1161                          * before we add this entry back on the dispatch list,
1162                          * we'll re-run it below.
1163                          */
1164                         if (!blk_mq_mark_tag_wait(hctx, rq)) {
1165                                 blk_mq_put_dispatch_budget(hctx);
1166                                 /*
1167                                  * For non-shared tags, the RESTART check
1168                                  * will suffice.
1169                                  */
1170                                 if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1171                                         no_tag = true;
1172                                 break;
1173                         }
1174                 }
1175
1176                 list_del_init(&rq->queuelist);
1177
1178                 bd.rq = rq;
1179
1180                 /*
1181                  * Flag last if we have no more requests, or if we have more
1182                  * but can't assign a driver tag to it.
1183                  */
1184                 if (list_empty(list))
1185                         bd.last = true;
1186                 else {
1187                         nxt = list_first_entry(list, struct request, queuelist);
1188                         bd.last = !blk_mq_get_driver_tag(nxt);
1189                 }
1190
1191                 ret = q->mq_ops->queue_rq(hctx, &bd);
1192                 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1193                         /*
1194                          * If an I/O scheduler has been configured and we got a
1195                          * driver tag for the next request already, free it
1196                          * again.
1197                          */
1198                         if (!list_empty(list)) {
1199                                 nxt = list_first_entry(list, struct request, queuelist);
1200                                 blk_mq_put_driver_tag(nxt);
1201                         }
1202                         list_add(&rq->queuelist, list);
1203                         __blk_mq_requeue_request(rq);
1204                         break;
1205                 }
1206
1207                 if (unlikely(ret != BLK_STS_OK)) {
1208                         errors++;
1209                         blk_mq_end_request(rq, BLK_STS_IOERR);
1210                         continue;
1211                 }
1212
1213                 queued++;
1214         } while (!list_empty(list));
1215
1216         hctx->dispatched[queued_to_index(queued)]++;
1217
1218         /*
1219          * Any items that need requeuing? Stuff them into hctx->dispatch,
1220          * that is where we will continue on next queue run.
1221          */
1222         if (!list_empty(list)) {
1223                 bool needs_restart;
1224
1225                 spin_lock(&hctx->lock);
1226                 list_splice_init(list, &hctx->dispatch);
1227                 spin_unlock(&hctx->lock);
1228
1229                 /*
1230                  * If SCHED_RESTART was set by the caller of this function and
1231                  * it is no longer set that means that it was cleared by another
1232                  * thread and hence that a queue rerun is needed.
1233                  *
1234                  * If 'no_tag' is set, that means that we failed getting
1235                  * a driver tag with an I/O scheduler attached. If our dispatch
1236                  * waitqueue is no longer active, ensure that we run the queue
1237                  * AFTER adding our entries back to the list.
1238                  *
1239                  * If no I/O scheduler has been configured it is possible that
1240                  * the hardware queue got stopped and restarted before requests
1241                  * were pushed back onto the dispatch list. Rerun the queue to
1242                  * avoid starvation. Notes:
1243                  * - blk_mq_run_hw_queue() checks whether or not a queue has
1244                  *   been stopped before rerunning a queue.
1245                  * - Some but not all block drivers stop a queue before
1246                  *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1247                  *   and dm-rq.
1248                  *
1249                  * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1250                  * bit is set, run queue after a delay to avoid IO stalls
1251                  * that could otherwise occur if the queue is idle.
1252                  */
1253                 needs_restart = blk_mq_sched_needs_restart(hctx);
1254                 if (!needs_restart ||
1255                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1256                         blk_mq_run_hw_queue(hctx, true);
1257                 else if (needs_restart && (ret == BLK_STS_RESOURCE))
1258                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1259
1260                 blk_mq_update_dispatch_busy(hctx, true);
1261                 return false;
1262         } else
1263                 blk_mq_update_dispatch_busy(hctx, false);
1264
1265         /*
1266          * If the host/device is unable to accept more work, inform the
1267          * caller of that.
1268          */
1269         if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1270                 return false;
1271
1272         return (queued + errors) != 0;
1273 }
1274
1275 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1276 {
1277         int srcu_idx;
1278
1279         /*
1280          * We should be running this queue from one of the CPUs that
1281          * are mapped to it.
1282          *
1283          * There are at least two related races now between setting
1284          * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
1285          * __blk_mq_run_hw_queue():
1286          *
1287          * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
1288          *   but later it becomes online, then this warning is harmless
1289          *   at all
1290          *
1291          * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
1292          *   but later it becomes offline, then the warning can't be
1293          *   triggered, and we depend on blk-mq timeout handler to
1294          *   handle dispatched requests to this hctx
1295          */
1296         if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1297                 cpu_online(hctx->next_cpu)) {
1298                 printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
1299                         raw_smp_processor_id(),
1300                         cpumask_empty(hctx->cpumask) ? "inactive": "active");
1301                 dump_stack();
1302         }
1303
1304         /*
1305          * We can't run the queue inline with ints disabled. Ensure that
1306          * we catch bad users of this early.
1307          */
1308         WARN_ON_ONCE(in_interrupt());
1309
1310         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1311
1312         hctx_lock(hctx, &srcu_idx);
1313         blk_mq_sched_dispatch_requests(hctx);
1314         hctx_unlock(hctx, srcu_idx);
1315 }
1316
1317 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1318 {
1319         int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1320
1321         if (cpu >= nr_cpu_ids)
1322                 cpu = cpumask_first(hctx->cpumask);
1323         return cpu;
1324 }
1325
1326 /*
1327  * It'd be great if the workqueue API had a way to pass
1328  * in a mask and had some smarts for more clever placement.
1329  * For now we just round-robin here, switching for every
1330  * BLK_MQ_CPU_WORK_BATCH queued items.
1331  */
1332 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1333 {
1334         bool tried = false;
1335         int next_cpu = hctx->next_cpu;
1336
1337         if (hctx->queue->nr_hw_queues == 1)
1338                 return WORK_CPU_UNBOUND;
1339
1340         if (--hctx->next_cpu_batch <= 0) {
1341 select_cpu:
1342                 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
1343                                 cpu_online_mask);
1344                 if (next_cpu >= nr_cpu_ids)
1345                         next_cpu = blk_mq_first_mapped_cpu(hctx);
1346                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1347         }
1348
1349         /*
1350          * Do unbound schedule if we can't find a online CPU for this hctx,
1351          * and it should only happen in the path of handling CPU DEAD.
1352          */
1353         if (!cpu_online(next_cpu)) {
1354                 if (!tried) {
1355                         tried = true;
1356                         goto select_cpu;
1357                 }
1358
1359                 /*
1360                  * Make sure to re-select CPU next time once after CPUs
1361                  * in hctx->cpumask become online again.
1362                  */
1363                 hctx->next_cpu = next_cpu;
1364                 hctx->next_cpu_batch = 1;
1365                 return WORK_CPU_UNBOUND;
1366         }
1367
1368         hctx->next_cpu = next_cpu;
1369         return next_cpu;
1370 }
1371
1372 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1373                                         unsigned long msecs)
1374 {
1375         if (unlikely(blk_mq_hctx_stopped(hctx)))
1376                 return;
1377
1378         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1379                 int cpu = get_cpu();
1380                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1381                         __blk_mq_run_hw_queue(hctx);
1382                         put_cpu();
1383                         return;
1384                 }
1385
1386                 put_cpu();
1387         }
1388
1389         kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1390                                     msecs_to_jiffies(msecs));
1391 }
1392
1393 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1394 {
1395         __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1396 }
1397 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1398
1399 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1400 {
1401         int srcu_idx;
1402         bool need_run;
1403
1404         /*
1405          * When queue is quiesced, we may be switching io scheduler, or
1406          * updating nr_hw_queues, or other things, and we can't run queue
1407          * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1408          *
1409          * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1410          * quiesced.
1411          */
1412         hctx_lock(hctx, &srcu_idx);
1413         need_run = !blk_queue_quiesced(hctx->queue) &&
1414                 blk_mq_hctx_has_pending(hctx);
1415         hctx_unlock(hctx, srcu_idx);
1416
1417         if (need_run) {
1418                 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1419                 return true;
1420         }
1421
1422         return false;
1423 }
1424 EXPORT_SYMBOL(blk_mq_run_hw_queue);
1425
1426 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1427 {
1428         struct blk_mq_hw_ctx *hctx;
1429         int i;
1430
1431         queue_for_each_hw_ctx(q, hctx, i) {
1432                 if (blk_mq_hctx_stopped(hctx))
1433                         continue;
1434
1435                 blk_mq_run_hw_queue(hctx, async);
1436         }
1437 }
1438 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1439
1440 /**
1441  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1442  * @q: request queue.
1443  *
1444  * The caller is responsible for serializing this function against
1445  * blk_mq_{start,stop}_hw_queue().
1446  */
1447 bool blk_mq_queue_stopped(struct request_queue *q)
1448 {
1449         struct blk_mq_hw_ctx *hctx;
1450         int i;
1451
1452         queue_for_each_hw_ctx(q, hctx, i)
1453                 if (blk_mq_hctx_stopped(hctx))
1454                         return true;
1455
1456         return false;
1457 }
1458 EXPORT_SYMBOL(blk_mq_queue_stopped);
1459
1460 /*
1461  * This function is often used for pausing .queue_rq() by driver when
1462  * there isn't enough resource or some conditions aren't satisfied, and
1463  * BLK_STS_RESOURCE is usually returned.
1464  *
1465  * We do not guarantee that dispatch can be drained or blocked
1466  * after blk_mq_stop_hw_queue() returns. Please use
1467  * blk_mq_quiesce_queue() for that requirement.
1468  */
1469 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1470 {
1471         cancel_delayed_work(&hctx->run_work);
1472
1473         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1474 }
1475 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1476
1477 /*
1478  * This function is often used for pausing .queue_rq() by driver when
1479  * there isn't enough resource or some conditions aren't satisfied, and
1480  * BLK_STS_RESOURCE is usually returned.
1481  *
1482  * We do not guarantee that dispatch can be drained or blocked
1483  * after blk_mq_stop_hw_queues() returns. Please use
1484  * blk_mq_quiesce_queue() for that requirement.
1485  */
1486 void blk_mq_stop_hw_queues(struct request_queue *q)
1487 {
1488         struct blk_mq_hw_ctx *hctx;
1489         int i;
1490
1491         queue_for_each_hw_ctx(q, hctx, i)
1492                 blk_mq_stop_hw_queue(hctx);
1493 }
1494 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1495
1496 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1497 {
1498         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1499
1500         blk_mq_run_hw_queue(hctx, false);
1501 }
1502 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1503
1504 void blk_mq_start_hw_queues(struct request_queue *q)
1505 {
1506         struct blk_mq_hw_ctx *hctx;
1507         int i;
1508
1509         queue_for_each_hw_ctx(q, hctx, i)
1510                 blk_mq_start_hw_queue(hctx);
1511 }
1512 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1513
1514 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1515 {
1516         if (!blk_mq_hctx_stopped(hctx))
1517                 return;
1518
1519         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1520         blk_mq_run_hw_queue(hctx, async);
1521 }
1522 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1523
1524 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1525 {
1526         struct blk_mq_hw_ctx *hctx;
1527         int i;
1528
1529         queue_for_each_hw_ctx(q, hctx, i)
1530                 blk_mq_start_stopped_hw_queue(hctx, async);
1531 }
1532 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1533
1534 static void blk_mq_run_work_fn(struct work_struct *work)
1535 {
1536         struct blk_mq_hw_ctx *hctx;
1537
1538         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1539
1540         /*
1541          * If we are stopped, don't run the queue.
1542          */
1543         if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1544                 return;
1545
1546         __blk_mq_run_hw_queue(hctx);
1547 }
1548
1549 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1550                                             struct request *rq,
1551                                             bool at_head)
1552 {
1553         struct blk_mq_ctx *ctx = rq->mq_ctx;
1554
1555         lockdep_assert_held(&ctx->lock);
1556
1557         trace_block_rq_insert(hctx->queue, rq);
1558
1559         if (at_head)
1560                 list_add(&rq->queuelist, &ctx->rq_list);
1561         else
1562                 list_add_tail(&rq->queuelist, &ctx->rq_list);
1563 }
1564
1565 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1566                              bool at_head)
1567 {
1568         struct blk_mq_ctx *ctx = rq->mq_ctx;
1569
1570         lockdep_assert_held(&ctx->lock);
1571
1572         __blk_mq_insert_req_list(hctx, rq, at_head);
1573         blk_mq_hctx_mark_pending(hctx, ctx);
1574 }
1575
1576 /*
1577  * Should only be used carefully, when the caller knows we want to
1578  * bypass a potential IO scheduler on the target device.
1579  */
1580 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1581 {
1582         struct blk_mq_ctx *ctx = rq->mq_ctx;
1583         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
1584                                                         ctx->cpu);
1585
1586         spin_lock(&hctx->lock);
1587         list_add_tail(&rq->queuelist, &hctx->dispatch);
1588         spin_unlock(&hctx->lock);
1589
1590         if (run_queue)
1591                 blk_mq_run_hw_queue(hctx, false);
1592 }
1593
1594 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1595                             struct list_head *list)
1596
1597 {
1598         struct request *rq;
1599
1600         /*
1601          * preemption doesn't flush plug list, so it's possible ctx->cpu is
1602          * offline now
1603          */
1604         list_for_each_entry(rq, list, queuelist) {
1605                 BUG_ON(rq->mq_ctx != ctx);
1606                 trace_block_rq_insert(hctx->queue, rq);
1607         }
1608
1609         spin_lock(&ctx->lock);
1610         list_splice_tail_init(list, &ctx->rq_list);
1611         blk_mq_hctx_mark_pending(hctx, ctx);
1612         spin_unlock(&ctx->lock);
1613 }
1614
1615 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1616 {
1617         struct request *rqa = container_of(a, struct request, queuelist);
1618         struct request *rqb = container_of(b, struct request, queuelist);
1619
1620         return !(rqa->mq_ctx < rqb->mq_ctx ||
1621                  (rqa->mq_ctx == rqb->mq_ctx &&
1622                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1623 }
1624
1625 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1626 {
1627         struct blk_mq_ctx *this_ctx;
1628         struct request_queue *this_q;
1629         struct request *rq;
1630         LIST_HEAD(list);
1631         LIST_HEAD(ctx_list);
1632         unsigned int depth;
1633
1634         list_splice_init(&plug->mq_list, &list);
1635
1636         list_sort(NULL, &list, plug_ctx_cmp);
1637
1638         this_q = NULL;
1639         this_ctx = NULL;
1640         depth = 0;
1641
1642         while (!list_empty(&list)) {
1643                 rq = list_entry_rq(list.next);
1644                 list_del_init(&rq->queuelist);
1645                 BUG_ON(!rq->q);
1646                 if (rq->mq_ctx != this_ctx) {
1647                         if (this_ctx) {
1648                                 trace_block_unplug(this_q, depth, !from_schedule);
1649                                 blk_mq_sched_insert_requests(this_q, this_ctx,
1650                                                                 &ctx_list,
1651                                                                 from_schedule);
1652                         }
1653
1654                         this_ctx = rq->mq_ctx;
1655                         this_q = rq->q;
1656                         depth = 0;
1657                 }
1658
1659                 depth++;
1660                 list_add_tail(&rq->queuelist, &ctx_list);
1661         }
1662
1663         /*
1664          * If 'this_ctx' is set, we know we have entries to complete
1665          * on 'ctx_list'. Do those.
1666          */
1667         if (this_ctx) {
1668                 trace_block_unplug(this_q, depth, !from_schedule);
1669                 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1670                                                 from_schedule);
1671         }
1672 }
1673
1674 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1675 {
1676         blk_init_request_from_bio(rq, bio);
1677
1678         blk_account_io_start(rq, true);
1679 }
1680
1681 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1682 {
1683         if (rq->tag != -1)
1684                 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1685
1686         return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1687 }
1688
1689 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1690                                             struct request *rq,
1691                                             blk_qc_t *cookie)
1692 {
1693         struct request_queue *q = rq->q;
1694         struct blk_mq_queue_data bd = {
1695                 .rq = rq,
1696                 .last = true,
1697         };
1698         blk_qc_t new_cookie;
1699         blk_status_t ret;
1700
1701         new_cookie = request_to_qc_t(hctx, rq);
1702
1703         /*
1704          * For OK queue, we are done. For error, caller may kill it.
1705          * Any other error (busy), just add it to our list as we
1706          * previously would have done.
1707          */
1708         ret = q->mq_ops->queue_rq(hctx, &bd);
1709         switch (ret) {
1710         case BLK_STS_OK:
1711                 blk_mq_update_dispatch_busy(hctx, false);
1712                 *cookie = new_cookie;
1713                 break;
1714         case BLK_STS_RESOURCE:
1715         case BLK_STS_DEV_RESOURCE:
1716                 blk_mq_update_dispatch_busy(hctx, true);
1717                 __blk_mq_requeue_request(rq);
1718                 break;
1719         default:
1720                 blk_mq_update_dispatch_busy(hctx, false);
1721                 *cookie = BLK_QC_T_NONE;
1722                 break;
1723         }
1724
1725         return ret;
1726 }
1727
1728 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1729                                                 struct request *rq,
1730                                                 blk_qc_t *cookie,
1731                                                 bool bypass_insert)
1732 {
1733         struct request_queue *q = rq->q;
1734         bool run_queue = true;
1735
1736         /*
1737          * RCU or SRCU read lock is needed before checking quiesced flag.
1738          *
1739          * When queue is stopped or quiesced, ignore 'bypass_insert' from
1740          * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
1741          * and avoid driver to try to dispatch again.
1742          */
1743         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1744                 run_queue = false;
1745                 bypass_insert = false;
1746                 goto insert;
1747         }
1748
1749         if (q->elevator && !bypass_insert)
1750                 goto insert;
1751
1752         if (!blk_mq_get_dispatch_budget(hctx))
1753                 goto insert;
1754
1755         if (!blk_mq_get_driver_tag(rq)) {
1756                 blk_mq_put_dispatch_budget(hctx);
1757                 goto insert;
1758         }
1759
1760         return __blk_mq_issue_directly(hctx, rq, cookie);
1761 insert:
1762         if (bypass_insert)
1763                 return BLK_STS_RESOURCE;
1764
1765         blk_mq_sched_insert_request(rq, false, run_queue, false);
1766         return BLK_STS_OK;
1767 }
1768
1769 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1770                 struct request *rq, blk_qc_t *cookie)
1771 {
1772         blk_status_t ret;
1773         int srcu_idx;
1774
1775         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1776
1777         hctx_lock(hctx, &srcu_idx);
1778
1779         ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1780         if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1781                 blk_mq_sched_insert_request(rq, false, true, false);
1782         else if (ret != BLK_STS_OK)
1783                 blk_mq_end_request(rq, ret);
1784
1785         hctx_unlock(hctx, srcu_idx);
1786 }
1787
1788 blk_status_t blk_mq_request_issue_directly(struct request *rq)
1789 {
1790         blk_status_t ret;
1791         int srcu_idx;
1792         blk_qc_t unused_cookie;
1793         struct blk_mq_ctx *ctx = rq->mq_ctx;
1794         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
1795                                                         ctx->cpu);
1796
1797         hctx_lock(hctx, &srcu_idx);
1798         ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
1799         hctx_unlock(hctx, srcu_idx);
1800
1801         return ret;
1802 }
1803
1804 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1805                 struct list_head *list)
1806 {
1807         while (!list_empty(list)) {
1808                 blk_status_t ret;
1809                 struct request *rq = list_first_entry(list, struct request,
1810                                 queuelist);
1811
1812                 list_del_init(&rq->queuelist);
1813                 ret = blk_mq_request_issue_directly(rq);
1814                 if (ret != BLK_STS_OK) {
1815                         if (ret == BLK_STS_RESOURCE ||
1816                                         ret == BLK_STS_DEV_RESOURCE) {
1817                                 list_add(&rq->queuelist, list);
1818                                 break;
1819                         }
1820                         blk_mq_end_request(rq, ret);
1821                 }
1822         }
1823 }
1824
1825 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1826 {
1827         const int is_sync = op_is_sync(bio->bi_opf);
1828         const int is_flush_fua = op_is_flush(bio->bi_opf);
1829         struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
1830         struct request *rq;
1831         unsigned int request_count = 0;
1832         struct blk_plug *plug;
1833         struct request *same_queue_rq = NULL;
1834         blk_qc_t cookie;
1835
1836         blk_queue_bounce(q, &bio);
1837
1838         blk_queue_split(q, &bio);
1839
1840         if (!bio_integrity_prep(bio))
1841                 return BLK_QC_T_NONE;
1842
1843         if (!is_flush_fua && !blk_queue_nomerges(q) &&
1844             blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1845                 return BLK_QC_T_NONE;
1846
1847         if (blk_mq_sched_bio_merge(q, bio))
1848                 return BLK_QC_T_NONE;
1849
1850         rq_qos_throttle(q, bio, NULL);
1851
1852         rq = blk_mq_get_request(q, bio, &data);
1853         if (unlikely(!rq)) {
1854                 rq_qos_cleanup(q, bio);
1855                 if (bio->bi_opf & REQ_NOWAIT)
1856                         bio_wouldblock_error(bio);
1857                 return BLK_QC_T_NONE;
1858         }
1859
1860         trace_block_getrq(q, bio, bio->bi_opf);
1861
1862         rq_qos_track(q, rq, bio);
1863
1864         cookie = request_to_qc_t(data.hctx, rq);
1865
1866         plug = current->plug;
1867         if (unlikely(is_flush_fua)) {
1868                 blk_mq_put_ctx(data.ctx);
1869                 blk_mq_bio_to_request(rq, bio);
1870
1871                 /* bypass scheduler for flush rq */
1872                 blk_insert_flush(rq);
1873                 blk_mq_run_hw_queue(data.hctx, true);
1874         } else if (plug && q->nr_hw_queues == 1) {
1875                 struct request *last = NULL;
1876
1877                 blk_mq_put_ctx(data.ctx);
1878                 blk_mq_bio_to_request(rq, bio);
1879
1880                 /*
1881                  * @request_count may become stale because of schedule
1882                  * out, so check the list again.
1883                  */
1884                 if (list_empty(&plug->mq_list))
1885                         request_count = 0;
1886                 else if (blk_queue_nomerges(q))
1887                         request_count = blk_plug_queued_count(q);
1888
1889                 if (!request_count)
1890                         trace_block_plug(q);
1891                 else
1892                         last = list_entry_rq(plug->mq_list.prev);
1893
1894                 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1895                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1896                         blk_flush_plug_list(plug, false);
1897                         trace_block_plug(q);
1898                 }
1899
1900                 list_add_tail(&rq->queuelist, &plug->mq_list);
1901         } else if (plug && !blk_queue_nomerges(q)) {
1902                 blk_mq_bio_to_request(rq, bio);
1903
1904                 /*
1905                  * We do limited plugging. If the bio can be merged, do that.
1906                  * Otherwise the existing request in the plug list will be
1907                  * issued. So the plug list will have one request at most
1908                  * The plug list might get flushed before this. If that happens,
1909                  * the plug list is empty, and same_queue_rq is invalid.
1910                  */
1911                 if (list_empty(&plug->mq_list))
1912                         same_queue_rq = NULL;
1913                 if (same_queue_rq)
1914                         list_del_init(&same_queue_rq->queuelist);
1915                 list_add_tail(&rq->queuelist, &plug->mq_list);
1916
1917                 blk_mq_put_ctx(data.ctx);
1918
1919                 if (same_queue_rq) {
1920                         data.hctx = blk_mq_map_queue(q,
1921                                         same_queue_rq->cmd_flags,
1922                                         same_queue_rq->mq_ctx->cpu);
1923                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1924                                         &cookie);
1925                 }
1926         } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1927                         !data.hctx->dispatch_busy)) {
1928                 blk_mq_put_ctx(data.ctx);
1929                 blk_mq_bio_to_request(rq, bio);
1930                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1931         } else {
1932                 blk_mq_put_ctx(data.ctx);
1933                 blk_mq_bio_to_request(rq, bio);
1934                 blk_mq_sched_insert_request(rq, false, true, true);
1935         }
1936
1937         return cookie;
1938 }
1939
1940 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1941                      unsigned int hctx_idx)
1942 {
1943         struct page *page;
1944
1945         if (tags->rqs && set->ops->exit_request) {
1946                 int i;
1947
1948                 for (i = 0; i < tags->nr_tags; i++) {
1949                         struct request *rq = tags->static_rqs[i];
1950
1951                         if (!rq)
1952                                 continue;
1953                         set->ops->exit_request(set, rq, hctx_idx);
1954                         tags->static_rqs[i] = NULL;
1955                 }
1956         }
1957
1958         while (!list_empty(&tags->page_list)) {
1959                 page = list_first_entry(&tags->page_list, struct page, lru);
1960                 list_del_init(&page->lru);
1961                 /*
1962                  * Remove kmemleak object previously allocated in
1963                  * blk_mq_init_rq_map().
1964                  */
1965                 kmemleak_free(page_address(page));
1966                 __free_pages(page, page->private);
1967         }
1968 }
1969
1970 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1971 {
1972         kfree(tags->rqs);
1973         tags->rqs = NULL;
1974         kfree(tags->static_rqs);
1975         tags->static_rqs = NULL;
1976
1977         blk_mq_free_tags(tags);
1978 }
1979
1980 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1981                                         unsigned int hctx_idx,
1982                                         unsigned int nr_tags,
1983                                         unsigned int reserved_tags)
1984 {
1985         struct blk_mq_tags *tags;
1986         int node;
1987
1988         node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
1989         if (node == NUMA_NO_NODE)
1990                 node = set->numa_node;
1991
1992         tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
1993                                 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1994         if (!tags)
1995                 return NULL;
1996
1997         tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
1998                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1999                                  node);
2000         if (!tags->rqs) {
2001                 blk_mq_free_tags(tags);
2002                 return NULL;
2003         }
2004
2005         tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2006                                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2007                                         node);
2008         if (!tags->static_rqs) {
2009                 kfree(tags->rqs);
2010                 blk_mq_free_tags(tags);
2011                 return NULL;
2012         }
2013
2014         return tags;
2015 }
2016
2017 static size_t order_to_size(unsigned int order)
2018 {
2019         return (size_t)PAGE_SIZE << order;
2020 }
2021
2022 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2023                                unsigned int hctx_idx, int node)
2024 {
2025         int ret;
2026
2027         if (set->ops->init_request) {
2028                 ret = set->ops->init_request(set, rq, hctx_idx, node);
2029                 if (ret)
2030                         return ret;
2031         }
2032
2033         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
2034         return 0;
2035 }
2036
2037 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2038                      unsigned int hctx_idx, unsigned int depth)
2039 {
2040         unsigned int i, j, entries_per_page, max_order = 4;
2041         size_t rq_size, left;
2042         int node;
2043
2044         node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
2045         if (node == NUMA_NO_NODE)
2046                 node = set->numa_node;
2047
2048         INIT_LIST_HEAD(&tags->page_list);
2049
2050         /*
2051          * rq_size is the size of the request plus driver payload, rounded
2052          * to the cacheline size
2053          */
2054         rq_size = round_up(sizeof(struct request) + set->cmd_size,
2055                                 cache_line_size());
2056         left = rq_size * depth;
2057
2058         for (i = 0; i < depth; ) {
2059                 int this_order = max_order;
2060                 struct page *page;
2061                 int to_do;
2062                 void *p;
2063
2064                 while (this_order && left < order_to_size(this_order - 1))
2065                         this_order--;
2066
2067                 do {
2068                         page = alloc_pages_node(node,
2069                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
2070                                 this_order);
2071                         if (page)
2072                                 break;
2073                         if (!this_order--)
2074                                 break;
2075                         if (order_to_size(this_order) < rq_size)
2076                                 break;
2077                 } while (1);
2078
2079                 if (!page)
2080                         goto fail;
2081
2082                 page->private = this_order;
2083                 list_add_tail(&page->lru, &tags->page_list);
2084
2085                 p = page_address(page);
2086                 /*
2087                  * Allow kmemleak to scan these pages as they contain pointers
2088                  * to additional allocations like via ops->init_request().
2089                  */
2090                 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
2091                 entries_per_page = order_to_size(this_order) / rq_size;
2092                 to_do = min(entries_per_page, depth - i);
2093                 left -= to_do * rq_size;
2094                 for (j = 0; j < to_do; j++) {
2095                         struct request *rq = p;
2096
2097                         tags->static_rqs[i] = rq;
2098                         if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2099                                 tags->static_rqs[i] = NULL;
2100                                 goto fail;
2101                         }
2102
2103                         p += rq_size;
2104                         i++;
2105                 }
2106         }
2107         return 0;
2108
2109 fail:
2110         blk_mq_free_rqs(set, tags, hctx_idx);
2111         return -ENOMEM;
2112 }
2113
2114 /*
2115  * 'cpu' is going away. splice any existing rq_list entries from this
2116  * software queue to the hw queue dispatch list, and ensure that it
2117  * gets run.
2118  */
2119 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2120 {
2121         struct blk_mq_hw_ctx *hctx;
2122         struct blk_mq_ctx *ctx;
2123         LIST_HEAD(tmp);
2124
2125         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2126         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2127
2128         spin_lock(&ctx->lock);
2129         if (!list_empty(&ctx->rq_list)) {
2130                 list_splice_init(&ctx->rq_list, &tmp);
2131                 blk_mq_hctx_clear_pending(hctx, ctx);
2132         }
2133         spin_unlock(&ctx->lock);
2134
2135         if (list_empty(&tmp))
2136                 return 0;
2137
2138         spin_lock(&hctx->lock);
2139         list_splice_tail_init(&tmp, &hctx->dispatch);
2140         spin_unlock(&hctx->lock);
2141
2142         blk_mq_run_hw_queue(hctx, true);
2143         return 0;
2144 }
2145
2146 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2147 {
2148         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2149                                             &hctx->cpuhp_dead);
2150 }
2151
2152 /* hctx->ctxs will be freed in queue's release handler */
2153 static void blk_mq_exit_hctx(struct request_queue *q,
2154                 struct blk_mq_tag_set *set,
2155                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2156 {
2157         if (blk_mq_hw_queue_mapped(hctx))
2158                 blk_mq_tag_idle(hctx);
2159
2160         if (set->ops->exit_request)
2161                 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2162
2163         if (set->ops->exit_hctx)
2164                 set->ops->exit_hctx(hctx, hctx_idx);
2165
2166         if (hctx->flags & BLK_MQ_F_BLOCKING)
2167                 cleanup_srcu_struct(hctx->srcu);
2168
2169         blk_mq_remove_cpuhp(hctx);
2170         blk_free_flush_queue(hctx->fq);
2171         sbitmap_free(&hctx->ctx_map);
2172 }
2173
2174 static void blk_mq_exit_hw_queues(struct request_queue *q,
2175                 struct blk_mq_tag_set *set, int nr_queue)
2176 {
2177         struct blk_mq_hw_ctx *hctx;
2178         unsigned int i;
2179
2180         queue_for_each_hw_ctx(q, hctx, i) {
2181                 if (i == nr_queue)
2182                         break;
2183                 blk_mq_debugfs_unregister_hctx(hctx);
2184                 blk_mq_exit_hctx(q, set, hctx, i);
2185         }
2186 }
2187
2188 static int blk_mq_init_hctx(struct request_queue *q,
2189                 struct blk_mq_tag_set *set,
2190                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2191 {
2192         int node;
2193
2194         node = hctx->numa_node;
2195         if (node == NUMA_NO_NODE)
2196                 node = hctx->numa_node = set->numa_node;
2197
2198         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2199         spin_lock_init(&hctx->lock);
2200         INIT_LIST_HEAD(&hctx->dispatch);
2201         hctx->queue = q;
2202         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2203
2204         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2205
2206         hctx->tags = set->tags[hctx_idx];
2207
2208         /*
2209          * Allocate space for all possible cpus to avoid allocation at
2210          * runtime
2211          */
2212         hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2213                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
2214         if (!hctx->ctxs)
2215                 goto unregister_cpu_notifier;
2216
2217         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2218                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
2219                 goto free_ctxs;
2220
2221         hctx->nr_ctx = 0;
2222
2223         spin_lock_init(&hctx->dispatch_wait_lock);
2224         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2225         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2226
2227         if (set->ops->init_hctx &&
2228             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2229                 goto free_bitmap;
2230
2231         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2232                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
2233         if (!hctx->fq)
2234                 goto exit_hctx;
2235
2236         if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2237                 goto free_fq;
2238
2239         if (hctx->flags & BLK_MQ_F_BLOCKING)
2240                 init_srcu_struct(hctx->srcu);
2241
2242         return 0;
2243
2244  free_fq:
2245         kfree(hctx->fq);
2246  exit_hctx:
2247         if (set->ops->exit_hctx)
2248                 set->ops->exit_hctx(hctx, hctx_idx);
2249  free_bitmap:
2250         sbitmap_free(&hctx->ctx_map);
2251  free_ctxs:
2252         kfree(hctx->ctxs);
2253  unregister_cpu_notifier:
2254         blk_mq_remove_cpuhp(hctx);
2255         return -1;
2256 }
2257
2258 static void blk_mq_init_cpu_queues(struct request_queue *q,
2259                                    unsigned int nr_hw_queues)
2260 {
2261         struct blk_mq_tag_set *set = q->tag_set;
2262         unsigned int i, j;
2263
2264         for_each_possible_cpu(i) {
2265                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2266                 struct blk_mq_hw_ctx *hctx;
2267
2268                 __ctx->cpu = i;
2269                 spin_lock_init(&__ctx->lock);
2270                 INIT_LIST_HEAD(&__ctx->rq_list);
2271                 __ctx->queue = q;
2272
2273                 /*
2274                  * Set local node, IFF we have more than one hw queue. If
2275                  * not, we remain on the home node of the device
2276                  */
2277                 for (j = 0; j < set->nr_maps; j++) {
2278                         hctx = blk_mq_map_queue_type(q, j, i);
2279                         if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2280                                 hctx->numa_node = local_memory_node(cpu_to_node(i));
2281                 }
2282         }
2283 }
2284
2285 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2286 {
2287         int ret = 0;
2288
2289         set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2290                                         set->queue_depth, set->reserved_tags);
2291         if (!set->tags[hctx_idx])
2292                 return false;
2293
2294         ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2295                                 set->queue_depth);
2296         if (!ret)
2297                 return true;
2298
2299         blk_mq_free_rq_map(set->tags[hctx_idx]);
2300         set->tags[hctx_idx] = NULL;
2301         return false;
2302 }
2303
2304 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2305                                          unsigned int hctx_idx)
2306 {
2307         if (set->tags[hctx_idx]) {
2308                 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2309                 blk_mq_free_rq_map(set->tags[hctx_idx]);
2310                 set->tags[hctx_idx] = NULL;
2311         }
2312 }
2313
2314 static void blk_mq_map_swqueue(struct request_queue *q)
2315 {
2316         unsigned int i, j, hctx_idx;
2317         struct blk_mq_hw_ctx *hctx;
2318         struct blk_mq_ctx *ctx;
2319         struct blk_mq_tag_set *set = q->tag_set;
2320
2321         /*
2322          * Avoid others reading imcomplete hctx->cpumask through sysfs
2323          */
2324         mutex_lock(&q->sysfs_lock);
2325
2326         queue_for_each_hw_ctx(q, hctx, i) {
2327                 cpumask_clear(hctx->cpumask);
2328                 hctx->nr_ctx = 0;
2329                 hctx->dispatch_from = NULL;
2330         }
2331
2332         /*
2333          * Map software to hardware queues.
2334          *
2335          * If the cpu isn't present, the cpu is mapped to first hctx.
2336          */
2337         for_each_possible_cpu(i) {
2338                 hctx_idx = set->map[0].mq_map[i];
2339                 /* unmapped hw queue can be remapped after CPU topo changed */
2340                 if (!set->tags[hctx_idx] &&
2341                     !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2342                         /*
2343                          * If tags initialization fail for some hctx,
2344                          * that hctx won't be brought online.  In this
2345                          * case, remap the current ctx to hctx[0] which
2346                          * is guaranteed to always have tags allocated
2347                          */
2348                         set->map[0].mq_map[i] = 0;
2349                 }
2350
2351                 ctx = per_cpu_ptr(q->queue_ctx, i);
2352                 for (j = 0; j < set->nr_maps; j++) {
2353                         hctx = blk_mq_map_queue_type(q, j, i);
2354
2355                         /*
2356                          * If the CPU is already set in the mask, then we've
2357                          * mapped this one already. This can happen if
2358                          * devices share queues across queue maps.
2359                          */
2360                         if (cpumask_test_cpu(i, hctx->cpumask))
2361                                 continue;
2362
2363                         cpumask_set_cpu(i, hctx->cpumask);
2364                         hctx->type = j;
2365                         ctx->index_hw[hctx->type] = hctx->nr_ctx;
2366                         hctx->ctxs[hctx->nr_ctx++] = ctx;
2367
2368                         /*
2369                          * If the nr_ctx type overflows, we have exceeded the
2370                          * amount of sw queues we can support.
2371                          */
2372                         BUG_ON(!hctx->nr_ctx);
2373                 }
2374         }
2375
2376         mutex_unlock(&q->sysfs_lock);
2377
2378         queue_for_each_hw_ctx(q, hctx, i) {
2379                 /*
2380                  * If no software queues are mapped to this hardware queue,
2381                  * disable it and free the request entries.
2382                  */
2383                 if (!hctx->nr_ctx) {
2384                         /* Never unmap queue 0.  We need it as a
2385                          * fallback in case of a new remap fails
2386                          * allocation
2387                          */
2388                         if (i && set->tags[i])
2389                                 blk_mq_free_map_and_requests(set, i);
2390
2391                         hctx->tags = NULL;
2392                         continue;
2393                 }
2394
2395                 hctx->tags = set->tags[i];
2396                 WARN_ON(!hctx->tags);
2397
2398                 /*
2399                  * Set the map size to the number of mapped software queues.
2400                  * This is more accurate and more efficient than looping
2401                  * over all possibly mapped software queues.
2402                  */
2403                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2404
2405                 /*
2406                  * Initialize batch roundrobin counts
2407                  */
2408                 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
2409                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2410         }
2411 }
2412
2413 /*
2414  * Caller needs to ensure that we're either frozen/quiesced, or that
2415  * the queue isn't live yet.
2416  */
2417 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2418 {
2419         struct blk_mq_hw_ctx *hctx;
2420         int i;
2421
2422         queue_for_each_hw_ctx(q, hctx, i) {
2423                 if (shared)
2424                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
2425                 else
2426                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2427         }
2428 }
2429
2430 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2431                                         bool shared)
2432 {
2433         struct request_queue *q;
2434
2435         lockdep_assert_held(&set->tag_list_lock);
2436
2437         list_for_each_entry(q, &set->tag_list, tag_set_list) {
2438                 blk_mq_freeze_queue(q);
2439                 queue_set_hctx_shared(q, shared);
2440                 blk_mq_unfreeze_queue(q);
2441         }
2442 }
2443
2444 static void blk_mq_del_queue_tag_set(struct request_queue *q)
2445 {
2446         struct blk_mq_tag_set *set = q->tag_set;
2447
2448         mutex_lock(&set->tag_list_lock);
2449         list_del_rcu(&q->tag_set_list);
2450         if (list_is_singular(&set->tag_list)) {
2451                 /* just transitioned to unshared */
2452                 set->flags &= ~BLK_MQ_F_TAG_SHARED;
2453                 /* update existing queue */
2454                 blk_mq_update_tag_set_depth(set, false);
2455         }
2456         mutex_unlock(&set->tag_list_lock);
2457         INIT_LIST_HEAD(&q->tag_set_list);
2458 }
2459
2460 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2461                                      struct request_queue *q)
2462 {
2463         mutex_lock(&set->tag_list_lock);
2464
2465         /*
2466          * Check to see if we're transitioning to shared (from 1 to 2 queues).
2467          */
2468         if (!list_empty(&set->tag_list) &&
2469             !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2470                 set->flags |= BLK_MQ_F_TAG_SHARED;
2471                 /* update existing queue */
2472                 blk_mq_update_tag_set_depth(set, true);
2473         }
2474         if (set->flags & BLK_MQ_F_TAG_SHARED)
2475                 queue_set_hctx_shared(q, true);
2476         list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2477
2478         mutex_unlock(&set->tag_list_lock);
2479 }
2480
2481 /*
2482  * It is the actual release handler for mq, but we do it from
2483  * request queue's release handler for avoiding use-after-free
2484  * and headache because q->mq_kobj shouldn't have been introduced,
2485  * but we can't group ctx/kctx kobj without it.
2486  */
2487 void blk_mq_release(struct request_queue *q)
2488 {
2489         struct blk_mq_hw_ctx *hctx;
2490         unsigned int i;
2491
2492         /* hctx kobj stays in hctx */
2493         queue_for_each_hw_ctx(q, hctx, i) {
2494                 if (!hctx)
2495                         continue;
2496                 kobject_put(&hctx->kobj);
2497         }
2498
2499         kfree(q->queue_hw_ctx);
2500
2501         /*
2502          * release .mq_kobj and sw queue's kobject now because
2503          * both share lifetime with request queue.
2504          */
2505         blk_mq_sysfs_deinit(q);
2506
2507         free_percpu(q->queue_ctx);
2508 }
2509
2510 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2511 {
2512         struct request_queue *uninit_q, *q;
2513
2514         uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
2515         if (!uninit_q)
2516                 return ERR_PTR(-ENOMEM);
2517
2518         q = blk_mq_init_allocated_queue(set, uninit_q);
2519         if (IS_ERR(q))
2520                 blk_cleanup_queue(uninit_q);
2521
2522         return q;
2523 }
2524 EXPORT_SYMBOL(blk_mq_init_queue);
2525
2526 /*
2527  * Helper for setting up a queue with mq ops, given queue depth, and
2528  * the passed in mq ops flags.
2529  */
2530 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2531                                            const struct blk_mq_ops *ops,
2532                                            unsigned int queue_depth,
2533                                            unsigned int set_flags)
2534 {
2535         struct request_queue *q;
2536         int ret;
2537
2538         memset(set, 0, sizeof(*set));
2539         set->ops = ops;
2540         set->nr_hw_queues = 1;
2541         set->nr_maps = 1;
2542         set->queue_depth = queue_depth;
2543         set->numa_node = NUMA_NO_NODE;
2544         set->flags = set_flags;
2545
2546         ret = blk_mq_alloc_tag_set(set);
2547         if (ret)
2548                 return ERR_PTR(ret);
2549
2550         q = blk_mq_init_queue(set);
2551         if (IS_ERR(q)) {
2552                 blk_mq_free_tag_set(set);
2553                 return q;
2554         }
2555
2556         return q;
2557 }
2558 EXPORT_SYMBOL(blk_mq_init_sq_queue);
2559
2560 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2561 {
2562         int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2563
2564         BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2565                            __alignof__(struct blk_mq_hw_ctx)) !=
2566                      sizeof(struct blk_mq_hw_ctx));
2567
2568         if (tag_set->flags & BLK_MQ_F_BLOCKING)
2569                 hw_ctx_size += sizeof(struct srcu_struct);
2570
2571         return hw_ctx_size;
2572 }
2573
2574 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
2575                 struct blk_mq_tag_set *set, struct request_queue *q,
2576                 int hctx_idx, int node)
2577 {
2578         struct blk_mq_hw_ctx *hctx;
2579
2580         hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
2581                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2582                         node);
2583         if (!hctx)
2584                 return NULL;
2585
2586         if (!zalloc_cpumask_var_node(&hctx->cpumask,
2587                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2588                                 node)) {
2589                 kfree(hctx);
2590                 return NULL;
2591         }
2592
2593         atomic_set(&hctx->nr_active, 0);
2594         hctx->numa_node = node;
2595         hctx->queue_num = hctx_idx;
2596
2597         if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) {
2598                 free_cpumask_var(hctx->cpumask);
2599                 kfree(hctx);
2600                 return NULL;
2601         }
2602         blk_mq_hctx_kobj_init(hctx);
2603
2604         return hctx;
2605 }
2606
2607 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2608                                                 struct request_queue *q)
2609 {
2610         int i, j, end;
2611         struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2612
2613         /* protect against switching io scheduler  */
2614         mutex_lock(&q->sysfs_lock);
2615         for (i = 0; i < set->nr_hw_queues; i++) {
2616                 int node;
2617                 struct blk_mq_hw_ctx *hctx;
2618
2619                 node = blk_mq_hw_queue_to_node(&set->map[0], i);
2620                 /*
2621                  * If the hw queue has been mapped to another numa node,
2622                  * we need to realloc the hctx. If allocation fails, fallback
2623                  * to use the previous one.
2624                  */
2625                 if (hctxs[i] && (hctxs[i]->numa_node == node))
2626                         continue;
2627
2628                 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
2629                 if (hctx) {
2630                         if (hctxs[i]) {
2631                                 blk_mq_exit_hctx(q, set, hctxs[i], i);
2632                                 kobject_put(&hctxs[i]->kobj);
2633                         }
2634                         hctxs[i] = hctx;
2635                 } else {
2636                         if (hctxs[i])
2637                                 pr_warn("Allocate new hctx on node %d fails,\
2638                                                 fallback to previous one on node %d\n",
2639                                                 node, hctxs[i]->numa_node);
2640                         else
2641                                 break;
2642                 }
2643         }
2644         /*
2645          * Increasing nr_hw_queues fails. Free the newly allocated
2646          * hctxs and keep the previous q->nr_hw_queues.
2647          */
2648         if (i != set->nr_hw_queues) {
2649                 j = q->nr_hw_queues;
2650                 end = i;
2651         } else {
2652                 j = i;
2653                 end = q->nr_hw_queues;
2654                 q->nr_hw_queues = set->nr_hw_queues;
2655         }
2656
2657         for (; j < end; j++) {
2658                 struct blk_mq_hw_ctx *hctx = hctxs[j];
2659
2660                 if (hctx) {
2661                         if (hctx->tags)
2662                                 blk_mq_free_map_and_requests(set, j);
2663                         blk_mq_exit_hctx(q, set, hctx, j);
2664                         kobject_put(&hctx->kobj);
2665                         hctxs[j] = NULL;
2666
2667                 }
2668         }
2669         mutex_unlock(&q->sysfs_lock);
2670 }
2671
2672 /*
2673  * Maximum number of hardware queues we support. For single sets, we'll never
2674  * have more than the CPUs (software queues). For multiple sets, the tag_set
2675  * user may have set ->nr_hw_queues larger.
2676  */
2677 static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
2678 {
2679         if (set->nr_maps == 1)
2680                 return nr_cpu_ids;
2681
2682         return max(set->nr_hw_queues, nr_cpu_ids);
2683 }
2684
2685 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2686                                                   struct request_queue *q)
2687 {
2688         /* mark the queue as mq asap */
2689         q->mq_ops = set->ops;
2690
2691         q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2692                                              blk_mq_poll_stats_bkt,
2693                                              BLK_MQ_POLL_STATS_BKTS, q);
2694         if (!q->poll_cb)
2695                 goto err_exit;
2696
2697         q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2698         if (!q->queue_ctx)
2699                 goto err_exit;
2700
2701         /* init q->mq_kobj and sw queues' kobjects */
2702         blk_mq_sysfs_init(q);
2703
2704         q->nr_queues = nr_hw_queues(set);
2705         q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
2706                                                 GFP_KERNEL, set->numa_node);
2707         if (!q->queue_hw_ctx)
2708                 goto err_percpu;
2709
2710         blk_mq_realloc_hw_ctxs(set, q);
2711         if (!q->nr_hw_queues)
2712                 goto err_hctxs;
2713
2714         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2715         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2716
2717         q->tag_set = set;
2718
2719         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2720
2721         if (!(set->flags & BLK_MQ_F_SG_MERGE))
2722                 queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
2723
2724         q->sg_reserved_size = INT_MAX;
2725
2726         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2727         INIT_LIST_HEAD(&q->requeue_list);
2728         spin_lock_init(&q->requeue_lock);
2729
2730         blk_queue_make_request(q, blk_mq_make_request);
2731         if (q->mq_ops->poll)
2732                 q->poll_fn = blk_mq_poll;
2733
2734         /*
2735          * Do this after blk_queue_make_request() overrides it...
2736          */
2737         q->nr_requests = set->queue_depth;
2738
2739         /*
2740          * Default to classic polling
2741          */
2742         q->poll_nsec = -1;
2743
2744         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2745         blk_mq_add_queue_tag_set(set, q);
2746         blk_mq_map_swqueue(q);
2747
2748         if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2749                 int ret;
2750
2751                 ret = elevator_init_mq(q);
2752                 if (ret)
2753                         return ERR_PTR(ret);
2754         }
2755
2756         return q;
2757
2758 err_hctxs:
2759         kfree(q->queue_hw_ctx);
2760 err_percpu:
2761         free_percpu(q->queue_ctx);
2762 err_exit:
2763         q->mq_ops = NULL;
2764         return ERR_PTR(-ENOMEM);
2765 }
2766 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2767
2768 void blk_mq_free_queue(struct request_queue *q)
2769 {
2770         struct blk_mq_tag_set   *set = q->tag_set;
2771
2772         blk_mq_del_queue_tag_set(q);
2773         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2774 }
2775
2776 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2777 {
2778         int i;
2779
2780         for (i = 0; i < set->nr_hw_queues; i++)
2781                 if (!__blk_mq_alloc_rq_map(set, i))
2782                         goto out_unwind;
2783
2784         return 0;
2785
2786 out_unwind:
2787         while (--i >= 0)
2788                 blk_mq_free_rq_map(set->tags[i]);
2789
2790         return -ENOMEM;
2791 }
2792
2793 /*
2794  * Allocate the request maps associated with this tag_set. Note that this
2795  * may reduce the depth asked for, if memory is tight. set->queue_depth
2796  * will be updated to reflect the allocated depth.
2797  */
2798 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2799 {
2800         unsigned int depth;
2801         int err;
2802
2803         depth = set->queue_depth;
2804         do {
2805                 err = __blk_mq_alloc_rq_maps(set);
2806                 if (!err)
2807                         break;
2808
2809                 set->queue_depth >>= 1;
2810                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2811                         err = -ENOMEM;
2812                         break;
2813                 }
2814         } while (set->queue_depth);
2815
2816         if (!set->queue_depth || err) {
2817                 pr_err("blk-mq: failed to allocate request map\n");
2818                 return -ENOMEM;
2819         }
2820
2821         if (depth != set->queue_depth)
2822                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2823                                                 depth, set->queue_depth);
2824
2825         return 0;
2826 }
2827
2828 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2829 {
2830         if (set->ops->map_queues) {
2831                 int i;
2832
2833                 /*
2834                  * transport .map_queues is usually done in the following
2835                  * way:
2836                  *
2837                  * for (queue = 0; queue < set->nr_hw_queues; queue++) {
2838                  *      mask = get_cpu_mask(queue)
2839                  *      for_each_cpu(cpu, mask)
2840                  *              set->map[x].mq_map[cpu] = queue;
2841                  * }
2842                  *
2843                  * When we need to remap, the table has to be cleared for
2844                  * killing stale mapping since one CPU may not be mapped
2845                  * to any hw queue.
2846                  */
2847                 for (i = 0; i < set->nr_maps; i++)
2848                         blk_mq_clear_mq_map(&set->map[i]);
2849
2850                 return set->ops->map_queues(set);
2851         } else {
2852                 BUG_ON(set->nr_maps > 1);
2853                 return blk_mq_map_queues(&set->map[0]);
2854         }
2855 }
2856
2857 /*
2858  * Alloc a tag set to be associated with one or more request queues.
2859  * May fail with EINVAL for various error conditions. May adjust the
2860  * requested depth down, if it's too large. In that case, the set
2861  * value will be stored in set->queue_depth.
2862  */
2863 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2864 {
2865         int i, ret;
2866
2867         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2868
2869         if (!set->nr_hw_queues)
2870                 return -EINVAL;
2871         if (!set->queue_depth)
2872                 return -EINVAL;
2873         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2874                 return -EINVAL;
2875
2876         if (!set->ops->queue_rq)
2877                 return -EINVAL;
2878
2879         if (!set->ops->get_budget ^ !set->ops->put_budget)
2880                 return -EINVAL;
2881
2882         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2883                 pr_info("blk-mq: reduced tag depth to %u\n",
2884                         BLK_MQ_MAX_DEPTH);
2885                 set->queue_depth = BLK_MQ_MAX_DEPTH;
2886         }
2887
2888         if (!set->nr_maps)
2889                 set->nr_maps = 1;
2890         else if (set->nr_maps > HCTX_MAX_TYPES)
2891                 return -EINVAL;
2892
2893         /*
2894          * If a crashdump is active, then we are potentially in a very
2895          * memory constrained environment. Limit us to 1 queue and
2896          * 64 tags to prevent using too much memory.
2897          */
2898         if (is_kdump_kernel()) {
2899                 set->nr_hw_queues = 1;
2900                 set->queue_depth = min(64U, set->queue_depth);
2901         }
2902         /*
2903          * There is no use for more h/w queues than cpus if we just have
2904          * a single map
2905          */
2906         if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2907                 set->nr_hw_queues = nr_cpu_ids;
2908
2909         set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
2910                                  GFP_KERNEL, set->numa_node);
2911         if (!set->tags)
2912                 return -ENOMEM;
2913
2914         ret = -ENOMEM;
2915         for (i = 0; i < set->nr_maps; i++) {
2916                 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
2917                                                   sizeof(struct blk_mq_queue_map),
2918                                                   GFP_KERNEL, set->numa_node);
2919                 if (!set->map[i].mq_map)
2920                         goto out_free_mq_map;
2921                 set->map[i].nr_queues = set->nr_hw_queues;
2922         }
2923
2924         ret = blk_mq_update_queue_map(set);
2925         if (ret)
2926                 goto out_free_mq_map;
2927
2928         ret = blk_mq_alloc_rq_maps(set);
2929         if (ret)
2930                 goto out_free_mq_map;
2931
2932         mutex_init(&set->tag_list_lock);
2933         INIT_LIST_HEAD(&set->tag_list);
2934
2935         return 0;
2936
2937 out_free_mq_map:
2938         for (i = 0; i < set->nr_maps; i++) {
2939                 kfree(set->map[i].mq_map);
2940                 set->map[i].mq_map = NULL;
2941         }
2942         kfree(set->tags);
2943         set->tags = NULL;
2944         return ret;
2945 }
2946 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2947
2948 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2949 {
2950         int i, j;
2951
2952         for (i = 0; i < nr_hw_queues(set); i++)
2953                 blk_mq_free_map_and_requests(set, i);
2954
2955         for (j = 0; j < set->nr_maps; j++) {
2956                 kfree(set->map[j].mq_map);
2957                 set->map[j].mq_map = NULL;
2958         }
2959
2960         kfree(set->tags);
2961         set->tags = NULL;
2962 }
2963 EXPORT_SYMBOL(blk_mq_free_tag_set);
2964
2965 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2966 {
2967         struct blk_mq_tag_set *set = q->tag_set;
2968         struct blk_mq_hw_ctx *hctx;
2969         int i, ret;
2970
2971         if (!set)
2972                 return -EINVAL;
2973
2974         blk_mq_freeze_queue(q);
2975         blk_mq_quiesce_queue(q);
2976
2977         ret = 0;
2978         queue_for_each_hw_ctx(q, hctx, i) {
2979                 if (!hctx->tags)
2980                         continue;
2981                 /*
2982                  * If we're using an MQ scheduler, just update the scheduler
2983                  * queue depth. This is similar to what the old code would do.
2984                  */
2985                 if (!hctx->sched_tags) {
2986                         ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2987                                                         false);
2988                 } else {
2989                         ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2990                                                         nr, true);
2991                 }
2992                 if (ret)
2993                         break;
2994         }
2995
2996         if (!ret)
2997                 q->nr_requests = nr;
2998
2999         blk_mq_unquiesce_queue(q);
3000         blk_mq_unfreeze_queue(q);
3001
3002         return ret;
3003 }
3004
3005 /*
3006  * request_queue and elevator_type pair.
3007  * It is just used by __blk_mq_update_nr_hw_queues to cache
3008  * the elevator_type associated with a request_queue.
3009  */
3010 struct blk_mq_qe_pair {
3011         struct list_head node;
3012         struct request_queue *q;
3013         struct elevator_type *type;
3014 };
3015
3016 /*
3017  * Cache the elevator_type in qe pair list and switch the
3018  * io scheduler to 'none'
3019  */
3020 static bool blk_mq_elv_switch_none(struct list_head *head,
3021                 struct request_queue *q)
3022 {
3023         struct blk_mq_qe_pair *qe;
3024
3025         if (!q->elevator)
3026                 return true;
3027
3028         qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
3029         if (!qe)
3030                 return false;
3031
3032         INIT_LIST_HEAD(&qe->node);
3033         qe->q = q;
3034         qe->type = q->elevator->type;
3035         list_add(&qe->node, head);
3036
3037         mutex_lock(&q->sysfs_lock);
3038         /*
3039          * After elevator_switch_mq, the previous elevator_queue will be
3040          * released by elevator_release. The reference of the io scheduler
3041          * module get by elevator_get will also be put. So we need to get
3042          * a reference of the io scheduler module here to prevent it to be
3043          * removed.
3044          */
3045         __module_get(qe->type->elevator_owner);
3046         elevator_switch_mq(q, NULL);
3047         mutex_unlock(&q->sysfs_lock);
3048
3049         return true;
3050 }
3051
3052 static void blk_mq_elv_switch_back(struct list_head *head,
3053                 struct request_queue *q)
3054 {
3055         struct blk_mq_qe_pair *qe;
3056         struct elevator_type *t = NULL;
3057
3058         list_for_each_entry(qe, head, node)
3059                 if (qe->q == q) {
3060                         t = qe->type;
3061                         break;
3062                 }
3063
3064         if (!t)
3065                 return;
3066
3067         list_del(&qe->node);
3068         kfree(qe);
3069
3070         mutex_lock(&q->sysfs_lock);
3071         elevator_switch_mq(q, t);
3072         mutex_unlock(&q->sysfs_lock);
3073 }
3074
3075 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3076                                                         int nr_hw_queues)
3077 {
3078         struct request_queue *q;
3079         LIST_HEAD(head);
3080         int prev_nr_hw_queues;
3081
3082         lockdep_assert_held(&set->tag_list_lock);
3083
3084         if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3085                 nr_hw_queues = nr_cpu_ids;
3086         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3087                 return;
3088
3089         list_for_each_entry(q, &set->tag_list, tag_set_list)
3090                 blk_mq_freeze_queue(q);
3091         /*
3092          * Sync with blk_mq_queue_tag_busy_iter.
3093          */
3094         synchronize_rcu();
3095         /*
3096          * Switch IO scheduler to 'none', cleaning up the data associated
3097          * with the previous scheduler. We will switch back once we are done
3098          * updating the new sw to hw queue mappings.
3099          */
3100         list_for_each_entry(q, &set->tag_list, tag_set_list)
3101                 if (!blk_mq_elv_switch_none(&head, q))
3102                         goto switch_back;
3103
3104         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3105                 blk_mq_debugfs_unregister_hctxs(q);
3106                 blk_mq_sysfs_unregister(q);
3107         }
3108
3109         prev_nr_hw_queues = set->nr_hw_queues;
3110         set->nr_hw_queues = nr_hw_queues;
3111         blk_mq_update_queue_map(set);
3112 fallback:
3113         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3114                 blk_mq_realloc_hw_ctxs(set, q);
3115                 if (q->nr_hw_queues != set->nr_hw_queues) {
3116                         pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3117                                         nr_hw_queues, prev_nr_hw_queues);
3118                         set->nr_hw_queues = prev_nr_hw_queues;
3119                         blk_mq_map_queues(&set->map[0]);
3120                         goto fallback;
3121                 }
3122                 blk_mq_map_swqueue(q);
3123         }
3124
3125         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3126                 blk_mq_sysfs_register(q);
3127                 blk_mq_debugfs_register_hctxs(q);
3128         }
3129
3130 switch_back:
3131         list_for_each_entry(q, &set->tag_list, tag_set_list)
3132                 blk_mq_elv_switch_back(&head, q);
3133
3134         list_for_each_entry(q, &set->tag_list, tag_set_list)
3135                 blk_mq_unfreeze_queue(q);
3136 }
3137
3138 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
3139 {
3140         mutex_lock(&set->tag_list_lock);
3141         __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
3142         mutex_unlock(&set->tag_list_lock);
3143 }
3144 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
3145
3146 /* Enable polling stats and return whether they were already enabled. */
3147 static bool blk_poll_stats_enable(struct request_queue *q)
3148 {
3149         if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3150             blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
3151                 return true;
3152         blk_stat_add_callback(q, q->poll_cb);
3153         return false;
3154 }
3155
3156 static void blk_mq_poll_stats_start(struct request_queue *q)
3157 {
3158         /*
3159          * We don't arm the callback if polling stats are not enabled or the
3160          * callback is already active.
3161          */
3162         if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3163             blk_stat_is_active(q->poll_cb))
3164                 return;
3165
3166         blk_stat_activate_msecs(q->poll_cb, 100);
3167 }
3168
3169 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
3170 {
3171         struct request_queue *q = cb->data;
3172         int bucket;
3173
3174         for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
3175                 if (cb->stat[bucket].nr_samples)
3176                         q->poll_stat[bucket] = cb->stat[bucket];
3177         }
3178 }
3179
3180 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3181                                        struct blk_mq_hw_ctx *hctx,
3182                                        struct request *rq)
3183 {
3184         unsigned long ret = 0;
3185         int bucket;
3186
3187         /*
3188          * If stats collection isn't on, don't sleep but turn it on for
3189          * future users
3190          */
3191         if (!blk_poll_stats_enable(q))
3192                 return 0;
3193
3194         /*
3195          * As an optimistic guess, use half of the mean service time
3196          * for this type of request. We can (and should) make this smarter.
3197          * For instance, if the completion latencies are tight, we can
3198          * get closer than just half the mean. This is especially
3199          * important on devices where the completion latencies are longer
3200          * than ~10 usec. We do use the stats for the relevant IO size
3201          * if available which does lead to better estimates.
3202          */
3203         bucket = blk_mq_poll_stats_bkt(rq);
3204         if (bucket < 0)
3205                 return ret;
3206
3207         if (q->poll_stat[bucket].nr_samples)
3208                 ret = (q->poll_stat[bucket].mean + 1) / 2;
3209
3210         return ret;
3211 }
3212
3213 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3214                                      struct blk_mq_hw_ctx *hctx,
3215                                      struct request *rq)
3216 {
3217         struct hrtimer_sleeper hs;
3218         enum hrtimer_mode mode;
3219         unsigned int nsecs;
3220         ktime_t kt;
3221
3222         if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
3223                 return false;
3224
3225         /*
3226          * poll_nsec can be:
3227          *
3228          * -1:  don't ever hybrid sleep
3229          *  0:  use half of prev avg
3230          * >0:  use this specific value
3231          */
3232         if (q->poll_nsec == -1)
3233                 return false;
3234         else if (q->poll_nsec > 0)
3235                 nsecs = q->poll_nsec;
3236         else
3237                 nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3238
3239         if (!nsecs)
3240                 return false;
3241
3242         rq->rq_flags |= RQF_MQ_POLL_SLEPT;
3243
3244         /*
3245          * This will be replaced with the stats tracking code, using
3246          * 'avg_completion_time / 2' as the pre-sleep target.
3247          */
3248         kt = nsecs;
3249
3250         mode = HRTIMER_MODE_REL;
3251         hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3252         hrtimer_set_expires(&hs.timer, kt);
3253
3254         hrtimer_init_sleeper(&hs, current);
3255         do {
3256                 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3257                         break;
3258                 set_current_state(TASK_UNINTERRUPTIBLE);
3259                 hrtimer_start_expires(&hs.timer, mode);
3260                 if (hs.task)
3261                         io_schedule();
3262                 hrtimer_cancel(&hs.timer);
3263                 mode = HRTIMER_MODE_ABS;
3264         } while (hs.task && !signal_pending(current));
3265
3266         __set_current_state(TASK_RUNNING);
3267         destroy_hrtimer_on_stack(&hs.timer);
3268         return true;
3269 }
3270
3271 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3272 {
3273         struct request_queue *q = hctx->queue;
3274         long state;
3275
3276         /*
3277          * If we sleep, have the caller restart the poll loop to reset
3278          * the state. Like for the other success return cases, the
3279          * caller is responsible for checking if the IO completed. If
3280          * the IO isn't complete, we'll get called again and will go
3281          * straight to the busy poll loop.
3282          */
3283         if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3284                 return true;
3285
3286         hctx->poll_considered++;
3287
3288         state = current->state;
3289         while (!need_resched()) {
3290                 int ret;
3291
3292                 hctx->poll_invoked++;
3293
3294                 ret = q->mq_ops->poll(hctx, rq->tag);
3295                 if (ret > 0) {
3296                         hctx->poll_success++;
3297                         set_current_state(TASK_RUNNING);
3298                         return true;
3299                 }
3300
3301                 if (signal_pending_state(state, current))
3302                         set_current_state(TASK_RUNNING);
3303
3304                 if (current->state == TASK_RUNNING)
3305                         return true;
3306                 if (ret < 0)
3307                         break;
3308                 cpu_relax();
3309         }
3310
3311         __set_current_state(TASK_RUNNING);
3312         return false;
3313 }
3314
3315 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3316 {
3317         struct blk_mq_hw_ctx *hctx;
3318         struct request *rq;
3319
3320         if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3321                 return false;
3322
3323         hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3324         if (!blk_qc_t_is_internal(cookie))
3325                 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3326         else {
3327                 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3328                 /*
3329                  * With scheduling, if the request has completed, we'll
3330                  * get a NULL return here, as we clear the sched tag when
3331                  * that happens. The request still remains valid, like always,
3332                  * so we should be safe with just the NULL check.
3333                  */
3334                 if (!rq)
3335                         return false;
3336         }
3337
3338         return __blk_mq_poll(hctx, rq);
3339 }
3340
3341 unsigned int blk_mq_rq_cpu(struct request *rq)
3342 {
3343         return rq->mq_ctx->cpu;
3344 }
3345 EXPORT_SYMBOL(blk_mq_rq_cpu);
3346
3347 static int __init blk_mq_init(void)
3348 {
3349         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3350                                 blk_mq_hctx_notify_dead);
3351         return 0;
3352 }
3353 subsys_initcall(blk_mq_init);