block/blk-mq.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Block multiqueue core code
   4  *
   5  * Copyright (C) 2013-2014 Jens Axboe
   6  * Copyright (C) 2013-2014 Christoph Hellwig
   7  */
   8 #include <linux/kernel.h>
   9 #include <linux/module.h>
  10 #include <linux/backing-dev.h>
  11 #include <linux/bio.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/kmemleak.h>
  14 #include <linux/mm.h>
  15 #include <linux/init.h>
  16 #include <linux/slab.h>
  17 #include <linux/workqueue.h>
  18 #include <linux/smp.h>
  19 #include <linux/llist.h>
  20 #include <linux/list_sort.h>
  21 #include <linux/cpu.h>
  22 #include <linux/cache.h>
  23 #include <linux/sched/sysctl.h>
  24 #include <linux/sched/topology.h>
  25 #include <linux/sched/signal.h>
  26 #include <linux/delay.h>
  27 #include <linux/crash_dump.h>
  28 #include <linux/prefetch.h>
  29 #include <linux/blk-crypto.h>
  30
  31 #include <trace/events/block.h>
  32
  33 #include <linux/blk-mq.h>
  34 #include <linux/t10-pi.h>
  35 #include "blk.h"
  36 #include "blk-mq.h"
  37 #include "blk-mq-debugfs.h"
  38 #include "blk-mq-tag.h"
  39 #include "blk-pm.h"
  40 #include "blk-stat.h"
  41 #include "blk-mq-sched.h"
  42 #include "blk-rq-qos.h"
  43
  44 static void blk_mq_poll_stats_start(struct request_queue *q);
  45 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  46
  47 static int blk_mq_poll_stats_bkt(const struct request *rq)
  48 {
  49         int ddir, sectors, bucket;
  50
  51         ddir = rq_data_dir(rq);
  52         sectors = blk_rq_stats_sectors(rq);
  53
  54         bucket = ddir + 2 * ilog2(sectors);
  55
  56         if (bucket < 0)
  57                 return -1;
  58         else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
  59                 return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
  60
  61         return bucket;
  62 }
  63
  64 /*
  65  * Check if any of the ctx, dispatch list or elevator
  66  * have pending work in this hardware queue.
  67  */
  68 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  69 {
  70         return !list_empty_careful(&hctx->dispatch) ||
  71                 sbitmap_any_bit_set(&hctx->ctx_map) ||
  72                         blk_mq_sched_has_work(hctx);
  73 }
  74
  75 /*
  76  * Mark this ctx as having pending work in this hardware queue
  77  */
  78 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  79                                      struct blk_mq_ctx *ctx)
  80 {
  81         const int bit = ctx->index_hw[hctx->type];
  82
  83         if (!sbitmap_test_bit(&hctx->ctx_map, bit))
  84                 sbitmap_set_bit(&hctx->ctx_map, bit);
  85 }
  86
  87 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  88                                       struct blk_mq_ctx *ctx)
  89 {
  90         const int bit = ctx->index_hw[hctx->type];
  91
  92         sbitmap_clear_bit(&hctx->ctx_map, bit);
  93 }
  94
  95 struct mq_inflight {
  96         struct hd_struct *part;
  97         unsigned int inflight[2];
  98 };
  99
 100 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 101                                   struct request *rq, void *priv,
 102                                   bool reserved)
 103 {
 104         struct mq_inflight *mi = priv;
 105
 106         if (rq->part == mi->part)
 107                 mi->inflight[rq_data_dir(rq)]++;
 108
 109         return true;
 110 }
 111
 112 unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
 113 {
 114         struct mq_inflight mi = { .part = part };
 115
 116         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 117
 118         return mi.inflight[0] + mi.inflight[1];
 119 }
 120
 121 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 122                          unsigned int inflight[2])
 123 {
 124         struct mq_inflight mi = { .part = part };
 125
 126         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 127         inflight[0] = mi.inflight[0];
 128         inflight[1] = mi.inflight[1];
 129 }
 130
 131 void blk_freeze_queue_start(struct request_queue *q)
 132 {
 133         mutex_lock(&q->mq_freeze_lock);
 134         if (++q->mq_freeze_depth == 1) {
 135                 percpu_ref_kill(&q->q_usage_counter);
 136                 mutex_unlock(&q->mq_freeze_lock);
 137                 if (queue_is_mq(q))
 138                         blk_mq_run_hw_queues(q, false);
 139         } else {
 140                 mutex_unlock(&q->mq_freeze_lock);
 141         }
 142 }
 143 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 144
 145 void blk_mq_freeze_queue_wait(struct request_queue *q)
 146 {
 147         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 148 }
 149 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 150
 151 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 152                                      unsigned long timeout)
 153 {
 154         return wait_event_timeout(q->mq_freeze_wq,
 155                                         percpu_ref_is_zero(&q->q_usage_counter),
 156                                         timeout);
 157 }
 158 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 159
 160 /*
 161  * Guarantee no request is in use, so we can change any data structure of
 162  * the queue afterward.
 163  */
 164 void blk_freeze_queue(struct request_queue *q)
 165 {
 166         /*
 167          * In the !blk_mq case we are only calling this to kill the
 168          * q_usage_counter, otherwise this increases the freeze depth
 169          * and waits for it to return to zero.  For this reason there is
 170          * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 171          * exported to drivers as the only user for unfreeze is blk_mq.
 172          */
 173         blk_freeze_queue_start(q);
 174         blk_mq_freeze_queue_wait(q);
 175 }
 176
 177 void blk_mq_freeze_queue(struct request_queue *q)
 178 {
 179         /*
 180          * ...just an alias to keep freeze and unfreeze actions balanced
 181          * in the blk_mq_* namespace
 182          */
 183         blk_freeze_queue(q);
 184 }
 185 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 186
 187 void blk_mq_unfreeze_queue(struct request_queue *q)
 188 {
 189         mutex_lock(&q->mq_freeze_lock);
 190         q->mq_freeze_depth--;
 191         WARN_ON_ONCE(q->mq_freeze_depth < 0);
 192         if (!q->mq_freeze_depth) {
 193                 percpu_ref_resurrect(&q->q_usage_counter);
 194                 wake_up_all(&q->mq_freeze_wq);
 195         }
 196         mutex_unlock(&q->mq_freeze_lock);
 197 }
 198 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 199
 200 /*
 201  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 202  * mpt3sas driver such that this function can be removed.
 203  */
 204 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 205 {
 206         blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 207 }
 208 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 209
 210 /**
 211  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 212  * @q: request queue.
 213  *
 214  * Note: this function does not prevent that the struct request end_io()
 215  * callback function is invoked. Once this function is returned, we make
 216  * sure no dispatch can happen until the queue is unquiesced via
 217  * blk_mq_unquiesce_queue().
 218  */
 219 void blk_mq_quiesce_queue(struct request_queue *q)
 220 {
 221         struct blk_mq_hw_ctx *hctx;
 222         unsigned int i;
 223         bool rcu = false;
 224
 225         blk_mq_quiesce_queue_nowait(q);
 226
 227         queue_for_each_hw_ctx(q, hctx, i) {
 228                 if (hctx->flags & BLK_MQ_F_BLOCKING)
 229                         synchronize_srcu(hctx->srcu);
 230                 else
 231                         rcu = true;
 232         }
 233         if (rcu)
 234                 synchronize_rcu();
 235 }
 236 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 237
 238 /*
 239  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 240  * @q: request queue.
 241  *
 242  * This function recovers queue into the state before quiescing
 243  * which is done by blk_mq_quiesce_queue.
 244  */
 245 void blk_mq_unquiesce_queue(struct request_queue *q)
 246 {
 247         blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 248
 249         /* dispatch requests which are inserted during quiescing */
 250         blk_mq_run_hw_queues(q, true);
 251 }
 252 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 253
 254 void blk_mq_wake_waiters(struct request_queue *q)
 255 {
 256         struct blk_mq_hw_ctx *hctx;
 257         unsigned int i;
 258
 259         queue_for_each_hw_ctx(q, hctx, i)
 260                 if (blk_mq_hw_queue_mapped(hctx))
 261                         blk_mq_tag_wakeup_all(hctx->tags, true);
 262 }
 263
 264 /*
 265  * Only need start/end time stamping if we have iostat or
 266  * blk stats enabled, or using an IO scheduler.
 267  */
 268 static inline bool blk_mq_need_time_stamp(struct request *rq)
 269 {
 270         return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
 271 }
 272
 273 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 274                 unsigned int tag, unsigned int op, u64 alloc_time_ns)
 275 {
 276         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 277         struct request *rq = tags->static_rqs[tag];
 278         req_flags_t rq_flags = 0;
 279
 280         if (data->flags & BLK_MQ_REQ_INTERNAL) {
 281                 rq->tag = -1;
 282                 rq->internal_tag = tag;
 283         } else {
 284                 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
 285                         rq_flags = RQF_MQ_INFLIGHT;
 286                         atomic_inc(&data->hctx->nr_active);
 287                 }
 288                 rq->tag = tag;
 289                 rq->internal_tag = -1;
 290                 data->hctx->tags->rqs[rq->tag] = rq;
 291         }
 292
 293         /* csd/requeue_work/fifo_time is initialized before use */
 294         rq->q = data->q;
 295         rq->mq_ctx = data->ctx;
 296         rq->mq_hctx = data->hctx;
 297         rq->rq_flags = rq_flags;
 298         rq->cmd_flags = op;
 299         if (data->flags & BLK_MQ_REQ_PREEMPT)
 300                 rq->rq_flags |= RQF_PREEMPT;
 301         if (blk_queue_io_stat(data->q))
 302                 rq->rq_flags |= RQF_IO_STAT;
 303         INIT_LIST_HEAD(&rq->queuelist);
 304         INIT_HLIST_NODE(&rq->hash);
 305         RB_CLEAR_NODE(&rq->rb_node);
 306         rq->rq_disk = NULL;
 307         rq->part = NULL;
 308 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 309         rq->alloc_time_ns = alloc_time_ns;
 310 #endif
 311         if (blk_mq_need_time_stamp(rq))
 312                 rq->start_time_ns = ktime_get_ns();
 313         else
 314                 rq->start_time_ns = 0;
 315         rq->io_start_time_ns = 0;
 316         rq->stats_sectors = 0;
 317         rq->nr_phys_segments = 0;
 318 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 319         rq->nr_integrity_segments = 0;
 320 #endif
 321         blk_crypto_rq_set_defaults(rq);
 322         /* tag was already set */
 323         WRITE_ONCE(rq->deadline, 0);
 324
 325         rq->timeout = 0;
 326
 327         rq->end_io = NULL;
 328         rq->end_io_data = NULL;
 329
 330         data->ctx->rq_dispatched[op_is_sync(op)]++;
 331         refcount_set(&rq->ref, 1);
 332         return rq;
 333 }
 334
 335 static struct request *blk_mq_get_request(struct request_queue *q,
 336                                           struct bio *bio,
 337                                           struct blk_mq_alloc_data *data)
 338 {
 339         struct elevator_queue *e = q->elevator;
 340         struct request *rq;
 341         unsigned int tag;
 342         bool clear_ctx_on_error = false;
 343         u64 alloc_time_ns = 0;
 344
 345         /* alloc_time includes depth and tag waits */
 346         if (blk_queue_rq_alloc_time(q))
 347                 alloc_time_ns = ktime_get_ns();
 348
 349         data->q = q;
 350         if (likely(!data->ctx)) {
 351                 data->ctx = blk_mq_get_ctx(q);
 352                 clear_ctx_on_error = true;
 353         }
 354         if (likely(!data->hctx))
 355                 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
 356                                                 data->ctx);
 357         if (data->cmd_flags & REQ_NOWAIT)
 358                 data->flags |= BLK_MQ_REQ_NOWAIT;
 359
 360         if (e) {
 361                 data->flags |= BLK_MQ_REQ_INTERNAL;
 362
 363                 /*
 364                  * Flush requests are special and go directly to the
 365                  * dispatch list. Don't include reserved tags in the
 366                  * limiting, as it isn't useful.
 367                  */
 368                 if (!op_is_flush(data->cmd_flags) &&
 369                     e->type->ops.limit_depth &&
 370                     !(data->flags & BLK_MQ_REQ_RESERVED))
 371                         e->type->ops.limit_depth(data->cmd_flags, data);
 372         } else {
 373                 blk_mq_tag_busy(data->hctx);
 374         }
 375
 376         tag = blk_mq_get_tag(data);
 377         if (tag == BLK_MQ_TAG_FAIL) {
 378                 if (clear_ctx_on_error)
 379                         data->ctx = NULL;
 380                 return NULL;
 381         }
 382
 383         rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
 384         if (!op_is_flush(data->cmd_flags)) {
 385                 rq->elv.icq = NULL;
 386                 if (e && e->type->ops.prepare_request) {
 387                         if (e->type->icq_cache)
 388                                 blk_mq_sched_assign_ioc(rq);
 389
 390                         e->type->ops.prepare_request(rq, bio);
 391                         rq->rq_flags |= RQF_ELVPRIV;
 392                 }
 393         }
 394         data->hctx->queued++;
 395         return rq;
 396 }
 397
 398 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 399                 blk_mq_req_flags_t flags)
 400 {
 401         struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 402         struct request *rq;
 403         int ret;
 404
 405         ret = blk_queue_enter(q, flags);
 406         if (ret)
 407                 return ERR_PTR(ret);
 408
 409         rq = blk_mq_get_request(q, NULL, &alloc_data);
 410         if (!rq)
 411                 goto out_queue_exit;
 412         rq->__data_len = 0;
 413         rq->__sector = (sector_t) -1;
 414         rq->bio = rq->biotail = NULL;
 415         return rq;
 416 out_queue_exit:
 417         blk_queue_exit(q);
 418         return ERR_PTR(-EWOULDBLOCK);
 419 }
 420 EXPORT_SYMBOL(blk_mq_alloc_request);
 421
 422 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 423         unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 424 {
 425         struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 426         struct request *rq;
 427         unsigned int cpu;
 428         int ret;
 429
 430         /*
 431          * If the tag allocator sleeps we could get an allocation for a
 432          * different hardware context.  No need to complicate the low level
 433          * allocator for this for the rare use case of a command tied to
 434          * a specific queue.
 435          */
 436         if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 437                 return ERR_PTR(-EINVAL);
 438
 439         if (hctx_idx >= q->nr_hw_queues)
 440                 return ERR_PTR(-EIO);
 441
 442         ret = blk_queue_enter(q, flags);
 443         if (ret)
 444                 return ERR_PTR(ret);
 445
 446         /*
 447          * Check if the hardware context is actually mapped to anything.
 448          * If not tell the caller that it should skip this queue.
 449          */
 450         ret = -EXDEV;
 451         alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
 452         if (!blk_mq_hw_queue_mapped(alloc_data.hctx))
 453                 goto out_queue_exit;
 454         cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 455         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 456
 457         ret = -EWOULDBLOCK;
 458         rq = blk_mq_get_request(q, NULL, &alloc_data);
 459         if (!rq)
 460                 goto out_queue_exit;
 461         return rq;
 462 out_queue_exit:
 463         blk_queue_exit(q);
 464         return ERR_PTR(ret);
 465 }
 466 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 467
 468 static void __blk_mq_free_request(struct request *rq)
 469 {
 470         struct request_queue *q = rq->q;
 471         struct blk_mq_ctx *ctx = rq->mq_ctx;
 472         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 473         const int sched_tag = rq->internal_tag;
 474
 475         blk_crypto_free_request(rq);
 476         blk_pm_mark_last_busy(rq);
 477         rq->mq_hctx = NULL;
 478         if (rq->tag != -1)
 479                 blk_mq_put_tag(hctx->tags, ctx, rq->tag);
 480         if (sched_tag != -1)
 481                 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 482         blk_mq_sched_restart(hctx);
 483         blk_queue_exit(q);
 484 }
 485
 486 void blk_mq_free_request(struct request *rq)
 487 {
 488         struct request_queue *q = rq->q;
 489         struct elevator_queue *e = q->elevator;
 490         struct blk_mq_ctx *ctx = rq->mq_ctx;
 491         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 492
 493         if (rq->rq_flags & RQF_ELVPRIV) {
 494                 if (e && e->type->ops.finish_request)
 495                         e->type->ops.finish_request(rq);
 496                 if (rq->elv.icq) {
 497                         put_io_context(rq->elv.icq->ioc);
 498                         rq->elv.icq = NULL;
 499                 }
 500         }
 501
 502         ctx->rq_completed[rq_is_sync(rq)]++;
 503         if (rq->rq_flags & RQF_MQ_INFLIGHT)
 504                 atomic_dec(&hctx->nr_active);
 505
 506         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 507                 laptop_io_completion(q->backing_dev_info);
 508
 509         rq_qos_done(q, rq);
 510
 511         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 512         if (refcount_dec_and_test(&rq->ref))
 513                 __blk_mq_free_request(rq);
 514 }
 515 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 516
 517 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 518 {
 519         u64 now = 0;
 520
 521         if (blk_mq_need_time_stamp(rq))
 522                 now = ktime_get_ns();
 523
 524         if (rq->rq_flags & RQF_STATS) {
 525                 blk_mq_poll_stats_start(rq->q);
 526                 blk_stat_add(rq, now);
 527         }
 528
 529         if (rq->internal_tag != -1)
 530                 blk_mq_sched_completed_request(rq, now);
 531
 532         blk_account_io_done(rq, now);
 533
 534         if (rq->end_io) {
 535                 rq_qos_done(rq->q, rq);
 536                 rq->end_io(rq, error);
 537         } else {
 538                 blk_mq_free_request(rq);
 539         }
 540 }
 541 EXPORT_SYMBOL(__blk_mq_end_request);
 542
 543 void blk_mq_end_request(struct request *rq, blk_status_t error)
 544 {
 545         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 546                 BUG();
 547         __blk_mq_end_request(rq, error);
 548 }
 549 EXPORT_SYMBOL(blk_mq_end_request);
 550
 551 static void __blk_mq_complete_request_remote(void *data)
 552 {
 553         struct request *rq = data;
 554         struct request_queue *q = rq->q;
 555
 556         q->mq_ops->complete(rq);
 557 }
 558
 559 /**
 560  * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
 561  *                              injection that could drop the completion.
 562  * @rq: Request to be force completed
 563  *
 564  * Drivers should use blk_mq_complete_request() to complete requests in their
 565  * normal IO path. For timeout error recovery, drivers may call this forced
 566  * completion routine after they've reclaimed timed out requests to bypass
 567  * potentially subsequent fake timeouts.
 568  */
 569 void blk_mq_force_complete_rq(struct request *rq)
 570 {
 571         struct blk_mq_ctx *ctx = rq->mq_ctx;
 572         struct request_queue *q = rq->q;
 573         bool shared = false;
 574         int cpu;
 575
 576         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 577         /*
 578          * Most of single queue controllers, there is only one irq vector
 579          * for handling IO completion, and the only irq's affinity is set
 580          * as all possible CPUs. On most of ARCHs, this affinity means the
 581          * irq is handled on one specific CPU.
 582          *
 583          * So complete IO reqeust in softirq context in case of single queue
 584          * for not degrading IO performance by irqsoff latency.
 585          */
 586         if (q->nr_hw_queues == 1) {
 587                 __blk_complete_request(rq);
 588                 return;
 589         }
 590
 591         /*
 592          * For a polled request, always complete locallly, it's pointless
 593          * to redirect the completion.
 594          */
 595         if ((rq->cmd_flags & REQ_HIPRI) ||
 596             !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
 597                 q->mq_ops->complete(rq);
 598                 return;
 599         }
 600
 601         cpu = get_cpu();
 602         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 603                 shared = cpus_share_cache(cpu, ctx->cpu);
 604
 605         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 606                 rq->csd.func = __blk_mq_complete_request_remote;
 607                 rq->csd.info = rq;
 608                 rq->csd.flags = 0;
 609                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 610         } else {
 611                 q->mq_ops->complete(rq);
 612         }
 613         put_cpu();
 614 }
 615 EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
 616
 617 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 618         __releases(hctx->srcu)
 619 {
 620         if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 621                 rcu_read_unlock();
 622         else
 623                 srcu_read_unlock(hctx->srcu, srcu_idx);
 624 }
 625
 626 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 627         __acquires(hctx->srcu)
 628 {
 629         if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 630                 /* shut up gcc false positive */
 631                 *srcu_idx = 0;
 632                 rcu_read_lock();
 633         } else
 634                 *srcu_idx = srcu_read_lock(hctx->srcu);
 635 }
 636
 637 /**
 638  * blk_mq_complete_request - end I/O on a request
 639  * @rq:         the request being processed
 640  *
 641  * Description:
 642  *      Ends all I/O on a request. It does not handle partial completions.
 643  *      The actual completion happens out-of-order, through a IPI handler.
 644  **/
 645 bool blk_mq_complete_request(struct request *rq)
 646 {
 647         if (unlikely(blk_should_fake_timeout(rq->q)))
 648                 return false;
 649         blk_mq_force_complete_rq(rq);
 650         return true;
 651 }
 652 EXPORT_SYMBOL(blk_mq_complete_request);
 653
 654 /**
 655  * blk_mq_start_request - Start processing a request
 656  * @rq: Pointer to request to be started
 657  *
 658  * Function used by device drivers to notify the block layer that a request
 659  * is going to be processed now, so blk layer can do proper initializations
 660  * such as starting the timeout timer.
 661  */
 662 void blk_mq_start_request(struct request *rq)
 663 {
 664         struct request_queue *q = rq->q;
 665
 666         trace_block_rq_issue(q, rq);
 667
 668         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 669                 rq->io_start_time_ns = ktime_get_ns();
 670                 rq->stats_sectors = blk_rq_sectors(rq);
 671                 rq->rq_flags |= RQF_STATS;
 672                 rq_qos_issue(q, rq);
 673         }
 674
 675         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 676
 677         blk_add_timer(rq);
 678         WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 679
 680 #ifdef CONFIG_BLK_DEV_INTEGRITY
 681         if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
 682                 q->integrity.profile->prepare_fn(rq);
 683 #endif
 684 }
 685 EXPORT_SYMBOL(blk_mq_start_request);
 686
 687 static void __blk_mq_requeue_request(struct request *rq)
 688 {
 689         struct request_queue *q = rq->q;
 690
 691         blk_mq_put_driver_tag(rq);
 692
 693         trace_block_rq_requeue(q, rq);
 694         rq_qos_requeue(q, rq);
 695
 696         if (blk_mq_request_started(rq)) {
 697                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 698                 rq->rq_flags &= ~RQF_TIMED_OUT;
 699         }
 700 }
 701
 702 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 703 {
 704         __blk_mq_requeue_request(rq);
 705
 706         /* this request will be re-inserted to io scheduler queue */
 707         blk_mq_sched_requeue_request(rq);
 708
 709         BUG_ON(!list_empty(&rq->queuelist));
 710         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 711 }
 712 EXPORT_SYMBOL(blk_mq_requeue_request);
 713
 714 static void blk_mq_requeue_work(struct work_struct *work)
 715 {
 716         struct request_queue *q =
 717                 container_of(work, struct request_queue, requeue_work.work);
 718         LIST_HEAD(rq_list);
 719         struct request *rq, *next;
 720
 721         spin_lock_irq(&q->requeue_lock);
 722         list_splice_init(&q->requeue_list, &rq_list);
 723         spin_unlock_irq(&q->requeue_lock);
 724
 725         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 726                 if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
 727                         continue;
 728
 729                 rq->rq_flags &= ~RQF_SOFTBARRIER;
 730                 list_del_init(&rq->queuelist);
 731                 /*
 732                  * If RQF_DONTPREP, rq has contained some driver specific
 733                  * data, so insert it to hctx dispatch list to avoid any
 734                  * merge.
 735                  */
 736                 if (rq->rq_flags & RQF_DONTPREP)
 737                         blk_mq_request_bypass_insert(rq, false, false);
 738                 else
 739                         blk_mq_sched_insert_request(rq, true, false, false);
 740         }
 741
 742         while (!list_empty(&rq_list)) {
 743                 rq = list_entry(rq_list.next, struct request, queuelist);
 744                 list_del_init(&rq->queuelist);
 745                 blk_mq_sched_insert_request(rq, false, false, false);
 746         }
 747
 748         blk_mq_run_hw_queues(q, false);
 749 }
 750
 751 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 752                                 bool kick_requeue_list)
 753 {
 754         struct request_queue *q = rq->q;
 755         unsigned long flags;
 756
 757         /*
 758          * We abuse this flag that is otherwise used by the I/O scheduler to
 759          * request head insertion from the workqueue.
 760          */
 761         BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 762
 763         spin_lock_irqsave(&q->requeue_lock, flags);
 764         if (at_head) {
 765                 rq->rq_flags |= RQF_SOFTBARRIER;
 766                 list_add(&rq->queuelist, &q->requeue_list);
 767         } else {
 768                 list_add_tail(&rq->queuelist, &q->requeue_list);
 769         }
 770         spin_unlock_irqrestore(&q->requeue_lock, flags);
 771
 772         if (kick_requeue_list)
 773                 blk_mq_kick_requeue_list(q);
 774 }
 775
 776 void blk_mq_kick_requeue_list(struct request_queue *q)
 777 {
 778         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 779 }
 780 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 781
 782 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 783                                     unsigned long msecs)
 784 {
 785         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
 786                                     msecs_to_jiffies(msecs));
 787 }
 788 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 789
 790 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 791 {
 792         if (tag < tags->nr_tags) {
 793                 prefetch(tags->rqs[tag]);
 794                 return tags->rqs[tag];
 795         }
 796
 797         return NULL;
 798 }
 799 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 800
 801 static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
 802                                void *priv, bool reserved)
 803 {
 804         /*
 805          * If we find a request that is inflight and the queue matches,
 806          * we know the queue is busy. Return false to stop the iteration.
 807          */
 808         if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
 809                 bool *busy = priv;
 810
 811                 *busy = true;
 812                 return false;
 813         }
 814
 815         return true;
 816 }
 817
 818 bool blk_mq_queue_inflight(struct request_queue *q)
 819 {
 820         bool busy = false;
 821
 822         blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
 823         return busy;
 824 }
 825 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
 826
 827 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 828 {
 829         req->rq_flags |= RQF_TIMED_OUT;
 830         if (req->q->mq_ops->timeout) {
 831                 enum blk_eh_timer_return ret;
 832
 833                 ret = req->q->mq_ops->timeout(req, reserved);
 834                 if (ret == BLK_EH_DONE)
 835                         return;
 836                 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 837         }
 838
 839         blk_add_timer(req);
 840 }
 841
 842 static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 843 {
 844         unsigned long deadline;
 845
 846         if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
 847                 return false;
 848         if (rq->rq_flags & RQF_TIMED_OUT)
 849                 return false;
 850
 851         deadline = READ_ONCE(rq->deadline);
 852         if (time_after_eq(jiffies, deadline))
 853                 return true;
 854
 855         if (*next == 0)
 856                 *next = deadline;
 857         else if (time_after(*next, deadline))
 858                 *next = deadline;
 859         return false;
 860 }
 861
 862 static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 863                 struct request *rq, void *priv, bool reserved)
 864 {
 865         unsigned long *next = priv;
 866
 867         /*
 868          * Just do a quick check if it is expired before locking the request in
 869          * so we're not unnecessarilly synchronizing across CPUs.
 870          */
 871         if (!blk_mq_req_expired(rq, next))
 872                 return true;
 873
 874         /*
 875          * We have reason to believe the request may be expired. Take a
 876          * reference on the request to lock this request lifetime into its
 877          * currently allocated context to prevent it from being reallocated in
 878          * the event the completion by-passes this timeout handler.
 879          *
 880          * If the reference was already released, then the driver beat the
 881          * timeout handler to posting a natural completion.
 882          */
 883         if (!refcount_inc_not_zero(&rq->ref))
 884                 return true;
 885
 886         /*
 887          * The request is now locked and cannot be reallocated underneath the
 888          * timeout handler's processing. Re-verify this exact request is truly
 889          * expired; if it is not expired, then the request was completed and
 890          * reallocated as a new request.
 891          */
 892         if (blk_mq_req_expired(rq, next))
 893                 blk_mq_rq_timed_out(rq, reserved);
 894
 895         if (is_flush_rq(rq, hctx))
 896                 rq->end_io(rq, 0);
 897         else if (refcount_dec_and_test(&rq->ref))
 898                 __blk_mq_free_request(rq);
 899
 900         return true;
 901 }
 902
 903 static void blk_mq_timeout_work(struct work_struct *work)
 904 {
 905         struct request_queue *q =
 906                 container_of(work, struct request_queue, timeout_work);
 907         unsigned long next = 0;
 908         struct blk_mq_hw_ctx *hctx;
 909         int i;
 910
 911         /* A deadlock might occur if a request is stuck requiring a
 912          * timeout at the same time a queue freeze is waiting
 913          * completion, since the timeout code would not be able to
 914          * acquire the queue reference here.
 915          *
 916          * That's why we don't use blk_queue_enter here; instead, we use
 917          * percpu_ref_tryget directly, because we need to be able to
 918          * obtain a reference even in the short window between the queue
 919          * starting to freeze, by dropping the first reference in
 920          * blk_freeze_queue_start, and the moment the last request is
 921          * consumed, marked by the instant q_usage_counter reaches
 922          * zero.
 923          */
 924         if (!percpu_ref_tryget(&q->q_usage_counter))
 925                 return;
 926
 927         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 928
 929         if (next != 0) {
 930                 mod_timer(&q->timeout, next);
 931         } else {
 932                 /*
 933                  * Request timeouts are handled as a forward rolling timer. If
 934                  * we end up here it means that no requests are pending and
 935                  * also that no request has been pending for a while. Mark
 936                  * each hctx as idle.
 937                  */
 938                 queue_for_each_hw_ctx(q, hctx, i) {
 939                         /* the hctx may be unmapped, so check it here */
 940                         if (blk_mq_hw_queue_mapped(hctx))
 941                                 blk_mq_tag_idle(hctx);
 942                 }
 943         }
 944         blk_queue_exit(q);
 945 }
 946
 947 struct flush_busy_ctx_data {
 948         struct blk_mq_hw_ctx *hctx;
 949         struct list_head *list;
 950 };
 951
 952 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 953 {
 954         struct flush_busy_ctx_data *flush_data = data;
 955         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 956         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 957         enum hctx_type type = hctx->type;
 958
 959         spin_lock(&ctx->lock);
 960         list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
 961         sbitmap_clear_bit(sb, bitnr);
 962         spin_unlock(&ctx->lock);
 963         return true;
 964 }
 965
 966 /*
 967  * Process software queues that have been marked busy, splicing them
 968  * to the for-dispatch
 969  */
 970 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 971 {
 972         struct flush_busy_ctx_data data = {
 973                 .hctx = hctx,
 974                 .list = list,
 975         };
 976
 977         sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 978 }
 979 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 980
 981 struct dispatch_rq_data {
 982         struct blk_mq_hw_ctx *hctx;
 983         struct request *rq;
 984 };
 985
 986 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 987                 void *data)
 988 {
 989         struct dispatch_rq_data *dispatch_data = data;
 990         struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
 991         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 992         enum hctx_type type = hctx->type;
 993
 994         spin_lock(&ctx->lock);
 995         if (!list_empty(&ctx->rq_lists[type])) {
 996                 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
 997                 list_del_init(&dispatch_data->rq->queuelist);
 998                 if (list_empty(&ctx->rq_lists[type]))
 999                         sbitmap_clear_bit(sb, bitnr);
1000         }
1001         spin_unlock(&ctx->lock);
1002
1003         return !dispatch_data->rq;
1004 }
1005
1006 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1007                                         struct blk_mq_ctx *start)
1008 {
1009         unsigned off = start ? start->index_hw[hctx->type] : 0;
1010         struct dispatch_rq_data data = {
1011                 .hctx = hctx,
1012                 .rq   = NULL,
1013         };
1014
1015         __sbitmap_for_each_set(&hctx->ctx_map, off,
1016                                dispatch_rq_from_ctx, &data);
1017
1018         return data.rq;
1019 }
1020
1021 static inline unsigned int queued_to_index(unsigned int queued)
1022 {
1023         if (!queued)
1024                 return 0;
1025
1026         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1027 }
1028
1029 bool blk_mq_get_driver_tag(struct request *rq)
1030 {
1031         struct blk_mq_alloc_data data = {
1032                 .q = rq->q,
1033                 .hctx = rq->mq_hctx,
1034                 .flags = BLK_MQ_REQ_NOWAIT,
1035                 .cmd_flags = rq->cmd_flags,
1036         };
1037         bool shared;
1038
1039         if (rq->tag != -1)
1040                 return true;
1041
1042         if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1043                 data.flags |= BLK_MQ_REQ_RESERVED;
1044
1045         shared = blk_mq_tag_busy(data.hctx);
1046         rq->tag = blk_mq_get_tag(&data);
1047         if (rq->tag >= 0) {
1048                 if (shared) {
1049                         rq->rq_flags |= RQF_MQ_INFLIGHT;
1050                         atomic_inc(&data.hctx->nr_active);
1051                 }
1052                 data.hctx->tags->rqs[rq->tag] = rq;
1053         }
1054
1055         return rq->tag != -1;
1056 }
1057
1058 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1059                                 int flags, void *key)
1060 {
1061         struct blk_mq_hw_ctx *hctx;
1062
1063         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1064
1065         spin_lock(&hctx->dispatch_wait_lock);
1066         if (!list_empty(&wait->entry)) {
1067                 struct sbitmap_queue *sbq;
1068
1069                 list_del_init(&wait->entry);
1070                 sbq = &hctx->tags->bitmap_tags;
1071                 atomic_dec(&sbq->ws_active);
1072         }
1073         spin_unlock(&hctx->dispatch_wait_lock);
1074
1075         blk_mq_run_hw_queue(hctx, true);
1076         return 1;
1077 }
1078
1079 /*
1080  * Mark us waiting for a tag. For shared tags, this involves hooking us into
1081  * the tag wakeups. For non-shared tags, we can simply mark us needing a
1082  * restart. For both cases, take care to check the condition again after
1083  * marking us as waiting.
1084  */
1085 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1086                                  struct request *rq)
1087 {
1088         struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
1089         struct wait_queue_head *wq;
1090         wait_queue_entry_t *wait;
1091         bool ret;
1092
1093         if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1094                 blk_mq_sched_mark_restart_hctx(hctx);
1095
1096                 /*
1097                  * It's possible that a tag was freed in the window between the
1098                  * allocation failure and adding the hardware queue to the wait
1099                  * queue.
1100                  *
1101                  * Don't clear RESTART here, someone else could have set it.
1102                  * At most this will cost an extra queue run.
1103                  */
1104                 return blk_mq_get_driver_tag(rq);
1105         }
1106
1107         wait = &hctx->dispatch_wait;
1108         if (!list_empty_careful(&wait->entry))
1109                 return false;
1110
1111         wq = &bt_wait_ptr(sbq, hctx)->wait;
1112
1113         spin_lock_irq(&wq->lock);
1114         spin_lock(&hctx->dispatch_wait_lock);
1115         if (!list_empty(&wait->entry)) {
1116                 spin_unlock(&hctx->dispatch_wait_lock);
1117                 spin_unlock_irq(&wq->lock);
1118                 return false;
1119         }
1120
1121         atomic_inc(&sbq->ws_active);
1122         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1123         __add_wait_queue(wq, wait);
1124
1125         /*
1126          * It's possible that a tag was freed in the window between the
1127          * allocation failure and adding the hardware queue to the wait
1128          * queue.
1129          */
1130         ret = blk_mq_get_driver_tag(rq);
1131         if (!ret) {
1132                 spin_unlock(&hctx->dispatch_wait_lock);
1133                 spin_unlock_irq(&wq->lock);
1134                 return false;
1135         }
1136
1137         /*
1138          * We got a tag, remove ourselves from the wait queue to ensure
1139          * someone else gets the wakeup.
1140          */
1141         list_del_init(&wait->entry);
1142         atomic_dec(&sbq->ws_active);
1143         spin_unlock(&hctx->dispatch_wait_lock);
1144         spin_unlock_irq(&wq->lock);
1145
1146         return true;
1147 }
1148
1149 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
1150 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
1151 /*
1152  * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1153  * - EWMA is one simple way to compute running average value
1154  * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1155  * - take 4 as factor for avoiding to get too small(0) result, and this
1156  *   factor doesn't matter because EWMA decreases exponentially
1157  */
1158 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1159 {
1160         unsigned int ewma;
1161
1162         if (hctx->queue->elevator)
1163                 return;
1164
1165         ewma = hctx->dispatch_busy;
1166
1167         if (!ewma && !busy)
1168                 return;
1169
1170         ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1171         if (busy)
1172                 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1173         ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1174
1175         hctx->dispatch_busy = ewma;
1176 }
1177
1178 #define BLK_MQ_RESOURCE_DELAY   3               /* ms units */
1179
1180 static void blk_mq_handle_dev_resource(struct request *rq,
1181                                        struct list_head *list)
1182 {
1183         struct request *next =
1184                 list_first_entry_or_null(list, struct request, queuelist);
1185
1186         /*
1187          * If an I/O scheduler has been configured and we got a driver tag for
1188          * the next request already, free it.
1189          */
1190         if (next)
1191                 blk_mq_put_driver_tag(next);
1192
1193         list_add(&rq->queuelist, list);
1194         __blk_mq_requeue_request(rq);
1195 }
1196
1197 static void blk_mq_handle_zone_resource(struct request *rq,
1198                                         struct list_head *zone_list)
1199 {
1200         /*
1201          * If we end up here it is because we cannot dispatch a request to a
1202          * specific zone due to LLD level zone-write locking or other zone
1203          * related resource not being available. In this case, set the request
1204          * aside in zone_list for retrying it later.
1205          */
1206         list_add(&rq->queuelist, zone_list);
1207         __blk_mq_requeue_request(rq);
1208 }
1209
1210 /*
1211  * Returns true if we did some work AND can potentially do more.
1212  */
1213 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1214                              bool got_budget)
1215 {
1216         struct blk_mq_hw_ctx *hctx;
1217         struct request *rq, *nxt;
1218         bool no_tag = false;
1219         int errors, queued;
1220         blk_status_t ret = BLK_STS_OK;
1221         bool no_budget_avail = false;
1222         LIST_HEAD(zone_list);
1223
1224         if (list_empty(list))
1225                 return false;
1226
1227         WARN_ON(!list_is_singular(list) && got_budget);
1228
1229         /*
1230          * Now process all the entries, sending them to the driver.
1231          */
1232         errors = queued = 0;
1233         do {
1234                 struct blk_mq_queue_data bd;
1235
1236                 rq = list_first_entry(list, struct request, queuelist);
1237
1238                 hctx = rq->mq_hctx;
1239                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
1240                         blk_mq_put_driver_tag(rq);
1241                         no_budget_avail = true;
1242                         break;
1243                 }
1244
1245                 if (!blk_mq_get_driver_tag(rq)) {
1246                         /*
1247                          * The initial allocation attempt failed, so we need to
1248                          * rerun the hardware queue when a tag is freed. The
1249                          * waitqueue takes care of that. If the queue is run
1250                          * before we add this entry back on the dispatch list,
1251                          * we'll re-run it below.
1252                          */
1253                         if (!blk_mq_mark_tag_wait(hctx, rq)) {
1254                                 blk_mq_put_dispatch_budget(hctx);
1255                                 /*
1256                                  * For non-shared tags, the RESTART check
1257                                  * will suffice.
1258                                  */
1259                                 if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1260                                         no_tag = true;
1261                                 break;
1262                         }
1263                 }
1264
1265                 list_del_init(&rq->queuelist);
1266
1267                 bd.rq = rq;
1268
1269                 /*
1270                  * Flag last if we have no more requests, or if we have more
1271                  * but can't assign a driver tag to it.
1272                  */
1273                 if (list_empty(list))
1274                         bd.last = true;
1275                 else {
1276                         nxt = list_first_entry(list, struct request, queuelist);
1277                         bd.last = !blk_mq_get_driver_tag(nxt);
1278                 }
1279
1280                 ret = q->mq_ops->queue_rq(hctx, &bd);
1281                 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
1282                         blk_mq_handle_dev_resource(rq, list);
1283                         break;
1284                 } else if (ret == BLK_STS_ZONE_RESOURCE) {
1285                         /*
1286                          * Move the request to zone_list and keep going through
1287                          * the dispatch list to find more requests the drive can
1288                          * accept.
1289                          */
1290                         blk_mq_handle_zone_resource(rq, &zone_list);
1291                         if (list_empty(list))
1292                                 break;
1293                         continue;
1294                 }
1295
1296                 if (unlikely(ret != BLK_STS_OK)) {
1297                         errors++;
1298                         blk_mq_end_request(rq, BLK_STS_IOERR);
1299                         continue;
1300                 }
1301
1302                 queued++;
1303         } while (!list_empty(list));
1304
1305         if (!list_empty(&zone_list))
1306                 list_splice_tail_init(&zone_list, list);
1307
1308         hctx->dispatched[queued_to_index(queued)]++;
1309
1310         /*
1311          * Any items that need requeuing? Stuff them into hctx->dispatch,
1312          * that is where we will continue on next queue run.
1313          */
1314         if (!list_empty(list)) {
1315                 bool needs_restart;
1316
1317                 /*
1318                  * If we didn't flush the entire list, we could have told
1319                  * the driver there was more coming, but that turned out to
1320                  * be a lie.
1321                  */
1322                 if (q->mq_ops->commit_rqs && queued)
1323                         q->mq_ops->commit_rqs(hctx);
1324
1325                 spin_lock(&hctx->lock);
1326                 list_splice_tail_init(list, &hctx->dispatch);
1327                 spin_unlock(&hctx->lock);
1328
1329                 /*
1330                  * If SCHED_RESTART was set by the caller of this function and
1331                  * it is no longer set that means that it was cleared by another
1332                  * thread and hence that a queue rerun is needed.
1333                  *
1334                  * If 'no_tag' is set, that means that we failed getting
1335                  * a driver tag with an I/O scheduler attached. If our dispatch
1336                  * waitqueue is no longer active, ensure that we run the queue
1337                  * AFTER adding our entries back to the list.
1338                  *
1339                  * If no I/O scheduler has been configured it is possible that
1340                  * the hardware queue got stopped and restarted before requests
1341                  * were pushed back onto the dispatch list. Rerun the queue to
1342                  * avoid starvation. Notes:
1343                  * - blk_mq_run_hw_queue() checks whether or not a queue has
1344                  *   been stopped before rerunning a queue.
1345                  * - Some but not all block drivers stop a queue before
1346                  *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1347                  *   and dm-rq.
1348                  *
1349                  * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1350                  * bit is set, run queue after a delay to avoid IO stalls
1351                  * that could otherwise occur if the queue is idle.  We'll do
1352                  * similar if we couldn't get budget and SCHED_RESTART is set.
1353                  */
1354                 needs_restart = blk_mq_sched_needs_restart(hctx);
1355                 if (!needs_restart ||
1356                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1357                         blk_mq_run_hw_queue(hctx, true);
1358                 else if (needs_restart && (ret == BLK_STS_RESOURCE ||
1359                                            no_budget_avail))
1360                         blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1361
1362                 blk_mq_update_dispatch_busy(hctx, true);
1363                 return false;
1364         } else
1365                 blk_mq_update_dispatch_busy(hctx, false);
1366
1367         /*
1368          * If the host/device is unable to accept more work, inform the
1369          * caller of that.
1370          */
1371         if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1372                 return false;
1373
1374         return (queued + errors) != 0;
1375 }
1376
1377 /**
1378  * __blk_mq_run_hw_queue - Run a hardware queue.
1379  * @hctx: Pointer to the hardware queue to run.
1380  *
1381  * Send pending requests to the hardware.
1382  */
1383 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1384 {
1385         int srcu_idx;
1386
1387         /*
1388          * We should be running this queue from one of the CPUs that
1389          * are mapped to it.
1390          *
1391          * There are at least two related races now between setting
1392          * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
1393          * __blk_mq_run_hw_queue():
1394          *
1395          * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
1396          *   but later it becomes online, then this warning is harmless
1397          *   at all
1398          *
1399          * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
1400          *   but later it becomes offline, then the warning can't be
1401          *   triggered, and we depend on blk-mq timeout handler to
1402          *   handle dispatched requests to this hctx
1403          */
1404         if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1405                 cpu_online(hctx->next_cpu)) {
1406                 printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
1407                         raw_smp_processor_id(),
1408                         cpumask_empty(hctx->cpumask) ? "inactive": "active");
1409                 dump_stack();
1410         }
1411
1412         /*
1413          * We can't run the queue inline with ints disabled. Ensure that
1414          * we catch bad users of this early.
1415          */
1416         WARN_ON_ONCE(in_interrupt());
1417
1418         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1419
1420         hctx_lock(hctx, &srcu_idx);
1421         blk_mq_sched_dispatch_requests(hctx);
1422         hctx_unlock(hctx, srcu_idx);
1423 }
1424
1425 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1426 {
1427         int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1428
1429         if (cpu >= nr_cpu_ids)
1430                 cpu = cpumask_first(hctx->cpumask);
1431         return cpu;
1432 }
1433
1434 /*
1435  * It'd be great if the workqueue API had a way to pass
1436  * in a mask and had some smarts for more clever placement.
1437  * For now we just round-robin here, switching for every
1438  * BLK_MQ_CPU_WORK_BATCH queued items.
1439  */
1440 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1441 {
1442         bool tried = false;
1443         int next_cpu = hctx->next_cpu;
1444
1445         if (hctx->queue->nr_hw_queues == 1)
1446                 return WORK_CPU_UNBOUND;
1447
1448         if (--hctx->next_cpu_batch <= 0) {
1449 select_cpu:
1450                 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
1451                                 cpu_online_mask);
1452                 if (next_cpu >= nr_cpu_ids)
1453                         next_cpu = blk_mq_first_mapped_cpu(hctx);
1454                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1455         }
1456
1457         /*
1458          * Do unbound schedule if we can't find a online CPU for this hctx,
1459          * and it should only happen in the path of handling CPU DEAD.
1460          */
1461         if (!cpu_online(next_cpu)) {
1462                 if (!tried) {
1463                         tried = true;
1464                         goto select_cpu;
1465                 }
1466
1467                 /*
1468                  * Make sure to re-select CPU next time once after CPUs
1469                  * in hctx->cpumask become online again.
1470                  */
1471                 hctx->next_cpu = next_cpu;
1472                 hctx->next_cpu_batch = 1;
1473                 return WORK_CPU_UNBOUND;
1474         }
1475
1476         hctx->next_cpu = next_cpu;
1477         return next_cpu;
1478 }
1479
1480 /**
1481  * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1482  * @hctx: Pointer to the hardware queue to run.
1483  * @async: If we want to run the queue asynchronously.
1484  * @msecs: Microseconds of delay to wait before running the queue.
1485  *
1486  * If !@async, try to run the queue now. Else, run the queue asynchronously and
1487  * with a delay of @msecs.
1488  */
1489 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1490                                         unsigned long msecs)
1491 {
1492         if (unlikely(blk_mq_hctx_stopped(hctx)))
1493                 return;
1494
1495         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1496                 int cpu = get_cpu();
1497                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1498                         __blk_mq_run_hw_queue(hctx);
1499                         put_cpu();
1500                         return;
1501                 }
1502
1503                 put_cpu();
1504         }
1505
1506         kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1507                                     msecs_to_jiffies(msecs));
1508 }
1509
1510 /**
1511  * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1512  * @hctx: Pointer to the hardware queue to run.
1513  * @msecs: Microseconds of delay to wait before running the queue.
1514  *
1515  * Run a hardware queue asynchronously with a delay of @msecs.
1516  */
1517 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1518 {
1519         __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1520 }
1521 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1522
1523 /**
1524  * blk_mq_run_hw_queue - Start to run a hardware queue.
1525  * @hctx: Pointer to the hardware queue to run.
1526  * @async: If we want to run the queue asynchronously.
1527  *
1528  * Check if the request queue is not in a quiesced state and if there are
1529  * pending requests to be sent. If this is true, run the queue to send requests
1530  * to hardware.
1531  */
1532 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1533 {
1534         int srcu_idx;
1535         bool need_run;
1536
1537         /*
1538          * When queue is quiesced, we may be switching io scheduler, or
1539          * updating nr_hw_queues, or other things, and we can't run queue
1540          * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1541          *
1542          * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1543          * quiesced.
1544          */
1545         hctx_lock(hctx, &srcu_idx);
1546         need_run = !blk_queue_quiesced(hctx->queue) &&
1547                 blk_mq_hctx_has_pending(hctx);
1548         hctx_unlock(hctx, srcu_idx);
1549
1550         if (need_run)
1551                 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1552 }
1553 EXPORT_SYMBOL(blk_mq_run_hw_queue);
1554
1555 /**
1556  * blk_mq_run_hw_queue - Run all hardware queues in a request queue.
1557  * @q: Pointer to the request queue to run.
1558  * @async: If we want to run the queue asynchronously.
1559  */
1560 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1561 {
1562         struct blk_mq_hw_ctx *hctx;
1563         int i;
1564
1565         queue_for_each_hw_ctx(q, hctx, i) {
1566                 if (blk_mq_hctx_stopped(hctx))
1567                         continue;
1568
1569                 blk_mq_run_hw_queue(hctx, async);
1570         }
1571 }
1572 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1573
1574 /**
1575  * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1576  * @q: Pointer to the request queue to run.
1577  * @msecs: Microseconds of delay to wait before running the queues.
1578  */
1579 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1580 {
1581         struct blk_mq_hw_ctx *hctx;
1582         int i;
1583
1584         queue_for_each_hw_ctx(q, hctx, i) {
1585                 if (blk_mq_hctx_stopped(hctx))
1586                         continue;
1587
1588                 blk_mq_delay_run_hw_queue(hctx, msecs);
1589         }
1590 }
1591 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1592
1593 /**
1594  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1595  * @q: request queue.
1596  *
1597  * The caller is responsible for serializing this function against
1598  * blk_mq_{start,stop}_hw_queue().
1599  */
1600 bool blk_mq_queue_stopped(struct request_queue *q)
1601 {
1602         struct blk_mq_hw_ctx *hctx;
1603         int i;
1604
1605         queue_for_each_hw_ctx(q, hctx, i)
1606                 if (blk_mq_hctx_stopped(hctx))
1607                         return true;
1608
1609         return false;
1610 }
1611 EXPORT_SYMBOL(blk_mq_queue_stopped);
1612
1613 /*
1614  * This function is often used for pausing .queue_rq() by driver when
1615  * there isn't enough resource or some conditions aren't satisfied, and
1616  * BLK_STS_RESOURCE is usually returned.
1617  *
1618  * We do not guarantee that dispatch can be drained or blocked
1619  * after blk_mq_stop_hw_queue() returns. Please use
1620  * blk_mq_quiesce_queue() for that requirement.
1621  */
1622 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1623 {
1624         cancel_delayed_work(&hctx->run_work);
1625
1626         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1627 }
1628 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1629
1630 /*
1631  * This function is often used for pausing .queue_rq() by driver when
1632  * there isn't enough resource or some conditions aren't satisfied, and
1633  * BLK_STS_RESOURCE is usually returned.
1634  *
1635  * We do not guarantee that dispatch can be drained or blocked
1636  * after blk_mq_stop_hw_queues() returns. Please use
1637  * blk_mq_quiesce_queue() for that requirement.
1638  */
1639 void blk_mq_stop_hw_queues(struct request_queue *q)
1640 {
1641         struct blk_mq_hw_ctx *hctx;
1642         int i;
1643
1644         queue_for_each_hw_ctx(q, hctx, i)
1645                 blk_mq_stop_hw_queue(hctx);
1646 }
1647 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1648
1649 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1650 {
1651         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1652
1653         blk_mq_run_hw_queue(hctx, false);
1654 }
1655 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1656
1657 void blk_mq_start_hw_queues(struct request_queue *q)
1658 {
1659         struct blk_mq_hw_ctx *hctx;
1660         int i;
1661
1662         queue_for_each_hw_ctx(q, hctx, i)
1663                 blk_mq_start_hw_queue(hctx);
1664 }
1665 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1666
1667 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1668 {
1669         if (!blk_mq_hctx_stopped(hctx))
1670                 return;
1671
1672         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1673         blk_mq_run_hw_queue(hctx, async);
1674 }
1675 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1676
1677 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1678 {
1679         struct blk_mq_hw_ctx *hctx;
1680         int i;
1681
1682         queue_for_each_hw_ctx(q, hctx, i)
1683                 blk_mq_start_stopped_hw_queue(hctx, async);
1684 }
1685 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1686
1687 static void blk_mq_run_work_fn(struct work_struct *work)
1688 {
1689         struct blk_mq_hw_ctx *hctx;
1690
1691         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1692
1693         /*
1694          * If we are stopped, don't run the queue.
1695          */
1696         if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
1697                 return;
1698
1699         __blk_mq_run_hw_queue(hctx);
1700 }
1701
1702 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1703                                             struct request *rq,
1704                                             bool at_head)
1705 {
1706         struct blk_mq_ctx *ctx = rq->mq_ctx;
1707         enum hctx_type type = hctx->type;
1708
1709         lockdep_assert_held(&ctx->lock);
1710
1711         trace_block_rq_insert(hctx->queue, rq);
1712
1713         if (at_head)
1714                 list_add(&rq->queuelist, &ctx->rq_lists[type]);
1715         else
1716                 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1717 }
1718
1719 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1720                              bool at_head)
1721 {
1722         struct blk_mq_ctx *ctx = rq->mq_ctx;
1723
1724         lockdep_assert_held(&ctx->lock);
1725
1726         __blk_mq_insert_req_list(hctx, rq, at_head);
1727         blk_mq_hctx_mark_pending(hctx, ctx);
1728 }
1729
1730 /**
1731  * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1732  * @rq: Pointer to request to be inserted.
1733  * @run_queue: If we should run the hardware queue after inserting the request.
1734  *
1735  * Should only be used carefully, when the caller knows we want to
1736  * bypass a potential IO scheduler on the target device.
1737  */
1738 void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1739                                   bool run_queue)
1740 {
1741         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1742
1743         spin_lock(&hctx->lock);
1744         if (at_head)
1745                 list_add(&rq->queuelist, &hctx->dispatch);
1746         else
1747                 list_add_tail(&rq->queuelist, &hctx->dispatch);
1748         spin_unlock(&hctx->lock);
1749
1750         if (run_queue)
1751                 blk_mq_run_hw_queue(hctx, false);
1752 }
1753
1754 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1755                             struct list_head *list)
1756
1757 {
1758         struct request *rq;
1759         enum hctx_type type = hctx->type;
1760
1761         /*
1762          * preemption doesn't flush plug list, so it's possible ctx->cpu is
1763          * offline now
1764          */
1765         list_for_each_entry(rq, list, queuelist) {
1766                 BUG_ON(rq->mq_ctx != ctx);
1767                 trace_block_rq_insert(hctx->queue, rq);
1768         }
1769
1770         spin_lock(&ctx->lock);
1771         list_splice_tail_init(list, &ctx->rq_lists[type]);
1772         blk_mq_hctx_mark_pending(hctx, ctx);
1773         spin_unlock(&ctx->lock);
1774 }
1775
1776 static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
1777 {
1778         struct request *rqa = container_of(a, struct request, queuelist);
1779         struct request *rqb = container_of(b, struct request, queuelist);
1780
1781         if (rqa->mq_ctx != rqb->mq_ctx)
1782                 return rqa->mq_ctx > rqb->mq_ctx;
1783         if (rqa->mq_hctx != rqb->mq_hctx)
1784                 return rqa->mq_hctx > rqb->mq_hctx;
1785
1786         return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1787 }
1788
1789 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1790 {
1791         LIST_HEAD(list);
1792
1793         if (list_empty(&plug->mq_list))
1794                 return;
1795         list_splice_init(&plug->mq_list, &list);
1796
1797         if (plug->rq_count > 2 && plug->multiple_queues)
1798                 list_sort(NULL, &list, plug_rq_cmp);
1799
1800         plug->rq_count = 0;
1801
1802         do {
1803                 struct list_head rq_list;
1804                 struct request *rq, *head_rq = list_entry_rq(list.next);
1805                 struct list_head *pos = &head_rq->queuelist; /* skip first */
1806                 struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1807                 struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1808                 unsigned int depth = 1;
1809
1810                 list_for_each_continue(pos, &list) {
1811                         rq = list_entry_rq(pos);
1812                         BUG_ON(!rq->q);
1813                         if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1814                                 break;
1815                         depth++;
1816                 }
1817
1818                 list_cut_before(&rq_list, &list, pos);
1819                 trace_block_unplug(head_rq->q, depth, !from_schedule);
1820                 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1821                                                 from_schedule);
1822         } while(!list_empty(&list));
1823 }
1824
1825 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1826                 unsigned int nr_segs)
1827 {
1828         if (bio->bi_opf & REQ_RAHEAD)
1829                 rq->cmd_flags |= REQ_FAILFAST_MASK;
1830
1831         rq->__sector = bio->bi_iter.bi_sector;
1832         rq->write_hint = bio->bi_write_hint;
1833         blk_rq_bio_prep(rq, bio, nr_segs);
1834         blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
1835
1836         blk_account_io_start(rq);
1837 }
1838
1839 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1840                                             struct request *rq,
1841                                             blk_qc_t *cookie, bool last)
1842 {
1843         struct request_queue *q = rq->q;
1844         struct blk_mq_queue_data bd = {
1845                 .rq = rq,
1846                 .last = last,
1847         };
1848         blk_qc_t new_cookie;
1849         blk_status_t ret;
1850
1851         new_cookie = request_to_qc_t(hctx, rq);
1852
1853         /*
1854          * For OK queue, we are done. For error, caller may kill it.
1855          * Any other error (busy), just add it to our list as we
1856          * previously would have done.
1857          */
1858         ret = q->mq_ops->queue_rq(hctx, &bd);
1859         switch (ret) {
1860         case BLK_STS_OK:
1861                 blk_mq_update_dispatch_busy(hctx, false);
1862                 *cookie = new_cookie;
1863                 break;
1864         case BLK_STS_RESOURCE:
1865         case BLK_STS_DEV_RESOURCE:
1866                 blk_mq_update_dispatch_busy(hctx, true);
1867                 __blk_mq_requeue_request(rq);
1868                 break;
1869         default:
1870                 blk_mq_update_dispatch_busy(hctx, false);
1871                 *cookie = BLK_QC_T_NONE;
1872                 break;
1873         }
1874
1875         return ret;
1876 }
1877
1878 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1879                                                 struct request *rq,
1880                                                 blk_qc_t *cookie,
1881                                                 bool bypass_insert, bool last)
1882 {
1883         struct request_queue *q = rq->q;
1884         bool run_queue = true;
1885
1886         /*
1887          * RCU or SRCU read lock is needed before checking quiesced flag.
1888          *
1889          * When queue is stopped or quiesced, ignore 'bypass_insert' from
1890          * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
1891          * and avoid driver to try to dispatch again.
1892          */
1893         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1894                 run_queue = false;
1895                 bypass_insert = false;
1896                 goto insert;
1897         }
1898
1899         if (q->elevator && !bypass_insert)
1900                 goto insert;
1901
1902         if (!blk_mq_get_dispatch_budget(hctx))
1903                 goto insert;
1904
1905         if (!blk_mq_get_driver_tag(rq)) {
1906                 blk_mq_put_dispatch_budget(hctx);
1907                 goto insert;
1908         }
1909
1910         return __blk_mq_issue_directly(hctx, rq, cookie, last);
1911 insert:
1912         if (bypass_insert)
1913                 return BLK_STS_RESOURCE;
1914
1915         blk_mq_request_bypass_insert(rq, false, run_queue);
1916         return BLK_STS_OK;
1917 }
1918
1919 /**
1920  * blk_mq_try_issue_directly - Try to send a request directly to device driver.
1921  * @hctx: Pointer of the associated hardware queue.
1922  * @rq: Pointer to request to be sent.
1923  * @cookie: Request queue cookie.
1924  *
1925  * If the device has enough resources to accept a new request now, send the
1926  * request directly to device driver. Else, insert at hctx->dispatch queue, so
1927  * we can try send it another time in the future. Requests inserted at this
1928  * queue have higher priority.
1929  */
1930 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1931                 struct request *rq, blk_qc_t *cookie)
1932 {
1933         blk_status_t ret;
1934         int srcu_idx;
1935
1936         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1937
1938         hctx_lock(hctx, &srcu_idx);
1939
1940         ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
1941         if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1942                 blk_mq_request_bypass_insert(rq, false, true);
1943         else if (ret != BLK_STS_OK)
1944                 blk_mq_end_request(rq, ret);
1945
1946         hctx_unlock(hctx, srcu_idx);
1947 }
1948
1949 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
1950 {
1951         blk_status_t ret;
1952         int srcu_idx;
1953         blk_qc_t unused_cookie;
1954         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1955
1956         hctx_lock(hctx, &srcu_idx);
1957         ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
1958         hctx_unlock(hctx, srcu_idx);
1959
1960         return ret;
1961 }
1962
1963 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1964                 struct list_head *list)
1965 {
1966         int queued = 0;
1967
1968         while (!list_empty(list)) {
1969                 blk_status_t ret;
1970                 struct request *rq = list_first_entry(list, struct request,
1971                                 queuelist);
1972
1973                 list_del_init(&rq->queuelist);
1974                 ret = blk_mq_request_issue_directly(rq, list_empty(list));
1975                 if (ret != BLK_STS_OK) {
1976                         if (ret == BLK_STS_RESOURCE ||
1977                                         ret == BLK_STS_DEV_RESOURCE) {
1978                                 blk_mq_request_bypass_insert(rq, false,
1979                                                         list_empty(list));
1980                                 break;
1981                         }
1982                         blk_mq_end_request(rq, ret);
1983                 } else
1984                         queued++;
1985         }
1986
1987         /*
1988          * If we didn't flush the entire list, we could have told
1989          * the driver there was more coming, but that turned out to
1990          * be a lie.
1991          */
1992         if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued)
1993                 hctx->queue->mq_ops->commit_rqs(hctx);
1994 }
1995
1996 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1997 {
1998         list_add_tail(&rq->queuelist, &plug->mq_list);
1999         plug->rq_count++;
2000         if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2001                 struct request *tmp;
2002
2003                 tmp = list_first_entry(&plug->mq_list, struct request,
2004                                                 queuelist);
2005                 if (tmp->q != rq->q)
2006                         plug->multiple_queues = true;
2007         }
2008 }
2009
2010 /**
2011  * blk_mq_make_request - Create and send a request to block device.
2012  * @q: Request queue pointer.
2013  * @bio: Bio pointer.
2014  *
2015  * Builds up a request structure from @q and @bio and send to the device. The
2016  * request may not be queued directly to hardware if:
2017  * * This request can be merged with another one
2018  * * We want to place request at plug queue for possible future merging
2019  * * There is an IO scheduler active at this queue
2020  *
2021  * It will not queue the request if there is an error with the bio, or at the
2022  * request creation.
2023  *
2024  * Returns: Request queue cookie.
2025  */
2026 blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
2027 {
2028         const int is_sync = op_is_sync(bio->bi_opf);
2029         const int is_flush_fua = op_is_flush(bio->bi_opf);
2030         struct blk_mq_alloc_data data = { .flags = 0};
2031         struct request *rq;
2032         struct blk_plug *plug;
2033         struct request *same_queue_rq = NULL;
2034         unsigned int nr_segs;
2035         blk_qc_t cookie;
2036         blk_status_t ret;
2037
2038         blk_queue_bounce(q, &bio);
2039         __blk_queue_split(q, &bio, &nr_segs);
2040
2041         if (!bio_integrity_prep(bio))
2042                 goto queue_exit;
2043
2044         if (!is_flush_fua && !blk_queue_nomerges(q) &&
2045             blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
2046                 goto queue_exit;
2047
2048         if (blk_mq_sched_bio_merge(q, bio, nr_segs))
2049                 goto queue_exit;
2050
2051         rq_qos_throttle(q, bio);
2052
2053         data.cmd_flags = bio->bi_opf;
2054         rq = blk_mq_get_request(q, bio, &data);
2055         if (unlikely(!rq)) {
2056                 rq_qos_cleanup(q, bio);
2057                 if (bio->bi_opf & REQ_NOWAIT)
2058                         bio_wouldblock_error(bio);
2059                 goto queue_exit;
2060         }
2061
2062         trace_block_getrq(q, bio, bio->bi_opf);
2063
2064         rq_qos_track(q, rq, bio);
2065
2066         cookie = request_to_qc_t(data.hctx, rq);
2067
2068         blk_mq_bio_to_request(rq, bio, nr_segs);
2069
2070         ret = blk_crypto_init_request(rq);
2071         if (ret != BLK_STS_OK) {
2072                 bio->bi_status = ret;
2073                 bio_endio(bio);
2074                 blk_mq_free_request(rq);
2075                 return BLK_QC_T_NONE;
2076         }
2077
2078         plug = blk_mq_plug(q, bio);
2079         if (unlikely(is_flush_fua)) {
2080                 /* Bypass scheduler for flush requests */
2081                 blk_insert_flush(rq);
2082                 blk_mq_run_hw_queue(data.hctx, true);
2083         } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
2084                                 !blk_queue_nonrot(q))) {
2085                 /*
2086                  * Use plugging if we have a ->commit_rqs() hook as well, as
2087                  * we know the driver uses bd->last in a smart fashion.
2088                  *
2089                  * Use normal plugging if this disk is slow HDD, as sequential
2090                  * IO may benefit a lot from plug merging.
2091                  */
2092                 unsigned int request_count = plug->rq_count;
2093                 struct request *last = NULL;
2094
2095                 if (!request_count)
2096                         trace_block_plug(q);
2097                 else
2098                         last = list_entry_rq(plug->mq_list.prev);
2099
2100                 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
2101                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
2102                         blk_flush_plug_list(plug, false);
2103                         trace_block_plug(q);
2104                 }
2105
2106                 blk_add_rq_to_plug(plug, rq);
2107         } else if (q->elevator) {
2108                 /* Insert the request at the IO scheduler queue */
2109                 blk_mq_sched_insert_request(rq, false, true, true);
2110         } else if (plug && !blk_queue_nomerges(q)) {
2111                 /*
2112                  * We do limited plugging. If the bio can be merged, do that.
2113                  * Otherwise the existing request in the plug list will be
2114                  * issued. So the plug list will have one request at most
2115                  * The plug list might get flushed before this. If that happens,
2116                  * the plug list is empty, and same_queue_rq is invalid.
2117                  */
2118                 if (list_empty(&plug->mq_list))
2119                         same_queue_rq = NULL;
2120                 if (same_queue_rq) {
2121                         list_del_init(&same_queue_rq->queuelist);
2122                         plug->rq_count--;
2123                 }
2124                 blk_add_rq_to_plug(plug, rq);
2125                 trace_block_plug(q);
2126
2127                 if (same_queue_rq) {
2128                         data.hctx = same_queue_rq->mq_hctx;
2129                         trace_block_unplug(q, 1, true);
2130                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
2131                                         &cookie);
2132                 }
2133         } else if ((q->nr_hw_queues > 1 && is_sync) ||
2134                         !data.hctx->dispatch_busy) {
2135                 /*
2136                  * There is no scheduler and we can try to send directly
2137                  * to the hardware.
2138                  */
2139                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
2140         } else {
2141                 /* Default case. */
2142                 blk_mq_sched_insert_request(rq, false, true, true);
2143         }
2144
2145         return cookie;
2146 queue_exit:
2147         blk_queue_exit(q);
2148         return BLK_QC_T_NONE;
2149 }
2150 EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
2151
2152 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2153                      unsigned int hctx_idx)
2154 {
2155         struct page *page;
2156
2157         if (tags->rqs && set->ops->exit_request) {
2158                 int i;
2159
2160                 for (i = 0; i < tags->nr_tags; i++) {
2161                         struct request *rq = tags->static_rqs[i];
2162
2163                         if (!rq)
2164                                 continue;
2165                         set->ops->exit_request(set, rq, hctx_idx);
2166                         tags->static_rqs[i] = NULL;
2167                 }
2168         }
2169
2170         while (!list_empty(&tags->page_list)) {
2171                 page = list_first_entry(&tags->page_list, struct page, lru);
2172                 list_del_init(&page->lru);
2173                 /*
2174                  * Remove kmemleak object previously allocated in
2175                  * blk_mq_alloc_rqs().
2176                  */
2177                 kmemleak_free(page_address(page));
2178                 __free_pages(page, page->private);
2179         }
2180 }
2181
2182 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
2183 {
2184         kfree(tags->rqs);
2185         tags->rqs = NULL;
2186         kfree(tags->static_rqs);
2187         tags->static_rqs = NULL;
2188
2189         blk_mq_free_tags(tags);
2190 }
2191
2192 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
2193                                         unsigned int hctx_idx,
2194                                         unsigned int nr_tags,
2195                                         unsigned int reserved_tags)
2196 {
2197         struct blk_mq_tags *tags;
2198         int node;
2199
2200         node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2201         if (node == NUMA_NO_NODE)
2202                 node = set->numa_node;
2203
2204         tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2205                                 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
2206         if (!tags)
2207                 return NULL;
2208
2209         tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2210                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2211                                  node);
2212         if (!tags->rqs) {
2213                 blk_mq_free_tags(tags);
2214                 return NULL;
2215         }
2216
2217         tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2218                                         GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2219                                         node);
2220         if (!tags->static_rqs) {
2221                 kfree(tags->rqs);
2222                 blk_mq_free_tags(tags);
2223                 return NULL;
2224         }
2225
2226         return tags;
2227 }
2228
2229 static size_t order_to_size(unsigned int order)
2230 {
2231         return (size_t)PAGE_SIZE << order;
2232 }
2233
2234 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2235                                unsigned int hctx_idx, int node)
2236 {
2237         int ret;
2238
2239         if (set->ops->init_request) {
2240                 ret = set->ops->init_request(set, rq, hctx_idx, node);
2241                 if (ret)
2242                         return ret;
2243         }
2244
2245         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
2246         return 0;
2247 }
2248
2249 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2250                      unsigned int hctx_idx, unsigned int depth)
2251 {
2252         unsigned int i, j, entries_per_page, max_order = 4;
2253         size_t rq_size, left;
2254         int node;
2255
2256         node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
2257         if (node == NUMA_NO_NODE)
2258                 node = set->numa_node;
2259
2260         INIT_LIST_HEAD(&tags->page_list);
2261
2262         /*
2263          * rq_size is the size of the request plus driver payload, rounded
2264          * to the cacheline size
2265          */
2266         rq_size = round_up(sizeof(struct request) + set->cmd_size,
2267                                 cache_line_size());
2268         left = rq_size * depth;
2269
2270         for (i = 0; i < depth; ) {
2271                 int this_order = max_order;
2272                 struct page *page;
2273                 int to_do;
2274                 void *p;
2275
2276                 while (this_order && left < order_to_size(this_order - 1))
2277                         this_order--;
2278
2279                 do {
2280                         page = alloc_pages_node(node,
2281                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
2282                                 this_order);
2283                         if (page)
2284                                 break;
2285                         if (!this_order--)
2286                                 break;
2287                         if (order_to_size(this_order) < rq_size)
2288                                 break;
2289                 } while (1);
2290
2291                 if (!page)
2292                         goto fail;
2293
2294                 page->private = this_order;
2295                 list_add_tail(&page->lru, &tags->page_list);
2296
2297                 p = page_address(page);
2298                 /*
2299                  * Allow kmemleak to scan these pages as they contain pointers
2300                  * to additional allocations like via ops->init_request().
2301                  */
2302                 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
2303                 entries_per_page = order_to_size(this_order) / rq_size;
2304                 to_do = min(entries_per_page, depth - i);
2305                 left -= to_do * rq_size;
2306                 for (j = 0; j < to_do; j++) {
2307                         struct request *rq = p;
2308
2309                         tags->static_rqs[i] = rq;
2310                         if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2311                                 tags->static_rqs[i] = NULL;
2312                                 goto fail;
2313                         }
2314
2315                         p += rq_size;
2316                         i++;
2317                 }
2318         }
2319         return 0;
2320
2321 fail:
2322         blk_mq_free_rqs(set, tags, hctx_idx);
2323         return -ENOMEM;
2324 }
2325
2326 /*
2327  * 'cpu' is going away. splice any existing rq_list entries from this
2328  * software queue to the hw queue dispatch list, and ensure that it
2329  * gets run.
2330  */
2331 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2332 {
2333         struct blk_mq_hw_ctx *hctx;
2334         struct blk_mq_ctx *ctx;
2335         LIST_HEAD(tmp);
2336         enum hctx_type type;
2337
2338         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2339         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2340         type = hctx->type;
2341
2342         spin_lock(&ctx->lock);
2343         if (!list_empty(&ctx->rq_lists[type])) {
2344                 list_splice_init(&ctx->rq_lists[type], &tmp);
2345                 blk_mq_hctx_clear_pending(hctx, ctx);
2346         }
2347         spin_unlock(&ctx->lock);
2348
2349         if (list_empty(&tmp))
2350                 return 0;
2351
2352         spin_lock(&hctx->lock);
2353         list_splice_tail_init(&tmp, &hctx->dispatch);
2354         spin_unlock(&hctx->lock);
2355
2356         blk_mq_run_hw_queue(hctx, true);
2357         return 0;
2358 }
2359
2360 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2361 {
2362         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2363                                             &hctx->cpuhp_dead);
2364 }
2365
2366 /* hctx->ctxs will be freed in queue's release handler */
2367 static void blk_mq_exit_hctx(struct request_queue *q,
2368                 struct blk_mq_tag_set *set,
2369                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2370 {
2371         if (blk_mq_hw_queue_mapped(hctx))
2372                 blk_mq_tag_idle(hctx);
2373
2374         if (set->ops->exit_request)
2375                 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2376
2377         if (set->ops->exit_hctx)
2378                 set->ops->exit_hctx(hctx, hctx_idx);
2379
2380         blk_mq_remove_cpuhp(hctx);
2381
2382         spin_lock(&q->unused_hctx_lock);
2383         list_add(&hctx->hctx_list, &q->unused_hctx_list);
2384         spin_unlock(&q->unused_hctx_lock);
2385 }
2386
2387 static void blk_mq_exit_hw_queues(struct request_queue *q,
2388                 struct blk_mq_tag_set *set, int nr_queue)
2389 {
2390         struct blk_mq_hw_ctx *hctx;
2391         unsigned int i;
2392
2393         queue_for_each_hw_ctx(q, hctx, i) {
2394                 if (i == nr_queue)
2395                         break;
2396                 blk_mq_debugfs_unregister_hctx(hctx);
2397                 blk_mq_exit_hctx(q, set, hctx, i);
2398         }
2399 }
2400
2401 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2402 {
2403         int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2404
2405         BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2406                            __alignof__(struct blk_mq_hw_ctx)) !=
2407                      sizeof(struct blk_mq_hw_ctx));
2408
2409         if (tag_set->flags & BLK_MQ_F_BLOCKING)
2410                 hw_ctx_size += sizeof(struct srcu_struct);
2411
2412         return hw_ctx_size;
2413 }
2414
2415 static int blk_mq_init_hctx(struct request_queue *q,
2416                 struct blk_mq_tag_set *set,
2417                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2418 {
2419         hctx->queue_num = hctx_idx;
2420
2421         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2422
2423         hctx->tags = set->tags[hctx_idx];
2424
2425         if (set->ops->init_hctx &&
2426             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2427                 goto unregister_cpu_notifier;
2428
2429         if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2430                                 hctx->numa_node))
2431                 goto exit_hctx;
2432         return 0;
2433
2434  exit_hctx:
2435         if (set->ops->exit_hctx)
2436                 set->ops->exit_hctx(hctx, hctx_idx);
2437  unregister_cpu_notifier:
2438         blk_mq_remove_cpuhp(hctx);
2439         return -1;
2440 }
2441
2442 static struct blk_mq_hw_ctx *
2443 blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2444                 int node)
2445 {
2446         struct blk_mq_hw_ctx *hctx;
2447         gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2448
2449         hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2450         if (!hctx)
2451                 goto fail_alloc_hctx;
2452
2453         if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2454                 goto free_hctx;
2455
2456         atomic_set(&hctx->nr_active, 0);
2457         if (node == NUMA_NO_NODE)
2458                 node = set->numa_node;
2459         hctx->numa_node = node;
2460
2461         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2462         spin_lock_init(&hctx->lock);
2463         INIT_LIST_HEAD(&hctx->dispatch);
2464         hctx->queue = q;
2465         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2466
2467         INIT_LIST_HEAD(&hctx->hctx_list);
2468
2469         /*
2470          * Allocate space for all possible cpus to avoid allocation at
2471          * runtime
2472          */
2473         hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2474                         gfp, node);
2475         if (!hctx->ctxs)
2476                 goto free_cpumask;
2477
2478         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2479                                 gfp, node))
2480                 goto free_ctxs;
2481         hctx->nr_ctx = 0;
2482
2483         spin_lock_init(&hctx->dispatch_wait_lock);
2484         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2485         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2486
2487         hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
2488         if (!hctx->fq)
2489                 goto free_bitmap;
2490
2491         if (hctx->flags & BLK_MQ_F_BLOCKING)
2492                 init_srcu_struct(hctx->srcu);
2493         blk_mq_hctx_kobj_init(hctx);
2494
2495         return hctx;
2496
2497  free_bitmap:
2498         sbitmap_free(&hctx->ctx_map);
2499  free_ctxs:
2500         kfree(hctx->ctxs);
2501  free_cpumask:
2502         free_cpumask_var(hctx->cpumask);
2503  free_hctx:
2504         kfree(hctx);
2505  fail_alloc_hctx:
2506         return NULL;
2507 }
2508
2509 static void blk_mq_init_cpu_queues(struct request_queue *q,
2510                                    unsigned int nr_hw_queues)
2511 {
2512         struct blk_mq_tag_set *set = q->tag_set;
2513         unsigned int i, j;
2514
2515         for_each_possible_cpu(i) {
2516                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2517                 struct blk_mq_hw_ctx *hctx;
2518                 int k;
2519
2520                 __ctx->cpu = i;
2521                 spin_lock_init(&__ctx->lock);
2522                 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2523                         INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2524
2525                 __ctx->queue = q;
2526
2527                 /*
2528                  * Set local node, IFF we have more than one hw queue. If
2529                  * not, we remain on the home node of the device
2530                  */
2531                 for (j = 0; j < set->nr_maps; j++) {
2532                         hctx = blk_mq_map_queue_type(q, j, i);
2533                         if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2534                                 hctx->numa_node = local_memory_node(cpu_to_node(i));
2535                 }
2536         }
2537 }
2538
2539 static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
2540                                         int hctx_idx)
2541 {
2542         int ret = 0;
2543
2544         set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2545                                         set->queue_depth, set->reserved_tags);
2546         if (!set->tags[hctx_idx])
2547                 return false;
2548
2549         ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2550                                 set->queue_depth);
2551         if (!ret)
2552                 return true;
2553
2554         blk_mq_free_rq_map(set->tags[hctx_idx]);
2555         set->tags[hctx_idx] = NULL;
2556         return false;
2557 }
2558
2559 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2560                                          unsigned int hctx_idx)
2561 {
2562         if (set->tags && set->tags[hctx_idx]) {
2563                 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2564                 blk_mq_free_rq_map(set->tags[hctx_idx]);
2565                 set->tags[hctx_idx] = NULL;
2566         }
2567 }
2568
2569 static void blk_mq_map_swqueue(struct request_queue *q)
2570 {
2571         unsigned int i, j, hctx_idx;
2572         struct blk_mq_hw_ctx *hctx;
2573         struct blk_mq_ctx *ctx;
2574         struct blk_mq_tag_set *set = q->tag_set;
2575
2576         queue_for_each_hw_ctx(q, hctx, i) {
2577                 cpumask_clear(hctx->cpumask);
2578                 hctx->nr_ctx = 0;
2579                 hctx->dispatch_from = NULL;
2580         }
2581
2582         /*
2583          * Map software to hardware queues.
2584          *
2585          * If the cpu isn't present, the cpu is mapped to first hctx.
2586          */
2587         for_each_possible_cpu(i) {
2588
2589                 ctx = per_cpu_ptr(q->queue_ctx, i);
2590                 for (j = 0; j < set->nr_maps; j++) {
2591                         if (!set->map[j].nr_queues) {
2592                                 ctx->hctxs[j] = blk_mq_map_queue_type(q,
2593                                                 HCTX_TYPE_DEFAULT, i);
2594                                 continue;
2595                         }
2596                         hctx_idx = set->map[j].mq_map[i];
2597                         /* unmapped hw queue can be remapped after CPU topo changed */
2598                         if (!set->tags[hctx_idx] &&
2599                             !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
2600                                 /*
2601                                  * If tags initialization fail for some hctx,
2602                                  * that hctx won't be brought online.  In this
2603                                  * case, remap the current ctx to hctx[0] which
2604                                  * is guaranteed to always have tags allocated
2605                                  */
2606                                 set->map[j].mq_map[i] = 0;
2607                         }
2608
2609                         hctx = blk_mq_map_queue_type(q, j, i);
2610                         ctx->hctxs[j] = hctx;
2611                         /*
2612                          * If the CPU is already set in the mask, then we've
2613                          * mapped this one already. This can happen if
2614                          * devices share queues across queue maps.
2615                          */
2616                         if (cpumask_test_cpu(i, hctx->cpumask))
2617                                 continue;
2618
2619                         cpumask_set_cpu(i, hctx->cpumask);
2620                         hctx->type = j;
2621                         ctx->index_hw[hctx->type] = hctx->nr_ctx;
2622                         hctx->ctxs[hctx->nr_ctx++] = ctx;
2623
2624                         /*
2625                          * If the nr_ctx type overflows, we have exceeded the
2626                          * amount of sw queues we can support.
2627                          */
2628                         BUG_ON(!hctx->nr_ctx);
2629                 }
2630
2631                 for (; j < HCTX_MAX_TYPES; j++)
2632                         ctx->hctxs[j] = blk_mq_map_queue_type(q,
2633                                         HCTX_TYPE_DEFAULT, i);
2634         }
2635
2636         queue_for_each_hw_ctx(q, hctx, i) {
2637                 /*
2638                  * If no software queues are mapped to this hardware queue,
2639                  * disable it and free the request entries.
2640                  */
2641                 if (!hctx->nr_ctx) {
2642                         /* Never unmap queue 0.  We need it as a
2643                          * fallback in case of a new remap fails
2644                          * allocation
2645                          */
2646                         if (i && set->tags[i])
2647                                 blk_mq_free_map_and_requests(set, i);
2648
2649                         hctx->tags = NULL;
2650                         continue;
2651                 }
2652
2653                 hctx->tags = set->tags[i];
2654                 WARN_ON(!hctx->tags);
2655
2656                 /*
2657                  * Set the map size to the number of mapped software queues.
2658                  * This is more accurate and more efficient than looping
2659                  * over all possibly mapped software queues.
2660                  */
2661                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2662
2663                 /*
2664                  * Initialize batch roundrobin counts
2665                  */
2666                 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
2667                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2668         }
2669 }
2670
2671 /*
2672  * Caller needs to ensure that we're either frozen/quiesced, or that
2673  * the queue isn't live yet.
2674  */
2675 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2676 {
2677         struct blk_mq_hw_ctx *hctx;
2678         int i;
2679
2680         queue_for_each_hw_ctx(q, hctx, i) {
2681                 if (shared)
2682                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
2683                 else
2684                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2685         }
2686 }
2687
2688 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2689                                         bool shared)
2690 {
2691         struct request_queue *q;
2692
2693         lockdep_assert_held(&set->tag_list_lock);
2694
2695         list_for_each_entry(q, &set->tag_list, tag_set_list) {
2696                 blk_mq_freeze_queue(q);
2697                 queue_set_hctx_shared(q, shared);
2698                 blk_mq_unfreeze_queue(q);
2699         }
2700 }
2701
2702 static void blk_mq_del_queue_tag_set(struct request_queue *q)
2703 {
2704         struct blk_mq_tag_set *set = q->tag_set;
2705
2706         mutex_lock(&set->tag_list_lock);
2707         list_del_rcu(&q->tag_set_list);
2708         if (list_is_singular(&set->tag_list)) {
2709                 /* just transitioned to unshared */
2710                 set->flags &= ~BLK_MQ_F_TAG_SHARED;
2711                 /* update existing queue */
2712                 blk_mq_update_tag_set_depth(set, false);
2713         }
2714         mutex_unlock(&set->tag_list_lock);
2715         INIT_LIST_HEAD(&q->tag_set_list);
2716 }
2717
2718 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2719                                      struct request_queue *q)
2720 {
2721         mutex_lock(&set->tag_list_lock);
2722
2723         /*
2724          * Check to see if we're transitioning to shared (from 1 to 2 queues).
2725          */
2726         if (!list_empty(&set->tag_list) &&
2727             !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2728                 set->flags |= BLK_MQ_F_TAG_SHARED;
2729                 /* update existing queue */
2730                 blk_mq_update_tag_set_depth(set, true);
2731         }
2732         if (set->flags & BLK_MQ_F_TAG_SHARED)
2733                 queue_set_hctx_shared(q, true);
2734         list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2735
2736         mutex_unlock(&set->tag_list_lock);
2737 }
2738
2739 /* All allocations will be freed in release handler of q->mq_kobj */
2740 static int blk_mq_alloc_ctxs(struct request_queue *q)
2741 {
2742         struct blk_mq_ctxs *ctxs;
2743         int cpu;
2744
2745         ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
2746         if (!ctxs)
2747                 return -ENOMEM;
2748
2749         ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2750         if (!ctxs->queue_ctx)
2751                 goto fail;
2752
2753         for_each_possible_cpu(cpu) {
2754                 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
2755                 ctx->ctxs = ctxs;
2756         }
2757
2758         q->mq_kobj = &ctxs->kobj;
2759         q->queue_ctx = ctxs->queue_ctx;
2760
2761         return 0;
2762  fail:
2763         kfree(ctxs);
2764         return -ENOMEM;
2765 }
2766
2767 /*
2768  * It is the actual release handler for mq, but we do it from
2769  * request queue's release handler for avoiding use-after-free
2770  * and headache because q->mq_kobj shouldn't have been introduced,
2771  * but we can't group ctx/kctx kobj without it.
2772  */
2773 void blk_mq_release(struct request_queue *q)
2774 {
2775         struct blk_mq_hw_ctx *hctx, *next;
2776         int i;
2777
2778         queue_for_each_hw_ctx(q, hctx, i)
2779                 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
2780
2781         /* all hctx are in .unused_hctx_list now */
2782         list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
2783                 list_del_init(&hctx->hctx_list);
2784                 kobject_put(&hctx->kobj);
2785         }
2786
2787         kfree(q->queue_hw_ctx);
2788
2789         /*
2790          * release .mq_kobj and sw queue's kobject now because
2791          * both share lifetime with request queue.
2792          */
2793         blk_mq_sysfs_deinit(q);
2794 }
2795
2796 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
2797                 void *queuedata)
2798 {
2799         struct request_queue *uninit_q, *q;
2800
2801         uninit_q = __blk_alloc_queue(set->numa_node);
2802         if (!uninit_q)
2803                 return ERR_PTR(-ENOMEM);
2804         uninit_q->queuedata = queuedata;
2805
2806         /*
2807          * Initialize the queue without an elevator. device_add_disk() will do
2808          * the initialization.
2809          */
2810         q = blk_mq_init_allocated_queue(set, uninit_q, false);
2811         if (IS_ERR(q))
2812                 blk_cleanup_queue(uninit_q);
2813
2814         return q;
2815 }
2816 EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
2817
2818 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2819 {
2820         return blk_mq_init_queue_data(set, NULL);
2821 }
2822 EXPORT_SYMBOL(blk_mq_init_queue);
2823
2824 /*
2825  * Helper for setting up a queue with mq ops, given queue depth, and
2826  * the passed in mq ops flags.
2827  */
2828 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2829                                            const struct blk_mq_ops *ops,
2830                                            unsigned int queue_depth,
2831                                            unsigned int set_flags)
2832 {
2833         struct request_queue *q;
2834         int ret;
2835
2836         memset(set, 0, sizeof(*set));
2837         set->ops = ops;
2838         set->nr_hw_queues = 1;
2839         set->nr_maps = 1;
2840         set->queue_depth = queue_depth;
2841         set->numa_node = NUMA_NO_NODE;
2842         set->flags = set_flags;
2843
2844         ret = blk_mq_alloc_tag_set(set);
2845         if (ret)
2846                 return ERR_PTR(ret);
2847
2848         q = blk_mq_init_queue(set);
2849         if (IS_ERR(q)) {
2850                 blk_mq_free_tag_set(set);
2851                 return q;
2852         }
2853
2854         return q;
2855 }
2856 EXPORT_SYMBOL(blk_mq_init_sq_queue);
2857
2858 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
2859                 struct blk_mq_tag_set *set, struct request_queue *q,
2860                 int hctx_idx, int node)
2861 {
2862         struct blk_mq_hw_ctx *hctx = NULL, *tmp;
2863
2864         /* reuse dead hctx first */
2865         spin_lock(&q->unused_hctx_lock);
2866         list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
2867                 if (tmp->numa_node == node) {
2868                         hctx = tmp;
2869                         break;
2870                 }
2871         }
2872         if (hctx)
2873                 list_del_init(&hctx->hctx_list);
2874         spin_unlock(&q->unused_hctx_lock);
2875
2876         if (!hctx)
2877                 hctx = blk_mq_alloc_hctx(q, set, node);
2878         if (!hctx)
2879                 goto fail;
2880
2881         if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
2882                 goto free_hctx;
2883
2884         return hctx;
2885
2886  free_hctx:
2887         kobject_put(&hctx->kobj);
2888  fail:
2889         return NULL;
2890 }
2891
2892 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2893                                                 struct request_queue *q)
2894 {
2895         int i, j, end;
2896         struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2897
2898         if (q->nr_hw_queues < set->nr_hw_queues) {
2899                 struct blk_mq_hw_ctx **new_hctxs;
2900
2901                 new_hctxs = kcalloc_node(set->nr_hw_queues,
2902                                        sizeof(*new_hctxs), GFP_KERNEL,
2903                                        set->numa_node);
2904                 if (!new_hctxs)
2905                         return;
2906                 if (hctxs)
2907                         memcpy(new_hctxs, hctxs, q->nr_hw_queues *
2908                                sizeof(*hctxs));
2909                 q->queue_hw_ctx = new_hctxs;
2910                 kfree(hctxs);
2911                 hctxs = new_hctxs;
2912         }
2913
2914         /* protect against switching io scheduler  */
2915         mutex_lock(&q->sysfs_lock);
2916         for (i = 0; i < set->nr_hw_queues; i++) {
2917                 int node;
2918                 struct blk_mq_hw_ctx *hctx;
2919
2920                 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
2921                 /*
2922                  * If the hw queue has been mapped to another numa node,
2923                  * we need to realloc the hctx. If allocation fails, fallback
2924                  * to use the previous one.
2925                  */
2926                 if (hctxs[i] && (hctxs[i]->numa_node == node))
2927                         continue;
2928
2929                 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
2930                 if (hctx) {
2931                         if (hctxs[i])
2932                                 blk_mq_exit_hctx(q, set, hctxs[i], i);
2933                         hctxs[i] = hctx;
2934                 } else {
2935                         if (hctxs[i])
2936                                 pr_warn("Allocate new hctx on node %d fails,\
2937                                                 fallback to previous one on node %d\n",
2938                                                 node, hctxs[i]->numa_node);
2939                         else
2940                                 break;
2941                 }
2942         }
2943         /*
2944          * Increasing nr_hw_queues fails. Free the newly allocated
2945          * hctxs and keep the previous q->nr_hw_queues.
2946          */
2947         if (i != set->nr_hw_queues) {
2948                 j = q->nr_hw_queues;
2949                 end = i;
2950         } else {
2951                 j = i;
2952                 end = q->nr_hw_queues;
2953                 q->nr_hw_queues = set->nr_hw_queues;
2954         }
2955
2956         for (; j < end; j++) {
2957                 struct blk_mq_hw_ctx *hctx = hctxs[j];
2958
2959                 if (hctx) {
2960                         if (hctx->tags)
2961                                 blk_mq_free_map_and_requests(set, j);
2962                         blk_mq_exit_hctx(q, set, hctx, j);
2963                         hctxs[j] = NULL;
2964                 }
2965         }
2966         mutex_unlock(&q->sysfs_lock);
2967 }
2968
2969 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2970                                                   struct request_queue *q,
2971                                                   bool elevator_init)
2972 {
2973         /* mark the queue as mq asap */
2974         q->mq_ops = set->ops;
2975
2976         q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2977                                              blk_mq_poll_stats_bkt,
2978                                              BLK_MQ_POLL_STATS_BKTS, q);
2979         if (!q->poll_cb)
2980                 goto err_exit;
2981
2982         if (blk_mq_alloc_ctxs(q))
2983                 goto err_poll;
2984
2985         /* init q->mq_kobj and sw queues' kobjects */
2986         blk_mq_sysfs_init(q);
2987
2988         INIT_LIST_HEAD(&q->unused_hctx_list);
2989         spin_lock_init(&q->unused_hctx_lock);
2990
2991         blk_mq_realloc_hw_ctxs(set, q);
2992         if (!q->nr_hw_queues)
2993                 goto err_hctxs;
2994
2995         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2996         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2997
2998         q->tag_set = set;
2999
3000         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
3001         if (set->nr_maps > HCTX_TYPE_POLL &&
3002             set->map[HCTX_TYPE_POLL].nr_queues)
3003                 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
3004
3005         q->sg_reserved_size = INT_MAX;
3006
3007         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
3008         INIT_LIST_HEAD(&q->requeue_list);
3009         spin_lock_init(&q->requeue_lock);
3010
3011         q->nr_requests = set->queue_depth;
3012
3013         /*
3014          * Default to classic polling
3015          */
3016         q->poll_nsec = BLK_MQ_POLL_CLASSIC;
3017
3018         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
3019         blk_mq_add_queue_tag_set(set, q);
3020         blk_mq_map_swqueue(q);
3021
3022         if (elevator_init)
3023                 elevator_init_mq(q);
3024
3025         return q;
3026
3027 err_hctxs:
3028         kfree(q->queue_hw_ctx);
3029         q->nr_hw_queues = 0;
3030         blk_mq_sysfs_deinit(q);
3031 err_poll:
3032         blk_stat_free_callback(q->poll_cb);
3033         q->poll_cb = NULL;
3034 err_exit:
3035         q->mq_ops = NULL;
3036         return ERR_PTR(-ENOMEM);
3037 }
3038 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
3039
3040 /* tags can _not_ be used after returning from blk_mq_exit_queue */
3041 void blk_mq_exit_queue(struct request_queue *q)
3042 {
3043         struct blk_mq_tag_set   *set = q->tag_set;
3044
3045         blk_mq_del_queue_tag_set(q);
3046         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
3047 }
3048
3049 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3050 {
3051         int i;
3052
3053         for (i = 0; i < set->nr_hw_queues; i++)
3054                 if (!__blk_mq_alloc_map_and_request(set, i))
3055                         goto out_unwind;
3056
3057         return 0;
3058
3059 out_unwind:
3060         while (--i >= 0)
3061                 blk_mq_free_map_and_requests(set, i);
3062
3063         return -ENOMEM;
3064 }
3065
3066 /*
3067  * Allocate the request maps associated with this tag_set. Note that this
3068  * may reduce the depth asked for, if memory is tight. set->queue_depth
3069  * will be updated to reflect the allocated depth.
3070  */
3071 static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
3072 {
3073         unsigned int depth;
3074         int err;
3075
3076         depth = set->queue_depth;
3077         do {
3078                 err = __blk_mq_alloc_rq_maps(set);
3079                 if (!err)
3080                         break;
3081
3082                 set->queue_depth >>= 1;
3083                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
3084                         err = -ENOMEM;
3085                         break;
3086                 }
3087         } while (set->queue_depth);
3088
3089         if (!set->queue_depth || err) {
3090                 pr_err("blk-mq: failed to allocate request map\n");
3091                 return -ENOMEM;
3092         }
3093
3094         if (depth != set->queue_depth)
3095                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
3096                                                 depth, set->queue_depth);
3097
3098         return 0;
3099 }
3100
3101 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
3102 {
3103         /*
3104          * blk_mq_map_queues() and multiple .map_queues() implementations
3105          * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3106          * number of hardware queues.
3107          */
3108         if (set->nr_maps == 1)
3109                 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3110
3111         if (set->ops->map_queues && !is_kdump_kernel()) {
3112                 int i;
3113
3114                 /*
3115                  * transport .map_queues is usually done in the following
3116                  * way:
3117                  *
3118                  * for (queue = 0; queue < set->nr_hw_queues; queue++) {
3119                  *      mask = get_cpu_mask(queue)
3120                  *      for_each_cpu(cpu, mask)
3121                  *              set->map[x].mq_map[cpu] = queue;
3122                  * }
3123                  *
3124                  * When we need to remap, the table has to be cleared for
3125                  * killing stale mapping since one CPU may not be mapped
3126                  * to any hw queue.
3127                  */
3128                 for (i = 0; i < set->nr_maps; i++)
3129                         blk_mq_clear_mq_map(&set->map[i]);
3130
3131                 return set->ops->map_queues(set);
3132         } else {
3133                 BUG_ON(set->nr_maps > 1);
3134                 return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3135         }
3136 }
3137
3138 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3139                                   int cur_nr_hw_queues, int new_nr_hw_queues)
3140 {
3141         struct blk_mq_tags **new_tags;
3142
3143         if (cur_nr_hw_queues >= new_nr_hw_queues)
3144                 return 0;
3145
3146         new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3147                                 GFP_KERNEL, set->numa_node);
3148         if (!new_tags)
3149                 return -ENOMEM;
3150
3151         if (set->tags)
3152                 memcpy(new_tags, set->tags, cur_nr_hw_queues *
3153                        sizeof(*set->tags));
3154         kfree(set->tags);
3155         set->tags = new_tags;
3156         set->nr_hw_queues = new_nr_hw_queues;
3157
3158         return 0;
3159 }
3160
3161 /*
3162  * Alloc a tag set to be associated with one or more request queues.
3163  * May fail with EINVAL for various error conditions. May adjust the
3164  * requested depth down, if it's too large. In that case, the set
3165  * value will be stored in set->queue_depth.
3166  */
3167 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
3168 {
3169         int i, ret;
3170
3171         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
3172
3173         if (!set->nr_hw_queues)
3174                 return -EINVAL;
3175         if (!set->queue_depth)
3176                 return -EINVAL;
3177         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
3178                 return -EINVAL;
3179
3180         if (!set->ops->queue_rq)
3181                 return -EINVAL;
3182
3183         if (!set->ops->get_budget ^ !set->ops->put_budget)
3184                 return -EINVAL;
3185
3186         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
3187                 pr_info("blk-mq: reduced tag depth to %u\n",
3188                         BLK_MQ_MAX_DEPTH);
3189                 set->queue_depth = BLK_MQ_MAX_DEPTH;
3190         }
3191
3192         if (!set->nr_maps)
3193                 set->nr_maps = 1;
3194         else if (set->nr_maps > HCTX_MAX_TYPES)
3195                 return -EINVAL;
3196
3197         /*
3198          * If a crashdump is active, then we are potentially in a very
3199          * memory constrained environment. Limit us to 1 queue and
3200          * 64 tags to prevent using too much memory.
3201          */
3202         if (is_kdump_kernel()) {
3203                 set->nr_hw_queues = 1;
3204                 set->nr_maps = 1;
3205                 set->queue_depth = min(64U, set->queue_depth);
3206         }
3207         /*
3208          * There is no use for more h/w queues than cpus if we just have
3209          * a single map
3210          */
3211         if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
3212                 set->nr_hw_queues = nr_cpu_ids;
3213
3214         if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
3215                 return -ENOMEM;
3216
3217         ret = -ENOMEM;
3218         for (i = 0; i < set->nr_maps; i++) {
3219                 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
3220                                                   sizeof(set->map[i].mq_map[0]),
3221                                                   GFP_KERNEL, set->numa_node);
3222                 if (!set->map[i].mq_map)
3223                         goto out_free_mq_map;
3224                 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3225         }
3226
3227         ret = blk_mq_update_queue_map(set);
3228         if (ret)
3229                 goto out_free_mq_map;
3230
3231         ret = blk_mq_alloc_map_and_requests(set);
3232         if (ret)
3233                 goto out_free_mq_map;
3234
3235         mutex_init(&set->tag_list_lock);
3236         INIT_LIST_HEAD(&set->tag_list);
3237
3238         return 0;
3239
3240 out_free_mq_map:
3241         for (i = 0; i < set->nr_maps; i++) {
3242                 kfree(set->map[i].mq_map);
3243                 set->map[i].mq_map = NULL;
3244         }
3245         kfree(set->tags);
3246         set->tags = NULL;
3247         return ret;
3248 }
3249 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
3250
3251 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
3252 {
3253         int i, j;
3254
3255         for (i = 0; i < set->nr_hw_queues; i++)
3256                 blk_mq_free_map_and_requests(set, i);
3257
3258         for (j = 0; j < set->nr_maps; j++) {
3259                 kfree(set->map[j].mq_map);
3260                 set->map[j].mq_map = NULL;
3261         }
3262
3263         kfree(set->tags);
3264         set->tags = NULL;
3265 }
3266 EXPORT_SYMBOL(blk_mq_free_tag_set);
3267
3268 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
3269 {
3270         struct blk_mq_tag_set *set = q->tag_set;
3271         struct blk_mq_hw_ctx *hctx;
3272         int i, ret;
3273
3274         if (!set)
3275                 return -EINVAL;
3276
3277         if (q->nr_requests == nr)
3278                 return 0;
3279
3280         blk_mq_freeze_queue(q);
3281         blk_mq_quiesce_queue(q);
3282
3283         ret = 0;
3284         queue_for_each_hw_ctx(q, hctx, i) {
3285                 if (!hctx->tags)
3286                         continue;
3287                 /*
3288                  * If we're using an MQ scheduler, just update the scheduler
3289                  * queue depth. This is similar to what the old code would do.
3290                  */
3291                 if (!hctx->sched_tags) {
3292                         ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
3293                                                         false);
3294                 } else {
3295                         ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
3296                                                         nr, true);
3297                 }
3298                 if (ret)
3299                         break;
3300                 if (q->elevator && q->elevator->type->ops.depth_updated)
3301                         q->elevator->type->ops.depth_updated(hctx);
3302         }
3303
3304         if (!ret)
3305                 q->nr_requests = nr;
3306
3307         blk_mq_unquiesce_queue(q);
3308         blk_mq_unfreeze_queue(q);
3309
3310         return ret;
3311 }
3312
3313 /*
3314  * request_queue and elevator_type pair.
3315  * It is just used by __blk_mq_update_nr_hw_queues to cache
3316  * the elevator_type associated with a request_queue.
3317  */
3318 struct blk_mq_qe_pair {
3319         struct list_head node;
3320         struct request_queue *q;
3321         struct elevator_type *type;
3322 };
3323
3324 /*
3325  * Cache the elevator_type in qe pair list and switch the
3326  * io scheduler to 'none'
3327  */
3328 static bool blk_mq_elv_switch_none(struct list_head *head,
3329                 struct request_queue *q)
3330 {
3331         struct blk_mq_qe_pair *qe;
3332
3333         if (!q->elevator)
3334                 return true;
3335
3336         qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
3337         if (!qe)
3338                 return false;
3339
3340         INIT_LIST_HEAD(&qe->node);
3341         qe->q = q;
3342         qe->type = q->elevator->type;
3343         list_add(&qe->node, head);
3344
3345         mutex_lock(&q->sysfs_lock);
3346         /*
3347          * After elevator_switch_mq, the previous elevator_queue will be
3348          * released by elevator_release. The reference of the io scheduler
3349          * module get by elevator_get will also be put. So we need to get
3350          * a reference of the io scheduler module here to prevent it to be
3351          * removed.
3352          */
3353         __module_get(qe->type->elevator_owner);
3354         elevator_switch_mq(q, NULL);
3355         mutex_unlock(&q->sysfs_lock);
3356
3357         return true;
3358 }
3359
3360 static void blk_mq_elv_switch_back(struct list_head *head,
3361                 struct request_queue *q)
3362 {
3363         struct blk_mq_qe_pair *qe;
3364         struct elevator_type *t = NULL;
3365
3366         list_for_each_entry(qe, head, node)
3367                 if (qe->q == q) {
3368                         t = qe->type;
3369                         break;
3370                 }
3371
3372         if (!t)
3373                 return;
3374
3375         list_del(&qe->node);
3376         kfree(qe);
3377
3378         mutex_lock(&q->sysfs_lock);
3379         elevator_switch_mq(q, t);
3380         mutex_unlock(&q->sysfs_lock);
3381 }
3382
3383 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3384                                                         int nr_hw_queues)
3385 {
3386         struct request_queue *q;
3387         LIST_HEAD(head);
3388         int prev_nr_hw_queues;
3389
3390         lockdep_assert_held(&set->tag_list_lock);
3391
3392         if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3393                 nr_hw_queues = nr_cpu_ids;
3394         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3395                 return;
3396
3397         list_for_each_entry(q, &set->tag_list, tag_set_list)
3398                 blk_mq_freeze_queue(q);
3399         /*
3400          * Switch IO scheduler to 'none', cleaning up the data associated
3401          * with the previous scheduler. We will switch back once we are done
3402          * updating the new sw to hw queue mappings.
3403          */
3404         list_for_each_entry(q, &set->tag_list, tag_set_list)
3405                 if (!blk_mq_elv_switch_none(&head, q))
3406                         goto switch_back;
3407
3408         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3409                 blk_mq_debugfs_unregister_hctxs(q);
3410                 blk_mq_sysfs_unregister(q);
3411         }
3412
3413         prev_nr_hw_queues = set->nr_hw_queues;
3414         if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3415             0)
3416                 goto reregister;
3417
3418         set->nr_hw_queues = nr_hw_queues;
3419 fallback:
3420         blk_mq_update_queue_map(set);
3421         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3422                 blk_mq_realloc_hw_ctxs(set, q);
3423                 if (q->nr_hw_queues != set->nr_hw_queues) {
3424                         pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3425                                         nr_hw_queues, prev_nr_hw_queues);
3426                         set->nr_hw_queues = prev_nr_hw_queues;
3427                         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
3428                         goto fallback;
3429                 }
3430                 blk_mq_map_swqueue(q);
3431         }
3432
3433 reregister:
3434         list_for_each_entry(q, &set->tag_list, tag_set_list) {
3435                 blk_mq_sysfs_register(q);
3436                 blk_mq_debugfs_register_hctxs(q);
3437         }
3438
3439 switch_back:
3440         list_for_each_entry(q, &set->tag_list, tag_set_list)
3441                 blk_mq_elv_switch_back(&head, q);
3442
3443         list_for_each_entry(q, &set->tag_list, tag_set_list)
3444                 blk_mq_unfreeze_queue(q);
3445 }
3446
3447 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
3448 {
3449         mutex_lock(&set->tag_list_lock);
3450         __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
3451         mutex_unlock(&set->tag_list_lock);
3452 }
3453 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
3454
3455 /* Enable polling stats and return whether they were already enabled. */
3456 static bool blk_poll_stats_enable(struct request_queue *q)
3457 {
3458         if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3459             blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
3460                 return true;
3461         blk_stat_add_callback(q, q->poll_cb);
3462         return false;
3463 }
3464
3465 static void blk_mq_poll_stats_start(struct request_queue *q)
3466 {
3467         /*
3468          * We don't arm the callback if polling stats are not enabled or the
3469          * callback is already active.
3470          */
3471         if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3472             blk_stat_is_active(q->poll_cb))
3473                 return;
3474
3475         blk_stat_activate_msecs(q->poll_cb, 100);
3476 }
3477
3478 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
3479 {
3480         struct request_queue *q = cb->data;
3481         int bucket;
3482
3483         for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
3484                 if (cb->stat[bucket].nr_samples)
3485                         q->poll_stat[bucket] = cb->stat[bucket];
3486         }
3487 }
3488
3489 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
3490                                        struct request *rq)
3491 {
3492         unsigned long ret = 0;
3493         int bucket;
3494
3495         /*
3496          * If stats collection isn't on, don't sleep but turn it on for
3497          * future users
3498          */
3499         if (!blk_poll_stats_enable(q))
3500                 return 0;
3501
3502         /*
3503          * As an optimistic guess, use half of the mean service time
3504          * for this type of request. We can (and should) make this smarter.
3505          * For instance, if the completion latencies are tight, we can
3506          * get closer than just half the mean. This is especially
3507          * important on devices where the completion latencies are longer
3508          * than ~10 usec. We do use the stats for the relevant IO size
3509          * if available which does lead to better estimates.
3510          */
3511         bucket = blk_mq_poll_stats_bkt(rq);
3512         if (bucket < 0)
3513                 return ret;
3514
3515         if (q->poll_stat[bucket].nr_samples)
3516                 ret = (q->poll_stat[bucket].mean + 1) / 2;
3517
3518         return ret;
3519 }
3520
3521 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3522                                      struct request *rq)
3523 {
3524         struct hrtimer_sleeper hs;
3525         enum hrtimer_mode mode;
3526         unsigned int nsecs;
3527         ktime_t kt;
3528
3529         if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
3530                 return false;
3531
3532         /*
3533          * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3534          *
3535          *  0:  use half of prev avg
3536          * >0:  use this specific value
3537          */
3538         if (q->poll_nsec > 0)
3539                 nsecs = q->poll_nsec;
3540         else
3541                 nsecs = blk_mq_poll_nsecs(q, rq);
3542
3543         if (!nsecs)
3544                 return false;
3545
3546         rq->rq_flags |= RQF_MQ_POLL_SLEPT;
3547
3548         /*
3549          * This will be replaced with the stats tracking code, using
3550          * 'avg_completion_time / 2' as the pre-sleep target.
3551          */
3552         kt = nsecs;
3553
3554         mode = HRTIMER_MODE_REL;
3555         hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
3556         hrtimer_set_expires(&hs.timer, kt);
3557
3558         do {
3559                 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
3560                         break;
3561                 set_current_state(TASK_UNINTERRUPTIBLE);
3562                 hrtimer_sleeper_start_expires(&hs, mode);
3563                 if (hs.task)
3564                         io_schedule();
3565                 hrtimer_cancel(&hs.timer);
3566                 mode = HRTIMER_MODE_ABS;
3567         } while (hs.task && !signal_pending(current));
3568
3569         __set_current_state(TASK_RUNNING);
3570         destroy_hrtimer_on_stack(&hs.timer);
3571         return true;
3572 }
3573
3574 static bool blk_mq_poll_hybrid(struct request_queue *q,
3575                                struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3576 {
3577         struct request *rq;
3578
3579         if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
3580                 return false;
3581
3582         if (!blk_qc_t_is_internal(cookie))
3583                 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3584         else {
3585                 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3586                 /*
3587                  * With scheduling, if the request has completed, we'll
3588                  * get a NULL return here, as we clear the sched tag when
3589                  * that happens. The request still remains valid, like always,
3590                  * so we should be safe with just the NULL check.
3591                  */
3592                 if (!rq)
3593                         return false;
3594         }
3595
3596         return blk_mq_poll_hybrid_sleep(q, rq);
3597 }
3598
3599 /**
3600  * blk_poll - poll for IO completions
3601  * @q:  the queue
3602  * @cookie: cookie passed back at IO submission time
3603  * @spin: whether to spin for completions
3604  *
3605  * Description:
3606  *    Poll for completions on the passed in queue. Returns number of
3607  *    completed entries found. If @spin is true, then blk_poll will continue
3608  *    looping until at least one completion is found, unless the task is
3609  *    otherwise marked running (or we need to reschedule).
3610  */
3611 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3612 {
3613         struct blk_mq_hw_ctx *hctx;
3614         long state;
3615
3616         if (!blk_qc_t_valid(cookie) ||
3617             !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3618                 return 0;
3619
3620         if (current->plug)
3621                 blk_flush_plug_list(current->plug, false);
3622
3623         hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3624
3625         /*
3626          * If we sleep, have the caller restart the poll loop to reset
3627          * the state. Like for the other success return cases, the
3628          * caller is responsible for checking if the IO completed. If
3629          * the IO isn't complete, we'll get called again and will go
3630          * straight to the busy poll loop.
3631          */
3632         if (blk_mq_poll_hybrid(q, hctx, cookie))
3633                 return 1;
3634
3635         hctx->poll_considered++;
3636
3637         state = current->state;
3638         do {
3639                 int ret;
3640
3641                 hctx->poll_invoked++;
3642
3643                 ret = q->mq_ops->poll(hctx);
3644                 if (ret > 0) {
3645                         hctx->poll_success++;
3646                         __set_current_state(TASK_RUNNING);
3647                         return ret;
3648                 }
3649
3650                 if (signal_pending_state(state, current))
3651                         __set_current_state(TASK_RUNNING);
3652
3653                 if (current->state == TASK_RUNNING)
3654                         return 1;
3655                 if (ret < 0 || !spin)
3656                         break;
3657                 cpu_relax();
3658         } while (!need_resched());
3659
3660         __set_current_state(TASK_RUNNING);
3661         return 0;
3662 }
3663 EXPORT_SYMBOL_GPL(blk_poll);
3664
3665 unsigned int blk_mq_rq_cpu(struct request *rq)
3666 {
3667         return rq->mq_ctx->cpu;
3668 }
3669 EXPORT_SYMBOL(blk_mq_rq_cpu);
3670
3671 static int __init blk_mq_init(void)
3672 {
3673         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3674                                 blk_mq_hctx_notify_dead);
3675         return 0;
3676 }
3677 subsys_initcall(blk_mq_init);