src/patches/suse-2.6.27.31/patches.suse/rq-based-multipath

   1 From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
   2 Subject: Request-based multipath patches
   3 References: FATE#302108
   4
   5 This is the latest version of the request-based multipathing patches,
   6 posted to dm-devel and linux-scsi on 03.10.2008.
   7
   8 Signed-off-by: Hannes Reinecke <hare@suse.de>
   9
  10 ---
  11  drivers/md/dm-ioctl.c         |   13
  12  drivers/md/dm-mpath.c         |  192 +++++---
  13  drivers/md/dm-table.c         |   82 +++
  14  drivers/md/dm.c               |  952 +++++++++++++++++++++++++++++++++++++++---
  15  drivers/md/dm.h               |   17
  16  include/linux/device-mapper.h |   24 +
  17  6 files changed, 1158 insertions(+), 122 deletions(-)
  18
  19 --- a/drivers/md/dm-ioctl.c
  20 +++ b/drivers/md/dm-ioctl.c
  21 @@ -1046,6 +1046,12 @@ static int populate_table(struct dm_tabl
  22                 next = spec->next;
  23         }
  24
  25 +       r = dm_table_set_type(table);
  26 +       if (r) {
  27 +               DMWARN("unable to set table type");
  28 +               return r;
  29 +       }
  30 +
  31         return dm_table_complete(table);
  32  }
  33
  34 @@ -1069,6 +1075,13 @@ static int table_load(struct dm_ioctl *p
  35                 dm_table_put(t);
  36                 goto out;
  37         }
  38 +
  39 +       r = dm_init_md_mempool(md, dm_table_get_type(t));
  40 +       if (r) {
  41 +               DMWARN("unable to initialize the md mempools for this table");
  42 +               dm_table_put(t);
  43 +               goto out;
  44 +       }
  45
  46         down_write(&_hash_lock);
  47         hc = dm_get_mdptr(md);
  48 --- a/drivers/md/dm-mpath.c
  49 +++ b/drivers/md/dm-mpath.c
  50 @@ -7,8 +7,6 @@
  51
  52  #include "dm.h"
  53  #include "dm-path-selector.h"
  54 -#include "dm-bio-list.h"
  55 -#include "dm-bio-record.h"
  56  #include "dm-uevent.h"
  57
  58  #include <linux/ctype.h>
  59 @@ -83,7 +81,7 @@ struct multipath {
  60         unsigned pg_init_count;         /* Number of times pg_init called */
  61
  62         struct work_struct process_queued_ios;
  63 -       struct bio_list queued_ios;
  64 +       struct list_head queued_ios;
  65         unsigned queue_size;
  66
  67         struct work_struct trigger_event;
  68 @@ -100,7 +98,6 @@ struct multipath {
  69   */
  70  struct dm_mpath_io {
  71         struct pgpath *pgpath;
  72 -       struct dm_bio_details details;
  73  };
  74
  75  typedef int (*action_fn) (struct pgpath *pgpath);
  76 @@ -197,6 +194,7 @@ static struct multipath *alloc_multipath
  77         m = kzalloc(sizeof(*m), GFP_KERNEL);
  78         if (m) {
  79                 INIT_LIST_HEAD(&m->priority_groups);
  80 +               INIT_LIST_HEAD(&m->queued_ios);
  81                 spin_lock_init(&m->lock);
  82                 m->queue_io = 1;
  83                 INIT_WORK(&m->process_queued_ios, process_queued_ios);
  84 @@ -321,12 +319,13 @@ static int __must_push_back(struct multi
  85                 dm_noflush_suspending(m->ti));
  86  }
  87
  88 -static int map_io(struct multipath *m, struct bio *bio,
  89 +static int map_io(struct multipath *m, struct request *clone,
  90                   struct dm_mpath_io *mpio, unsigned was_queued)
  91  {
  92         int r = DM_MAPIO_REMAPPED;
  93         unsigned long flags;
  94         struct pgpath *pgpath;
  95 +       struct block_device *bdev;
  96
  97         spin_lock_irqsave(&m->lock, flags);
  98
  99 @@ -343,16 +342,18 @@ static int map_io(struct multipath *m, s
 100         if ((pgpath && m->queue_io) ||
 101             (!pgpath && m->queue_if_no_path)) {
 102                 /* Queue for the daemon to resubmit */
 103 -               bio_list_add(&m->queued_ios, bio);
 104 +               list_add_tail(&clone->queuelist, &m->queued_ios);
 105                 m->queue_size++;
 106                 if ((m->pg_init_required && !m->pg_init_in_progress) ||
 107                     !m->queue_io)
 108                         queue_work(kmultipathd, &m->process_queued_ios);
 109                 pgpath = NULL;
 110                 r = DM_MAPIO_SUBMITTED;
 111 -       } else if (pgpath)
 112 -               bio->bi_bdev = pgpath->path.dev->bdev;
 113 -       else if (__must_push_back(m))
 114 +       } else if (pgpath) {
 115 +               bdev = pgpath->path.dev->bdev;
 116 +               clone->q = bdev_get_queue(bdev);
 117 +               clone->rq_disk = bdev->bd_disk;
 118 +       } else if (__must_push_back(m))
 119                 r = DM_MAPIO_REQUEUE;
 120         else
 121                 r = -EIO;       /* Failed */
 122 @@ -395,30 +396,31 @@ static void dispatch_queued_ios(struct m
 123  {
 124         int r;
 125         unsigned long flags;
 126 -       struct bio *bio = NULL, *next;
 127         struct dm_mpath_io *mpio;
 128         union map_info *info;
 129 +       struct request *clone, *n;
 130 +       LIST_HEAD(cl);
 131
 132         spin_lock_irqsave(&m->lock, flags);
 133 -       bio = bio_list_get(&m->queued_ios);
 134 +       list_splice_init(&m->queued_ios, &cl);
 135         spin_unlock_irqrestore(&m->lock, flags);
 136
 137 -       while (bio) {
 138 -               next = bio->bi_next;
 139 -               bio->bi_next = NULL;
 140 +       list_for_each_entry_safe(clone, n, &cl, queuelist) {
 141 +               list_del_init(&clone->queuelist);
 142
 143 -               info = dm_get_mapinfo(bio);
 144 +               info = dm_get_rq_mapinfo(clone);
 145                 mpio = info->ptr;
 146
 147 -               r = map_io(m, bio, mpio, 1);
 148 -               if (r < 0)
 149 -                       bio_endio(bio, r);
 150 -               else if (r == DM_MAPIO_REMAPPED)
 151 -                       generic_make_request(bio);
 152 -               else if (r == DM_MAPIO_REQUEUE)
 153 -                       bio_endio(bio, -EIO);
 154 -
 155 -               bio = next;
 156 +               r = map_io(m, clone, mpio, 1);
 157 +               if (r < 0) {
 158 +                       mempool_free(mpio, m->mpio_pool);
 159 +                       dm_kill_request(clone, r);
 160 +               } else if (r == DM_MAPIO_REMAPPED)
 161 +                       dm_dispatch_request(clone);
 162 +               else if (r == DM_MAPIO_REQUEUE) {
 163 +                       mempool_free(mpio, m->mpio_pool);
 164 +                       dm_requeue_request(clone);
 165 +               }
 166         }
 167  }
 168
 169 @@ -844,21 +846,24 @@ static void multipath_dtr(struct dm_targ
 170  }
 171
 172  /*
 173 - * Map bios, recording original fields for later in case we have to resubmit
 174 + * Map cloned requests
 175   */
 176 -static int multipath_map(struct dm_target *ti, struct bio *bio,
 177 +static int multipath_map(struct dm_target *ti, struct request *clone,
 178                          union map_info *map_context)
 179  {
 180         int r;
 181         struct dm_mpath_io *mpio;
 182         struct multipath *m = (struct multipath *) ti->private;
 183
 184 -       mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
 185 -       dm_bio_record(&mpio->details, bio);
 186 +       mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
 187 +       if (!mpio)
 188 +               /* ENOMEM, requeue */
 189 +               return DM_MAPIO_REQUEUE;
 190 +       memset(mpio, 0, sizeof(*mpio));
 191
 192         map_context->ptr = mpio;
 193 -       bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
 194 -       r = map_io(m, bio, mpio, 0);
 195 +       clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
 196 +       r = map_io(m, clone, mpio, 0);
 197         if (r < 0 || r == DM_MAPIO_REQUEUE)
 198                 mempool_free(mpio, m->mpio_pool);
 199
 200 @@ -1140,53 +1145,41 @@ static void activate_path(struct work_st
 201  /*
 202   * end_io handling
 203   */
 204 -static int do_end_io(struct multipath *m, struct bio *bio,
 205 +static int do_end_io(struct multipath *m, struct request *clone,
 206                      int error, struct dm_mpath_io *mpio)
 207  {
 208 +       /*
 209 +        * We don't queue any clone request inside the multipath target
 210 +        * during end I/O handling, since those clone requests don't have
 211 +        * bio clones.  If we queue them inside the multipath target,
 212 +        * we need to make bio clones, that requires memory allocation.
 213 +        * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
 214 +        *  don't have bio clones.)
 215 +        * Instead of queueing the clone request here, we queue the original
 216 +        * request into dm core, which will remake a clone request and
 217 +        * clone bios for it and resubmit it later.
 218 +        */
 219 +       int r = DM_ENDIO_REQUEUE;
 220         unsigned long flags;
 221
 222 -       if (!error)
 223 +       if (!error && !clone->errors)
 224                 return 0;       /* I/O complete */
 225
 226 -       if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
 227 -               return error;
 228 -
 229         if (error == -EOPNOTSUPP)
 230                 return error;
 231
 232 -       spin_lock_irqsave(&m->lock, flags);
 233 -       if (!m->nr_valid_paths) {
 234 -               if (__must_push_back(m)) {
 235 -                       spin_unlock_irqrestore(&m->lock, flags);
 236 -                       return DM_ENDIO_REQUEUE;
 237 -               } else if (!m->queue_if_no_path) {
 238 -                       spin_unlock_irqrestore(&m->lock, flags);
 239 -                       return -EIO;
 240 -               } else {
 241 -                       spin_unlock_irqrestore(&m->lock, flags);
 242 -                       goto requeue;
 243 -               }
 244 -       }
 245 -       spin_unlock_irqrestore(&m->lock, flags);
 246 -
 247         if (mpio->pgpath)
 248                 fail_path(mpio->pgpath);
 249
 250 -      requeue:
 251 -       dm_bio_restore(&mpio->details, bio);
 252 -
 253 -       /* queue for the daemon to resubmit or fail */
 254         spin_lock_irqsave(&m->lock, flags);
 255 -       bio_list_add(&m->queued_ios, bio);
 256 -       m->queue_size++;
 257 -       if (!m->queue_io)
 258 -               queue_work(kmultipathd, &m->process_queued_ios);
 259 +       if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
 260 +               r = -EIO;
 261         spin_unlock_irqrestore(&m->lock, flags);
 262
 263 -       return DM_ENDIO_INCOMPLETE;     /* io not complete */
 264 +       return r;
 265  }
 266
 267 -static int multipath_end_io(struct dm_target *ti, struct bio *bio,
 268 +static int multipath_end_io(struct dm_target *ti, struct request *clone,
 269                             int error, union map_info *map_context)
 270  {
 271         struct multipath *m = ti->private;
 272 @@ -1195,14 +1188,13 @@ static int multipath_end_io(struct dm_ta
 273         struct path_selector *ps;
 274         int r;
 275
 276 -       r  = do_end_io(m, bio, error, mpio);
 277 +       r  = do_end_io(m, clone, error, mpio);
 278         if (pgpath) {
 279                 ps = &pgpath->pg->ps;
 280                 if (ps->type->end_io)
 281                         ps->type->end_io(ps, &pgpath->path);
 282         }
 283 -       if (r != DM_ENDIO_INCOMPLETE)
 284 -               mempool_free(mpio, m->mpio_pool);
 285 +       mempool_free(mpio, m->mpio_pool);
 286
 287         return r;
 288  }
 289 @@ -1438,6 +1430,75 @@ static int multipath_ioctl(struct dm_tar
 290                                          bdev->bd_disk, cmd, arg);
 291  }
 292
 293 +static int __pgpath_busy(struct pgpath *pgpath)
 294 +{
 295 +       struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 296 +
 297 +       return dm_underlying_device_busy(q);
 298 +}
 299 +
 300 +/*
 301 + * We return "busy", only when we can map I/Os but underlying devices
 302 + * are busy (so even if we map I/Os now, the I/Os will wait on
 303 + * the underlying queue).
 304 + * In other words, if we want to kill I/Os or queue them inside us
 305 + * due to map unavailability, we don't return "busy".  Otherwise,
 306 + * dm core won't give us the I/Os and we can't do what we want.
 307 + */
 308 +static int multipath_busy(struct dm_target *ti)
 309 +{
 310 +       int busy = 0, has_active = 0;
 311 +       struct multipath *m = (struct multipath *) ti->private;
 312 +       struct priority_group *pg;
 313 +       struct pgpath *pgpath;
 314 +       unsigned long flags;
 315 +
 316 +       spin_lock_irqsave(&m->lock, flags);
 317 +
 318 +       /* Guess which priority_group will be used at next mapping time */
 319 +       if (unlikely(!m->current_pgpath && m->next_pg))
 320 +               pg = m->next_pg;
 321 +       else if (likely(m->current_pg))
 322 +               pg = m->current_pg;
 323 +       else
 324 +               /*
 325 +                * We don't know which pg will be used at next mapping time.
 326 +                * We don't call __choose_pgpath() here to avoid to trigger
 327 +                * pg_init just by busy checking.
 328 +                * So we don't know whether underlying devices we will be using
 329 +                * at next mapping time are busy or not. Just try mapping.
 330 +                */
 331 +               goto out;
 332 +
 333 +       /*
 334 +        * If there is one non-busy active path at least, the path selector
 335 +        * will be able to select it. So we consider such a pg as not busy.
 336 +        */
 337 +       busy = 1;
 338 +       list_for_each_entry(pgpath, &pg->pgpaths, list)
 339 +               if (pgpath->is_active) {
 340 +                       has_active = 1;
 341 +
 342 +                       if (!__pgpath_busy(pgpath)) {
 343 +                               busy = 0;
 344 +                               break;
 345 +                       }
 346 +               }
 347 +
 348 +       if (!has_active)
 349 +               /*
 350 +                * No active path in this pg, so this pg won't be used and
 351 +                * the current_pg will be changed at next mapping time.
 352 +                * We need to try mapping to determine it.
 353 +                */
 354 +               busy = 0;
 355 +
 356 +out:
 357 +       spin_unlock_irqrestore(&m->lock, flags);
 358 +
 359 +       return busy;
 360 +}
 361 +
 362  /*-----------------------------------------------------------------
 363   * Module setup
 364   *---------------------------------------------------------------*/
 365 @@ -1447,13 +1508,14 @@ static struct target_type multipath_targ
 366         .module = THIS_MODULE,
 367         .ctr = multipath_ctr,
 368         .dtr = multipath_dtr,
 369 -       .map = multipath_map,
 370 -       .end_io = multipath_end_io,
 371 +       .map_rq = multipath_map,
 372 +       .rq_end_io = multipath_end_io,
 373         .presuspend = multipath_presuspend,
 374         .resume = multipath_resume,
 375         .status = multipath_status,
 376         .message = multipath_message,
 377         .ioctl  = multipath_ioctl,
 378 +       .busy = multipath_busy,
 379  };
 380
 381  static int __init dm_multipath_init(void)
 382 --- a/drivers/md/dm-table.c
 383 +++ b/drivers/md/dm-table.c
 384 @@ -108,6 +108,8 @@ static void combine_restrictions_low(str
 385         lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
 386
 387         lhs->no_cluster |= rhs->no_cluster;
 388 +
 389 +       lhs->no_request_stacking |= rhs->no_request_stacking;
 390  }
 391
 392  /*
 393 @@ -522,6 +524,8 @@ void dm_set_device_limits(struct dm_targ
 394         rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
 395
 396         rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 397 +
 398 +       rs->no_request_stacking |= !blk_queue_stackable(q);
 399  }
 400  EXPORT_SYMBOL_GPL(dm_set_device_limits);
 401
 402 @@ -731,6 +735,66 @@ int dm_table_add_target(struct dm_table
 403         return r;
 404  }
 405
 406 +int dm_table_set_type(struct dm_table *t)
 407 +{
 408 +       int i;
 409 +       int bio_based = 0, request_based = 0;
 410 +       struct dm_target *tgt;
 411 +
 412 +       for (i = 0; i < t->num_targets; i++) {
 413 +               tgt = t->targets + i;
 414 +               if (tgt->type->map_rq)
 415 +                       request_based = 1;
 416 +               else
 417 +                       bio_based = 1;
 418 +
 419 +               if (bio_based && request_based) {
 420 +                       DMWARN("Inconsistent table: different target types"
 421 +                              " can't be mixed up");
 422 +                       return -EINVAL;
 423 +               }
 424 +       }
 425 +
 426 +       if (bio_based) {
 427 +               /* We must use this table as bio-based */
 428 +               t->limits.no_request_stacking = 1;
 429 +               return 0;
 430 +       }
 431 +
 432 +       BUG_ON(!request_based); /* No targets in this table */
 433 +
 434 +       /* Non-request-stackable devices can't be used for request-based dm */
 435 +       if (t->limits.no_request_stacking) {
 436 +               DMWARN("table load rejected: including non-request-stackable"
 437 +                      " devices");
 438 +               return -EINVAL;
 439 +       }
 440 +
 441 +       /*
 442 +        * Request-based dm supports only tables that have a single target now.
 443 +        * To support multiple targets, request splitting support is needed,
 444 +        * and that needs lots of changes in the block-layer.
 445 +        * (e.g. request completion process for partial completion.)
 446 +        */
 447 +       if (t->num_targets > 1) {
 448 +               DMWARN("Request-based dm doesn't support multiple targets yet");
 449 +               return -EINVAL;
 450 +       }
 451 +
 452 +       return 0;
 453 +}
 454 +
 455 +int dm_table_get_type(struct dm_table *t)
 456 +{
 457 +       return t->limits.no_request_stacking ?
 458 +               DM_TYPE_BIO_BASED : DM_TYPE_REQUEST_BASED;
 459 +}
 460 +
 461 +int dm_table_request_based(struct dm_table *t)
 462 +{
 463 +       return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
 464 +}
 465 +
 466  static int setup_indexes(struct dm_table *t)
 467  {
 468         int i;
 469 @@ -861,6 +925,10 @@ void dm_table_set_restrictions(struct dm
 470         else
 471                 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 472
 473 +       if (t->limits.no_request_stacking)
 474 +               queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, q);
 475 +       else
 476 +               queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 477  }
 478
 479  unsigned int dm_table_get_num_targets(struct dm_table *t)
 480 @@ -949,6 +1017,20 @@ int dm_table_any_congested(struct dm_tab
 481         return r;
 482  }
 483
 484 +int dm_table_any_busy_target(struct dm_table *t)
 485 +{
 486 +       int i;
 487 +       struct dm_target *ti;
 488 +
 489 +       for (i = 0; i < t->num_targets; i++) {
 490 +               ti = t->targets + i;
 491 +               if (ti->type->busy && ti->type->busy(ti))
 492 +                       return 1;
 493 +       }
 494 +
 495 +       return 0;
 496 +}
 497 +
 498  void dm_table_unplug_all(struct dm_table *t)
 499  {
 500         struct dm_dev *dd;
 501 --- a/drivers/md/dm.c
 502 +++ b/drivers/md/dm.c
 503 @@ -32,6 +32,7 @@ static unsigned int _major = 0;
 504
 505  static DEFINE_SPINLOCK(_minor_lock);
 506  /*
 507 + * For bio based dm.
 508   * One of these is allocated per bio.
 509   */
 510  struct dm_io {
 511 @@ -43,6 +44,7 @@ struct dm_io {
 512  };
 513
 514  /*
 515 + * For bio based dm.
 516   * One of these is allocated per target within a bio.  Hopefully
 517   * this will be simplified out one day.
 518   */
 519 @@ -52,6 +54,31 @@ struct dm_target_io {
 520         union map_info info;
 521  };
 522
 523 +/*
 524 + * For request based dm.
 525 + * One of these is allocated per request.
 526 + *
 527 + * Since assuming "original request : cloned request = 1 : 1" and
 528 + * a counter for number of clones like struct dm_io.io_count isn't needed,
 529 + * struct dm_io and struct target_io can be merged.
 530 + */
 531 +struct dm_rq_target_io {
 532 +       struct mapped_device *md;
 533 +       struct dm_target *ti;
 534 +       struct request *orig, clone;
 535 +       int error;
 536 +       union map_info info;
 537 +};
 538 +
 539 +/*
 540 + * For request based dm.
 541 + * One of these is allocated per bio.
 542 + */
 543 +struct dm_clone_bio_info {
 544 +       struct bio *orig;
 545 +       struct request *rq;
 546 +};
 547 +
 548  union map_info *dm_get_mapinfo(struct bio *bio)
 549  {
 550         if (bio && bio->bi_private)
 551 @@ -59,6 +86,14 @@ union map_info *dm_get_mapinfo(struct bi
 552         return NULL;
 553  }
 554
 555 +union map_info *dm_get_rq_mapinfo(struct request *rq)
 556 +{
 557 +       if (rq && rq->end_io_data)
 558 +               return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 559 +       return NULL;
 560 +}
 561 +EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 562 +
 563  #define MINOR_ALLOCED ((void *)-1)
 564
 565  /*
 566 @@ -76,7 +111,6 @@ union map_info *dm_get_mapinfo(struct bi
 567   */
 568  struct dm_wq_req {
 569         enum {
 570 -               DM_WQ_FLUSH_ALL,
 571                 DM_WQ_FLUSH_DEFERRED,
 572         } type;
 573         struct work_struct work;
 574 @@ -126,6 +160,8 @@ struct mapped_device {
 575
 576         struct bio_set *bs;
 577
 578 +       unsigned int mempool_type; /* Type of mempools above. */
 579 +
 580         /*
 581          * Event handling.
 582          */
 583 @@ -143,52 +179,74 @@ struct mapped_device {
 584
 585         /* forced geometry settings */
 586         struct hd_geometry geometry;
 587 +
 588 +       /* marker of flush suspend for request-based dm */
 589 +       struct request suspend_rq;
 590 +
 591 +       /* For saving the address of __make_request for request based dm */
 592 +       make_request_fn *saved_make_request_fn;
 593  };
 594
 595  #define MIN_IOS 256
 596  static struct kmem_cache *_io_cache;
 597  static struct kmem_cache *_tio_cache;
 598 +static struct kmem_cache *_rq_tio_cache;
 599 +static struct kmem_cache *_bio_info_cache;
 600
 601  static int __init local_init(void)
 602  {
 603 -       int r;
 604 +       int r = -ENOMEM;
 605
 606         /* allocate a slab for the dm_ios */
 607         _io_cache = KMEM_CACHE(dm_io, 0);
 608         if (!_io_cache)
 609 -               return -ENOMEM;
 610 +               return r;
 611
 612         /* allocate a slab for the target ios */
 613         _tio_cache = KMEM_CACHE(dm_target_io, 0);
 614 -       if (!_tio_cache) {
 615 -               kmem_cache_destroy(_io_cache);
 616 -               return -ENOMEM;
 617 -       }
 618 +       if (!_tio_cache)
 619 +               goto out_free_io_cache;
 620 +
 621 +       _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 622 +       if (!_rq_tio_cache)
 623 +               goto out_free_tio_cache;
 624 +
 625 +       _bio_info_cache = KMEM_CACHE(dm_clone_bio_info, 0);
 626 +       if (!_bio_info_cache)
 627 +               goto out_free_rq_tio_cache;
 628
 629         r = dm_uevent_init();
 630 -       if (r) {
 631 -               kmem_cache_destroy(_tio_cache);
 632 -               kmem_cache_destroy(_io_cache);
 633 -               return r;
 634 -       }
 635 +       if (r)
 636 +               goto out_free_bio_info_cache;
 637
 638         _major = major;
 639         r = register_blkdev(_major, _name);
 640 -       if (r < 0) {
 641 -               kmem_cache_destroy(_tio_cache);
 642 -               kmem_cache_destroy(_io_cache);
 643 -               dm_uevent_exit();
 644 -               return r;
 645 -       }
 646 +       if (r < 0)
 647 +               goto out_uevent_exit;
 648
 649         if (!_major)
 650                 _major = r;
 651
 652         return 0;
 653 +
 654 +out_uevent_exit:
 655 +       dm_uevent_exit();
 656 +out_free_bio_info_cache:
 657 +       kmem_cache_destroy(_bio_info_cache);
 658 +out_free_rq_tio_cache:
 659 +       kmem_cache_destroy(_rq_tio_cache);
 660 +out_free_tio_cache:
 661 +       kmem_cache_destroy(_tio_cache);
 662 +out_free_io_cache:
 663 +       kmem_cache_destroy(_io_cache);
 664 +
 665 +       return r;
 666  }
 667
 668  static void local_exit(void)
 669  {
 670 +       kmem_cache_destroy(_bio_info_cache);
 671 +       kmem_cache_destroy(_rq_tio_cache);
 672         kmem_cache_destroy(_tio_cache);
 673         kmem_cache_destroy(_io_cache);
 674         unregister_blkdev(_major, _name);
 675 @@ -380,6 +438,28 @@ static void free_tio(struct mapped_devic
 676         mempool_free(tio, md->tio_pool);
 677  }
 678
 679 +static inline struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
 680 +{
 681 +       return mempool_alloc(md->tio_pool, GFP_ATOMIC);
 682 +}
 683 +
 684 +static inline void free_rq_tio(struct mapped_device *md,
 685 +                              struct dm_rq_target_io *tio)
 686 +{
 687 +       mempool_free(tio, md->tio_pool);
 688 +}
 689 +
 690 +static inline struct dm_clone_bio_info *alloc_bio_info(struct mapped_device *md)
 691 +{
 692 +       return mempool_alloc(md->io_pool, GFP_ATOMIC);
 693 +}
 694 +
 695 +static inline void free_bio_info(struct mapped_device *md,
 696 +                                struct dm_clone_bio_info *info)
 697 +{
 698 +       mempool_free(info, md->io_pool);
 699 +}
 700 +
 701  static void start_io_acct(struct dm_io *io)
 702  {
 703         struct mapped_device *md = io->md;
 704 @@ -568,6 +648,266 @@ static void clone_endio(struct bio *bio,
 705         free_tio(md, tio);
 706  }
 707
 708 +/*
 709 + * Partial completion handling for request-based dm
 710 + */
 711 +static void end_clone_bio(struct bio *clone, int error)
 712 +{
 713 +       struct dm_clone_bio_info *info = clone->bi_private;
 714 +       struct dm_rq_target_io *tio = info->rq->end_io_data;
 715 +       struct bio *bio = info->orig;
 716 +       unsigned int nr_bytes = info->orig->bi_size;
 717 +
 718 +       free_bio_info(tio->md, info);
 719 +       clone->bi_private = tio->md->bs;
 720 +       bio_put(clone);
 721 +
 722 +       if (tio->error) {
 723 +               /*
 724 +                * An error has already been detected on the request.
 725 +                * Once error occurred, just let clone->end_io() handle
 726 +                * the remainder.
 727 +                */
 728 +               return;
 729 +       } else if (error) {
 730 +               /*
 731 +                * Don't notice the error to the upper layer yet.
 732 +                * The error handling decision is made by the target driver,
 733 +                * when the request is completed.
 734 +                */
 735 +               tio->error = error;
 736 +               return;
 737 +       }
 738 +
 739 +       /*
 740 +        * I/O for the bio successfully completed.
 741 +        * Notice the data completion to the upper layer.
 742 +        */
 743 +
 744 +       /*
 745 +        * bios are processed from the head of the list.
 746 +        * So the completing bio should always be rq->bio.
 747 +        * If it's not, something wrong is happening.
 748 +        */
 749 +       if (tio->orig->bio != bio)
 750 +               DMERR("bio completion is going in the middle of the request");
 751 +
 752 +       /*
 753 +        * Update the original request.
 754 +        * Do not use blk_end_request() here, because it may complete
 755 +        * the original request before the clone, and break the ordering.
 756 +        */
 757 +       blk_update_request(tio->orig, 0, nr_bytes);
 758 +}
 759 +
 760 +static void free_bio_clone(struct request *clone)
 761 +{
 762 +       struct dm_rq_target_io *tio = clone->end_io_data;
 763 +       struct mapped_device *md = tio->md;
 764 +       struct bio *bio;
 765 +       struct dm_clone_bio_info *info;
 766 +
 767 +       while ((bio = clone->bio) != NULL) {
 768 +               clone->bio = bio->bi_next;
 769 +
 770 +               info = bio->bi_private;
 771 +               free_bio_info(md, info);
 772 +
 773 +               bio->bi_private = md->bs;
 774 +               bio_put(bio);
 775 +       }
 776 +}
 777 +
 778 +static void dec_rq_pending(struct dm_rq_target_io *tio)
 779 +{
 780 +       if (!atomic_dec_return(&tio->md->pending))
 781 +               /* nudge anyone waiting on suspend queue */
 782 +               wake_up(&tio->md->wait);
 783 +}
 784 +
 785 +static void dm_unprep_request(struct request *rq)
 786 +{
 787 +       struct request *clone = rq->special;
 788 +       struct dm_rq_target_io *tio = clone->end_io_data;
 789 +
 790 +       rq->special = NULL;
 791 +       rq->cmd_flags &= ~REQ_DONTPREP;
 792 +
 793 +       free_bio_clone(clone);
 794 +       dec_rq_pending(tio);
 795 +       free_rq_tio(tio->md, tio);
 796 +}
 797 +
 798 +/*
 799 + * Requeue the original request of a clone.
 800 + */
 801 +void dm_requeue_request(struct request *clone)
 802 +{
 803 +       struct dm_rq_target_io *tio = clone->end_io_data;
 804 +       struct request *rq = tio->orig;
 805 +       struct request_queue *q = rq->q;
 806 +       unsigned long flags;
 807 +
 808 +       dm_unprep_request(rq);
 809 +
 810 +       spin_lock_irqsave(q->queue_lock, flags);
 811 +       if (elv_queue_empty(q))
 812 +               blk_plug_device(q);
 813 +       blk_requeue_request(q, rq);
 814 +       spin_unlock_irqrestore(q->queue_lock, flags);
 815 +}
 816 +EXPORT_SYMBOL_GPL(dm_requeue_request);
 817 +
 818 +static inline void __stop_queue(struct request_queue *q)
 819 +{
 820 +       blk_stop_queue(q);
 821 +}
 822 +
 823 +static void stop_queue(struct request_queue *q)
 824 +{
 825 +       unsigned long flags;
 826 +
 827 +       spin_lock_irqsave(q->queue_lock, flags);
 828 +       __stop_queue(q);
 829 +       spin_unlock_irqrestore(q->queue_lock, flags);
 830 +}
 831 +
 832 +static inline void __start_queue(struct request_queue *q)
 833 +{
 834 +       if (blk_queue_stopped(q))
 835 +               blk_start_queue(q);
 836 +}
 837 +
 838 +static void start_queue(struct request_queue *q)
 839 +{
 840 +       unsigned long flags;
 841 +
 842 +       spin_lock_irqsave(q->queue_lock, flags);
 843 +       __start_queue(q);
 844 +       spin_unlock_irqrestore(q->queue_lock, flags);
 845 +}
 846 +
 847 +/*
 848 + * Complete the clone and the original request
 849 + */
 850 +static void dm_end_request(struct request *clone, int error)
 851 +{
 852 +       struct dm_rq_target_io *tio = clone->end_io_data;
 853 +       struct request *rq = tio->orig;
 854 +       struct request_queue *q = rq->q;
 855 +       unsigned int nr_bytes = blk_rq_bytes(rq);
 856 +
 857 +       if (blk_pc_request(rq)) {
 858 +               rq->errors = clone->errors;
 859 +               rq->data_len = clone->data_len;
 860 +
 861 +               if (rq->sense)
 862 +                       /*
 863 +                        * We are using the sense buffer of the original
 864 +                        * request.
 865 +                        * So setting the length of the sense data is enough.
 866 +                        */
 867 +                       rq->sense_len = clone->sense_len;
 868 +       }
 869 +
 870 +       free_bio_clone(clone);
 871 +       dec_rq_pending(tio);
 872 +       free_rq_tio(tio->md, tio);
 873 +
 874 +       if (unlikely(blk_end_request(rq, error, nr_bytes)))
 875 +               BUG();
 876 +
 877 +       blk_run_queue(q);
 878 +}
 879 +
 880 +/*
 881 + * Request completion handler for request-based dm
 882 + */
 883 +static void dm_softirq_done(struct request *rq)
 884 +{
 885 +       struct request *clone = rq->completion_data;
 886 +       struct dm_rq_target_io *tio = clone->end_io_data;
 887 +       dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
 888 +       int error = tio->error;
 889 +       int r;
 890 +
 891 +       if (rq->cmd_flags & REQ_FAILED)
 892 +               goto end_request;
 893 +
 894 +       if (rq_end_io) {
 895 +               r = rq_end_io(tio->ti, clone, error, &tio->info);
 896 +               if (r <= 0)
 897 +                       /* The target wants to complete the I/O */
 898 +                       error = r;
 899 +               else if (r == DM_ENDIO_INCOMPLETE)
 900 +                       /* The target will handle the I/O */
 901 +                       return;
 902 +               else if (r == DM_ENDIO_REQUEUE) {
 903 +                       /*
 904 +                        * The target wants to requeue the I/O.
 905 +                        * Don't invoke blk_run_queue() so that the requeued
 906 +                        * request won't be dispatched again soon.
 907 +                        */
 908 +                       dm_requeue_request(clone);
 909 +                       return;
 910 +               } else {
 911 +                       DMWARN("unimplemented target endio return value: %d",
 912 +                              r);
 913 +                       BUG();
 914 +               }
 915 +       }
 916 +
 917 +end_request:
 918 +       dm_end_request(clone, error);
 919 +}
 920 +
 921 +/*
 922 + * Called with the queue lock held
 923 + */
 924 +static void end_clone_request(struct request *clone, int error)
 925 +{
 926 +       struct dm_rq_target_io *tio = clone->end_io_data;
 927 +       struct request *rq = tio->orig;
 928 +
 929 +       /*
 930 +        * For just cleaning up the information of the queue in which
 931 +        * the clone was dispatched.
 932 +        * The clone is *NOT* freed actually here because it is alloced from
 933 +        * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
 934 +        */
 935 +       __blk_put_request(clone->q, clone);
 936 +
 937 +       /*
 938 +        * Actual request completion is done in a softirq context which doesn't
 939 +        * hold the queue lock.  Otherwise, deadlock could occur because:
 940 +        *     - another request may be submitted by the upper level driver
 941 +        *       of the stacking during the completion
 942 +        *     - the submission which requires queue lock may be done
 943 +        *       against this queue
 944 +        */
 945 +       tio->error = error;
 946 +       rq->completion_data = clone;
 947 +       blk_complete_request(rq);
 948 +}
 949 +
 950 +/*
 951 + * Complete the original request of a clone with an error status.
 952 + * Target's rq_end_io() function isn't called.
 953 + * This may be used by target's map_rq() function when the mapping fails.
 954 + */
 955 +void dm_kill_request(struct request *clone, int error)
 956 +{
 957 +       struct dm_rq_target_io *tio = clone->end_io_data;
 958 +       struct request *rq = tio->orig;
 959 +
 960 +       tio->error = error;
 961 +       /* Avoid printing "I/O error" message, since we didn't I/O actually */
 962 +       rq->cmd_flags |= (REQ_FAILED | REQ_QUIET);
 963 +       rq->completion_data = clone;
 964 +       blk_complete_request(rq);
 965 +}
 966 +EXPORT_SYMBOL_GPL(dm_kill_request);
 967 +
 968  static sector_t max_io_len(struct mapped_device *md,
 969                            sector_t sector, struct dm_target *ti)
 970  {
 971 @@ -886,7 +1226,7 @@ out:
 972   * The request function that just remaps the bio built up by
 973   * dm_merge_bvec.
 974   */
 975 -static int dm_request(struct request_queue *q, struct bio *bio)
 976 +static int _dm_request(struct request_queue *q, struct bio *bio)
 977  {
 978         int r = -EIO;
 979         int rw = bio_data_dir(bio);
 980 @@ -936,12 +1276,335 @@ out_req:
 981         return 0;
 982  }
 983
 984 +static int dm_make_request(struct request_queue *q, struct bio *bio)
 985 +{
 986 +       struct mapped_device *md = (struct mapped_device *)q->queuedata;
 987 +
 988 +       if (unlikely(bio_barrier(bio))) {
 989 +               bio_endio(bio, -EOPNOTSUPP);
 990 +               return 0;
 991 +       }
 992 +
 993 +       if (unlikely(!md->map)) {
 994 +               bio_endio(bio, -EIO);
 995 +               return 0;
 996 +       }
 997 +
 998 +       return md->saved_make_request_fn(q, bio); /* call __make_request() */
 999 +}
1000 +
1001 +static inline int dm_request_based(struct mapped_device *md)
1002 +{
1003 +       return blk_queue_stackable(md->queue);
1004 +}
1005 +
1006 +static int dm_request(struct request_queue *q, struct bio *bio)
1007 +{
1008 +       struct mapped_device *md = q->queuedata;
1009 +
1010 +       if (dm_request_based(md))
1011 +               return dm_make_request(q, bio);
1012 +
1013 +       return _dm_request(q, bio);
1014 +}
1015 +
1016 +void dm_dispatch_request(struct request *rq)
1017 +{
1018 +       int r;
1019 +
1020 +       rq->start_time = jiffies;
1021 +       r = blk_insert_cloned_request(rq->q, rq);
1022 +       if (r)
1023 +               dm_kill_request(rq, r);
1024 +}
1025 +EXPORT_SYMBOL_GPL(dm_dispatch_request);
1026 +
1027 +static void copy_request_info(struct request *clone, struct request *rq)
1028 +{
1029 +       clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE);
1030 +       clone->cmd_type = rq->cmd_type;
1031 +       clone->sector = rq->sector;
1032 +       clone->hard_sector = rq->hard_sector;
1033 +       clone->nr_sectors = rq->nr_sectors;
1034 +       clone->hard_nr_sectors = rq->hard_nr_sectors;
1035 +       clone->current_nr_sectors = rq->current_nr_sectors;
1036 +       clone->hard_cur_sectors = rq->hard_cur_sectors;
1037 +       clone->nr_phys_segments = rq->nr_phys_segments;
1038 +       clone->ioprio = rq->ioprio;
1039 +       clone->buffer = rq->buffer;
1040 +       clone->cmd_len = rq->cmd_len;
1041 +       if (rq->cmd_len)
1042 +               clone->cmd = rq->cmd;
1043 +       clone->data_len = rq->data_len;
1044 +       clone->extra_len = rq->extra_len;
1045 +       clone->sense_len = rq->sense_len;
1046 +       clone->data = rq->data;
1047 +       clone->sense = rq->sense;
1048 +}
1049 +
1050 +static int clone_request_bios(struct request *clone, struct request *rq,
1051 +                             struct mapped_device *md)
1052 +{
1053 +       struct bio *bio, *clone_bio;
1054 +       struct dm_clone_bio_info *info;
1055 +
1056 +       for (bio = rq->bio; bio; bio = bio->bi_next) {
1057 +               info = alloc_bio_info(md);
1058 +               if (!info)
1059 +                       goto free_and_out;
1060 +
1061 +               clone_bio = bio_alloc_bioset(GFP_ATOMIC, bio->bi_max_vecs,
1062 +                                            md->bs);
1063 +               if (!clone_bio) {
1064 +                       free_bio_info(md, info);
1065 +                       goto free_and_out;
1066 +               }
1067 +
1068 +               __bio_clone(clone_bio, bio);
1069 +               clone_bio->bi_destructor = dm_bio_destructor;
1070 +               clone_bio->bi_end_io = end_clone_bio;
1071 +               info->rq = clone;
1072 +               info->orig = bio;
1073 +               clone_bio->bi_private = info;
1074 +
1075 +               if (clone->bio) {
1076 +                       clone->biotail->bi_next = clone_bio;
1077 +                       clone->biotail = clone_bio;
1078 +               } else
1079 +                       clone->bio = clone->biotail = clone_bio;
1080 +       }
1081 +
1082 +       return 0;
1083 +
1084 +free_and_out:
1085 +       free_bio_clone(clone);
1086 +
1087 +       return -ENOMEM;
1088 +}
1089 +
1090 +static int setup_clone(struct request *clone, struct request *rq,
1091 +                      struct dm_rq_target_io *tio)
1092 +{
1093 +       int r;
1094 +
1095 +       blk_rq_init(NULL, clone);
1096 +
1097 +       r = clone_request_bios(clone, rq, tio->md);
1098 +       if (r)
1099 +               return r;
1100 +
1101 +       copy_request_info(clone, rq);
1102 +       clone->start_time = jiffies;
1103 +       clone->end_io = end_clone_request;
1104 +       clone->end_io_data = tio;
1105 +
1106 +       return 0;
1107 +}
1108 +
1109 +static inline int dm_flush_suspending(struct mapped_device *md)
1110 +{
1111 +       return !md->suspend_rq.data;
1112 +}
1113 +
1114 +/*
1115 + * Called with the queue lock held.
1116 + */
1117 +static int dm_prep_fn(struct request_queue *q, struct request *rq)
1118 +{
1119 +       struct mapped_device *md = (struct mapped_device *)q->queuedata;
1120 +       struct dm_rq_target_io *tio;
1121 +       struct request *clone;
1122 +
1123 +       if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend marker */
1124 +               if (dm_flush_suspending(md)) {
1125 +                       if (q->in_flight)
1126 +                               return BLKPREP_DEFER;
1127 +                       else {
1128 +                               /* This device should be quiet now */
1129 +                               __stop_queue(q);
1130 +                               smp_mb();
1131 +                               BUG_ON(atomic_read(&md->pending));
1132 +                               wake_up(&md->wait);
1133 +                               return BLKPREP_KILL;
1134 +                       }
1135 +               } else
1136 +                       /*
1137 +                        * The suspend process was interrupted.
1138 +                        * So no need to suspend now.
1139 +                        */
1140 +                       return BLKPREP_KILL;
1141 +       }
1142 +
1143 +       if (unlikely(rq->special)) {
1144 +               DMWARN("Already has something in rq->special.");
1145 +               return BLKPREP_KILL;
1146 +       }
1147 +
1148 +       if (unlikely(!dm_request_based(md))) {
1149 +               DMWARN("Request was queued into bio-based device");
1150 +               return BLKPREP_KILL;
1151 +       }
1152 +
1153 +       tio = alloc_rq_tio(md); /* Only one for each original request */
1154 +       if (!tio)
1155 +               /* -ENOMEM */
1156 +               return BLKPREP_DEFER;
1157 +
1158 +       tio->md = md;
1159 +       tio->ti = NULL;
1160 +       tio->orig = rq;
1161 +       tio->error = 0;
1162 +       memset(&tio->info, 0, sizeof(tio->info));
1163 +
1164 +       clone = &tio->clone;
1165 +       if (setup_clone(clone, rq, tio)) {
1166 +               /* -ENOMEM */
1167 +               free_rq_tio(md, tio);
1168 +               return BLKPREP_DEFER;
1169 +       }
1170 +
1171 +       rq->special = clone;
1172 +       rq->cmd_flags |= REQ_DONTPREP;
1173 +
1174 +       return BLKPREP_OK;
1175 +}
1176 +
1177 +static void map_request(struct dm_target *ti, struct request *rq,
1178 +                       struct mapped_device *md)
1179 +{
1180 +       int r;
1181 +       struct request *clone = rq->special;
1182 +       struct dm_rq_target_io *tio = clone->end_io_data;
1183 +
1184 +       tio->ti = ti;
1185 +       atomic_inc(&md->pending);
1186 +
1187 +       /*
1188 +        * Although submitted requests to the md->queue are checked against
1189 +        * the table/queue limitations at the submission time, the limitations
1190 +        * may be changed by a table swapping while those already checked
1191 +        * requests are in the md->queue.
1192 +        * If the limitations have been shrunk in such situations, we may be
1193 +        * dispatching requests violating the current limitations here.
1194 +        * Since struct request is a reliable one in the block-layer
1195 +        * and device drivers, dispatching such requests is dangerous.
1196 +        * (e.g. it may cause kernel panic easily.)
1197 +        * Avoid to dispatch such problematic requests in request-based dm.
1198 +        *
1199 +        * Since dm_kill_request() decrements the md->pending, this have to
1200 +        * be done after incrementing the md->pending.
1201 +        */
1202 +       r = blk_rq_check_limits(rq->q, rq);
1203 +       if (unlikely(r)) {
1204 +               DMWARN("violating the queue limitation. the limitation may be"
1205 +                      " shrunk while there are some requests in the queue.");
1206 +               dm_kill_request(clone, r);
1207 +               return;
1208 +       }
1209 +
1210 +       r = ti->type->map_rq(ti, clone, &tio->info);
1211 +       switch (r) {
1212 +       case DM_MAPIO_SUBMITTED:
1213 +               /* The target has taken the I/O to submit by itself later */
1214 +               break;
1215 +       case DM_MAPIO_REMAPPED:
1216 +               /* The target has remapped the I/O so dispatch it */
1217 +               dm_dispatch_request(clone);
1218 +               break;
1219 +       case DM_MAPIO_REQUEUE:
1220 +               /* The target wants to requeue the I/O */
1221 +               dm_requeue_request(clone);
1222 +               break;
1223 +       default:
1224 +               if (r > 0) {
1225 +                       DMWARN("unimplemented target map return value: %d", r);
1226 +                       BUG();
1227 +               }
1228 +
1229 +               /* The target wants to complete the I/O */
1230 +               dm_kill_request(clone, r);
1231 +               break;
1232 +       }
1233 +}
1234 +
1235 +/*
1236 + * q->request_fn for request-based dm.
1237 + * Called with the queue lock held.
1238 + */
1239 +static void dm_request_fn(struct request_queue *q)
1240 +{
1241 +       struct mapped_device *md = (struct mapped_device *)q->queuedata;
1242 +       struct dm_table *map = dm_get_table(md);
1243 +       struct dm_target *ti;
1244 +       struct request *rq;
1245 +
1246 +       /*
1247 +        * The check for blk_queue_stopped() needs here, because:
1248 +        *     - device suspend uses blk_stop_queue() and expects that
1249 +        *       no I/O will be dispatched any more after the queue stop
1250 +        *     - generic_unplug_device() doesn't call q->request_fn()
1251 +        *       when the queue is stopped, so no problem
1252 +        *     - but underlying device drivers may call q->request_fn()
1253 +        *       without the check through blk_run_queue()
1254 +        */
1255 +       while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1256 +               rq = elv_next_request(q);
1257 +               if (!rq)
1258 +                       goto plug_and_out;
1259 +
1260 +               ti = dm_table_find_target(map, rq->sector);
1261 +               if (ti->type->busy && ti->type->busy(ti))
1262 +                       goto plug_and_out;
1263 +
1264 +               blkdev_dequeue_request(rq);
1265 +               spin_unlock(q->queue_lock);
1266 +               map_request(ti, rq, md);
1267 +               spin_lock_irq(q->queue_lock);
1268 +       }
1269 +
1270 +       goto out;
1271 +
1272 +plug_and_out:
1273 +       if (!elv_queue_empty(q))
1274 +               /* Some requests still remain, retry later */
1275 +               blk_plug_device(q);
1276 +
1277 +out:
1278 +       dm_table_put(map);
1279 +
1280 +       return;
1281 +}
1282 +
1283 +int dm_underlying_device_busy(struct request_queue *q)
1284 +{
1285 +       return blk_lld_busy(q);
1286 +}
1287 +EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1288 +
1289 +static int dm_lld_busy(struct request_queue *q)
1290 +{
1291 +       int r;
1292 +       struct mapped_device *md = q->queuedata;
1293 +       struct dm_table *map = dm_get_table(md);
1294 +
1295 +       if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
1296 +               r = 1;
1297 +       else
1298 +               r = dm_table_any_busy_target(map);
1299 +
1300 +       dm_table_put(map);
1301 +       return r;
1302 +}
1303 +
1304  static void dm_unplug_all(struct request_queue *q)
1305  {
1306         struct mapped_device *md = q->queuedata;
1307         struct dm_table *map = dm_get_table(md);
1308
1309         if (map) {
1310 +               if (dm_request_based(md))
1311 +                       generic_unplug_device(q);
1312 +
1313                 dm_table_unplug_all(map);
1314                 dm_table_put(map);
1315         }
1316 @@ -955,6 +1618,12 @@ static int dm_any_congested(void *conges
1317
1318         if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
1319                 r = bdi_bits;
1320 +       else if (dm_request_based(md))
1321 +               /*
1322 +                * Request-based dm cares about only own queue for
1323 +                * the query about congestion status of request_queue
1324 +                */
1325 +               r = md->queue->backing_dev_info.state & bdi_bits;
1326         else
1327                 r = dm_table_any_congested(map, bdi_bits);
1328
1329 @@ -1075,10 +1744,22 @@ static struct mapped_device *alloc_dev(i
1330         INIT_LIST_HEAD(&md->uevent_list);
1331         spin_lock_init(&md->uevent_lock);
1332
1333 -       md->queue = blk_alloc_queue(GFP_KERNEL);
1334 +       md->queue = blk_init_queue(dm_request_fn, NULL);
1335         if (!md->queue)
1336                 goto bad_queue;
1337
1338 +       /*
1339 +        * Request-based dm devices cannot be stacked on top of bio-based dm
1340 +        * devices.  The type of this dm device has not been decided yet,
1341 +        * although we initialized the queue using blk_init_queue().
1342 +        * The type is decided at the first table loading time.
1343 +        * To prevent problematic device stacking, clear the queue flag
1344 +        * for request stacking support until then.
1345 +        *
1346 +        * This queue is new, so no concurrency on the queue_flags.
1347 +        */
1348 +       queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1349 +       md->saved_make_request_fn = md->queue->make_request_fn;
1350         md->queue->queuedata = md;
1351         md->queue->backing_dev_info.congested_fn = dm_any_congested;
1352         md->queue->backing_dev_info.congested_data = md;
1353 @@ -1086,18 +1767,9 @@ static struct mapped_device *alloc_dev(i
1354         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1355         md->queue->unplug_fn = dm_unplug_all;
1356         blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1357 -
1358 -       md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
1359 -       if (!md->io_pool)
1360 -               goto bad_io_pool;
1361 -
1362 -       md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1363 -       if (!md->tio_pool)
1364 -               goto bad_tio_pool;
1365 -
1366 -       md->bs = bioset_create(16, 16);
1367 -       if (!md->bs)
1368 -               goto bad_no_bioset;
1369 +       blk_queue_softirq_done(md->queue, dm_softirq_done);
1370 +       blk_queue_prep_rq(md->queue, dm_prep_fn);
1371 +       blk_queue_lld_busy(md->queue, dm_lld_busy);
1372
1373         md->disk = alloc_disk(1);
1374         if (!md->disk)
1375 @@ -1132,12 +1804,6 @@ static struct mapped_device *alloc_dev(i
1376  bad_thread:
1377         put_disk(md->disk);
1378  bad_disk:
1379 -       bioset_free(md->bs);
1380 -bad_no_bioset:
1381 -       mempool_destroy(md->tio_pool);
1382 -bad_tio_pool:
1383 -       mempool_destroy(md->io_pool);
1384 -bad_io_pool:
1385         blk_cleanup_queue(md->queue);
1386  bad_queue:
1387         free_minor(minor);
1388 @@ -1159,9 +1825,12 @@ static void free_dev(struct mapped_devic
1389                 bdput(md->suspended_bdev);
1390         }
1391         destroy_workqueue(md->wq);
1392 -       mempool_destroy(md->tio_pool);
1393 -       mempool_destroy(md->io_pool);
1394 -       bioset_free(md->bs);
1395 +       if (md->tio_pool)
1396 +               mempool_destroy(md->tio_pool);
1397 +       if (md->io_pool)
1398 +               mempool_destroy(md->io_pool);
1399 +       if (md->bs)
1400 +               bioset_free(md->bs);
1401         del_gendisk(md->disk);
1402         free_minor(minor);
1403
1404 @@ -1224,6 +1893,16 @@ static int __bind(struct mapped_device *
1405         dm_table_get(t);
1406         dm_table_event_callback(t, event_callback, md);
1407
1408 +       /*
1409 +        * The queue hasn't been stopped yet, if the old table type wasn't
1410 +        * for request-based during suspension.  So stop it to prevent
1411 +        * I/O mapping before resume.
1412 +        * This must be done before setting the queue restrictions,
1413 +        * because request-based dm may be run just after the setting.
1414 +        */
1415 +       if (dm_table_request_based(t) && !blk_queue_stopped(q))
1416 +               stop_queue(q);
1417 +
1418         write_lock(&md->map_lock);
1419         md->map = t;
1420         dm_table_set_restrictions(t, q);
1421 @@ -1346,7 +2025,11 @@ static int dm_wait_for_completion(struct
1422                 set_current_state(TASK_INTERRUPTIBLE);
1423
1424                 smp_mb();
1425 -               if (!atomic_read(&md->pending))
1426 +               if (dm_request_based(md)) {
1427 +                       if (!atomic_read(&md->pending) &&
1428 +                           blk_queue_stopped(md->queue))
1429 +                               break;
1430 +               } else if (!atomic_read(&md->pending))
1431                         break;
1432
1433                 if (signal_pending(current)) {
1434 @@ -1369,7 +2052,13 @@ static void __flush_deferred_io(struct m
1435         struct bio *c;
1436
1437         while ((c = bio_list_pop(&md->deferred))) {
1438 -               if (__split_bio(md, c))
1439 +               /*
1440 +                * Some bios might have been queued here during suspension
1441 +                * before setting of request-based dm in resume
1442 +                */
1443 +               if (dm_request_based(md))
1444 +                       generic_make_request(c);
1445 +               else if (__split_bio(md, c))
1446                         bio_io_error(c);
1447         }
1448
1449 @@ -1394,9 +2083,6 @@ static void dm_wq_work(struct work_struc
1450
1451         down_write(&md->io_lock);
1452         switch (req->type) {
1453 -       case DM_WQ_FLUSH_ALL:
1454 -               __merge_pushback_list(md);
1455 -               /* pass through */
1456         case DM_WQ_FLUSH_DEFERRED:
1457                 __flush_deferred_io(md);
1458                 break;
1459 @@ -1451,6 +2137,88 @@ out:
1460         return r;
1461  }
1462
1463 +static inline void dm_invalidate_flush_suspend(struct mapped_device *md)
1464 +{
1465 +       md->suspend_rq.data = (void *)0x1;
1466 +}
1467 +
1468 +static void dm_abort_suspend(struct mapped_device *md, int noflush)
1469 +{
1470 +       struct request_queue *q = md->queue;
1471 +       unsigned long flags;
1472 +
1473 +       /*
1474 +        * For flush suspend, invalidation and queue restart must be protected
1475 +        * by a single queue lock to prevent a race with dm_prep_fn().
1476 +        */
1477 +       spin_lock_irqsave(q->queue_lock, flags);
1478 +       if (!noflush)
1479 +               dm_invalidate_flush_suspend(md);
1480 +       __start_queue(q);
1481 +       spin_unlock_irqrestore(q->queue_lock, flags);
1482 +}
1483 +
1484 +/*
1485 + * Additional suspend work for request-based dm.
1486 + *
1487 + * In request-based dm, stopping request_queue prevents mapping.
1488 + * Even after stopping the request_queue, submitted requests from upper-layer
1489 + * can be inserted to the request_queue.  So original (unmapped) requests are
1490 + * kept in the request_queue during suspension.
1491 + */
1492 +static void dm_start_suspend(struct mapped_device *md, int noflush)
1493 +{
1494 +       struct request *rq = &md->suspend_rq;
1495 +       struct request_queue *q = md->queue;
1496 +       unsigned long flags;
1497 +
1498 +       if (noflush) {
1499 +               stop_queue(q);
1500 +               return;
1501 +       }
1502 +
1503 +       /*
1504 +        * For flush suspend, we need a marker to indicate the border line
1505 +        * between flush needed I/Os and deferred I/Os, since all I/Os are
1506 +        * queued in the request_queue during suspension.
1507 +        *
1508 +        * This marker must be inserted after setting DMF_BLOCK_IO,
1509 +        * because dm_prep_fn() considers no DMF_BLOCK_IO to be
1510 +        * a suspend interruption.
1511 +        */
1512 +       spin_lock_irqsave(q->queue_lock, flags);
1513 +       if (unlikely(rq->ref_count)) {
1514 +               /*
1515 +                * This can happen when the previous suspend was interrupted,
1516 +                * the inserted suspend_rq for the previous suspend has still
1517 +                * been in the queue and this suspend has been invoked.
1518 +                *
1519 +                * We could re-insert the suspend_rq by deleting it from
1520 +                * the queue forcibly using list_del_init(&rq->queuelist).
1521 +                * But it would break the block-layer easily.
1522 +                * So we don't re-insert the suspend_rq again in such a case.
1523 +                * The suspend_rq should be already invalidated during
1524 +                * the previous suspend interruption, so just wait for it
1525 +                * to be completed.
1526 +                *
1527 +                * This suspend will never complete, so warn the user to
1528 +                * interrupt this suspend and retry later.
1529 +                */
1530 +               BUG_ON(!rq->data);
1531 +               spin_unlock_irqrestore(q->queue_lock, flags);
1532 +
1533 +               DMWARN("Invalidating the previous suspend is still in"
1534 +                      " progress.  This suspend will be never done."
1535 +                      " Please interrupt this suspend and retry later.");
1536 +               return;
1537 +       }
1538 +       spin_unlock_irqrestore(q->queue_lock, flags);
1539 +
1540 +       /* Now no user of the suspend_rq */
1541 +       blk_rq_init(q, rq);
1542 +       blk_insert_request(q, rq, 0, NULL);
1543 +}
1544 +
1545  /*
1546   * Functions to lock and unlock any filesystem running on the
1547   * device.
1548 @@ -1526,7 +2294,7 @@ int dm_suspend(struct mapped_device *md,
1549                 if (!md->suspended_bdev) {
1550                         DMWARN("bdget failed in dm_suspend");
1551                         r = -ENOMEM;
1552 -                       goto flush_and_out;
1553 +                       goto out;
1554                 }
1555
1556                 /*
1557 @@ -1549,6 +2317,9 @@ int dm_suspend(struct mapped_device *md,
1558         add_wait_queue(&md->wait, &wait);
1559         up_write(&md->io_lock);
1560
1561 +       if (dm_request_based(md))
1562 +               dm_start_suspend(md, noflush);
1563 +
1564         /* unplug */
1565         if (map)
1566                 dm_table_unplug_all(map);
1567 @@ -1561,14 +2332,22 @@ int dm_suspend(struct mapped_device *md,
1568         down_write(&md->io_lock);
1569         remove_wait_queue(&md->wait, &wait);
1570
1571 -       if (noflush)
1572 -               __merge_pushback_list(md);
1573 +       if (noflush) {
1574 +               if (dm_request_based(md))
1575 +                       /* All requeued requests are already in md->queue */
1576 +                       clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1577 +               else
1578 +                       __merge_pushback_list(md);
1579 +       }
1580         up_write(&md->io_lock);
1581
1582         /* were we interrupted ? */
1583         if (r < 0) {
1584                 dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
1585
1586 +               if (dm_request_based(md))
1587 +                       dm_abort_suspend(md, noflush);
1588 +
1589                 unlock_fs(md);
1590                 goto out; /* pushback list is already flushed, so skip flush */
1591         }
1592 @@ -1577,14 +2356,6 @@ int dm_suspend(struct mapped_device *md,
1593
1594         set_bit(DMF_SUSPENDED, &md->flags);
1595
1596 -flush_and_out:
1597 -       if (r && noflush)
1598 -               /*
1599 -                * Because there may be already I/Os in the pushback list,
1600 -                * flush them before return.
1601 -                */
1602 -               dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
1603 -
1604  out:
1605         if (r && md->suspended_bdev) {
1606                 bdput(md->suspended_bdev);
1607 @@ -1617,6 +2388,14 @@ int dm_resume(struct mapped_device *md)
1608
1609         dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
1610
1611 +       /*
1612 +        * Flushing deferred I/Os must be done after targets are resumed
1613 +        * so that mapping of targets can work correctly.
1614 +        * Request-based dm is queueing the deferred I/Os in its request_queue.
1615 +        */
1616 +       if (dm_request_based(md))
1617 +               start_queue(md->queue);
1618 +
1619         unlock_fs(md);
1620
1621         if (md->suspended_bdev) {
1622 @@ -1698,6 +2477,65 @@ int dm_noflush_suspending(struct dm_targ
1623  }
1624  EXPORT_SYMBOL_GPL(dm_noflush_suspending);
1625
1626 +int dm_init_md_mempool(struct mapped_device *md, int type)
1627 +{
1628 +       if (unlikely(type == DM_TYPE_NONE)) {
1629 +               DMWARN("no type is specified, can't initialize mempool");
1630 +               return -EINVAL;
1631 +       }
1632 +
1633 +       if (md->mempool_type == type)
1634 +               return 0;
1635 +
1636 +       if (md->map) {
1637 +               /* The md has been using, can't change the mempool type */
1638 +               DMWARN("can't change mempool type after a table is bound");
1639 +               return -EINVAL;
1640 +       }
1641 +
1642 +       /* Not using the md yet, we can still change the mempool type */
1643 +       if (md->mempool_type != DM_TYPE_NONE) {
1644 +               mempool_destroy(md->io_pool);
1645 +               md->io_pool = NULL;
1646 +               mempool_destroy(md->tio_pool);
1647 +               md->tio_pool = NULL;
1648 +               bioset_free(md->bs);
1649 +               md->bs = NULL;
1650 +               md->mempool_type = DM_TYPE_NONE;
1651 +       }
1652 +
1653 +       md->io_pool = (type == DM_TYPE_BIO_BASED) ?
1654 +                     mempool_create_slab_pool(MIN_IOS, _io_cache) :
1655 +                     mempool_create_slab_pool(MIN_IOS, _bio_info_cache);
1656 +       if (!md->io_pool)
1657 +               return -ENOMEM;
1658 +
1659 +       md->tio_pool = (type == DM_TYPE_BIO_BASED) ?
1660 +                      mempool_create_slab_pool(MIN_IOS, _tio_cache) :
1661 +                      mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
1662 +       if (!md->tio_pool)
1663 +               goto free_io_pool_and_out;
1664 +
1665 +       md->bs = (type == DM_TYPE_BIO_BASED) ?
1666 +                bioset_create(16, 16) : bioset_create(MIN_IOS, MIN_IOS);
1667 +       if (!md->bs)
1668 +               goto free_tio_pool_and_out;
1669 +
1670 +       md->mempool_type = type;
1671 +
1672 +       return 0;
1673 +
1674 +free_tio_pool_and_out:
1675 +       mempool_destroy(md->tio_pool);
1676 +       md->tio_pool = NULL;
1677 +
1678 +free_io_pool_and_out:
1679 +       mempool_destroy(md->io_pool);
1680 +       md->io_pool = NULL;
1681 +
1682 +       return -ENOMEM;
1683 +}
1684 +
1685  static struct block_device_operations dm_blk_dops = {
1686         .open = dm_blk_open,
1687         .release = dm_blk_close,
1688 --- a/drivers/md/dm.h
1689 +++ b/drivers/md/dm.h
1690 @@ -23,6 +23,13 @@
1691  #define DM_SUSPEND_NOFLUSH_FLAG                (1 << 1)
1692
1693  /*
1694 + * Type of table and mapped_device's mempool
1695 + */
1696 +#define DM_TYPE_NONE           0
1697 +#define DM_TYPE_BIO_BASED      1
1698 +#define DM_TYPE_REQUEST_BASED  2
1699 +
1700 +/*
1701   * List of devices that a metadevice uses and should open/close.
1702   */
1703  struct dm_dev {
1704 @@ -49,6 +56,10 @@ void dm_table_presuspend_targets(struct
1705  void dm_table_postsuspend_targets(struct dm_table *t);
1706  int dm_table_resume_targets(struct dm_table *t);
1707  int dm_table_any_congested(struct dm_table *t, int bdi_bits);
1708 +int dm_table_any_busy_target(struct dm_table *t);
1709 +int dm_table_set_type(struct dm_table *t);
1710 +int dm_table_get_type(struct dm_table *t);
1711 +int dm_table_request_based(struct dm_table *t);
1712  void dm_table_unplug_all(struct dm_table *t);
1713
1714  /*
1715 @@ -97,10 +108,16 @@ void *dm_vcalloc(unsigned long nmemb, un
1716  union map_info *dm_get_mapinfo(struct bio *bio);
1717  int dm_open_count(struct mapped_device *md);
1718  int dm_lock_for_deletion(struct mapped_device *md);
1719 +union map_info *dm_get_rq_mapinfo(struct request *rq);
1720
1721  void dm_kobject_uevent(struct mapped_device *md);
1722
1723  int dm_kcopyd_init(void);
1724  void dm_kcopyd_exit(void);
1725
1726 +/*
1727 + * Mempool initializer for a mapped_device
1728 + */
1729 +int dm_init_md_mempool(struct mapped_device *md, int type);
1730 +
1731  #endif
1732 --- a/include/linux/device-mapper.h
1733 +++ b/include/linux/device-mapper.h
1734 @@ -46,6 +46,8 @@ typedef void (*dm_dtr_fn) (struct dm_tar
1735   */
1736  typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
1737                           union map_info *map_context);
1738 +typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
1739 +                                 union map_info *map_context);
1740
1741  /*
1742   * Returns:
1743 @@ -58,6 +60,9 @@ typedef int (*dm_map_fn) (struct dm_targ
1744  typedef int (*dm_endio_fn) (struct dm_target *ti,
1745                             struct bio *bio, int error,
1746                             union map_info *map_context);
1747 +typedef int (*dm_request_endio_fn) (struct dm_target *ti,
1748 +                                   struct request *clone, int error,
1749 +                                   union map_info *map_context);
1750
1751  typedef void (*dm_flush_fn) (struct dm_target *ti);
1752  typedef void (*dm_presuspend_fn) (struct dm_target *ti);
1753 @@ -77,6 +82,13 @@ typedef int (*dm_ioctl_fn) (struct dm_ta
1754  typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
1755                             struct bio_vec *biovec, int max_size);
1756
1757 +/*
1758 + * Returns:
1759 + *    0: The target can handle the next I/O immediately.
1760 + *    1: The target can't handle the next I/O immediately.
1761 + */
1762 +typedef int (*dm_busy_fn) (struct dm_target *ti);
1763 +
1764  void dm_error(const char *message);
1765
1766  /*
1767 @@ -103,7 +115,9 @@ struct target_type {
1768         dm_ctr_fn ctr;
1769         dm_dtr_fn dtr;
1770         dm_map_fn map;
1771 +       dm_map_request_fn map_rq;
1772         dm_endio_fn end_io;
1773 +       dm_request_endio_fn rq_end_io;
1774         dm_flush_fn flush;
1775         dm_presuspend_fn presuspend;
1776         dm_postsuspend_fn postsuspend;
1777 @@ -113,6 +127,7 @@ struct target_type {
1778         dm_message_fn message;
1779         dm_ioctl_fn ioctl;
1780         dm_merge_fn merge;
1781 +       dm_busy_fn busy;
1782  };
1783
1784  struct io_restrictions {
1785 @@ -125,6 +140,7 @@ struct io_restrictions {
1786         unsigned short max_hw_segments;
1787         unsigned short max_phys_segments;
1788         unsigned char no_cluster; /* inverted so that 0 is default */
1789 +       unsigned char no_request_stacking;
1790  };
1791
1792  struct dm_target {
1793 @@ -348,4 +364,12 @@ static inline unsigned long to_bytes(sec
1794         return (n << SECTOR_SHIFT);
1795  }
1796
1797 +/*-----------------------------------------------------------------
1798 + * Helper for block layer and dm core operations
1799 + *---------------------------------------------------------------*/
1800 +void dm_dispatch_request(struct request *rq);
1801 +void dm_requeue_request(struct request *rq);
1802 +void dm_kill_request(struct request *rq, int error);
1803 +int dm_underlying_device_busy(struct request_queue *q);
1804 +
1805  #endif /* _LINUX_DEVICE_MAPPER_H */