2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
10 #include "dm-uevent.h"
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/mutex.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/blkpg.h>
19 #include <linux/bio.h>
20 #include <linux/mempool.h>
21 #include <linux/dax.h>
22 #include <linux/slab.h>
23 #include <linux/idr.h>
24 #include <linux/uio.h>
25 #include <linux/hdreg.h>
26 #include <linux/delay.h>
27 #include <linux/wait.h>
29 #include <linux/refcount.h>
30 #include <linux/part_stat.h>
31 #include <linux/blk-crypto.h>
32 #include <linux/blk-crypto-profile.h>
34 #define DM_MSG_PREFIX "core"
37 * Cookies are numeric values sent with CHANGE and REMOVE
38 * uevents while resuming, removing or renaming the device.
40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
41 #define DM_COOKIE_LENGTH 24
44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
45 * dm_io into one list, and reuse bio->bi_private as the list head. Before
46 * ending this fs bio, we will recover its ->bi_private.
48 #define REQ_DM_POLL_LIST REQ_DRV
50 static const char *_name
= DM_NAME
;
52 static unsigned int major
= 0;
53 static unsigned int _major
= 0;
55 static DEFINE_IDR(_minor_idr
);
57 static DEFINE_SPINLOCK(_minor_lock
);
59 static void do_deferred_remove(struct work_struct
*w
);
61 static DECLARE_WORK(deferred_remove_work
, do_deferred_remove
);
63 static struct workqueue_struct
*deferred_remove_workqueue
;
65 atomic_t dm_global_event_nr
= ATOMIC_INIT(0);
66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq
);
68 void dm_issue_global_event(void)
70 atomic_inc(&dm_global_event_nr
);
71 wake_up(&dm_global_eventq
);
74 DEFINE_STATIC_KEY_FALSE(stats_enabled
);
75 DEFINE_STATIC_KEY_FALSE(swap_bios_enabled
);
76 DEFINE_STATIC_KEY_FALSE(zoned_enabled
);
79 * One of these is allocated (on-stack) per original bio.
86 unsigned sector_count
;
87 bool is_abnormal_io
:1;
88 bool submit_as_polled
:1;
91 static inline struct dm_target_io
*clone_to_tio(struct bio
*clone
)
93 return container_of(clone
, struct dm_target_io
, clone
);
96 void *dm_per_bio_data(struct bio
*bio
, size_t data_size
)
98 if (!dm_tio_flagged(clone_to_tio(bio
), DM_TIO_INSIDE_DM_IO
))
99 return (char *)bio
- DM_TARGET_IO_BIO_OFFSET
- data_size
;
100 return (char *)bio
- DM_IO_BIO_OFFSET
- data_size
;
102 EXPORT_SYMBOL_GPL(dm_per_bio_data
);
104 struct bio
*dm_bio_from_per_bio_data(void *data
, size_t data_size
)
106 struct dm_io
*io
= (struct dm_io
*)((char *)data
+ data_size
);
107 if (io
->magic
== DM_IO_MAGIC
)
108 return (struct bio
*)((char *)io
+ DM_IO_BIO_OFFSET
);
109 BUG_ON(io
->magic
!= DM_TIO_MAGIC
);
110 return (struct bio
*)((char *)io
+ DM_TARGET_IO_BIO_OFFSET
);
112 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data
);
114 unsigned dm_bio_get_target_bio_nr(const struct bio
*bio
)
116 return container_of(bio
, struct dm_target_io
, clone
)->target_bio_nr
;
118 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr
);
120 #define MINOR_ALLOCED ((void *)-1)
122 #define DM_NUMA_NODE NUMA_NO_NODE
123 static int dm_numa_node
= DM_NUMA_NODE
;
125 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
126 static int swap_bios
= DEFAULT_SWAP_BIOS
;
127 static int get_swap_bios(void)
129 int latch
= READ_ONCE(swap_bios
);
130 if (unlikely(latch
<= 0))
131 latch
= DEFAULT_SWAP_BIOS
;
135 struct table_device
{
136 struct list_head list
;
138 struct dm_dev dm_dev
;
142 * Bio-based DM's mempools' reserved IOs set by the user.
144 #define RESERVED_BIO_BASED_IOS 16
145 static unsigned reserved_bio_based_ios
= RESERVED_BIO_BASED_IOS
;
147 static int __dm_get_module_param_int(int *module_param
, int min
, int max
)
149 int param
= READ_ONCE(*module_param
);
150 int modified_param
= 0;
151 bool modified
= true;
154 modified_param
= min
;
155 else if (param
> max
)
156 modified_param
= max
;
161 (void)cmpxchg(module_param
, param
, modified_param
);
162 param
= modified_param
;
168 unsigned __dm_get_module_param(unsigned *module_param
,
169 unsigned def
, unsigned max
)
171 unsigned param
= READ_ONCE(*module_param
);
172 unsigned modified_param
= 0;
175 modified_param
= def
;
176 else if (param
> max
)
177 modified_param
= max
;
179 if (modified_param
) {
180 (void)cmpxchg(module_param
, param
, modified_param
);
181 param
= modified_param
;
187 unsigned dm_get_reserved_bio_based_ios(void)
189 return __dm_get_module_param(&reserved_bio_based_ios
,
190 RESERVED_BIO_BASED_IOS
, DM_RESERVED_MAX_IOS
);
192 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios
);
194 static unsigned dm_get_numa_node(void)
196 return __dm_get_module_param_int(&dm_numa_node
,
197 DM_NUMA_NODE
, num_online_nodes() - 1);
200 static int __init
local_init(void)
204 r
= dm_uevent_init();
208 deferred_remove_workqueue
= alloc_workqueue("kdmremove", WQ_UNBOUND
, 1);
209 if (!deferred_remove_workqueue
) {
211 goto out_uevent_exit
;
215 r
= register_blkdev(_major
, _name
);
217 goto out_free_workqueue
;
225 destroy_workqueue(deferred_remove_workqueue
);
232 static void local_exit(void)
234 flush_scheduled_work();
235 destroy_workqueue(deferred_remove_workqueue
);
237 unregister_blkdev(_major
, _name
);
242 DMINFO("cleaned up");
245 static int (*_inits
[])(void) __initdata
= {
256 static void (*_exits
[])(void) = {
267 static int __init
dm_init(void)
269 const int count
= ARRAY_SIZE(_inits
);
272 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
273 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
274 " Duplicate IMA measurements will not be recorded in the IMA log.");
277 for (i
= 0; i
< count
; i
++) {
291 static void __exit
dm_exit(void)
293 int i
= ARRAY_SIZE(_exits
);
299 * Should be empty by this point.
301 idr_destroy(&_minor_idr
);
305 * Block device functions
307 int dm_deleting_md(struct mapped_device
*md
)
309 return test_bit(DMF_DELETING
, &md
->flags
);
312 static int dm_blk_open(struct block_device
*bdev
, fmode_t mode
)
314 struct mapped_device
*md
;
316 spin_lock(&_minor_lock
);
318 md
= bdev
->bd_disk
->private_data
;
322 if (test_bit(DMF_FREEING
, &md
->flags
) ||
323 dm_deleting_md(md
)) {
329 atomic_inc(&md
->open_count
);
331 spin_unlock(&_minor_lock
);
333 return md
? 0 : -ENXIO
;
336 static void dm_blk_close(struct gendisk
*disk
, fmode_t mode
)
338 struct mapped_device
*md
;
340 spin_lock(&_minor_lock
);
342 md
= disk
->private_data
;
346 if (atomic_dec_and_test(&md
->open_count
) &&
347 (test_bit(DMF_DEFERRED_REMOVE
, &md
->flags
)))
348 queue_work(deferred_remove_workqueue
, &deferred_remove_work
);
352 spin_unlock(&_minor_lock
);
355 int dm_open_count(struct mapped_device
*md
)
357 return atomic_read(&md
->open_count
);
361 * Guarantees nothing is using the device before it's deleted.
363 int dm_lock_for_deletion(struct mapped_device
*md
, bool mark_deferred
, bool only_deferred
)
367 spin_lock(&_minor_lock
);
369 if (dm_open_count(md
)) {
372 set_bit(DMF_DEFERRED_REMOVE
, &md
->flags
);
373 } else if (only_deferred
&& !test_bit(DMF_DEFERRED_REMOVE
, &md
->flags
))
376 set_bit(DMF_DELETING
, &md
->flags
);
378 spin_unlock(&_minor_lock
);
383 int dm_cancel_deferred_remove(struct mapped_device
*md
)
387 spin_lock(&_minor_lock
);
389 if (test_bit(DMF_DELETING
, &md
->flags
))
392 clear_bit(DMF_DEFERRED_REMOVE
, &md
->flags
);
394 spin_unlock(&_minor_lock
);
399 static void do_deferred_remove(struct work_struct
*w
)
401 dm_deferred_remove();
404 static int dm_blk_getgeo(struct block_device
*bdev
, struct hd_geometry
*geo
)
406 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
408 return dm_get_geometry(md
, geo
);
411 static int dm_prepare_ioctl(struct mapped_device
*md
, int *srcu_idx
,
412 struct block_device
**bdev
)
414 struct dm_target
*ti
;
415 struct dm_table
*map
;
420 map
= dm_get_live_table(md
, srcu_idx
);
421 if (!map
|| !dm_table_get_size(map
))
424 /* We only support devices that have a single target */
425 if (map
->num_targets
!= 1)
428 ti
= dm_table_get_target(map
, 0);
429 if (!ti
->type
->prepare_ioctl
)
432 if (dm_suspended_md(md
))
435 r
= ti
->type
->prepare_ioctl(ti
, bdev
);
436 if (r
== -ENOTCONN
&& !fatal_signal_pending(current
)) {
437 dm_put_live_table(md
, *srcu_idx
);
445 static void dm_unprepare_ioctl(struct mapped_device
*md
, int srcu_idx
)
447 dm_put_live_table(md
, srcu_idx
);
450 static int dm_blk_ioctl(struct block_device
*bdev
, fmode_t mode
,
451 unsigned int cmd
, unsigned long arg
)
453 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
456 r
= dm_prepare_ioctl(md
, &srcu_idx
, &bdev
);
462 * Target determined this ioctl is being issued against a
463 * subset of the parent bdev; require extra privileges.
465 if (!capable(CAP_SYS_RAWIO
)) {
467 "%s: sending ioctl %x to DM device without required privilege.",
474 if (!bdev
->bd_disk
->fops
->ioctl
)
477 r
= bdev
->bd_disk
->fops
->ioctl(bdev
, mode
, cmd
, arg
);
479 dm_unprepare_ioctl(md
, srcu_idx
);
483 u64
dm_start_time_ns_from_clone(struct bio
*bio
)
485 return jiffies_to_nsecs(clone_to_tio(bio
)->io
->start_time
);
487 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone
);
489 static bool bio_is_flush_with_data(struct bio
*bio
)
491 return ((bio
->bi_opf
& REQ_PREFLUSH
) && bio
->bi_iter
.bi_size
);
494 static void dm_io_acct(struct dm_io
*io
, bool end
)
496 struct dm_stats_aux
*stats_aux
= &io
->stats_aux
;
497 unsigned long start_time
= io
->start_time
;
498 struct mapped_device
*md
= io
->md
;
499 struct bio
*bio
= io
->orig_bio
;
500 unsigned int sectors
;
503 * If REQ_PREFLUSH set, don't account payload, it will be
504 * submitted (and accounted) after this flush completes.
506 if (bio_is_flush_with_data(bio
))
508 else if (likely(!(dm_io_flagged(io
, DM_IO_WAS_SPLIT
))))
509 sectors
= bio_sectors(bio
);
511 sectors
= io
->sectors
;
514 bdev_start_io_acct(bio
->bi_bdev
, sectors
, bio_op(bio
),
517 bdev_end_io_acct(bio
->bi_bdev
, bio_op(bio
), start_time
);
519 if (static_branch_unlikely(&stats_enabled
) &&
520 unlikely(dm_stats_used(&md
->stats
))) {
523 if (likely(!dm_io_flagged(io
, DM_IO_WAS_SPLIT
)))
524 sector
= bio
->bi_iter
.bi_sector
;
526 sector
= bio_end_sector(bio
) - io
->sector_offset
;
528 dm_stats_account_io(&md
->stats
, bio_data_dir(bio
),
530 end
, start_time
, stats_aux
);
534 static void __dm_start_io_acct(struct dm_io
*io
)
536 dm_io_acct(io
, false);
539 static void dm_start_io_acct(struct dm_io
*io
, struct bio
*clone
)
542 * Ensure IO accounting is only ever started once.
544 if (dm_io_flagged(io
, DM_IO_ACCOUNTED
))
547 /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
548 if (!clone
|| likely(dm_tio_is_normal(clone_to_tio(clone
)))) {
549 dm_io_set_flag(io
, DM_IO_ACCOUNTED
);
552 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
553 spin_lock_irqsave(&io
->lock
, flags
);
554 if (dm_io_flagged(io
, DM_IO_ACCOUNTED
)) {
555 spin_unlock_irqrestore(&io
->lock
, flags
);
558 dm_io_set_flag(io
, DM_IO_ACCOUNTED
);
559 spin_unlock_irqrestore(&io
->lock
, flags
);
562 __dm_start_io_acct(io
);
565 static void dm_end_io_acct(struct dm_io
*io
)
567 dm_io_acct(io
, true);
570 static struct dm_io
*alloc_io(struct mapped_device
*md
, struct bio
*bio
)
573 struct dm_target_io
*tio
;
576 clone
= bio_alloc_clone(NULL
, bio
, GFP_NOIO
, &md
->mempools
->io_bs
);
577 tio
= clone_to_tio(clone
);
579 dm_tio_set_flag(tio
, DM_TIO_INSIDE_DM_IO
);
582 io
= container_of(tio
, struct dm_io
, tio
);
583 io
->magic
= DM_IO_MAGIC
;
584 io
->status
= BLK_STS_OK
;
586 /* one ref is for submission, the other is for completion */
587 atomic_set(&io
->io_count
, 2);
588 this_cpu_inc(*md
->pending_io
);
591 spin_lock_init(&io
->lock
);
592 io
->start_time
= jiffies
;
595 if (static_branch_unlikely(&stats_enabled
))
596 dm_stats_record_start(&md
->stats
, &io
->stats_aux
);
601 static void free_io(struct dm_io
*io
)
603 bio_put(&io
->tio
.clone
);
606 static struct bio
*alloc_tio(struct clone_info
*ci
, struct dm_target
*ti
,
607 unsigned target_bio_nr
, unsigned *len
, gfp_t gfp_mask
)
609 struct mapped_device
*md
= ci
->io
->md
;
610 struct dm_target_io
*tio
;
613 if (!ci
->io
->tio
.io
) {
614 /* the dm_target_io embedded in ci->io is available */
616 /* alloc_io() already initialized embedded clone */
619 clone
= bio_alloc_clone(NULL
, ci
->bio
, gfp_mask
,
624 /* REQ_DM_POLL_LIST shouldn't be inherited */
625 clone
->bi_opf
&= ~REQ_DM_POLL_LIST
;
627 tio
= clone_to_tio(clone
);
628 tio
->flags
= 0; /* also clears DM_TIO_INSIDE_DM_IO */
631 tio
->magic
= DM_TIO_MAGIC
;
634 tio
->target_bio_nr
= target_bio_nr
;
638 /* Set default bdev, but target must bio_set_dev() before issuing IO */
639 clone
->bi_bdev
= md
->disk
->part0
;
640 if (unlikely(ti
->needs_bio_set_dev
))
641 bio_set_dev(clone
, md
->disk
->part0
);
644 clone
->bi_iter
.bi_size
= to_bytes(*len
);
645 if (bio_integrity(clone
))
646 bio_integrity_trim(clone
);
652 static void free_tio(struct bio
*clone
)
654 if (dm_tio_flagged(clone_to_tio(clone
), DM_TIO_INSIDE_DM_IO
))
660 * Add the bio to the list of deferred io.
662 static void queue_io(struct mapped_device
*md
, struct bio
*bio
)
666 spin_lock_irqsave(&md
->deferred_lock
, flags
);
667 bio_list_add(&md
->deferred
, bio
);
668 spin_unlock_irqrestore(&md
->deferred_lock
, flags
);
669 queue_work(md
->wq
, &md
->work
);
673 * Everyone (including functions in this file), should use this
674 * function to access the md->map field, and make sure they call
675 * dm_put_live_table() when finished.
677 struct dm_table
*dm_get_live_table(struct mapped_device
*md
,
678 int *srcu_idx
) __acquires(md
->io_barrier
)
680 *srcu_idx
= srcu_read_lock(&md
->io_barrier
);
682 return srcu_dereference(md
->map
, &md
->io_barrier
);
685 void dm_put_live_table(struct mapped_device
*md
,
686 int srcu_idx
) __releases(md
->io_barrier
)
688 srcu_read_unlock(&md
->io_barrier
, srcu_idx
);
691 void dm_sync_table(struct mapped_device
*md
)
693 synchronize_srcu(&md
->io_barrier
);
694 synchronize_rcu_expedited();
698 * A fast alternative to dm_get_live_table/dm_put_live_table.
699 * The caller must not block between these two functions.
701 static struct dm_table
*dm_get_live_table_fast(struct mapped_device
*md
) __acquires(RCU
)
704 return rcu_dereference(md
->map
);
707 static void dm_put_live_table_fast(struct mapped_device
*md
) __releases(RCU
)
712 static inline struct dm_table
*dm_get_live_table_bio(struct mapped_device
*md
,
713 int *srcu_idx
, blk_opf_t bio_opf
)
715 if (bio_opf
& REQ_NOWAIT
)
716 return dm_get_live_table_fast(md
);
718 return dm_get_live_table(md
, srcu_idx
);
721 static inline void dm_put_live_table_bio(struct mapped_device
*md
, int srcu_idx
,
724 if (bio_opf
& REQ_NOWAIT
)
725 dm_put_live_table_fast(md
);
727 dm_put_live_table(md
, srcu_idx
);
730 static char *_dm_claim_ptr
= "I belong to device-mapper";
733 * Open a table device so we can use it as a map destination.
735 static struct table_device
*open_table_device(struct mapped_device
*md
,
736 dev_t dev
, fmode_t mode
)
738 struct table_device
*td
;
739 struct block_device
*bdev
;
743 td
= kmalloc_node(sizeof(*td
), GFP_KERNEL
, md
->numa_node_id
);
745 return ERR_PTR(-ENOMEM
);
746 refcount_set(&td
->count
, 1);
748 bdev
= blkdev_get_by_dev(dev
, mode
| FMODE_EXCL
, _dm_claim_ptr
);
755 * We can be called before the dm disk is added. In that case we can't
756 * register the holder relation here. It will be done once add_disk was
759 if (md
->disk
->slave_dir
) {
760 r
= bd_link_disk_holder(bdev
, md
->disk
);
765 td
->dm_dev
.mode
= mode
;
766 td
->dm_dev
.bdev
= bdev
;
767 td
->dm_dev
.dax_dev
= fs_dax_get_by_bdev(bdev
, &part_off
, NULL
, NULL
);
768 format_dev_t(td
->dm_dev
.name
, dev
);
769 list_add(&td
->list
, &md
->table_devices
);
773 blkdev_put(bdev
, mode
| FMODE_EXCL
);
780 * Close a table device that we've been using.
782 static void close_table_device(struct table_device
*td
, struct mapped_device
*md
)
784 if (md
->disk
->slave_dir
)
785 bd_unlink_disk_holder(td
->dm_dev
.bdev
, md
->disk
);
786 blkdev_put(td
->dm_dev
.bdev
, td
->dm_dev
.mode
| FMODE_EXCL
);
787 put_dax(td
->dm_dev
.dax_dev
);
792 static struct table_device
*find_table_device(struct list_head
*l
, dev_t dev
,
795 struct table_device
*td
;
797 list_for_each_entry(td
, l
, list
)
798 if (td
->dm_dev
.bdev
->bd_dev
== dev
&& td
->dm_dev
.mode
== mode
)
804 int dm_get_table_device(struct mapped_device
*md
, dev_t dev
, fmode_t mode
,
805 struct dm_dev
**result
)
807 struct table_device
*td
;
809 mutex_lock(&md
->table_devices_lock
);
810 td
= find_table_device(&md
->table_devices
, dev
, mode
);
812 td
= open_table_device(md
, dev
, mode
);
814 mutex_unlock(&md
->table_devices_lock
);
818 refcount_inc(&td
->count
);
820 mutex_unlock(&md
->table_devices_lock
);
822 *result
= &td
->dm_dev
;
826 void dm_put_table_device(struct mapped_device
*md
, struct dm_dev
*d
)
828 struct table_device
*td
= container_of(d
, struct table_device
, dm_dev
);
830 mutex_lock(&md
->table_devices_lock
);
831 if (refcount_dec_and_test(&td
->count
))
832 close_table_device(td
, md
);
833 mutex_unlock(&md
->table_devices_lock
);
837 * Get the geometry associated with a dm device
839 int dm_get_geometry(struct mapped_device
*md
, struct hd_geometry
*geo
)
847 * Set the geometry of a device.
849 int dm_set_geometry(struct mapped_device
*md
, struct hd_geometry
*geo
)
851 sector_t sz
= (sector_t
)geo
->cylinders
* geo
->heads
* geo
->sectors
;
853 if (geo
->start
> sz
) {
854 DMERR("Start sector is beyond the geometry limits.");
863 static int __noflush_suspending(struct mapped_device
*md
)
865 return test_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
868 static void dm_requeue_add_io(struct dm_io
*io
, bool first_stage
)
870 struct mapped_device
*md
= io
->md
;
873 struct dm_io
*next
= md
->requeue_list
;
875 md
->requeue_list
= io
;
878 bio_list_add_head(&md
->deferred
, io
->orig_bio
);
882 static void dm_kick_requeue(struct mapped_device
*md
, bool first_stage
)
885 queue_work(md
->wq
, &md
->requeue_work
);
887 queue_work(md
->wq
, &md
->work
);
891 * Return true if the dm_io's original bio is requeued.
892 * io->status is updated with error if requeue disallowed.
894 static bool dm_handle_requeue(struct dm_io
*io
, bool first_stage
)
896 struct bio
*bio
= io
->orig_bio
;
897 bool handle_requeue
= (io
->status
== BLK_STS_DM_REQUEUE
);
898 bool handle_polled_eagain
= ((io
->status
== BLK_STS_AGAIN
) &&
899 (bio
->bi_opf
& REQ_POLLED
));
900 struct mapped_device
*md
= io
->md
;
901 bool requeued
= false;
903 if (handle_requeue
|| handle_polled_eagain
) {
906 if (bio
->bi_opf
& REQ_POLLED
) {
908 * Upper layer won't help us poll split bio
909 * (io->orig_bio may only reflect a subset of the
910 * pre-split original) so clear REQ_POLLED.
912 bio_clear_polled(bio
);
916 * Target requested pushing back the I/O or
917 * polled IO hit BLK_STS_AGAIN.
919 spin_lock_irqsave(&md
->deferred_lock
, flags
);
920 if ((__noflush_suspending(md
) &&
921 !WARN_ON_ONCE(dm_is_zone_write(md
, bio
))) ||
922 handle_polled_eagain
|| first_stage
) {
923 dm_requeue_add_io(io
, first_stage
);
927 * noflush suspend was interrupted or this is
928 * a write to a zoned target.
930 io
->status
= BLK_STS_IOERR
;
932 spin_unlock_irqrestore(&md
->deferred_lock
, flags
);
936 dm_kick_requeue(md
, first_stage
);
941 static void __dm_io_complete(struct dm_io
*io
, bool first_stage
)
943 struct bio
*bio
= io
->orig_bio
;
944 struct mapped_device
*md
= io
->md
;
945 blk_status_t io_error
;
948 requeued
= dm_handle_requeue(io
, first_stage
);
949 if (requeued
&& first_stage
)
952 io_error
= io
->status
;
953 if (dm_io_flagged(io
, DM_IO_ACCOUNTED
))
955 else if (!io_error
) {
957 * Must handle target that DM_MAPIO_SUBMITTED only to
958 * then bio_endio() rather than dm_submit_bio_remap()
960 __dm_start_io_acct(io
);
965 this_cpu_dec(*md
->pending_io
);
967 /* nudge anyone waiting on suspend queue */
968 if (unlikely(wq_has_sleeper(&md
->wait
)))
971 /* Return early if the original bio was requeued */
975 if (bio_is_flush_with_data(bio
)) {
977 * Preflush done for flush with data, reissue
978 * without REQ_PREFLUSH.
980 bio
->bi_opf
&= ~REQ_PREFLUSH
;
983 /* done with normal IO or empty flush */
985 bio
->bi_status
= io_error
;
990 static void dm_wq_requeue_work(struct work_struct
*work
)
992 struct mapped_device
*md
= container_of(work
, struct mapped_device
,
997 /* reuse deferred lock to simplify dm_handle_requeue */
998 spin_lock_irqsave(&md
->deferred_lock
, flags
);
999 io
= md
->requeue_list
;
1000 md
->requeue_list
= NULL
;
1001 spin_unlock_irqrestore(&md
->deferred_lock
, flags
);
1004 struct dm_io
*next
= io
->next
;
1006 dm_io_rewind(io
, &md
->disk
->bio_split
);
1009 __dm_io_complete(io
, false);
1015 * Two staged requeue:
1017 * 1) io->orig_bio points to the real original bio, and the part mapped to
1018 * this io must be requeued, instead of other parts of the original bio.
1020 * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
1022 static void dm_io_complete(struct dm_io
*io
)
1027 * Only dm_io that has been split needs two stage requeue, otherwise
1028 * we may run into long bio clone chain during suspend and OOM could
1031 * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
1032 * also aren't handled via the first stage requeue.
1034 if (dm_io_flagged(io
, DM_IO_WAS_SPLIT
))
1035 first_requeue
= true;
1037 first_requeue
= false;
1039 __dm_io_complete(io
, first_requeue
);
1043 * Decrements the number of outstanding ios that a bio has been
1044 * cloned into, completing the original io if necc.
1046 static inline void __dm_io_dec_pending(struct dm_io
*io
)
1048 if (atomic_dec_and_test(&io
->io_count
))
1052 static void dm_io_set_error(struct dm_io
*io
, blk_status_t error
)
1054 unsigned long flags
;
1056 /* Push-back supersedes any I/O errors */
1057 spin_lock_irqsave(&io
->lock
, flags
);
1058 if (!(io
->status
== BLK_STS_DM_REQUEUE
&&
1059 __noflush_suspending(io
->md
))) {
1062 spin_unlock_irqrestore(&io
->lock
, flags
);
1065 static void dm_io_dec_pending(struct dm_io
*io
, blk_status_t error
)
1067 if (unlikely(error
))
1068 dm_io_set_error(io
, error
);
1070 __dm_io_dec_pending(io
);
1073 void disable_discard(struct mapped_device
*md
)
1075 struct queue_limits
*limits
= dm_get_queue_limits(md
);
1077 /* device doesn't really support DISCARD, disable it */
1078 limits
->max_discard_sectors
= 0;
1081 void disable_write_zeroes(struct mapped_device
*md
)
1083 struct queue_limits
*limits
= dm_get_queue_limits(md
);
1085 /* device doesn't really support WRITE ZEROES, disable it */
1086 limits
->max_write_zeroes_sectors
= 0;
1089 static bool swap_bios_limit(struct dm_target
*ti
, struct bio
*bio
)
1091 return unlikely((bio
->bi_opf
& REQ_SWAP
) != 0) && unlikely(ti
->limit_swap_bios
);
1094 static void clone_endio(struct bio
*bio
)
1096 blk_status_t error
= bio
->bi_status
;
1097 struct dm_target_io
*tio
= clone_to_tio(bio
);
1098 struct dm_target
*ti
= tio
->ti
;
1099 dm_endio_fn endio
= ti
->type
->end_io
;
1100 struct dm_io
*io
= tio
->io
;
1101 struct mapped_device
*md
= io
->md
;
1103 if (unlikely(error
== BLK_STS_TARGET
)) {
1104 if (bio_op(bio
) == REQ_OP_DISCARD
&&
1105 !bdev_max_discard_sectors(bio
->bi_bdev
))
1106 disable_discard(md
);
1107 else if (bio_op(bio
) == REQ_OP_WRITE_ZEROES
&&
1108 !bdev_write_zeroes_sectors(bio
->bi_bdev
))
1109 disable_write_zeroes(md
);
1112 if (static_branch_unlikely(&zoned_enabled
) &&
1113 unlikely(bdev_is_zoned(bio
->bi_bdev
)))
1114 dm_zone_endio(io
, bio
);
1117 int r
= endio(ti
, bio
, &error
);
1119 case DM_ENDIO_REQUEUE
:
1120 if (static_branch_unlikely(&zoned_enabled
)) {
1122 * Requeuing writes to a sequential zone of a zoned
1123 * target will break the sequential write pattern:
1126 if (WARN_ON_ONCE(dm_is_zone_write(md
, bio
)))
1127 error
= BLK_STS_IOERR
;
1129 error
= BLK_STS_DM_REQUEUE
;
1131 error
= BLK_STS_DM_REQUEUE
;
1135 case DM_ENDIO_INCOMPLETE
:
1136 /* The target will handle the io */
1139 DMCRIT("unimplemented target endio return value: %d", r
);
1144 if (static_branch_unlikely(&swap_bios_enabled
) &&
1145 unlikely(swap_bios_limit(ti
, bio
)))
1146 up(&md
->swap_bios_semaphore
);
1149 dm_io_dec_pending(io
, error
);
1153 * Return maximum size of I/O possible at the supplied sector up to the current
1156 static inline sector_t
max_io_len_target_boundary(struct dm_target
*ti
,
1157 sector_t target_offset
)
1159 return ti
->len
- target_offset
;
1162 static sector_t
max_io_len(struct dm_target
*ti
, sector_t sector
)
1164 sector_t target_offset
= dm_target_offset(ti
, sector
);
1165 sector_t len
= max_io_len_target_boundary(ti
, target_offset
);
1168 * Does the target need to split IO even further?
1169 * - varied (per target) IO splitting is a tenet of DM; this
1170 * explains why stacked chunk_sectors based splitting via
1171 * bio_split_to_limits() isn't possible here.
1173 if (!ti
->max_io_len
)
1175 return min_t(sector_t
, len
,
1176 min(queue_max_sectors(ti
->table
->md
->queue
),
1177 blk_chunk_sectors_left(target_offset
, ti
->max_io_len
)));
1180 int dm_set_target_max_io_len(struct dm_target
*ti
, sector_t len
)
1182 if (len
> UINT_MAX
) {
1183 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1184 (unsigned long long)len
, UINT_MAX
);
1185 ti
->error
= "Maximum size of target IO is too large";
1189 ti
->max_io_len
= (uint32_t) len
;
1193 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len
);
1195 static struct dm_target
*dm_dax_get_live_target(struct mapped_device
*md
,
1196 sector_t sector
, int *srcu_idx
)
1197 __acquires(md
->io_barrier
)
1199 struct dm_table
*map
;
1200 struct dm_target
*ti
;
1202 map
= dm_get_live_table(md
, srcu_idx
);
1206 ti
= dm_table_find_target(map
, sector
);
1213 static long dm_dax_direct_access(struct dax_device
*dax_dev
, pgoff_t pgoff
,
1214 long nr_pages
, enum dax_access_mode mode
, void **kaddr
,
1217 struct mapped_device
*md
= dax_get_private(dax_dev
);
1218 sector_t sector
= pgoff
* PAGE_SECTORS
;
1219 struct dm_target
*ti
;
1220 long len
, ret
= -EIO
;
1223 ti
= dm_dax_get_live_target(md
, sector
, &srcu_idx
);
1227 if (!ti
->type
->direct_access
)
1229 len
= max_io_len(ti
, sector
) / PAGE_SECTORS
;
1232 nr_pages
= min(len
, nr_pages
);
1233 ret
= ti
->type
->direct_access(ti
, pgoff
, nr_pages
, mode
, kaddr
, pfn
);
1236 dm_put_live_table(md
, srcu_idx
);
1241 static int dm_dax_zero_page_range(struct dax_device
*dax_dev
, pgoff_t pgoff
,
1244 struct mapped_device
*md
= dax_get_private(dax_dev
);
1245 sector_t sector
= pgoff
* PAGE_SECTORS
;
1246 struct dm_target
*ti
;
1250 ti
= dm_dax_get_live_target(md
, sector
, &srcu_idx
);
1254 if (WARN_ON(!ti
->type
->dax_zero_page_range
)) {
1256 * ->zero_page_range() is mandatory dax operation. If we are
1257 * here, something is wrong.
1261 ret
= ti
->type
->dax_zero_page_range(ti
, pgoff
, nr_pages
);
1263 dm_put_live_table(md
, srcu_idx
);
1268 static size_t dm_dax_recovery_write(struct dax_device
*dax_dev
, pgoff_t pgoff
,
1269 void *addr
, size_t bytes
, struct iov_iter
*i
)
1271 struct mapped_device
*md
= dax_get_private(dax_dev
);
1272 sector_t sector
= pgoff
* PAGE_SECTORS
;
1273 struct dm_target
*ti
;
1277 ti
= dm_dax_get_live_target(md
, sector
, &srcu_idx
);
1278 if (!ti
|| !ti
->type
->dax_recovery_write
)
1281 ret
= ti
->type
->dax_recovery_write(ti
, pgoff
, addr
, bytes
, i
);
1283 dm_put_live_table(md
, srcu_idx
);
1288 * A target may call dm_accept_partial_bio only from the map routine. It is
1289 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1290 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1291 * __send_duplicate_bios().
1293 * dm_accept_partial_bio informs the dm that the target only wants to process
1294 * additional n_sectors sectors of the bio and the rest of the data should be
1295 * sent in a next bio.
1297 * A diagram that explains the arithmetics:
1298 * +--------------------+---------------+-------+
1300 * +--------------------+---------------+-------+
1302 * <-------------- *tio->len_ptr --------------->
1303 * <----- bio_sectors ----->
1306 * Region 1 was already iterated over with bio_advance or similar function.
1307 * (it may be empty if the target doesn't use bio_advance)
1308 * Region 2 is the remaining bio size that the target wants to process.
1309 * (it may be empty if region 1 is non-empty, although there is no reason
1311 * The target requires that region 3 is to be sent in the next bio.
1313 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1314 * the partially processed part (the sum of regions 1+2) must be the same for all
1315 * copies of the bio.
1317 void dm_accept_partial_bio(struct bio
*bio
, unsigned n_sectors
)
1319 struct dm_target_io
*tio
= clone_to_tio(bio
);
1320 struct dm_io
*io
= tio
->io
;
1321 unsigned bio_sectors
= bio_sectors(bio
);
1323 BUG_ON(dm_tio_flagged(tio
, DM_TIO_IS_DUPLICATE_BIO
));
1324 BUG_ON(op_is_zone_mgmt(bio_op(bio
)));
1325 BUG_ON(bio_op(bio
) == REQ_OP_ZONE_APPEND
);
1326 BUG_ON(bio_sectors
> *tio
->len_ptr
);
1327 BUG_ON(n_sectors
> bio_sectors
);
1329 *tio
->len_ptr
-= bio_sectors
- n_sectors
;
1330 bio
->bi_iter
.bi_size
= n_sectors
<< SECTOR_SHIFT
;
1333 * __split_and_process_bio() may have already saved mapped part
1334 * for accounting but it is being reduced so update accordingly.
1336 dm_io_set_flag(io
, DM_IO_WAS_SPLIT
);
1337 io
->sectors
= n_sectors
;
1338 io
->sector_offset
= bio_sectors(io
->orig_bio
);
1340 EXPORT_SYMBOL_GPL(dm_accept_partial_bio
);
1343 * @clone: clone bio that DM core passed to target's .map function
1344 * @tgt_clone: clone of @clone bio that target needs submitted
1346 * Targets should use this interface to submit bios they take
1347 * ownership of when returning DM_MAPIO_SUBMITTED.
1349 * Target should also enable ti->accounts_remapped_io
1351 void dm_submit_bio_remap(struct bio
*clone
, struct bio
*tgt_clone
)
1353 struct dm_target_io
*tio
= clone_to_tio(clone
);
1354 struct dm_io
*io
= tio
->io
;
1356 /* establish bio that will get submitted */
1361 * Account io->origin_bio to DM dev on behalf of target
1362 * that took ownership of IO with DM_MAPIO_SUBMITTED.
1364 dm_start_io_acct(io
, clone
);
1366 trace_block_bio_remap(tgt_clone
, disk_devt(io
->md
->disk
),
1368 submit_bio_noacct(tgt_clone
);
1370 EXPORT_SYMBOL_GPL(dm_submit_bio_remap
);
1372 static noinline
void __set_swap_bios_limit(struct mapped_device
*md
, int latch
)
1374 mutex_lock(&md
->swap_bios_lock
);
1375 while (latch
< md
->swap_bios
) {
1377 down(&md
->swap_bios_semaphore
);
1380 while (latch
> md
->swap_bios
) {
1382 up(&md
->swap_bios_semaphore
);
1385 mutex_unlock(&md
->swap_bios_lock
);
1388 static void __map_bio(struct bio
*clone
)
1390 struct dm_target_io
*tio
= clone_to_tio(clone
);
1391 struct dm_target
*ti
= tio
->ti
;
1392 struct dm_io
*io
= tio
->io
;
1393 struct mapped_device
*md
= io
->md
;
1396 clone
->bi_end_io
= clone_endio
;
1401 tio
->old_sector
= clone
->bi_iter
.bi_sector
;
1403 if (static_branch_unlikely(&swap_bios_enabled
) &&
1404 unlikely(swap_bios_limit(ti
, clone
))) {
1405 int latch
= get_swap_bios();
1406 if (unlikely(latch
!= md
->swap_bios
))
1407 __set_swap_bios_limit(md
, latch
);
1408 down(&md
->swap_bios_semaphore
);
1411 if (static_branch_unlikely(&zoned_enabled
)) {
1413 * Check if the IO needs a special mapping due to zone append
1414 * emulation on zoned target. In this case, dm_zone_map_bio()
1415 * calls the target map operation.
1417 if (unlikely(dm_emulate_zone_append(md
)))
1418 r
= dm_zone_map_bio(tio
);
1420 r
= ti
->type
->map(ti
, clone
);
1422 r
= ti
->type
->map(ti
, clone
);
1425 case DM_MAPIO_SUBMITTED
:
1426 /* target has assumed ownership of this io */
1427 if (!ti
->accounts_remapped_io
)
1428 dm_start_io_acct(io
, clone
);
1430 case DM_MAPIO_REMAPPED
:
1431 dm_submit_bio_remap(clone
, NULL
);
1434 case DM_MAPIO_REQUEUE
:
1435 if (static_branch_unlikely(&swap_bios_enabled
) &&
1436 unlikely(swap_bios_limit(ti
, clone
)))
1437 up(&md
->swap_bios_semaphore
);
1439 if (r
== DM_MAPIO_KILL
)
1440 dm_io_dec_pending(io
, BLK_STS_IOERR
);
1442 dm_io_dec_pending(io
, BLK_STS_DM_REQUEUE
);
1445 DMCRIT("unimplemented target map return value: %d", r
);
1450 static void setup_split_accounting(struct clone_info
*ci
, unsigned len
)
1452 struct dm_io
*io
= ci
->io
;
1454 if (ci
->sector_count
> len
) {
1456 * Split needed, save the mapped part for accounting.
1457 * NOTE: dm_accept_partial_bio() will update accordingly.
1459 dm_io_set_flag(io
, DM_IO_WAS_SPLIT
);
1461 io
->sector_offset
= bio_sectors(ci
->bio
);
1465 static void alloc_multiple_bios(struct bio_list
*blist
, struct clone_info
*ci
,
1466 struct dm_target
*ti
, unsigned num_bios
)
1471 for (try = 0; try < 2; try++) {
1475 mutex_lock(&ci
->io
->md
->table_devices_lock
);
1476 for (bio_nr
= 0; bio_nr
< num_bios
; bio_nr
++) {
1477 bio
= alloc_tio(ci
, ti
, bio_nr
, NULL
,
1478 try ? GFP_NOIO
: GFP_NOWAIT
);
1482 bio_list_add(blist
, bio
);
1485 mutex_unlock(&ci
->io
->md
->table_devices_lock
);
1486 if (bio_nr
== num_bios
)
1489 while ((bio
= bio_list_pop(blist
)))
1494 static int __send_duplicate_bios(struct clone_info
*ci
, struct dm_target
*ti
,
1495 unsigned int num_bios
, unsigned *len
)
1497 struct bio_list blist
= BIO_EMPTY_LIST
;
1499 unsigned int ret
= 0;
1506 setup_split_accounting(ci
, *len
);
1507 clone
= alloc_tio(ci
, ti
, 0, len
, GFP_NOIO
);
1512 /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
1513 alloc_multiple_bios(&blist
, ci
, ti
, num_bios
);
1514 while ((clone
= bio_list_pop(&blist
))) {
1515 dm_tio_set_flag(clone_to_tio(clone
), DM_TIO_IS_DUPLICATE_BIO
);
1525 static void __send_empty_flush(struct clone_info
*ci
)
1527 struct dm_table
*t
= ci
->map
;
1528 struct bio flush_bio
;
1531 * Use an on-stack bio for this, it's safe since we don't
1532 * need to reference it after submit. It's just used as
1533 * the basis for the clone(s).
1535 bio_init(&flush_bio
, ci
->io
->md
->disk
->part0
, NULL
, 0,
1536 REQ_OP_WRITE
| REQ_PREFLUSH
| REQ_SYNC
);
1538 ci
->bio
= &flush_bio
;
1539 ci
->sector_count
= 0;
1540 ci
->io
->tio
.clone
.bi_iter
.bi_size
= 0;
1542 for (unsigned int i
= 0; i
< t
->num_targets
; i
++) {
1544 struct dm_target
*ti
= dm_table_get_target(t
, i
);
1546 atomic_add(ti
->num_flush_bios
, &ci
->io
->io_count
);
1547 bios
= __send_duplicate_bios(ci
, ti
, ti
->num_flush_bios
, NULL
);
1548 atomic_sub(ti
->num_flush_bios
- bios
, &ci
->io
->io_count
);
1552 * alloc_io() takes one extra reference for submission, so the
1553 * reference won't reach 0 without the following subtraction
1555 atomic_sub(1, &ci
->io
->io_count
);
1557 bio_uninit(ci
->bio
);
1560 static void __send_changing_extent_only(struct clone_info
*ci
, struct dm_target
*ti
,
1566 len
= min_t(sector_t
, ci
->sector_count
,
1567 max_io_len_target_boundary(ti
, dm_target_offset(ti
, ci
->sector
)));
1569 atomic_add(num_bios
, &ci
->io
->io_count
);
1570 bios
= __send_duplicate_bios(ci
, ti
, num_bios
, &len
);
1572 * alloc_io() takes one extra reference for submission, so the
1573 * reference won't reach 0 without the following (+1) subtraction
1575 atomic_sub(num_bios
- bios
+ 1, &ci
->io
->io_count
);
1578 ci
->sector_count
-= len
;
1581 static bool is_abnormal_io(struct bio
*bio
)
1583 enum req_op op
= bio_op(bio
);
1585 if (op
!= REQ_OP_READ
&& op
!= REQ_OP_WRITE
&& op
!= REQ_OP_FLUSH
) {
1587 case REQ_OP_DISCARD
:
1588 case REQ_OP_SECURE_ERASE
:
1589 case REQ_OP_WRITE_ZEROES
:
1599 static blk_status_t
__process_abnormal_io(struct clone_info
*ci
,
1600 struct dm_target
*ti
)
1602 unsigned num_bios
= 0;
1604 switch (bio_op(ci
->bio
)) {
1605 case REQ_OP_DISCARD
:
1606 num_bios
= ti
->num_discard_bios
;
1608 case REQ_OP_SECURE_ERASE
:
1609 num_bios
= ti
->num_secure_erase_bios
;
1611 case REQ_OP_WRITE_ZEROES
:
1612 num_bios
= ti
->num_write_zeroes_bios
;
1619 * Even though the device advertised support for this type of
1620 * request, that does not mean every target supports it, and
1621 * reconfiguration might also have changed that since the
1622 * check was performed.
1624 if (unlikely(!num_bios
))
1625 return BLK_STS_NOTSUPP
;
1627 __send_changing_extent_only(ci
, ti
, num_bios
);
1632 * Reuse ->bi_private as dm_io list head for storing all dm_io instances
1633 * associated with this bio, and this bio's bi_private needs to be
1634 * stored in dm_io->data before the reuse.
1636 * bio->bi_private is owned by fs or upper layer, so block layer won't
1637 * touch it after splitting. Meantime it won't be changed by anyone after
1638 * bio is submitted. So this reuse is safe.
1640 static inline struct dm_io
**dm_poll_list_head(struct bio
*bio
)
1642 return (struct dm_io
**)&bio
->bi_private
;
1645 static void dm_queue_poll_io(struct bio
*bio
, struct dm_io
*io
)
1647 struct dm_io
**head
= dm_poll_list_head(bio
);
1649 if (!(bio
->bi_opf
& REQ_DM_POLL_LIST
)) {
1650 bio
->bi_opf
|= REQ_DM_POLL_LIST
;
1652 * Save .bi_private into dm_io, so that we can reuse
1653 * .bi_private as dm_io list head for storing dm_io list
1655 io
->data
= bio
->bi_private
;
1657 /* tell block layer to poll for completion */
1658 bio
->bi_cookie
= ~BLK_QC_T_NONE
;
1663 * bio recursed due to split, reuse original poll list,
1664 * and save bio->bi_private too.
1666 io
->data
= (*head
)->data
;
1674 * Select the correct strategy for processing a non-flush bio.
1676 static blk_status_t
__split_and_process_bio(struct clone_info
*ci
)
1679 struct dm_target
*ti
;
1682 ti
= dm_table_find_target(ci
->map
, ci
->sector
);
1684 return BLK_STS_IOERR
;
1686 if (unlikely((ci
->bio
->bi_opf
& REQ_NOWAIT
) != 0) &&
1687 unlikely(!dm_target_supports_nowait(ti
->type
)))
1688 return BLK_STS_NOTSUPP
;
1690 if (unlikely(ci
->is_abnormal_io
))
1691 return __process_abnormal_io(ci
, ti
);
1694 * Only support bio polling for normal IO, and the target io is
1695 * exactly inside the dm_io instance (verified in dm_poll_dm_io)
1697 ci
->submit_as_polled
= !!(ci
->bio
->bi_opf
& REQ_POLLED
);
1699 len
= min_t(sector_t
, max_io_len(ti
, ci
->sector
), ci
->sector_count
);
1700 setup_split_accounting(ci
, len
);
1701 clone
= alloc_tio(ci
, ti
, 0, &len
, GFP_NOIO
);
1705 ci
->sector_count
-= len
;
1710 static void init_clone_info(struct clone_info
*ci
, struct mapped_device
*md
,
1711 struct dm_table
*map
, struct bio
*bio
, bool is_abnormal
)
1714 ci
->io
= alloc_io(md
, bio
);
1716 ci
->is_abnormal_io
= is_abnormal
;
1717 ci
->submit_as_polled
= false;
1718 ci
->sector
= bio
->bi_iter
.bi_sector
;
1719 ci
->sector_count
= bio_sectors(bio
);
1721 /* Shouldn't happen but sector_count was being set to 0 so... */
1722 if (static_branch_unlikely(&zoned_enabled
) &&
1723 WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio
)) && ci
->sector_count
))
1724 ci
->sector_count
= 0;
1728 * Entry point to split a bio into clones and submit them to the targets.
1730 static void dm_split_and_process_bio(struct mapped_device
*md
,
1731 struct dm_table
*map
, struct bio
*bio
)
1733 struct clone_info ci
;
1735 blk_status_t error
= BLK_STS_OK
;
1738 is_abnormal
= is_abnormal_io(bio
);
1739 if (unlikely(is_abnormal
)) {
1741 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
1742 * otherwise associated queue_limits won't be imposed.
1744 bio
= bio_split_to_limits(bio
);
1749 init_clone_info(&ci
, md
, map
, bio
, is_abnormal
);
1752 if (bio
->bi_opf
& REQ_PREFLUSH
) {
1753 __send_empty_flush(&ci
);
1754 /* dm_io_complete submits any data associated with flush */
1758 error
= __split_and_process_bio(&ci
);
1759 if (error
|| !ci
.sector_count
)
1762 * Remainder must be passed to submit_bio_noacct() so it gets handled
1763 * *after* bios already submitted have been completely processed.
1765 bio_trim(bio
, io
->sectors
, ci
.sector_count
);
1766 trace_block_split(bio
, bio
->bi_iter
.bi_sector
);
1767 bio_inc_remaining(bio
);
1768 submit_bio_noacct(bio
);
1771 * Drop the extra reference count for non-POLLED bio, and hold one
1772 * reference for POLLED bio, which will be released in dm_poll_bio
1774 * Add every dm_io instance into the dm_io list head which is stored
1775 * in bio->bi_private, so that dm_poll_bio can poll them all.
1777 if (error
|| !ci
.submit_as_polled
) {
1779 * In case of submission failure, the extra reference for
1780 * submitting io isn't consumed yet
1783 atomic_dec(&io
->io_count
);
1784 dm_io_dec_pending(io
, error
);
1786 dm_queue_poll_io(bio
, io
);
1789 static void dm_submit_bio(struct bio
*bio
)
1791 struct mapped_device
*md
= bio
->bi_bdev
->bd_disk
->private_data
;
1793 struct dm_table
*map
;
1794 blk_opf_t bio_opf
= bio
->bi_opf
;
1796 map
= dm_get_live_table_bio(md
, &srcu_idx
, bio_opf
);
1798 /* If suspended, or map not yet available, queue this IO for later */
1799 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
)) ||
1801 if (bio
->bi_opf
& REQ_NOWAIT
)
1802 bio_wouldblock_error(bio
);
1803 else if (bio
->bi_opf
& REQ_RAHEAD
)
1810 dm_split_and_process_bio(md
, map
, bio
);
1812 dm_put_live_table_bio(md
, srcu_idx
, bio_opf
);
1815 static bool dm_poll_dm_io(struct dm_io
*io
, struct io_comp_batch
*iob
,
1818 WARN_ON_ONCE(!dm_tio_is_normal(&io
->tio
));
1820 /* don't poll if the mapped io is done */
1821 if (atomic_read(&io
->io_count
) > 1)
1822 bio_poll(&io
->tio
.clone
, iob
, flags
);
1824 /* bio_poll holds the last reference */
1825 return atomic_read(&io
->io_count
) == 1;
1828 static int dm_poll_bio(struct bio
*bio
, struct io_comp_batch
*iob
,
1831 struct dm_io
**head
= dm_poll_list_head(bio
);
1832 struct dm_io
*list
= *head
;
1833 struct dm_io
*tmp
= NULL
;
1834 struct dm_io
*curr
, *next
;
1836 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
1837 if (!(bio
->bi_opf
& REQ_DM_POLL_LIST
))
1840 WARN_ON_ONCE(!list
);
1843 * Restore .bi_private before possibly completing dm_io.
1845 * bio_poll() is only possible once @bio has been completely
1846 * submitted via submit_bio_noacct()'s depth-first submission.
1847 * So there is no dm_queue_poll_io() race associated with
1848 * clearing REQ_DM_POLL_LIST here.
1850 bio
->bi_opf
&= ~REQ_DM_POLL_LIST
;
1851 bio
->bi_private
= list
->data
;
1853 for (curr
= list
, next
= curr
->next
; curr
; curr
= next
, next
=
1854 curr
? curr
->next
: NULL
) {
1855 if (dm_poll_dm_io(curr
, iob
, flags
)) {
1857 * clone_endio() has already occurred, so no
1858 * error handling is needed here.
1860 __dm_io_dec_pending(curr
);
1869 bio
->bi_opf
|= REQ_DM_POLL_LIST
;
1870 /* Reset bio->bi_private to dm_io list head */
1877 /*-----------------------------------------------------------------
1878 * An IDR is used to keep track of allocated minor numbers.
1879 *---------------------------------------------------------------*/
1880 static void free_minor(int minor
)
1882 spin_lock(&_minor_lock
);
1883 idr_remove(&_minor_idr
, minor
);
1884 spin_unlock(&_minor_lock
);
1888 * See if the device with a specific minor # is free.
1890 static int specific_minor(int minor
)
1894 if (minor
>= (1 << MINORBITS
))
1897 idr_preload(GFP_KERNEL
);
1898 spin_lock(&_minor_lock
);
1900 r
= idr_alloc(&_minor_idr
, MINOR_ALLOCED
, minor
, minor
+ 1, GFP_NOWAIT
);
1902 spin_unlock(&_minor_lock
);
1905 return r
== -ENOSPC
? -EBUSY
: r
;
1909 static int next_free_minor(int *minor
)
1913 idr_preload(GFP_KERNEL
);
1914 spin_lock(&_minor_lock
);
1916 r
= idr_alloc(&_minor_idr
, MINOR_ALLOCED
, 0, 1 << MINORBITS
, GFP_NOWAIT
);
1918 spin_unlock(&_minor_lock
);
1926 static const struct block_device_operations dm_blk_dops
;
1927 static const struct block_device_operations dm_rq_blk_dops
;
1928 static const struct dax_operations dm_dax_ops
;
1930 static void dm_wq_work(struct work_struct
*work
);
1932 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
1933 static void dm_queue_destroy_crypto_profile(struct request_queue
*q
)
1935 dm_destroy_crypto_profile(q
->crypto_profile
);
1938 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
1940 static inline void dm_queue_destroy_crypto_profile(struct request_queue
*q
)
1943 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1945 static void cleanup_mapped_device(struct mapped_device
*md
)
1948 destroy_workqueue(md
->wq
);
1949 dm_free_md_mempools(md
->mempools
);
1952 dax_remove_host(md
->disk
);
1953 kill_dax(md
->dax_dev
);
1954 put_dax(md
->dax_dev
);
1958 dm_cleanup_zoned_dev(md
);
1960 spin_lock(&_minor_lock
);
1961 md
->disk
->private_data
= NULL
;
1962 spin_unlock(&_minor_lock
);
1963 if (dm_get_md_type(md
) != DM_TYPE_NONE
) {
1964 struct table_device
*td
;
1967 list_for_each_entry(td
, &md
->table_devices
, list
) {
1968 bd_unlink_disk_holder(td
->dm_dev
.bdev
,
1973 * Hold lock to make sure del_gendisk() won't concurrent
1974 * with open/close_table_device().
1976 mutex_lock(&md
->table_devices_lock
);
1977 del_gendisk(md
->disk
);
1978 mutex_unlock(&md
->table_devices_lock
);
1980 dm_queue_destroy_crypto_profile(md
->queue
);
1984 if (md
->pending_io
) {
1985 free_percpu(md
->pending_io
);
1986 md
->pending_io
= NULL
;
1989 cleanup_srcu_struct(&md
->io_barrier
);
1991 mutex_destroy(&md
->suspend_lock
);
1992 mutex_destroy(&md
->type_lock
);
1993 mutex_destroy(&md
->table_devices_lock
);
1994 mutex_destroy(&md
->swap_bios_lock
);
1996 dm_mq_cleanup_mapped_device(md
);
2000 * Allocate and initialise a blank device with a given minor.
2002 static struct mapped_device
*alloc_dev(int minor
)
2004 int r
, numa_node_id
= dm_get_numa_node();
2005 struct mapped_device
*md
;
2008 md
= kvzalloc_node(sizeof(*md
), GFP_KERNEL
, numa_node_id
);
2010 DMERR("unable to allocate device, out of memory.");
2014 if (!try_module_get(THIS_MODULE
))
2015 goto bad_module_get
;
2017 /* get a minor number for the dev */
2018 if (minor
== DM_ANY_MINOR
)
2019 r
= next_free_minor(&minor
);
2021 r
= specific_minor(minor
);
2025 r
= init_srcu_struct(&md
->io_barrier
);
2027 goto bad_io_barrier
;
2029 md
->numa_node_id
= numa_node_id
;
2030 md
->init_tio_pdu
= false;
2031 md
->type
= DM_TYPE_NONE
;
2032 mutex_init(&md
->suspend_lock
);
2033 mutex_init(&md
->type_lock
);
2034 mutex_init(&md
->table_devices_lock
);
2035 spin_lock_init(&md
->deferred_lock
);
2036 atomic_set(&md
->holders
, 1);
2037 atomic_set(&md
->open_count
, 0);
2038 atomic_set(&md
->event_nr
, 0);
2039 atomic_set(&md
->uevent_seq
, 0);
2040 INIT_LIST_HEAD(&md
->uevent_list
);
2041 INIT_LIST_HEAD(&md
->table_devices
);
2042 spin_lock_init(&md
->uevent_lock
);
2045 * default to bio-based until DM table is loaded and md->type
2046 * established. If request-based table is loaded: blk-mq will
2047 * override accordingly.
2049 md
->disk
= blk_alloc_disk(md
->numa_node_id
);
2052 md
->queue
= md
->disk
->queue
;
2054 init_waitqueue_head(&md
->wait
);
2055 INIT_WORK(&md
->work
, dm_wq_work
);
2056 INIT_WORK(&md
->requeue_work
, dm_wq_requeue_work
);
2057 init_waitqueue_head(&md
->eventq
);
2058 init_completion(&md
->kobj_holder
.completion
);
2060 md
->requeue_list
= NULL
;
2061 md
->swap_bios
= get_swap_bios();
2062 sema_init(&md
->swap_bios_semaphore
, md
->swap_bios
);
2063 mutex_init(&md
->swap_bios_lock
);
2065 md
->disk
->major
= _major
;
2066 md
->disk
->first_minor
= minor
;
2067 md
->disk
->minors
= 1;
2068 md
->disk
->flags
|= GENHD_FL_NO_PART
;
2069 md
->disk
->fops
= &dm_blk_dops
;
2070 md
->disk
->private_data
= md
;
2071 sprintf(md
->disk
->disk_name
, "dm-%d", minor
);
2073 if (IS_ENABLED(CONFIG_FS_DAX
)) {
2074 md
->dax_dev
= alloc_dax(md
, &dm_dax_ops
);
2075 if (IS_ERR(md
->dax_dev
)) {
2079 set_dax_nocache(md
->dax_dev
);
2080 set_dax_nomc(md
->dax_dev
);
2081 if (dax_add_host(md
->dax_dev
, md
->disk
))
2085 format_dev_t(md
->name
, MKDEV(_major
, minor
));
2087 md
->wq
= alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM
, 0, md
->name
);
2091 md
->pending_io
= alloc_percpu(unsigned long);
2092 if (!md
->pending_io
)
2095 dm_stats_init(&md
->stats
);
2097 /* Populate the mapping, nobody knows we exist yet */
2098 spin_lock(&_minor_lock
);
2099 old_md
= idr_replace(&_minor_idr
, md
, minor
);
2100 spin_unlock(&_minor_lock
);
2102 BUG_ON(old_md
!= MINOR_ALLOCED
);
2107 cleanup_mapped_device(md
);
2111 module_put(THIS_MODULE
);
2117 static void unlock_fs(struct mapped_device
*md
);
2119 static void free_dev(struct mapped_device
*md
)
2121 int minor
= MINOR(disk_devt(md
->disk
));
2125 cleanup_mapped_device(md
);
2127 WARN_ON_ONCE(!list_empty(&md
->table_devices
));
2128 dm_stats_cleanup(&md
->stats
);
2131 module_put(THIS_MODULE
);
2136 * Bind a table to the device.
2138 static void event_callback(void *context
)
2140 unsigned long flags
;
2142 struct mapped_device
*md
= (struct mapped_device
*) context
;
2144 spin_lock_irqsave(&md
->uevent_lock
, flags
);
2145 list_splice_init(&md
->uevent_list
, &uevents
);
2146 spin_unlock_irqrestore(&md
->uevent_lock
, flags
);
2148 dm_send_uevents(&uevents
, &disk_to_dev(md
->disk
)->kobj
);
2150 atomic_inc(&md
->event_nr
);
2151 wake_up(&md
->eventq
);
2152 dm_issue_global_event();
2156 * Returns old map, which caller must destroy.
2158 static struct dm_table
*__bind(struct mapped_device
*md
, struct dm_table
*t
,
2159 struct queue_limits
*limits
)
2161 struct dm_table
*old_map
;
2165 lockdep_assert_held(&md
->suspend_lock
);
2167 size
= dm_table_get_size(t
);
2170 * Wipe any geometry if the size of the table changed.
2172 if (size
!= dm_get_size(md
))
2173 memset(&md
->geometry
, 0, sizeof(md
->geometry
));
2175 set_capacity(md
->disk
, size
);
2177 dm_table_event_callback(t
, event_callback
, md
);
2179 if (dm_table_request_based(t
)) {
2181 * Leverage the fact that request-based DM targets are
2182 * immutable singletons - used to optimize dm_mq_queue_rq.
2184 md
->immutable_target
= dm_table_get_immutable_target(t
);
2187 * There is no need to reload with request-based dm because the
2188 * size of front_pad doesn't change.
2190 * Note for future: If you are to reload bioset, prep-ed
2191 * requests in the queue may refer to bio from the old bioset,
2192 * so you must walk through the queue to unprep.
2194 if (!md
->mempools
) {
2195 md
->mempools
= t
->mempools
;
2200 * The md may already have mempools that need changing.
2201 * If so, reload bioset because front_pad may have changed
2202 * because a different table was loaded.
2204 dm_free_md_mempools(md
->mempools
);
2205 md
->mempools
= t
->mempools
;
2209 ret
= dm_table_set_restrictions(t
, md
->queue
, limits
);
2211 old_map
= ERR_PTR(ret
);
2215 old_map
= rcu_dereference_protected(md
->map
, lockdep_is_held(&md
->suspend_lock
));
2216 rcu_assign_pointer(md
->map
, (void *)t
);
2217 md
->immutable_target_type
= dm_table_get_immutable_target_type(t
);
2226 * Returns unbound table for the caller to free.
2228 static struct dm_table
*__unbind(struct mapped_device
*md
)
2230 struct dm_table
*map
= rcu_dereference_protected(md
->map
, 1);
2235 dm_table_event_callback(map
, NULL
, NULL
);
2236 RCU_INIT_POINTER(md
->map
, NULL
);
2243 * Constructor for a new device.
2245 int dm_create(int minor
, struct mapped_device
**result
)
2247 struct mapped_device
*md
;
2249 md
= alloc_dev(minor
);
2253 dm_ima_reset_data(md
);
2260 * Functions to manage md->type.
2261 * All are required to hold md->type_lock.
2263 void dm_lock_md_type(struct mapped_device
*md
)
2265 mutex_lock(&md
->type_lock
);
2268 void dm_unlock_md_type(struct mapped_device
*md
)
2270 mutex_unlock(&md
->type_lock
);
2273 void dm_set_md_type(struct mapped_device
*md
, enum dm_queue_mode type
)
2275 BUG_ON(!mutex_is_locked(&md
->type_lock
));
2279 enum dm_queue_mode
dm_get_md_type(struct mapped_device
*md
)
2284 struct target_type
*dm_get_immutable_target_type(struct mapped_device
*md
)
2286 return md
->immutable_target_type
;
2290 * The queue_limits are only valid as long as you have a reference
2293 struct queue_limits
*dm_get_queue_limits(struct mapped_device
*md
)
2295 BUG_ON(!atomic_read(&md
->holders
));
2296 return &md
->queue
->limits
;
2298 EXPORT_SYMBOL_GPL(dm_get_queue_limits
);
2301 * Setup the DM device's queue based on md's type
2303 int dm_setup_md_queue(struct mapped_device
*md
, struct dm_table
*t
)
2305 enum dm_queue_mode type
= dm_table_get_type(t
);
2306 struct queue_limits limits
;
2307 struct table_device
*td
;
2311 case DM_TYPE_REQUEST_BASED
:
2312 md
->disk
->fops
= &dm_rq_blk_dops
;
2313 r
= dm_mq_init_request_queue(md
, t
);
2315 DMERR("Cannot initialize queue for request-based dm mapped device");
2319 case DM_TYPE_BIO_BASED
:
2320 case DM_TYPE_DAX_BIO_BASED
:
2327 r
= dm_calculate_queue_limits(t
, &limits
);
2329 DMERR("Cannot calculate initial queue limits");
2332 r
= dm_table_set_restrictions(t
, md
->queue
, &limits
);
2337 * Hold lock to make sure add_disk() and del_gendisk() won't concurrent
2338 * with open_table_device() and close_table_device().
2340 mutex_lock(&md
->table_devices_lock
);
2341 r
= add_disk(md
->disk
);
2342 mutex_unlock(&md
->table_devices_lock
);
2347 * Register the holder relationship for devices added before the disk
2350 list_for_each_entry(td
, &md
->table_devices
, list
) {
2351 r
= bd_link_disk_holder(td
->dm_dev
.bdev
, md
->disk
);
2353 goto out_undo_holders
;
2356 r
= dm_sysfs_init(md
);
2358 goto out_undo_holders
;
2364 list_for_each_entry_continue_reverse(td
, &md
->table_devices
, list
)
2365 bd_unlink_disk_holder(td
->dm_dev
.bdev
, md
->disk
);
2366 mutex_lock(&md
->table_devices_lock
);
2367 del_gendisk(md
->disk
);
2368 mutex_unlock(&md
->table_devices_lock
);
2372 struct mapped_device
*dm_get_md(dev_t dev
)
2374 struct mapped_device
*md
;
2375 unsigned minor
= MINOR(dev
);
2377 if (MAJOR(dev
) != _major
|| minor
>= (1 << MINORBITS
))
2380 spin_lock(&_minor_lock
);
2382 md
= idr_find(&_minor_idr
, minor
);
2383 if (!md
|| md
== MINOR_ALLOCED
|| (MINOR(disk_devt(dm_disk(md
))) != minor
) ||
2384 test_bit(DMF_FREEING
, &md
->flags
) || dm_deleting_md(md
)) {
2390 spin_unlock(&_minor_lock
);
2394 EXPORT_SYMBOL_GPL(dm_get_md
);
2396 void *dm_get_mdptr(struct mapped_device
*md
)
2398 return md
->interface_ptr
;
2401 void dm_set_mdptr(struct mapped_device
*md
, void *ptr
)
2403 md
->interface_ptr
= ptr
;
2406 void dm_get(struct mapped_device
*md
)
2408 atomic_inc(&md
->holders
);
2409 BUG_ON(test_bit(DMF_FREEING
, &md
->flags
));
2412 int dm_hold(struct mapped_device
*md
)
2414 spin_lock(&_minor_lock
);
2415 if (test_bit(DMF_FREEING
, &md
->flags
)) {
2416 spin_unlock(&_minor_lock
);
2420 spin_unlock(&_minor_lock
);
2423 EXPORT_SYMBOL_GPL(dm_hold
);
2425 const char *dm_device_name(struct mapped_device
*md
)
2429 EXPORT_SYMBOL_GPL(dm_device_name
);
2431 static void __dm_destroy(struct mapped_device
*md
, bool wait
)
2433 struct dm_table
*map
;
2438 spin_lock(&_minor_lock
);
2439 idr_replace(&_minor_idr
, MINOR_ALLOCED
, MINOR(disk_devt(dm_disk(md
))));
2440 set_bit(DMF_FREEING
, &md
->flags
);
2441 spin_unlock(&_minor_lock
);
2443 blk_mark_disk_dead(md
->disk
);
2446 * Take suspend_lock so that presuspend and postsuspend methods
2447 * do not race with internal suspend.
2449 mutex_lock(&md
->suspend_lock
);
2450 map
= dm_get_live_table(md
, &srcu_idx
);
2451 if (!dm_suspended_md(md
)) {
2452 dm_table_presuspend_targets(map
);
2453 set_bit(DMF_SUSPENDED
, &md
->flags
);
2454 set_bit(DMF_POST_SUSPENDING
, &md
->flags
);
2455 dm_table_postsuspend_targets(map
);
2457 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2458 dm_put_live_table(md
, srcu_idx
);
2459 mutex_unlock(&md
->suspend_lock
);
2462 * Rare, but there may be I/O requests still going to complete,
2463 * for example. Wait for all references to disappear.
2464 * No one should increment the reference count of the mapped_device,
2465 * after the mapped_device state becomes DMF_FREEING.
2468 while (atomic_read(&md
->holders
))
2470 else if (atomic_read(&md
->holders
))
2471 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2472 dm_device_name(md
), atomic_read(&md
->holders
));
2474 dm_table_destroy(__unbind(md
));
2478 void dm_destroy(struct mapped_device
*md
)
2480 __dm_destroy(md
, true);
2483 void dm_destroy_immediate(struct mapped_device
*md
)
2485 __dm_destroy(md
, false);
2488 void dm_put(struct mapped_device
*md
)
2490 atomic_dec(&md
->holders
);
2492 EXPORT_SYMBOL_GPL(dm_put
);
2494 static bool dm_in_flight_bios(struct mapped_device
*md
)
2497 unsigned long sum
= 0;
2499 for_each_possible_cpu(cpu
)
2500 sum
+= *per_cpu_ptr(md
->pending_io
, cpu
);
2505 static int dm_wait_for_bios_completion(struct mapped_device
*md
, unsigned int task_state
)
2511 prepare_to_wait(&md
->wait
, &wait
, task_state
);
2513 if (!dm_in_flight_bios(md
))
2516 if (signal_pending_state(task_state
, current
)) {
2523 finish_wait(&md
->wait
, &wait
);
2530 static int dm_wait_for_completion(struct mapped_device
*md
, unsigned int task_state
)
2534 if (!queue_is_mq(md
->queue
))
2535 return dm_wait_for_bios_completion(md
, task_state
);
2538 if (!blk_mq_queue_inflight(md
->queue
))
2541 if (signal_pending_state(task_state
, current
)) {
2553 * Process the deferred bios
2555 static void dm_wq_work(struct work_struct
*work
)
2557 struct mapped_device
*md
= container_of(work
, struct mapped_device
, work
);
2560 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
)) {
2561 spin_lock_irq(&md
->deferred_lock
);
2562 bio
= bio_list_pop(&md
->deferred
);
2563 spin_unlock_irq(&md
->deferred_lock
);
2568 submit_bio_noacct(bio
);
2572 static void dm_queue_flush(struct mapped_device
*md
)
2574 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
);
2575 smp_mb__after_atomic();
2576 queue_work(md
->wq
, &md
->work
);
2580 * Swap in a new table, returning the old one for the caller to destroy.
2582 struct dm_table
*dm_swap_table(struct mapped_device
*md
, struct dm_table
*table
)
2584 struct dm_table
*live_map
= NULL
, *map
= ERR_PTR(-EINVAL
);
2585 struct queue_limits limits
;
2588 mutex_lock(&md
->suspend_lock
);
2590 /* device must be suspended */
2591 if (!dm_suspended_md(md
))
2595 * If the new table has no data devices, retain the existing limits.
2596 * This helps multipath with queue_if_no_path if all paths disappear,
2597 * then new I/O is queued based on these limits, and then some paths
2600 if (dm_table_has_no_data_devices(table
)) {
2601 live_map
= dm_get_live_table_fast(md
);
2603 limits
= md
->queue
->limits
;
2604 dm_put_live_table_fast(md
);
2608 r
= dm_calculate_queue_limits(table
, &limits
);
2615 map
= __bind(md
, table
, &limits
);
2616 dm_issue_global_event();
2619 mutex_unlock(&md
->suspend_lock
);
2624 * Functions to lock and unlock any filesystem running on the
2627 static int lock_fs(struct mapped_device
*md
)
2631 WARN_ON(test_bit(DMF_FROZEN
, &md
->flags
));
2633 r
= freeze_bdev(md
->disk
->part0
);
2635 set_bit(DMF_FROZEN
, &md
->flags
);
2639 static void unlock_fs(struct mapped_device
*md
)
2641 if (!test_bit(DMF_FROZEN
, &md
->flags
))
2643 thaw_bdev(md
->disk
->part0
);
2644 clear_bit(DMF_FROZEN
, &md
->flags
);
2648 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2649 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2650 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2652 * If __dm_suspend returns 0, the device is completely quiescent
2653 * now. There is no request-processing activity. All new requests
2654 * are being added to md->deferred list.
2656 static int __dm_suspend(struct mapped_device
*md
, struct dm_table
*map
,
2657 unsigned suspend_flags
, unsigned int task_state
,
2658 int dmf_suspended_flag
)
2660 bool do_lockfs
= suspend_flags
& DM_SUSPEND_LOCKFS_FLAG
;
2661 bool noflush
= suspend_flags
& DM_SUSPEND_NOFLUSH_FLAG
;
2664 lockdep_assert_held(&md
->suspend_lock
);
2667 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2668 * This flag is cleared before dm_suspend returns.
2671 set_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
2673 DMDEBUG("%s: suspending with flush", dm_device_name(md
));
2676 * This gets reverted if there's an error later and the targets
2677 * provide the .presuspend_undo hook.
2679 dm_table_presuspend_targets(map
);
2682 * Flush I/O to the device.
2683 * Any I/O submitted after lock_fs() may not be flushed.
2684 * noflush takes precedence over do_lockfs.
2685 * (lock_fs() flushes I/Os and waits for them to complete.)
2687 if (!noflush
&& do_lockfs
) {
2690 dm_table_presuspend_undo_targets(map
);
2696 * Here we must make sure that no processes are submitting requests
2697 * to target drivers i.e. no one may be executing
2698 * dm_split_and_process_bio from dm_submit_bio.
2700 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
2701 * we take the write lock. To prevent any process from reentering
2702 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
2703 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
2704 * flush_workqueue(md->wq).
2706 set_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
);
2708 synchronize_srcu(&md
->io_barrier
);
2711 * Stop md->queue before flushing md->wq in case request-based
2712 * dm defers requests to md->wq from md->queue.
2714 if (dm_request_based(md
))
2715 dm_stop_queue(md
->queue
);
2717 flush_workqueue(md
->wq
);
2720 * At this point no more requests are entering target request routines.
2721 * We call dm_wait_for_completion to wait for all existing requests
2724 r
= dm_wait_for_completion(md
, task_state
);
2726 set_bit(dmf_suspended_flag
, &md
->flags
);
2729 clear_bit(DMF_NOFLUSH_SUSPENDING
, &md
->flags
);
2731 synchronize_srcu(&md
->io_barrier
);
2733 /* were we interrupted ? */
2737 if (dm_request_based(md
))
2738 dm_start_queue(md
->queue
);
2741 dm_table_presuspend_undo_targets(map
);
2742 /* pushback list is already flushed, so skip flush */
2749 * We need to be able to change a mapping table under a mounted
2750 * filesystem. For example we might want to move some data in
2751 * the background. Before the table can be swapped with
2752 * dm_bind_table, dm_suspend must be called to flush any in
2753 * flight bios and ensure that any further io gets deferred.
2756 * Suspend mechanism in request-based dm.
2758 * 1. Flush all I/Os by lock_fs() if needed.
2759 * 2. Stop dispatching any I/O by stopping the request_queue.
2760 * 3. Wait for all in-flight I/Os to be completed or requeued.
2762 * To abort suspend, start the request_queue.
2764 int dm_suspend(struct mapped_device
*md
, unsigned suspend_flags
)
2766 struct dm_table
*map
= NULL
;
2770 mutex_lock_nested(&md
->suspend_lock
, SINGLE_DEPTH_NESTING
);
2772 if (dm_suspended_md(md
)) {
2777 if (dm_suspended_internally_md(md
)) {
2778 /* already internally suspended, wait for internal resume */
2779 mutex_unlock(&md
->suspend_lock
);
2780 r
= wait_on_bit(&md
->flags
, DMF_SUSPENDED_INTERNALLY
, TASK_INTERRUPTIBLE
);
2786 map
= rcu_dereference_protected(md
->map
, lockdep_is_held(&md
->suspend_lock
));
2788 r
= __dm_suspend(md
, map
, suspend_flags
, TASK_INTERRUPTIBLE
, DMF_SUSPENDED
);
2792 set_bit(DMF_POST_SUSPENDING
, &md
->flags
);
2793 dm_table_postsuspend_targets(map
);
2794 clear_bit(DMF_POST_SUSPENDING
, &md
->flags
);
2797 mutex_unlock(&md
->suspend_lock
);
2801 static int __dm_resume(struct mapped_device
*md
, struct dm_table
*map
)
2804 int r
= dm_table_resume_targets(map
);
2812 * Flushing deferred I/Os must be done after targets are resumed
2813 * so that mapping of targets can work correctly.
2814 * Request-based dm is queueing the deferred I/Os in its request_queue.
2816 if (dm_request_based(md
))
2817 dm_start_queue(md
->queue
);
2824 int dm_resume(struct mapped_device
*md
)
2827 struct dm_table
*map
= NULL
;
2831 mutex_lock_nested(&md
->suspend_lock
, SINGLE_DEPTH_NESTING
);
2833 if (!dm_suspended_md(md
))
2836 if (dm_suspended_internally_md(md
)) {
2837 /* already internally suspended, wait for internal resume */
2838 mutex_unlock(&md
->suspend_lock
);
2839 r
= wait_on_bit(&md
->flags
, DMF_SUSPENDED_INTERNALLY
, TASK_INTERRUPTIBLE
);
2845 map
= rcu_dereference_protected(md
->map
, lockdep_is_held(&md
->suspend_lock
));
2846 if (!map
|| !dm_table_get_size(map
))
2849 r
= __dm_resume(md
, map
);
2853 clear_bit(DMF_SUSPENDED
, &md
->flags
);
2855 mutex_unlock(&md
->suspend_lock
);
2861 * Internal suspend/resume works like userspace-driven suspend. It waits
2862 * until all bios finish and prevents issuing new bios to the target drivers.
2863 * It may be used only from the kernel.
2866 static void __dm_internal_suspend(struct mapped_device
*md
, unsigned suspend_flags
)
2868 struct dm_table
*map
= NULL
;
2870 lockdep_assert_held(&md
->suspend_lock
);
2872 if (md
->internal_suspend_count
++)
2873 return; /* nested internal suspend */
2875 if (dm_suspended_md(md
)) {
2876 set_bit(DMF_SUSPENDED_INTERNALLY
, &md
->flags
);
2877 return; /* nest suspend */
2880 map
= rcu_dereference_protected(md
->map
, lockdep_is_held(&md
->suspend_lock
));
2883 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2884 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2885 * would require changing .presuspend to return an error -- avoid this
2886 * until there is a need for more elaborate variants of internal suspend.
2888 (void) __dm_suspend(md
, map
, suspend_flags
, TASK_UNINTERRUPTIBLE
,
2889 DMF_SUSPENDED_INTERNALLY
);
2891 set_bit(DMF_POST_SUSPENDING
, &md
->flags
);
2892 dm_table_postsuspend_targets(map
);
2893 clear_bit(DMF_POST_SUSPENDING
, &md
->flags
);
2896 static void __dm_internal_resume(struct mapped_device
*md
)
2898 BUG_ON(!md
->internal_suspend_count
);
2900 if (--md
->internal_suspend_count
)
2901 return; /* resume from nested internal suspend */
2903 if (dm_suspended_md(md
))
2904 goto done
; /* resume from nested suspend */
2907 * NOTE: existing callers don't need to call dm_table_resume_targets
2908 * (which may fail -- so best to avoid it for now by passing NULL map)
2910 (void) __dm_resume(md
, NULL
);
2913 clear_bit(DMF_SUSPENDED_INTERNALLY
, &md
->flags
);
2914 smp_mb__after_atomic();
2915 wake_up_bit(&md
->flags
, DMF_SUSPENDED_INTERNALLY
);
2918 void dm_internal_suspend_noflush(struct mapped_device
*md
)
2920 mutex_lock(&md
->suspend_lock
);
2921 __dm_internal_suspend(md
, DM_SUSPEND_NOFLUSH_FLAG
);
2922 mutex_unlock(&md
->suspend_lock
);
2924 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush
);
2926 void dm_internal_resume(struct mapped_device
*md
)
2928 mutex_lock(&md
->suspend_lock
);
2929 __dm_internal_resume(md
);
2930 mutex_unlock(&md
->suspend_lock
);
2932 EXPORT_SYMBOL_GPL(dm_internal_resume
);
2935 * Fast variants of internal suspend/resume hold md->suspend_lock,
2936 * which prevents interaction with userspace-driven suspend.
2939 void dm_internal_suspend_fast(struct mapped_device
*md
)
2941 mutex_lock(&md
->suspend_lock
);
2942 if (dm_suspended_md(md
) || dm_suspended_internally_md(md
))
2945 set_bit(DMF_BLOCK_IO_FOR_SUSPEND
, &md
->flags
);
2946 synchronize_srcu(&md
->io_barrier
);
2947 flush_workqueue(md
->wq
);
2948 dm_wait_for_completion(md
, TASK_UNINTERRUPTIBLE
);
2950 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast
);
2952 void dm_internal_resume_fast(struct mapped_device
*md
)
2954 if (dm_suspended_md(md
) || dm_suspended_internally_md(md
))
2960 mutex_unlock(&md
->suspend_lock
);
2962 EXPORT_SYMBOL_GPL(dm_internal_resume_fast
);
2964 /*-----------------------------------------------------------------
2965 * Event notification.
2966 *---------------------------------------------------------------*/
2967 int dm_kobject_uevent(struct mapped_device
*md
, enum kobject_action action
,
2968 unsigned cookie
, bool need_resize_uevent
)
2972 char udev_cookie
[DM_COOKIE_LENGTH
];
2973 char *envp
[3] = { NULL
, NULL
, NULL
};
2974 char **envpp
= envp
;
2976 snprintf(udev_cookie
, DM_COOKIE_LENGTH
, "%s=%u",
2977 DM_COOKIE_ENV_VAR_NAME
, cookie
);
2978 *envpp
++ = udev_cookie
;
2980 if (need_resize_uevent
) {
2981 *envpp
++ = "RESIZE=1";
2984 noio_flag
= memalloc_noio_save();
2986 r
= kobject_uevent_env(&disk_to_dev(md
->disk
)->kobj
, action
, envp
);
2988 memalloc_noio_restore(noio_flag
);
2993 uint32_t dm_next_uevent_seq(struct mapped_device
*md
)
2995 return atomic_add_return(1, &md
->uevent_seq
);
2998 uint32_t dm_get_event_nr(struct mapped_device
*md
)
3000 return atomic_read(&md
->event_nr
);
3003 int dm_wait_event(struct mapped_device
*md
, int event_nr
)
3005 return wait_event_interruptible(md
->eventq
,
3006 (event_nr
!= atomic_read(&md
->event_nr
)));
3009 void dm_uevent_add(struct mapped_device
*md
, struct list_head
*elist
)
3011 unsigned long flags
;
3013 spin_lock_irqsave(&md
->uevent_lock
, flags
);
3014 list_add(elist
, &md
->uevent_list
);
3015 spin_unlock_irqrestore(&md
->uevent_lock
, flags
);
3019 * The gendisk is only valid as long as you have a reference
3022 struct gendisk
*dm_disk(struct mapped_device
*md
)
3026 EXPORT_SYMBOL_GPL(dm_disk
);
3028 struct kobject
*dm_kobject(struct mapped_device
*md
)
3030 return &md
->kobj_holder
.kobj
;
3033 struct mapped_device
*dm_get_from_kobject(struct kobject
*kobj
)
3035 struct mapped_device
*md
;
3037 md
= container_of(kobj
, struct mapped_device
, kobj_holder
.kobj
);
3039 spin_lock(&_minor_lock
);
3040 if (test_bit(DMF_FREEING
, &md
->flags
) || dm_deleting_md(md
)) {
3046 spin_unlock(&_minor_lock
);
3051 int dm_suspended_md(struct mapped_device
*md
)
3053 return test_bit(DMF_SUSPENDED
, &md
->flags
);
3056 static int dm_post_suspending_md(struct mapped_device
*md
)
3058 return test_bit(DMF_POST_SUSPENDING
, &md
->flags
);
3061 int dm_suspended_internally_md(struct mapped_device
*md
)
3063 return test_bit(DMF_SUSPENDED_INTERNALLY
, &md
->flags
);
3066 int dm_test_deferred_remove_flag(struct mapped_device
*md
)
3068 return test_bit(DMF_DEFERRED_REMOVE
, &md
->flags
);
3071 int dm_suspended(struct dm_target
*ti
)
3073 return dm_suspended_md(ti
->table
->md
);
3075 EXPORT_SYMBOL_GPL(dm_suspended
);
3077 int dm_post_suspending(struct dm_target
*ti
)
3079 return dm_post_suspending_md(ti
->table
->md
);
3081 EXPORT_SYMBOL_GPL(dm_post_suspending
);
3083 int dm_noflush_suspending(struct dm_target
*ti
)
3085 return __noflush_suspending(ti
->table
->md
);
3087 EXPORT_SYMBOL_GPL(dm_noflush_suspending
);
3089 void dm_free_md_mempools(struct dm_md_mempools
*pools
)
3094 bioset_exit(&pools
->bs
);
3095 bioset_exit(&pools
->io_bs
);
3110 static int dm_call_pr(struct block_device
*bdev
, iterate_devices_callout_fn fn
,
3113 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
3114 struct dm_table
*table
;
3115 struct dm_target
*ti
;
3116 int ret
= -ENOTTY
, srcu_idx
;
3118 table
= dm_get_live_table(md
, &srcu_idx
);
3119 if (!table
|| !dm_table_get_size(table
))
3122 /* We only support devices that have a single target */
3123 if (table
->num_targets
!= 1)
3125 ti
= dm_table_get_target(table
, 0);
3127 if (dm_suspended_md(md
)) {
3133 if (!ti
->type
->iterate_devices
)
3136 ti
->type
->iterate_devices(ti
, fn
, pr
);
3139 dm_put_live_table(md
, srcu_idx
);
3144 * For register / unregister we need to manually call out to every path.
3146 static int __dm_pr_register(struct dm_target
*ti
, struct dm_dev
*dev
,
3147 sector_t start
, sector_t len
, void *data
)
3149 struct dm_pr
*pr
= data
;
3150 const struct pr_ops
*ops
= dev
->bdev
->bd_disk
->fops
->pr_ops
;
3153 if (!ops
|| !ops
->pr_register
) {
3154 pr
->ret
= -EOPNOTSUPP
;
3158 ret
= ops
->pr_register(dev
->bdev
, pr
->old_key
, pr
->new_key
, pr
->flags
);
3171 static int dm_pr_register(struct block_device
*bdev
, u64 old_key
, u64 new_key
,
3183 ret
= dm_call_pr(bdev
, __dm_pr_register
, &pr
);
3185 /* Didn't even get to register a path */
3196 /* unregister all paths if we failed to register any path */
3197 pr
.old_key
= new_key
;
3200 pr
.fail_early
= false;
3201 (void) dm_call_pr(bdev
, __dm_pr_register
, &pr
);
3206 static int __dm_pr_reserve(struct dm_target
*ti
, struct dm_dev
*dev
,
3207 sector_t start
, sector_t len
, void *data
)
3209 struct dm_pr
*pr
= data
;
3210 const struct pr_ops
*ops
= dev
->bdev
->bd_disk
->fops
->pr_ops
;
3212 if (!ops
|| !ops
->pr_reserve
) {
3213 pr
->ret
= -EOPNOTSUPP
;
3217 pr
->ret
= ops
->pr_reserve(dev
->bdev
, pr
->old_key
, pr
->type
, pr
->flags
);
3224 static int dm_pr_reserve(struct block_device
*bdev
, u64 key
, enum pr_type type
,
3231 .fail_early
= false,
3236 ret
= dm_call_pr(bdev
, __dm_pr_reserve
, &pr
);
3244 * If there is a non-All Registrants type of reservation, the release must be
3245 * sent down the holding path. For the cases where there is no reservation or
3246 * the path is not the holder the device will also return success, so we must
3247 * try each path to make sure we got the correct path.
3249 static int __dm_pr_release(struct dm_target
*ti
, struct dm_dev
*dev
,
3250 sector_t start
, sector_t len
, void *data
)
3252 struct dm_pr
*pr
= data
;
3253 const struct pr_ops
*ops
= dev
->bdev
->bd_disk
->fops
->pr_ops
;
3255 if (!ops
|| !ops
->pr_release
) {
3256 pr
->ret
= -EOPNOTSUPP
;
3260 pr
->ret
= ops
->pr_release(dev
->bdev
, pr
->old_key
, pr
->type
);
3267 static int dm_pr_release(struct block_device
*bdev
, u64 key
, enum pr_type type
)
3272 .fail_early
= false,
3276 ret
= dm_call_pr(bdev
, __dm_pr_release
, &pr
);
3283 static int __dm_pr_preempt(struct dm_target
*ti
, struct dm_dev
*dev
,
3284 sector_t start
, sector_t len
, void *data
)
3286 struct dm_pr
*pr
= data
;
3287 const struct pr_ops
*ops
= dev
->bdev
->bd_disk
->fops
->pr_ops
;
3289 if (!ops
|| !ops
->pr_preempt
) {
3290 pr
->ret
= -EOPNOTSUPP
;
3294 pr
->ret
= ops
->pr_preempt(dev
->bdev
, pr
->old_key
, pr
->new_key
, pr
->type
,
3302 static int dm_pr_preempt(struct block_device
*bdev
, u64 old_key
, u64 new_key
,
3303 enum pr_type type
, bool abort
)
3309 .fail_early
= false,
3313 ret
= dm_call_pr(bdev
, __dm_pr_preempt
, &pr
);
3320 static int dm_pr_clear(struct block_device
*bdev
, u64 key
)
3322 struct mapped_device
*md
= bdev
->bd_disk
->private_data
;
3323 const struct pr_ops
*ops
;
3326 r
= dm_prepare_ioctl(md
, &srcu_idx
, &bdev
);
3330 ops
= bdev
->bd_disk
->fops
->pr_ops
;
3331 if (ops
&& ops
->pr_clear
)
3332 r
= ops
->pr_clear(bdev
, key
);
3336 dm_unprepare_ioctl(md
, srcu_idx
);
3340 static const struct pr_ops dm_pr_ops
= {
3341 .pr_register
= dm_pr_register
,
3342 .pr_reserve
= dm_pr_reserve
,
3343 .pr_release
= dm_pr_release
,
3344 .pr_preempt
= dm_pr_preempt
,
3345 .pr_clear
= dm_pr_clear
,
3348 static const struct block_device_operations dm_blk_dops
= {
3349 .submit_bio
= dm_submit_bio
,
3350 .poll_bio
= dm_poll_bio
,
3351 .open
= dm_blk_open
,
3352 .release
= dm_blk_close
,
3353 .ioctl
= dm_blk_ioctl
,
3354 .getgeo
= dm_blk_getgeo
,
3355 .report_zones
= dm_blk_report_zones
,
3356 .pr_ops
= &dm_pr_ops
,
3357 .owner
= THIS_MODULE
3360 static const struct block_device_operations dm_rq_blk_dops
= {
3361 .open
= dm_blk_open
,
3362 .release
= dm_blk_close
,
3363 .ioctl
= dm_blk_ioctl
,
3364 .getgeo
= dm_blk_getgeo
,
3365 .pr_ops
= &dm_pr_ops
,
3366 .owner
= THIS_MODULE
3369 static const struct dax_operations dm_dax_ops
= {
3370 .direct_access
= dm_dax_direct_access
,
3371 .zero_page_range
= dm_dax_zero_page_range
,
3372 .recovery_write
= dm_dax_recovery_write
,
3378 module_init(dm_init
);
3379 module_exit(dm_exit
);
3381 module_param(major
, uint
, 0);
3382 MODULE_PARM_DESC(major
, "The major number of the device mapper");
3384 module_param(reserved_bio_based_ios
, uint
, S_IRUGO
| S_IWUSR
);
3385 MODULE_PARM_DESC(reserved_bio_based_ios
, "Reserved IOs in bio-based mempools");
3387 module_param(dm_numa_node
, int, S_IRUGO
| S_IWUSR
);
3388 MODULE_PARM_DESC(dm_numa_node
, "NUMA node for DM device memory allocations");
3390 module_param(swap_bios
, int, S_IRUGO
| S_IWUSR
);
3391 MODULE_PARM_DESC(swap_bios
, "Maximum allowed inflight swap IOs");
3393 MODULE_DESCRIPTION(DM_NAME
" driver");
3394 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3395 MODULE_LICENSE("GPL");