1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Userspace block device - block device which IO is handled from userspace
5 * Take full use of io_uring passthrough command for communicating with
6 * ublk userspace daemon(ublksrvd) for handling basic IO request.
8 * Copyright 2022 Ming Lei <ming.lei@redhat.com>
10 * (part of code stolen from loop.c)
12 #include <linux/module.h>
13 #include <linux/moduleparam.h>
14 #include <linux/sched.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/major.h>
21 #include <linux/wait.h>
22 #include <linux/blkdev.h>
23 #include <linux/init.h>
24 #include <linux/swap.h>
25 #include <linux/slab.h>
26 #include <linux/compat.h>
27 #include <linux/mutex.h>
28 #include <linux/writeback.h>
29 #include <linux/completion.h>
30 #include <linux/highmem.h>
31 #include <linux/sysfs.h>
32 #include <linux/miscdevice.h>
33 #include <linux/falloc.h>
34 #include <linux/uio.h>
35 #include <linux/ioprio.h>
36 #include <linux/sched/mm.h>
37 #include <linux/uaccess.h>
38 #include <linux/cdev.h>
39 #include <linux/io_uring.h>
40 #include <linux/blk-mq.h>
41 #include <linux/delay.h>
44 #include <linux/task_work.h>
45 #include <uapi/linux/ublk_cmd.h>
47 #define UBLK_MINORS (1U << MINORBITS)
49 /* All UBLK_F_* have to be included into UBLK_F_ALL */
50 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
51 | UBLK_F_URING_CMD_COMP_IN_TASK \
52 | UBLK_F_NEED_GET_DATA \
53 | UBLK_F_USER_RECOVERY \
54 | UBLK_F_USER_RECOVERY_REISSUE)
56 /* All UBLK_PARAM_TYPE_* should be included here */
57 #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
61 struct callback_head work
;
62 struct llist_node node
;
66 struct ublk_uring_cmd_pdu
{
67 struct ublk_queue
*ubq
;
71 * io command is active: sqe cmd is received, and its cqe isn't done
73 * If the flag is set, the io command is owned by ublk driver, and waited
74 * for incoming blk-mq request from the ublk block device.
76 * If the flag is cleared, the io command will be completed, and owned by
79 #define UBLK_IO_FLAG_ACTIVE 0x01
82 * IO command is completed via cqe, and it is being handled by ublksrv, and
85 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
88 #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
91 * IO command is aborted, so this flag is set in case of
92 * !UBLK_IO_FLAG_ACTIVE.
94 * After this flag is observed, any pending or new incoming request
95 * associated with this io command will be failed immediately
97 #define UBLK_IO_FLAG_ABORTED 0x04
100 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
101 * get data buffer address from ublksrv.
103 * Then, bio data could be copied into this data buffer for a WRITE request
104 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
106 #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
109 /* userspace buffer address from io cmd */
114 struct io_uring_cmd
*cmd
;
122 struct task_struct
*ubq_daemon
;
125 struct llist_head io_cmds
;
127 unsigned long io_addr
; /* mapped vm address */
128 unsigned int max_io_sz
;
130 unsigned short nr_io_ready
; /* how many ios setup */
131 struct ublk_device
*dev
;
132 struct ublk_io ios
[];
135 #define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
138 struct gendisk
*ub_disk
;
142 unsigned short queue_size
;
143 struct ublksrv_ctrl_dev_info dev_info
;
145 struct blk_mq_tag_set tag_set
;
148 struct device cdev_dev
;
150 #define UB_STATE_OPEN 0
151 #define UB_STATE_USED 1
158 struct mm_struct
*mm
;
160 struct ublk_params params
;
162 struct completion completion
;
163 unsigned int nr_queues_ready
;
164 atomic_t nr_aborted_queues
;
167 * Our ubq->daemon may be killed without any notification, so
168 * monitor each queue's daemon periodically
170 struct delayed_work monitor_work
;
171 struct work_struct quiesce_work
;
172 struct work_struct stop_work
;
175 /* header of ublk_params */
176 struct ublk_params_header
{
181 static dev_t ublk_chr_devt
;
182 static struct class *ublk_chr_class
;
184 static DEFINE_IDR(ublk_index_idr
);
185 static DEFINE_SPINLOCK(ublk_idr_lock
);
186 static wait_queue_head_t ublk_idr_wq
; /* wait until one idr is freed */
188 static DEFINE_MUTEX(ublk_ctl_mutex
);
190 static struct miscdevice ublk_misc
;
192 static void ublk_dev_param_basic_apply(struct ublk_device
*ub
)
194 struct request_queue
*q
= ub
->ub_disk
->queue
;
195 const struct ublk_param_basic
*p
= &ub
->params
.basic
;
197 blk_queue_logical_block_size(q
, 1 << p
->logical_bs_shift
);
198 blk_queue_physical_block_size(q
, 1 << p
->physical_bs_shift
);
199 blk_queue_io_min(q
, 1 << p
->io_min_shift
);
200 blk_queue_io_opt(q
, 1 << p
->io_opt_shift
);
202 blk_queue_write_cache(q
, p
->attrs
& UBLK_ATTR_VOLATILE_CACHE
,
203 p
->attrs
& UBLK_ATTR_FUA
);
204 if (p
->attrs
& UBLK_ATTR_ROTATIONAL
)
205 blk_queue_flag_clear(QUEUE_FLAG_NONROT
, q
);
207 blk_queue_flag_set(QUEUE_FLAG_NONROT
, q
);
209 blk_queue_max_hw_sectors(q
, p
->max_sectors
);
210 blk_queue_chunk_sectors(q
, p
->chunk_sectors
);
211 blk_queue_virt_boundary(q
, p
->virt_boundary_mask
);
213 if (p
->attrs
& UBLK_ATTR_READ_ONLY
)
214 set_disk_ro(ub
->ub_disk
, true);
216 set_capacity(ub
->ub_disk
, p
->dev_sectors
);
219 static void ublk_dev_param_discard_apply(struct ublk_device
*ub
)
221 struct request_queue
*q
= ub
->ub_disk
->queue
;
222 const struct ublk_param_discard
*p
= &ub
->params
.discard
;
224 q
->limits
.discard_alignment
= p
->discard_alignment
;
225 q
->limits
.discard_granularity
= p
->discard_granularity
;
226 blk_queue_max_discard_sectors(q
, p
->max_discard_sectors
);
227 blk_queue_max_write_zeroes_sectors(q
,
228 p
->max_write_zeroes_sectors
);
229 blk_queue_max_discard_segments(q
, p
->max_discard_segments
);
232 static int ublk_validate_params(const struct ublk_device
*ub
)
234 /* basic param is the only one which must be set */
235 if (ub
->params
.types
& UBLK_PARAM_TYPE_BASIC
) {
236 const struct ublk_param_basic
*p
= &ub
->params
.basic
;
238 if (p
->logical_bs_shift
> PAGE_SHIFT
)
241 if (p
->logical_bs_shift
> p
->physical_bs_shift
)
244 if (p
->max_sectors
> (ub
->dev_info
.max_io_buf_bytes
>> 9))
249 if (ub
->params
.types
& UBLK_PARAM_TYPE_DISCARD
) {
250 const struct ublk_param_discard
*p
= &ub
->params
.discard
;
252 /* So far, only support single segment discard */
253 if (p
->max_discard_sectors
&& p
->max_discard_segments
!= 1)
256 if (!p
->discard_granularity
)
263 static int ublk_apply_params(struct ublk_device
*ub
)
265 if (!(ub
->params
.types
& UBLK_PARAM_TYPE_BASIC
))
268 ublk_dev_param_basic_apply(ub
);
270 if (ub
->params
.types
& UBLK_PARAM_TYPE_DISCARD
)
271 ublk_dev_param_discard_apply(ub
);
276 static inline bool ublk_can_use_task_work(const struct ublk_queue
*ubq
)
278 if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK
) &&
279 !(ubq
->flags
& UBLK_F_URING_CMD_COMP_IN_TASK
))
284 static inline bool ublk_need_get_data(const struct ublk_queue
*ubq
)
286 if (ubq
->flags
& UBLK_F_NEED_GET_DATA
)
291 static struct ublk_device
*ublk_get_device(struct ublk_device
*ub
)
293 if (kobject_get_unless_zero(&ub
->cdev_dev
.kobj
))
298 static void ublk_put_device(struct ublk_device
*ub
)
300 put_device(&ub
->cdev_dev
);
303 static inline struct ublk_queue
*ublk_get_queue(struct ublk_device
*dev
,
306 return (struct ublk_queue
*)&(dev
->__queues
[qid
* dev
->queue_size
]);
309 static inline bool ublk_rq_has_data(const struct request
*rq
)
311 return rq
->bio
&& bio_has_data(rq
->bio
);
314 static inline struct ublksrv_io_desc
*ublk_get_iod(struct ublk_queue
*ubq
,
317 return (struct ublksrv_io_desc
*)
318 &(ubq
->io_cmd_buf
[tag
* sizeof(struct ublksrv_io_desc
)]);
321 static inline char *ublk_queue_cmd_buf(struct ublk_device
*ub
, int q_id
)
323 return ublk_get_queue(ub
, q_id
)->io_cmd_buf
;
326 static inline int ublk_queue_cmd_buf_size(struct ublk_device
*ub
, int q_id
)
328 struct ublk_queue
*ubq
= ublk_get_queue(ub
, q_id
);
330 return round_up(ubq
->q_depth
* sizeof(struct ublksrv_io_desc
),
334 static inline bool ublk_queue_can_use_recovery_reissue(
335 struct ublk_queue
*ubq
)
337 if ((ubq
->flags
& UBLK_F_USER_RECOVERY
) &&
338 (ubq
->flags
& UBLK_F_USER_RECOVERY_REISSUE
))
343 static inline bool ublk_queue_can_use_recovery(
344 struct ublk_queue
*ubq
)
346 if (ubq
->flags
& UBLK_F_USER_RECOVERY
)
351 static inline bool ublk_can_use_recovery(struct ublk_device
*ub
)
353 if (ub
->dev_info
.flags
& UBLK_F_USER_RECOVERY
)
358 static void ublk_free_disk(struct gendisk
*disk
)
360 struct ublk_device
*ub
= disk
->private_data
;
362 clear_bit(UB_STATE_USED
, &ub
->state
);
363 put_device(&ub
->cdev_dev
);
366 static const struct block_device_operations ub_fops
= {
367 .owner
= THIS_MODULE
,
368 .free_disk
= ublk_free_disk
,
371 #define UBLK_MAX_PIN_PAGES 32
373 struct ublk_map_data
{
374 const struct ublk_queue
*ubq
;
375 const struct request
*rq
;
376 const struct ublk_io
*io
;
380 struct ublk_io_iter
{
381 struct page
*pages
[UBLK_MAX_PIN_PAGES
];
382 unsigned pg_off
; /* offset in the 1st page in pages */
383 int nr_pages
; /* how many page pointers in pages */
385 struct bvec_iter iter
;
388 static inline unsigned ublk_copy_io_pages(struct ublk_io_iter
*data
,
389 unsigned max_bytes
, bool to_vm
)
391 const unsigned total
= min_t(unsigned, max_bytes
,
392 PAGE_SIZE
- data
->pg_off
+
393 ((data
->nr_pages
- 1) << PAGE_SHIFT
));
397 while (done
< total
) {
398 struct bio_vec bv
= bio_iter_iovec(data
->bio
, data
->iter
);
399 const unsigned int bytes
= min3(bv
.bv_len
, total
- done
,
400 (unsigned)(PAGE_SIZE
- data
->pg_off
));
401 void *bv_buf
= bvec_kmap_local(&bv
);
402 void *pg_buf
= kmap_local_page(data
->pages
[pg_idx
]);
405 memcpy(pg_buf
+ data
->pg_off
, bv_buf
, bytes
);
407 memcpy(bv_buf
, pg_buf
+ data
->pg_off
, bytes
);
409 kunmap_local(pg_buf
);
410 kunmap_local(bv_buf
);
412 /* advance page array */
413 data
->pg_off
+= bytes
;
414 if (data
->pg_off
== PAGE_SIZE
) {
422 bio_advance_iter_single(data
->bio
, &data
->iter
, bytes
);
423 if (!data
->iter
.bi_size
) {
424 data
->bio
= data
->bio
->bi_next
;
425 if (data
->bio
== NULL
)
427 data
->iter
= data
->bio
->bi_iter
;
434 static inline int ublk_copy_user_pages(struct ublk_map_data
*data
,
437 const unsigned int gup_flags
= to_vm
? FOLL_WRITE
: 0;
438 const unsigned long start_vm
= data
->io
->addr
;
439 unsigned int done
= 0;
440 struct ublk_io_iter iter
= {
441 .pg_off
= start_vm
& (PAGE_SIZE
- 1),
442 .bio
= data
->rq
->bio
,
443 .iter
= data
->rq
->bio
->bi_iter
,
445 const unsigned int nr_pages
= round_up(data
->max_bytes
+
446 (start_vm
& (PAGE_SIZE
- 1)), PAGE_SIZE
) >> PAGE_SHIFT
;
448 while (done
< nr_pages
) {
449 const unsigned to_pin
= min_t(unsigned, UBLK_MAX_PIN_PAGES
,
453 iter
.nr_pages
= get_user_pages_fast(start_vm
+
454 (done
<< PAGE_SHIFT
), to_pin
, gup_flags
,
456 if (iter
.nr_pages
<= 0)
457 return done
== 0 ? iter
.nr_pages
: done
;
458 len
= ublk_copy_io_pages(&iter
, data
->max_bytes
, to_vm
);
459 for (i
= 0; i
< iter
.nr_pages
; i
++) {
461 set_page_dirty(iter
.pages
[i
]);
462 put_page(iter
.pages
[i
]);
464 data
->max_bytes
-= len
;
465 done
+= iter
.nr_pages
;
471 static int ublk_map_io(const struct ublk_queue
*ubq
, const struct request
*req
,
474 const unsigned int rq_bytes
= blk_rq_bytes(req
);
476 * no zero copy, we delay copy WRITE request data into ublksrv
477 * context and the big benefit is that pinning pages in current
478 * context is pretty fast, see ublk_pin_user_pages
480 if (req_op(req
) != REQ_OP_WRITE
&& req_op(req
) != REQ_OP_FLUSH
)
483 if (ublk_rq_has_data(req
)) {
484 struct ublk_map_data data
= {
488 .max_bytes
= rq_bytes
,
491 ublk_copy_user_pages(&data
, true);
493 return rq_bytes
- data
.max_bytes
;
498 static int ublk_unmap_io(const struct ublk_queue
*ubq
,
499 const struct request
*req
,
502 const unsigned int rq_bytes
= blk_rq_bytes(req
);
504 if (req_op(req
) == REQ_OP_READ
&& ublk_rq_has_data(req
)) {
505 struct ublk_map_data data
= {
509 .max_bytes
= io
->res
,
512 WARN_ON_ONCE(io
->res
> rq_bytes
);
514 ublk_copy_user_pages(&data
, false);
516 return io
->res
- data
.max_bytes
;
521 static inline unsigned int ublk_req_build_flags(struct request
*req
)
525 if (req
->cmd_flags
& REQ_FAILFAST_DEV
)
526 flags
|= UBLK_IO_F_FAILFAST_DEV
;
528 if (req
->cmd_flags
& REQ_FAILFAST_TRANSPORT
)
529 flags
|= UBLK_IO_F_FAILFAST_TRANSPORT
;
531 if (req
->cmd_flags
& REQ_FAILFAST_DRIVER
)
532 flags
|= UBLK_IO_F_FAILFAST_DRIVER
;
534 if (req
->cmd_flags
& REQ_META
)
535 flags
|= UBLK_IO_F_META
;
537 if (req
->cmd_flags
& REQ_FUA
)
538 flags
|= UBLK_IO_F_FUA
;
540 if (req
->cmd_flags
& REQ_NOUNMAP
)
541 flags
|= UBLK_IO_F_NOUNMAP
;
543 if (req
->cmd_flags
& REQ_SWAP
)
544 flags
|= UBLK_IO_F_SWAP
;
549 static blk_status_t
ublk_setup_iod(struct ublk_queue
*ubq
, struct request
*req
)
551 struct ublksrv_io_desc
*iod
= ublk_get_iod(ubq
, req
->tag
);
552 struct ublk_io
*io
= &ubq
->ios
[req
->tag
];
555 switch (req_op(req
)) {
557 ublk_op
= UBLK_IO_OP_READ
;
560 ublk_op
= UBLK_IO_OP_WRITE
;
563 ublk_op
= UBLK_IO_OP_FLUSH
;
566 ublk_op
= UBLK_IO_OP_DISCARD
;
568 case REQ_OP_WRITE_ZEROES
:
569 ublk_op
= UBLK_IO_OP_WRITE_ZEROES
;
572 return BLK_STS_IOERR
;
575 /* need to translate since kernel may change */
576 iod
->op_flags
= ublk_op
| ublk_req_build_flags(req
);
577 iod
->nr_sectors
= blk_rq_sectors(req
);
578 iod
->start_sector
= blk_rq_pos(req
);
579 iod
->addr
= io
->addr
;
584 static inline struct ublk_uring_cmd_pdu
*ublk_get_uring_cmd_pdu(
585 struct io_uring_cmd
*ioucmd
)
587 return (struct ublk_uring_cmd_pdu
*)&ioucmd
->pdu
;
590 static inline bool ubq_daemon_is_dying(struct ublk_queue
*ubq
)
592 return ubq
->ubq_daemon
->flags
& PF_EXITING
;
595 /* todo: handle partial completion */
596 static void ublk_complete_rq(struct request
*req
)
598 struct ublk_queue
*ubq
= req
->mq_hctx
->driver_data
;
599 struct ublk_io
*io
= &ubq
->ios
[req
->tag
];
600 unsigned int unmapped_bytes
;
602 /* failed read IO if nothing is read */
603 if (!io
->res
&& req_op(req
) == REQ_OP_READ
)
607 blk_mq_end_request(req
, errno_to_blk_status(io
->res
));
612 * FLUSH or DISCARD usually won't return bytes returned, so end them
615 * Both the two needn't unmap.
617 if (req_op(req
) != REQ_OP_READ
&& req_op(req
) != REQ_OP_WRITE
) {
618 blk_mq_end_request(req
, BLK_STS_OK
);
622 /* for READ request, writing data in iod->addr to rq buffers */
623 unmapped_bytes
= ublk_unmap_io(ubq
, req
, io
);
626 * Extremely impossible since we got data filled in just before
628 * Re-read simply for this unlikely case.
630 if (unlikely(unmapped_bytes
< io
->res
))
631 io
->res
= unmapped_bytes
;
633 if (blk_update_request(req
, BLK_STS_OK
, io
->res
))
634 blk_mq_requeue_request(req
, true);
636 __blk_mq_end_request(req
, BLK_STS_OK
);
640 * Since __ublk_rq_task_work always fails requests immediately during
641 * exiting, __ublk_fail_req() is only called from abort context during
642 * exiting. So lock is unnecessary.
644 * Also aborting may not be started yet, keep in mind that one failed
645 * request may be issued by block layer again.
647 static void __ublk_fail_req(struct ublk_queue
*ubq
, struct ublk_io
*io
,
650 WARN_ON_ONCE(io
->flags
& UBLK_IO_FLAG_ACTIVE
);
652 if (!(io
->flags
& UBLK_IO_FLAG_ABORTED
)) {
653 io
->flags
|= UBLK_IO_FLAG_ABORTED
;
654 if (ublk_queue_can_use_recovery_reissue(ubq
))
655 blk_mq_requeue_request(req
, false);
657 blk_mq_end_request(req
, BLK_STS_IOERR
);
661 static void ubq_complete_io_cmd(struct ublk_io
*io
, int res
)
663 /* mark this cmd owned by ublksrv */
664 io
->flags
|= UBLK_IO_FLAG_OWNED_BY_SRV
;
667 * clear ACTIVE since we are done with this sqe/cmd slot
668 * We can only accept io cmd in case of being not active.
670 io
->flags
&= ~UBLK_IO_FLAG_ACTIVE
;
672 /* tell ublksrv one io request is coming */
673 io_uring_cmd_done(io
->cmd
, res
, 0);
676 #define UBLK_REQUEUE_DELAY_MS 3
678 static inline void __ublk_abort_rq(struct ublk_queue
*ubq
,
681 /* We cannot process this rq so just requeue it. */
682 if (ublk_queue_can_use_recovery(ubq
))
683 blk_mq_requeue_request(rq
, false);
685 blk_mq_end_request(rq
, BLK_STS_IOERR
);
687 mod_delayed_work(system_wq
, &ubq
->dev
->monitor_work
, 0);
690 static inline void __ublk_rq_task_work(struct request
*req
)
692 struct ublk_queue
*ubq
= req
->mq_hctx
->driver_data
;
694 struct ublk_io
*io
= &ubq
->ios
[tag
];
695 unsigned int mapped_bytes
;
697 pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
698 __func__
, io
->cmd
->cmd_op
, ubq
->q_id
, req
->tag
, io
->flags
,
699 ublk_get_iod(ubq
, req
->tag
)->addr
);
702 * Task is exiting if either:
704 * (1) current != ubq_daemon.
705 * io_uring_cmd_complete_in_task() tries to run task_work
706 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
708 * (2) current->flags & PF_EXITING.
710 if (unlikely(current
!= ubq
->ubq_daemon
|| current
->flags
& PF_EXITING
)) {
711 __ublk_abort_rq(ubq
, req
);
715 if (ublk_need_get_data(ubq
) &&
716 (req_op(req
) == REQ_OP_WRITE
||
717 req_op(req
) == REQ_OP_FLUSH
)) {
719 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
720 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
723 if (!(io
->flags
& UBLK_IO_FLAG_NEED_GET_DATA
)) {
724 io
->flags
|= UBLK_IO_FLAG_NEED_GET_DATA
;
725 pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
726 __func__
, io
->cmd
->cmd_op
, ubq
->q_id
,
727 req
->tag
, io
->flags
);
728 ubq_complete_io_cmd(io
, UBLK_IO_RES_NEED_GET_DATA
);
732 * We have handled UBLK_IO_NEED_GET_DATA command,
733 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
736 io
->flags
&= ~UBLK_IO_FLAG_NEED_GET_DATA
;
737 /* update iod->addr because ublksrv may have passed a new io buffer */
738 ublk_get_iod(ubq
, req
->tag
)->addr
= io
->addr
;
739 pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
740 __func__
, io
->cmd
->cmd_op
, ubq
->q_id
, req
->tag
, io
->flags
,
741 ublk_get_iod(ubq
, req
->tag
)->addr
);
744 mapped_bytes
= ublk_map_io(ubq
, req
, io
);
746 /* partially mapped, update io descriptor */
747 if (unlikely(mapped_bytes
!= blk_rq_bytes(req
))) {
749 * Nothing mapped, retry until we succeed.
751 * We may never succeed in mapping any bytes here because
752 * of OOM. TODO: reserve one buffer with single page pinned
753 * for providing forward progress guarantee.
755 if (unlikely(!mapped_bytes
)) {
756 blk_mq_requeue_request(req
, false);
757 blk_mq_delay_kick_requeue_list(req
->q
,
758 UBLK_REQUEUE_DELAY_MS
);
762 ublk_get_iod(ubq
, req
->tag
)->nr_sectors
=
766 ubq_complete_io_cmd(io
, UBLK_IO_RES_OK
);
769 static void ublk_rq_task_work_cb(struct io_uring_cmd
*cmd
)
771 struct ublk_uring_cmd_pdu
*pdu
= ublk_get_uring_cmd_pdu(cmd
);
772 struct ublk_queue
*ubq
= pdu
->ubq
;
773 struct llist_node
*io_cmds
= llist_del_all(&ubq
->io_cmds
);
774 struct ublk_rq_data
*data
;
776 llist_for_each_entry(data
, io_cmds
, node
)
777 __ublk_rq_task_work(blk_mq_rq_from_pdu(data
));
780 static void ublk_rq_task_work_fn(struct callback_head
*work
)
782 struct ublk_rq_data
*data
= container_of(work
,
783 struct ublk_rq_data
, work
);
784 struct request
*req
= blk_mq_rq_from_pdu(data
);
786 __ublk_rq_task_work(req
);
789 static void ublk_submit_cmd(struct ublk_queue
*ubq
, const struct request
*rq
)
791 struct ublk_io
*io
= &ubq
->ios
[rq
->tag
];
794 * If the check pass, we know that this is a re-issued request aborted
795 * previously in monitor_work because the ubq_daemon(cmd's task) is
796 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
797 * because this ioucmd's io_uring context may be freed now if no inflight
798 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
800 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
801 * the tag). Then the request is re-started(allocating the tag) and we are here.
802 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
803 * guarantees that here is a re-issued request aborted previously.
805 if (unlikely(io
->flags
& UBLK_IO_FLAG_ABORTED
)) {
806 struct llist_node
*io_cmds
= llist_del_all(&ubq
->io_cmds
);
807 struct ublk_rq_data
*data
;
809 llist_for_each_entry(data
, io_cmds
, node
)
810 __ublk_abort_rq(ubq
, blk_mq_rq_from_pdu(data
));
812 struct io_uring_cmd
*cmd
= io
->cmd
;
813 struct ublk_uring_cmd_pdu
*pdu
= ublk_get_uring_cmd_pdu(cmd
);
816 io_uring_cmd_complete_in_task(cmd
, ublk_rq_task_work_cb
);
820 static void ublk_queue_cmd(struct ublk_queue
*ubq
, struct request
*rq
,
823 struct ublk_rq_data
*data
= blk_mq_rq_to_pdu(rq
);
825 if (ublk_can_use_task_work(ubq
)) {
826 enum task_work_notify_mode notify_mode
= last
?
827 TWA_SIGNAL_NO_IPI
: TWA_NONE
;
829 if (task_work_add(ubq
->ubq_daemon
, &data
->work
, notify_mode
))
830 __ublk_abort_rq(ubq
, rq
);
832 if (llist_add(&data
->node
, &ubq
->io_cmds
))
833 ublk_submit_cmd(ubq
, rq
);
837 static blk_status_t
ublk_queue_rq(struct blk_mq_hw_ctx
*hctx
,
838 const struct blk_mq_queue_data
*bd
)
840 struct ublk_queue
*ubq
= hctx
->driver_data
;
841 struct request
*rq
= bd
->rq
;
844 /* fill iod to slot in io cmd buffer */
845 res
= ublk_setup_iod(ubq
, rq
);
846 if (unlikely(res
!= BLK_STS_OK
))
847 return BLK_STS_IOERR
;
849 /* With recovery feature enabled, force_abort is set in
850 * ublk_stop_dev() before calling del_gendisk(). We have to
851 * abort all requeued and new rqs here to let del_gendisk()
852 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
853 * to avoid UAF on io_uring ctx.
855 * Note: force_abort is guaranteed to be seen because it is set
856 * before request queue is unqiuesced.
858 if (ublk_queue_can_use_recovery(ubq
) && unlikely(ubq
->force_abort
))
859 return BLK_STS_IOERR
;
861 blk_mq_start_request(bd
->rq
);
863 if (unlikely(ubq_daemon_is_dying(ubq
))) {
864 __ublk_abort_rq(ubq
, rq
);
868 ublk_queue_cmd(ubq
, rq
, bd
->last
);
873 static void ublk_commit_rqs(struct blk_mq_hw_ctx
*hctx
)
875 struct ublk_queue
*ubq
= hctx
->driver_data
;
877 if (ublk_can_use_task_work(ubq
))
878 __set_notify_signal(ubq
->ubq_daemon
);
881 static int ublk_init_hctx(struct blk_mq_hw_ctx
*hctx
, void *driver_data
,
882 unsigned int hctx_idx
)
884 struct ublk_device
*ub
= driver_data
;
885 struct ublk_queue
*ubq
= ublk_get_queue(ub
, hctx
->queue_num
);
887 hctx
->driver_data
= ubq
;
891 static int ublk_init_rq(struct blk_mq_tag_set
*set
, struct request
*req
,
892 unsigned int hctx_idx
, unsigned int numa_node
)
894 struct ublk_rq_data
*data
= blk_mq_rq_to_pdu(req
);
896 init_task_work(&data
->work
, ublk_rq_task_work_fn
);
900 static const struct blk_mq_ops ublk_mq_ops
= {
901 .queue_rq
= ublk_queue_rq
,
902 .commit_rqs
= ublk_commit_rqs
,
903 .init_hctx
= ublk_init_hctx
,
904 .init_request
= ublk_init_rq
,
907 static int ublk_ch_open(struct inode
*inode
, struct file
*filp
)
909 struct ublk_device
*ub
= container_of(inode
->i_cdev
,
910 struct ublk_device
, cdev
);
912 if (test_and_set_bit(UB_STATE_OPEN
, &ub
->state
))
914 filp
->private_data
= ub
;
918 static int ublk_ch_release(struct inode
*inode
, struct file
*filp
)
920 struct ublk_device
*ub
= filp
->private_data
;
922 clear_bit(UB_STATE_OPEN
, &ub
->state
);
926 /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
927 static int ublk_ch_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
929 struct ublk_device
*ub
= filp
->private_data
;
930 size_t sz
= vma
->vm_end
- vma
->vm_start
;
931 unsigned max_sz
= UBLK_MAX_QUEUE_DEPTH
* sizeof(struct ublksrv_io_desc
);
932 unsigned long pfn
, end
, phys_off
= vma
->vm_pgoff
<< PAGE_SHIFT
;
935 spin_lock(&ub
->mm_lock
);
937 ub
->mm
= current
->mm
;
938 if (current
->mm
!= ub
->mm
)
940 spin_unlock(&ub
->mm_lock
);
945 if (vma
->vm_flags
& VM_WRITE
)
948 end
= UBLKSRV_CMD_BUF_OFFSET
+ ub
->dev_info
.nr_hw_queues
* max_sz
;
949 if (phys_off
< UBLKSRV_CMD_BUF_OFFSET
|| phys_off
>= end
)
952 q_id
= (phys_off
- UBLKSRV_CMD_BUF_OFFSET
) / max_sz
;
953 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
954 __func__
, q_id
, current
->pid
, vma
->vm_start
,
955 phys_off
, (unsigned long)sz
);
957 if (sz
!= ublk_queue_cmd_buf_size(ub
, q_id
))
960 pfn
= virt_to_phys(ublk_queue_cmd_buf(ub
, q_id
)) >> PAGE_SHIFT
;
961 return remap_pfn_range(vma
, vma
->vm_start
, pfn
, sz
, vma
->vm_page_prot
);
964 static void ublk_commit_completion(struct ublk_device
*ub
,
965 struct ublksrv_io_cmd
*ub_cmd
)
967 u32 qid
= ub_cmd
->q_id
, tag
= ub_cmd
->tag
;
968 struct ublk_queue
*ubq
= ublk_get_queue(ub
, qid
);
969 struct ublk_io
*io
= &ubq
->ios
[tag
];
972 /* now this cmd slot is owned by nbd driver */
973 io
->flags
&= ~UBLK_IO_FLAG_OWNED_BY_SRV
;
974 io
->res
= ub_cmd
->result
;
976 /* find the io request and complete */
977 req
= blk_mq_tag_to_rq(ub
->tag_set
.tags
[qid
], tag
);
979 if (req
&& likely(!blk_should_fake_timeout(req
->q
)))
980 ublk_complete_rq(req
);
984 * When ->ubq_daemon is exiting, either new request is ended immediately,
985 * or any queued io command is drained, so it is safe to abort queue
988 static void ublk_abort_queue(struct ublk_device
*ub
, struct ublk_queue
*ubq
)
992 if (!ublk_get_device(ub
))
995 for (i
= 0; i
< ubq
->q_depth
; i
++) {
996 struct ublk_io
*io
= &ubq
->ios
[i
];
998 if (!(io
->flags
& UBLK_IO_FLAG_ACTIVE
)) {
1002 * Either we fail the request or ublk_rq_task_work_fn
1005 rq
= blk_mq_tag_to_rq(ub
->tag_set
.tags
[ubq
->q_id
], i
);
1007 __ublk_fail_req(ubq
, io
, rq
);
1010 ublk_put_device(ub
);
1013 static void ublk_daemon_monitor_work(struct work_struct
*work
)
1015 struct ublk_device
*ub
=
1016 container_of(work
, struct ublk_device
, monitor_work
.work
);
1019 for (i
= 0; i
< ub
->dev_info
.nr_hw_queues
; i
++) {
1020 struct ublk_queue
*ubq
= ublk_get_queue(ub
, i
);
1022 if (ubq_daemon_is_dying(ubq
)) {
1023 if (ublk_queue_can_use_recovery(ubq
))
1024 schedule_work(&ub
->quiesce_work
);
1026 schedule_work(&ub
->stop_work
);
1028 /* abort queue is for making forward progress */
1029 ublk_abort_queue(ub
, ubq
);
1034 * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
1035 * after ublk_remove() or __ublk_quiesce_dev() is started.
1037 * No need ub->mutex, monitor work are canceled after state is marked
1038 * as not LIVE, so new state is observed reliably.
1040 if (ub
->dev_info
.state
== UBLK_S_DEV_LIVE
)
1041 schedule_delayed_work(&ub
->monitor_work
,
1042 UBLK_DAEMON_MONITOR_PERIOD
);
1045 static inline bool ublk_queue_ready(struct ublk_queue
*ubq
)
1047 return ubq
->nr_io_ready
== ubq
->q_depth
;
1050 static void ublk_cancel_queue(struct ublk_queue
*ubq
)
1054 if (!ublk_queue_ready(ubq
))
1057 for (i
= 0; i
< ubq
->q_depth
; i
++) {
1058 struct ublk_io
*io
= &ubq
->ios
[i
];
1060 if (io
->flags
& UBLK_IO_FLAG_ACTIVE
)
1061 io_uring_cmd_done(io
->cmd
, UBLK_IO_RES_ABORT
, 0);
1064 /* all io commands are canceled */
1065 ubq
->nr_io_ready
= 0;
1068 /* Cancel all pending commands, must be called after del_gendisk() returns */
1069 static void ublk_cancel_dev(struct ublk_device
*ub
)
1073 for (i
= 0; i
< ub
->dev_info
.nr_hw_queues
; i
++)
1074 ublk_cancel_queue(ublk_get_queue(ub
, i
));
1077 static bool ublk_check_inflight_rq(struct request
*rq
, void *data
)
1081 if (blk_mq_request_started(rq
)) {
1088 static void ublk_wait_tagset_rqs_idle(struct ublk_device
*ub
)
1092 WARN_ON_ONCE(!blk_queue_quiesced(ub
->ub_disk
->queue
));
1095 blk_mq_tagset_busy_iter(&ub
->tag_set
,
1096 ublk_check_inflight_rq
, &idle
);
1099 msleep(UBLK_REQUEUE_DELAY_MS
);
1103 static void __ublk_quiesce_dev(struct ublk_device
*ub
)
1105 pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1106 __func__
, ub
->dev_info
.dev_id
,
1107 ub
->dev_info
.state
== UBLK_S_DEV_LIVE
?
1108 "LIVE" : "QUIESCED");
1109 blk_mq_quiesce_queue(ub
->ub_disk
->queue
);
1110 ublk_wait_tagset_rqs_idle(ub
);
1111 ub
->dev_info
.state
= UBLK_S_DEV_QUIESCED
;
1112 ublk_cancel_dev(ub
);
1113 /* we are going to release task_struct of ubq_daemon and resets
1114 * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
1115 * Besides, monitor_work is not necessary in QUIESCED state since we have
1116 * already scheduled quiesce_work and quiesced all ubqs.
1118 * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
1119 * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
1121 cancel_delayed_work_sync(&ub
->monitor_work
);
1124 static void ublk_quiesce_work_fn(struct work_struct
*work
)
1126 struct ublk_device
*ub
=
1127 container_of(work
, struct ublk_device
, quiesce_work
);
1129 mutex_lock(&ub
->mutex
);
1130 if (ub
->dev_info
.state
!= UBLK_S_DEV_LIVE
)
1132 __ublk_quiesce_dev(ub
);
1134 mutex_unlock(&ub
->mutex
);
1137 static void ublk_unquiesce_dev(struct ublk_device
*ub
)
1141 pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1142 __func__
, ub
->dev_info
.dev_id
,
1143 ub
->dev_info
.state
== UBLK_S_DEV_LIVE
?
1144 "LIVE" : "QUIESCED");
1145 /* quiesce_work has run. We let requeued rqs be aborted
1146 * before running fallback_wq. "force_abort" must be seen
1147 * after request queue is unqiuesced. Then del_gendisk()
1150 for (i
= 0; i
< ub
->dev_info
.nr_hw_queues
; i
++)
1151 ublk_get_queue(ub
, i
)->force_abort
= true;
1153 blk_mq_unquiesce_queue(ub
->ub_disk
->queue
);
1154 /* We may have requeued some rqs in ublk_quiesce_queue() */
1155 blk_mq_kick_requeue_list(ub
->ub_disk
->queue
);
1158 static void ublk_stop_dev(struct ublk_device
*ub
)
1160 mutex_lock(&ub
->mutex
);
1161 if (ub
->dev_info
.state
== UBLK_S_DEV_DEAD
)
1163 if (ublk_can_use_recovery(ub
)) {
1164 if (ub
->dev_info
.state
== UBLK_S_DEV_LIVE
)
1165 __ublk_quiesce_dev(ub
);
1166 ublk_unquiesce_dev(ub
);
1168 del_gendisk(ub
->ub_disk
);
1169 ub
->dev_info
.state
= UBLK_S_DEV_DEAD
;
1170 ub
->dev_info
.ublksrv_pid
= -1;
1171 put_disk(ub
->ub_disk
);
1174 ublk_cancel_dev(ub
);
1175 mutex_unlock(&ub
->mutex
);
1176 cancel_delayed_work_sync(&ub
->monitor_work
);
1179 /* device can only be started after all IOs are ready */
1180 static void ublk_mark_io_ready(struct ublk_device
*ub
, struct ublk_queue
*ubq
)
1182 mutex_lock(&ub
->mutex
);
1184 if (ublk_queue_ready(ubq
)) {
1185 ubq
->ubq_daemon
= current
;
1186 get_task_struct(ubq
->ubq_daemon
);
1187 ub
->nr_queues_ready
++;
1189 if (ub
->nr_queues_ready
== ub
->dev_info
.nr_hw_queues
)
1190 complete_all(&ub
->completion
);
1191 mutex_unlock(&ub
->mutex
);
1194 static void ublk_handle_need_get_data(struct ublk_device
*ub
, int q_id
,
1197 struct ublk_queue
*ubq
= ublk_get_queue(ub
, q_id
);
1198 struct request
*req
= blk_mq_tag_to_rq(ub
->tag_set
.tags
[q_id
], tag
);
1200 ublk_queue_cmd(ubq
, req
, true);
1203 static int ublk_ch_uring_cmd(struct io_uring_cmd
*cmd
, unsigned int issue_flags
)
1205 struct ublksrv_io_cmd
*ub_cmd
= (struct ublksrv_io_cmd
*)cmd
->cmd
;
1206 struct ublk_device
*ub
= cmd
->file
->private_data
;
1207 struct ublk_queue
*ubq
;
1209 u32 cmd_op
= cmd
->cmd_op
;
1210 unsigned tag
= ub_cmd
->tag
;
1213 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
1214 __func__
, cmd
->cmd_op
, ub_cmd
->q_id
, tag
,
1217 if (!(issue_flags
& IO_URING_F_SQE128
))
1220 if (ub_cmd
->q_id
>= ub
->dev_info
.nr_hw_queues
)
1223 ubq
= ublk_get_queue(ub
, ub_cmd
->q_id
);
1224 if (!ubq
|| ub_cmd
->q_id
!= ubq
->q_id
)
1227 if (ubq
->ubq_daemon
&& ubq
->ubq_daemon
!= current
)
1230 if (tag
>= ubq
->q_depth
)
1233 io
= &ubq
->ios
[tag
];
1235 /* there is pending io cmd, something must be wrong */
1236 if (io
->flags
& UBLK_IO_FLAG_ACTIVE
) {
1242 * ensure that the user issues UBLK_IO_NEED_GET_DATA
1243 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1245 if ((!!(io
->flags
& UBLK_IO_FLAG_NEED_GET_DATA
))
1246 ^ (cmd_op
== UBLK_IO_NEED_GET_DATA
))
1250 case UBLK_IO_FETCH_REQ
:
1251 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
1252 if (ublk_queue_ready(ubq
)) {
1257 * The io is being handled by server, so COMMIT_RQ is expected
1258 * instead of FETCH_REQ
1260 if (io
->flags
& UBLK_IO_FLAG_OWNED_BY_SRV
)
1262 /* FETCH_RQ has to provide IO buffer */
1266 io
->flags
|= UBLK_IO_FLAG_ACTIVE
;
1267 io
->addr
= ub_cmd
->addr
;
1269 ublk_mark_io_ready(ub
, ubq
);
1271 case UBLK_IO_COMMIT_AND_FETCH_REQ
:
1272 /* FETCH_RQ has to provide IO buffer */
1275 if (!(io
->flags
& UBLK_IO_FLAG_OWNED_BY_SRV
))
1277 io
->addr
= ub_cmd
->addr
;
1278 io
->flags
|= UBLK_IO_FLAG_ACTIVE
;
1280 ublk_commit_completion(ub
, ub_cmd
);
1282 case UBLK_IO_NEED_GET_DATA
:
1283 if (!(io
->flags
& UBLK_IO_FLAG_OWNED_BY_SRV
))
1285 io
->addr
= ub_cmd
->addr
;
1287 io
->flags
|= UBLK_IO_FLAG_ACTIVE
;
1288 ublk_handle_need_get_data(ub
, ub_cmd
->q_id
, ub_cmd
->tag
);
1293 return -EIOCBQUEUED
;
1296 io_uring_cmd_done(cmd
, ret
, 0);
1297 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
1298 __func__
, cmd_op
, tag
, ret
, io
->flags
);
1299 return -EIOCBQUEUED
;
1302 static const struct file_operations ublk_ch_fops
= {
1303 .owner
= THIS_MODULE
,
1304 .open
= ublk_ch_open
,
1305 .release
= ublk_ch_release
,
1306 .llseek
= no_llseek
,
1307 .uring_cmd
= ublk_ch_uring_cmd
,
1308 .mmap
= ublk_ch_mmap
,
1311 static void ublk_deinit_queue(struct ublk_device
*ub
, int q_id
)
1313 int size
= ublk_queue_cmd_buf_size(ub
, q_id
);
1314 struct ublk_queue
*ubq
= ublk_get_queue(ub
, q_id
);
1316 if (ubq
->ubq_daemon
)
1317 put_task_struct(ubq
->ubq_daemon
);
1318 if (ubq
->io_cmd_buf
)
1319 free_pages((unsigned long)ubq
->io_cmd_buf
, get_order(size
));
1322 static int ublk_init_queue(struct ublk_device
*ub
, int q_id
)
1324 struct ublk_queue
*ubq
= ublk_get_queue(ub
, q_id
);
1325 gfp_t gfp_flags
= GFP_KERNEL
| __GFP_ZERO
;
1329 ubq
->flags
= ub
->dev_info
.flags
;
1331 ubq
->q_depth
= ub
->dev_info
.queue_depth
;
1332 size
= ublk_queue_cmd_buf_size(ub
, q_id
);
1334 ptr
= (void *) __get_free_pages(gfp_flags
, get_order(size
));
1338 ubq
->io_cmd_buf
= ptr
;
1343 static void ublk_deinit_queues(struct ublk_device
*ub
)
1345 int nr_queues
= ub
->dev_info
.nr_hw_queues
;
1351 for (i
= 0; i
< nr_queues
; i
++)
1352 ublk_deinit_queue(ub
, i
);
1353 kfree(ub
->__queues
);
1356 static int ublk_init_queues(struct ublk_device
*ub
)
1358 int nr_queues
= ub
->dev_info
.nr_hw_queues
;
1359 int depth
= ub
->dev_info
.queue_depth
;
1360 int ubq_size
= sizeof(struct ublk_queue
) + depth
* sizeof(struct ublk_io
);
1361 int i
, ret
= -ENOMEM
;
1363 ub
->queue_size
= ubq_size
;
1364 ub
->__queues
= kcalloc(nr_queues
, ubq_size
, GFP_KERNEL
);
1368 for (i
= 0; i
< nr_queues
; i
++) {
1369 if (ublk_init_queue(ub
, i
))
1373 init_completion(&ub
->completion
);
1377 ublk_deinit_queues(ub
);
1381 static int ublk_alloc_dev_number(struct ublk_device
*ub
, int idx
)
1386 spin_lock(&ublk_idr_lock
);
1387 /* allocate id, if @id >= 0, we're requesting that specific id */
1389 err
= idr_alloc(&ublk_index_idr
, ub
, i
, i
+ 1, GFP_NOWAIT
);
1393 err
= idr_alloc(&ublk_index_idr
, ub
, 0, 0, GFP_NOWAIT
);
1395 spin_unlock(&ublk_idr_lock
);
1398 ub
->ub_number
= err
;
1403 static void ublk_free_dev_number(struct ublk_device
*ub
)
1405 spin_lock(&ublk_idr_lock
);
1406 idr_remove(&ublk_index_idr
, ub
->ub_number
);
1407 wake_up_all(&ublk_idr_wq
);
1408 spin_unlock(&ublk_idr_lock
);
1411 static void ublk_cdev_rel(struct device
*dev
)
1413 struct ublk_device
*ub
= container_of(dev
, struct ublk_device
, cdev_dev
);
1415 blk_mq_free_tag_set(&ub
->tag_set
);
1416 ublk_deinit_queues(ub
);
1417 ublk_free_dev_number(ub
);
1418 mutex_destroy(&ub
->mutex
);
1422 static int ublk_add_chdev(struct ublk_device
*ub
)
1424 struct device
*dev
= &ub
->cdev_dev
;
1425 int minor
= ub
->ub_number
;
1428 dev
->parent
= ublk_misc
.this_device
;
1429 dev
->devt
= MKDEV(MAJOR(ublk_chr_devt
), minor
);
1430 dev
->class = ublk_chr_class
;
1431 dev
->release
= ublk_cdev_rel
;
1432 device_initialize(dev
);
1434 ret
= dev_set_name(dev
, "ublkc%d", minor
);
1438 cdev_init(&ub
->cdev
, &ublk_ch_fops
);
1439 ret
= cdev_device_add(&ub
->cdev
, dev
);
1448 static void ublk_stop_work_fn(struct work_struct
*work
)
1450 struct ublk_device
*ub
=
1451 container_of(work
, struct ublk_device
, stop_work
);
1456 /* align max io buffer size with PAGE_SIZE */
1457 static void ublk_align_max_io_size(struct ublk_device
*ub
)
1459 unsigned int max_io_bytes
= ub
->dev_info
.max_io_buf_bytes
;
1461 ub
->dev_info
.max_io_buf_bytes
=
1462 round_down(max_io_bytes
, PAGE_SIZE
);
1465 static int ublk_add_tag_set(struct ublk_device
*ub
)
1467 ub
->tag_set
.ops
= &ublk_mq_ops
;
1468 ub
->tag_set
.nr_hw_queues
= ub
->dev_info
.nr_hw_queues
;
1469 ub
->tag_set
.queue_depth
= ub
->dev_info
.queue_depth
;
1470 ub
->tag_set
.numa_node
= NUMA_NO_NODE
;
1471 ub
->tag_set
.cmd_size
= sizeof(struct ublk_rq_data
);
1472 ub
->tag_set
.flags
= BLK_MQ_F_SHOULD_MERGE
;
1473 ub
->tag_set
.driver_data
= ub
;
1474 return blk_mq_alloc_tag_set(&ub
->tag_set
);
1477 static void ublk_remove(struct ublk_device
*ub
)
1480 cancel_work_sync(&ub
->stop_work
);
1481 cancel_work_sync(&ub
->quiesce_work
);
1482 cdev_device_del(&ub
->cdev
, &ub
->cdev_dev
);
1483 put_device(&ub
->cdev_dev
);
1486 static struct ublk_device
*ublk_get_device_from_id(int idx
)
1488 struct ublk_device
*ub
= NULL
;
1493 spin_lock(&ublk_idr_lock
);
1494 ub
= idr_find(&ublk_index_idr
, idx
);
1496 ub
= ublk_get_device(ub
);
1497 spin_unlock(&ublk_idr_lock
);
1502 static int ublk_ctrl_start_dev(struct io_uring_cmd
*cmd
)
1504 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1505 int ublksrv_pid
= (int)header
->data
[0];
1506 struct ublk_device
*ub
;
1507 struct gendisk
*disk
;
1510 if (ublksrv_pid
<= 0)
1513 ub
= ublk_get_device_from_id(header
->dev_id
);
1517 wait_for_completion_interruptible(&ub
->completion
);
1519 schedule_delayed_work(&ub
->monitor_work
, UBLK_DAEMON_MONITOR_PERIOD
);
1521 mutex_lock(&ub
->mutex
);
1522 if (ub
->dev_info
.state
== UBLK_S_DEV_LIVE
||
1523 test_bit(UB_STATE_USED
, &ub
->state
)) {
1528 disk
= blk_mq_alloc_disk(&ub
->tag_set
, ub
);
1530 ret
= PTR_ERR(disk
);
1533 sprintf(disk
->disk_name
, "ublkb%d", ub
->ub_number
);
1534 disk
->fops
= &ub_fops
;
1535 disk
->private_data
= ub
;
1537 ub
->dev_info
.ublksrv_pid
= ublksrv_pid
;
1540 ret
= ublk_apply_params(ub
);
1544 get_device(&ub
->cdev_dev
);
1545 ret
= add_disk(disk
);
1548 * Has to drop the reference since ->free_disk won't be
1549 * called in case of add_disk failure.
1551 ublk_put_device(ub
);
1554 set_bit(UB_STATE_USED
, &ub
->state
);
1555 ub
->dev_info
.state
= UBLK_S_DEV_LIVE
;
1560 mutex_unlock(&ub
->mutex
);
1561 ublk_put_device(ub
);
1565 static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd
*cmd
)
1567 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1568 void __user
*argp
= (void __user
*)(unsigned long)header
->addr
;
1569 struct ublk_device
*ub
;
1570 cpumask_var_t cpumask
;
1571 unsigned long queue
;
1572 unsigned int retlen
;
1576 if (header
->len
* BITS_PER_BYTE
< nr_cpu_ids
)
1578 if (header
->len
& (sizeof(unsigned long)-1))
1583 ub
= ublk_get_device_from_id(header
->dev_id
);
1587 queue
= header
->data
[0];
1588 if (queue
>= ub
->dev_info
.nr_hw_queues
)
1589 goto out_put_device
;
1592 if (!zalloc_cpumask_var(&cpumask
, GFP_KERNEL
))
1593 goto out_put_device
;
1595 for_each_possible_cpu(i
) {
1596 if (ub
->tag_set
.map
[HCTX_TYPE_DEFAULT
].mq_map
[i
] == queue
)
1597 cpumask_set_cpu(i
, cpumask
);
1601 retlen
= min_t(unsigned short, header
->len
, cpumask_size());
1602 if (copy_to_user(argp
, cpumask
, retlen
))
1603 goto out_free_cpumask
;
1604 if (retlen
!= header
->len
&&
1605 clear_user(argp
+ retlen
, header
->len
- retlen
))
1606 goto out_free_cpumask
;
1610 free_cpumask_var(cpumask
);
1612 ublk_put_device(ub
);
1616 static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info
*info
)
1618 pr_devel("%s: dev id %d flags %llx\n", __func__
,
1619 info
->dev_id
, info
->flags
);
1620 pr_devel("\t nr_hw_queues %d queue_depth %d\n",
1621 info
->nr_hw_queues
, info
->queue_depth
);
1624 static int ublk_ctrl_add_dev(struct io_uring_cmd
*cmd
)
1626 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1627 void __user
*argp
= (void __user
*)(unsigned long)header
->addr
;
1628 struct ublksrv_ctrl_dev_info info
;
1629 struct ublk_device
*ub
;
1632 if (header
->len
< sizeof(info
) || !header
->addr
)
1634 if (header
->queue_id
!= (u16
)-1) {
1635 pr_warn("%s: queue_id is wrong %x\n",
1636 __func__
, header
->queue_id
);
1639 if (copy_from_user(&info
, argp
, sizeof(info
)))
1641 ublk_dump_dev_info(&info
);
1642 if (header
->dev_id
!= info
.dev_id
) {
1643 pr_warn("%s: dev id not match %u %u\n",
1644 __func__
, header
->dev_id
, info
.dev_id
);
1648 ret
= mutex_lock_killable(&ublk_ctl_mutex
);
1653 ub
= kzalloc(sizeof(*ub
), GFP_KERNEL
);
1656 mutex_init(&ub
->mutex
);
1657 spin_lock_init(&ub
->mm_lock
);
1658 INIT_WORK(&ub
->quiesce_work
, ublk_quiesce_work_fn
);
1659 INIT_WORK(&ub
->stop_work
, ublk_stop_work_fn
);
1660 INIT_DELAYED_WORK(&ub
->monitor_work
, ublk_daemon_monitor_work
);
1662 ret
= ublk_alloc_dev_number(ub
, header
->dev_id
);
1666 memcpy(&ub
->dev_info
, &info
, sizeof(info
));
1668 /* update device id */
1669 ub
->dev_info
.dev_id
= ub
->ub_number
;
1672 * 64bit flags will be copied back to userspace as feature
1673 * negotiation result, so have to clear flags which driver
1674 * doesn't support yet, then userspace can get correct flags
1675 * (features) to handle.
1677 ub
->dev_info
.flags
&= UBLK_F_ALL
;
1679 if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK
))
1680 ub
->dev_info
.flags
|= UBLK_F_URING_CMD_COMP_IN_TASK
;
1682 /* We are not ready to support zero copy */
1683 ub
->dev_info
.flags
&= ~UBLK_F_SUPPORT_ZERO_COPY
;
1685 ub
->dev_info
.nr_hw_queues
= min_t(unsigned int,
1686 ub
->dev_info
.nr_hw_queues
, nr_cpu_ids
);
1687 ublk_align_max_io_size(ub
);
1689 ret
= ublk_init_queues(ub
);
1691 goto out_free_dev_number
;
1693 ret
= ublk_add_tag_set(ub
);
1695 goto out_deinit_queues
;
1698 if (copy_to_user(argp
, &ub
->dev_info
, sizeof(info
)))
1699 goto out_free_tag_set
;
1702 * Add the char dev so that ublksrv daemon can be setup.
1703 * ublk_add_chdev() will cleanup everything if it fails.
1705 ret
= ublk_add_chdev(ub
);
1709 blk_mq_free_tag_set(&ub
->tag_set
);
1711 ublk_deinit_queues(ub
);
1712 out_free_dev_number
:
1713 ublk_free_dev_number(ub
);
1715 mutex_destroy(&ub
->mutex
);
1718 mutex_unlock(&ublk_ctl_mutex
);
1722 static inline bool ublk_idr_freed(int id
)
1726 spin_lock(&ublk_idr_lock
);
1727 ptr
= idr_find(&ublk_index_idr
, id
);
1728 spin_unlock(&ublk_idr_lock
);
1733 static int ublk_ctrl_del_dev(int idx
)
1735 struct ublk_device
*ub
;
1738 ret
= mutex_lock_killable(&ublk_ctl_mutex
);
1742 ub
= ublk_get_device_from_id(idx
);
1745 ublk_put_device(ub
);
1752 * Wait until the idr is removed, then it can be reused after
1753 * DEL_DEV command is returned.
1756 wait_event(ublk_idr_wq
, ublk_idr_freed(idx
));
1757 mutex_unlock(&ublk_ctl_mutex
);
1762 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd
*cmd
)
1764 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1766 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
1767 __func__
, cmd
->cmd_op
, header
->dev_id
, header
->queue_id
,
1768 header
->data
[0], header
->addr
, header
->len
);
1771 static int ublk_ctrl_stop_dev(struct io_uring_cmd
*cmd
)
1773 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1774 struct ublk_device
*ub
;
1776 ub
= ublk_get_device_from_id(header
->dev_id
);
1781 cancel_work_sync(&ub
->stop_work
);
1782 cancel_work_sync(&ub
->quiesce_work
);
1784 ublk_put_device(ub
);
1788 static int ublk_ctrl_get_dev_info(struct io_uring_cmd
*cmd
)
1790 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1791 void __user
*argp
= (void __user
*)(unsigned long)header
->addr
;
1792 struct ublk_device
*ub
;
1795 if (header
->len
< sizeof(struct ublksrv_ctrl_dev_info
) || !header
->addr
)
1798 ub
= ublk_get_device_from_id(header
->dev_id
);
1802 if (copy_to_user(argp
, &ub
->dev_info
, sizeof(ub
->dev_info
)))
1804 ublk_put_device(ub
);
1809 static int ublk_ctrl_get_params(struct io_uring_cmd
*cmd
)
1811 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1812 void __user
*argp
= (void __user
*)(unsigned long)header
->addr
;
1813 struct ublk_params_header ph
;
1814 struct ublk_device
*ub
;
1817 if (header
->len
<= sizeof(ph
) || !header
->addr
)
1820 if (copy_from_user(&ph
, argp
, sizeof(ph
)))
1823 if (ph
.len
> header
->len
|| !ph
.len
)
1826 if (ph
.len
> sizeof(struct ublk_params
))
1827 ph
.len
= sizeof(struct ublk_params
);
1829 ub
= ublk_get_device_from_id(header
->dev_id
);
1833 mutex_lock(&ub
->mutex
);
1834 if (copy_to_user(argp
, &ub
->params
, ph
.len
))
1838 mutex_unlock(&ub
->mutex
);
1840 ublk_put_device(ub
);
1844 static int ublk_ctrl_set_params(struct io_uring_cmd
*cmd
)
1846 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1847 void __user
*argp
= (void __user
*)(unsigned long)header
->addr
;
1848 struct ublk_params_header ph
;
1849 struct ublk_device
*ub
;
1852 if (header
->len
<= sizeof(ph
) || !header
->addr
)
1855 if (copy_from_user(&ph
, argp
, sizeof(ph
)))
1858 if (ph
.len
> header
->len
|| !ph
.len
|| !ph
.types
)
1861 if (ph
.len
> sizeof(struct ublk_params
))
1862 ph
.len
= sizeof(struct ublk_params
);
1864 ub
= ublk_get_device_from_id(header
->dev_id
);
1868 /* parameters can only be changed when device isn't live */
1869 mutex_lock(&ub
->mutex
);
1870 if (ub
->dev_info
.state
== UBLK_S_DEV_LIVE
) {
1872 } else if (copy_from_user(&ub
->params
, argp
, ph
.len
)) {
1875 /* clear all we don't support yet */
1876 ub
->params
.types
&= UBLK_PARAM_TYPE_ALL
;
1877 ret
= ublk_validate_params(ub
);
1879 mutex_unlock(&ub
->mutex
);
1880 ublk_put_device(ub
);
1885 static void ublk_queue_reinit(struct ublk_device
*ub
, struct ublk_queue
*ubq
)
1889 WARN_ON_ONCE(!(ubq
->ubq_daemon
&& ubq_daemon_is_dying(ubq
)));
1890 /* All old ioucmds have to be completed */
1891 WARN_ON_ONCE(ubq
->nr_io_ready
);
1892 /* old daemon is PF_EXITING, put it now */
1893 put_task_struct(ubq
->ubq_daemon
);
1894 /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
1895 ubq
->ubq_daemon
= NULL
;
1897 for (i
= 0; i
< ubq
->q_depth
; i
++) {
1898 struct ublk_io
*io
= &ubq
->ios
[i
];
1900 /* forget everything now and be ready for new FETCH_REQ */
1907 static int ublk_ctrl_start_recovery(struct io_uring_cmd
*cmd
)
1909 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1910 struct ublk_device
*ub
;
1914 ub
= ublk_get_device_from_id(header
->dev_id
);
1918 mutex_lock(&ub
->mutex
);
1919 if (!ublk_can_use_recovery(ub
))
1922 * START_RECOVERY is only allowd after:
1924 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
1925 * and related io_uring ctx is freed so file struct of /dev/ublkcX is
1928 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
1929 * (a)has quiesced request queue
1930 * (b)has requeued every inflight rqs whose io_flags is ACTIVE
1931 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
1932 * (d)has completed/camceled all ioucmds owned by ther dying process
1934 if (test_bit(UB_STATE_OPEN
, &ub
->state
) ||
1935 ub
->dev_info
.state
!= UBLK_S_DEV_QUIESCED
) {
1939 pr_devel("%s: start recovery for dev id %d.\n", __func__
, header
->dev_id
);
1940 for (i
= 0; i
< ub
->dev_info
.nr_hw_queues
; i
++)
1941 ublk_queue_reinit(ub
, ublk_get_queue(ub
, i
));
1942 /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
1944 ub
->nr_queues_ready
= 0;
1945 init_completion(&ub
->completion
);
1948 mutex_unlock(&ub
->mutex
);
1949 ublk_put_device(ub
);
1953 static int ublk_ctrl_end_recovery(struct io_uring_cmd
*cmd
)
1955 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
1956 int ublksrv_pid
= (int)header
->data
[0];
1957 struct ublk_device
*ub
;
1960 ub
= ublk_get_device_from_id(header
->dev_id
);
1964 pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
1965 __func__
, ub
->dev_info
.nr_hw_queues
, header
->dev_id
);
1966 /* wait until new ubq_daemon sending all FETCH_REQ */
1967 wait_for_completion_interruptible(&ub
->completion
);
1968 pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
1969 __func__
, ub
->dev_info
.nr_hw_queues
, header
->dev_id
);
1971 mutex_lock(&ub
->mutex
);
1972 if (!ublk_can_use_recovery(ub
))
1975 if (ub
->dev_info
.state
!= UBLK_S_DEV_QUIESCED
) {
1979 ub
->dev_info
.ublksrv_pid
= ublksrv_pid
;
1980 pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
1981 __func__
, ublksrv_pid
, header
->dev_id
);
1982 blk_mq_unquiesce_queue(ub
->ub_disk
->queue
);
1983 pr_devel("%s: queue unquiesced, dev id %d.\n",
1984 __func__
, header
->dev_id
);
1985 blk_mq_kick_requeue_list(ub
->ub_disk
->queue
);
1986 ub
->dev_info
.state
= UBLK_S_DEV_LIVE
;
1987 schedule_delayed_work(&ub
->monitor_work
, UBLK_DAEMON_MONITOR_PERIOD
);
1990 mutex_unlock(&ub
->mutex
);
1991 ublk_put_device(ub
);
1995 static int ublk_ctrl_uring_cmd(struct io_uring_cmd
*cmd
,
1996 unsigned int issue_flags
)
1998 struct ublksrv_ctrl_cmd
*header
= (struct ublksrv_ctrl_cmd
*)cmd
->cmd
;
2001 ublk_ctrl_cmd_dump(cmd
);
2003 if (!(issue_flags
& IO_URING_F_SQE128
))
2007 if (!capable(CAP_SYS_ADMIN
))
2011 switch (cmd
->cmd_op
) {
2012 case UBLK_CMD_START_DEV
:
2013 ret
= ublk_ctrl_start_dev(cmd
);
2015 case UBLK_CMD_STOP_DEV
:
2016 ret
= ublk_ctrl_stop_dev(cmd
);
2018 case UBLK_CMD_GET_DEV_INFO
:
2019 ret
= ublk_ctrl_get_dev_info(cmd
);
2021 case UBLK_CMD_ADD_DEV
:
2022 ret
= ublk_ctrl_add_dev(cmd
);
2024 case UBLK_CMD_DEL_DEV
:
2025 ret
= ublk_ctrl_del_dev(header
->dev_id
);
2027 case UBLK_CMD_GET_QUEUE_AFFINITY
:
2028 ret
= ublk_ctrl_get_queue_affinity(cmd
);
2030 case UBLK_CMD_GET_PARAMS
:
2031 ret
= ublk_ctrl_get_params(cmd
);
2033 case UBLK_CMD_SET_PARAMS
:
2034 ret
= ublk_ctrl_set_params(cmd
);
2036 case UBLK_CMD_START_USER_RECOVERY
:
2037 ret
= ublk_ctrl_start_recovery(cmd
);
2039 case UBLK_CMD_END_USER_RECOVERY
:
2040 ret
= ublk_ctrl_end_recovery(cmd
);
2046 io_uring_cmd_done(cmd
, ret
, 0);
2047 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
2048 __func__
, ret
, cmd
->cmd_op
, header
->dev_id
, header
->queue_id
);
2049 return -EIOCBQUEUED
;
2052 static const struct file_operations ublk_ctl_fops
= {
2053 .open
= nonseekable_open
,
2054 .uring_cmd
= ublk_ctrl_uring_cmd
,
2055 .owner
= THIS_MODULE
,
2056 .llseek
= noop_llseek
,
2059 static struct miscdevice ublk_misc
= {
2060 .minor
= MISC_DYNAMIC_MINOR
,
2061 .name
= "ublk-control",
2062 .fops
= &ublk_ctl_fops
,
2065 static int __init
ublk_init(void)
2069 init_waitqueue_head(&ublk_idr_wq
);
2071 ret
= misc_register(&ublk_misc
);
2075 ret
= alloc_chrdev_region(&ublk_chr_devt
, 0, UBLK_MINORS
, "ublk-char");
2077 goto unregister_mis
;
2079 ublk_chr_class
= class_create(THIS_MODULE
, "ublk-char");
2080 if (IS_ERR(ublk_chr_class
)) {
2081 ret
= PTR_ERR(ublk_chr_class
);
2082 goto free_chrdev_region
;
2087 unregister_chrdev_region(ublk_chr_devt
, UBLK_MINORS
);
2089 misc_deregister(&ublk_misc
);
2093 static void __exit
ublk_exit(void)
2095 struct ublk_device
*ub
;
2098 class_destroy(ublk_chr_class
);
2100 misc_deregister(&ublk_misc
);
2102 idr_for_each_entry(&ublk_index_idr
, ub
, id
)
2105 idr_destroy(&ublk_index_idr
);
2106 unregister_chrdev_region(ublk_chr_devt
, UBLK_MINORS
);
2109 module_init(ublk_init
);
2110 module_exit(ublk_exit
);
2112 MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
2113 MODULE_LICENSE("GPL");