From: Kiyoshi Ueda Subject: Block layer fixes for request-based multipathing References: References: FATE#302108 This is a combined patch from linux-2.6.git. Commit-IDs: d6c578ec08b3f07050401ed83193b3f21729213b afac32f0c9c68698eaf7688d52de859301a0539f ebd2bf40e9cfa4ebfa614703944f4eafdf0d2c64 509395182b6b7cf7e3c1ca2cd669506d8f43ee01 88171cad9ace4b67c5298e6504d70454296afb76 Signed-off-by: Hannes Reinecke --- block/blk-core.c | 169 +++++++++++++++++++++++++++++++++++++++++++++--- block/blk-settings.c | 6 + drivers/scsi/scsi_lib.c | 32 +++++++++ include/linux/blkdev.h | 12 +++ 4 files changed, 209 insertions(+), 10 deletions(-) --- a/block/blk-core.c +++ b/block/blk-core.c @@ -592,7 +592,8 @@ blk_init_queue_node(request_fn_proc *rfn q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; - q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); + q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | + 1 << QUEUE_FLAG_STACKABLE); q->queue_lock = lock; blk_queue_segment_boundary(q, 0xffffffff); @@ -1586,6 +1587,87 @@ void blkdev_dequeue_request(struct reque EXPORT_SYMBOL(blkdev_dequeue_request); /** + * blk_rq_check_limits - Helper function to check a request for the queue limit + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * This function should also be useful for request stacking drivers + * in some cases below, so export this fuction. + * Request stacking drivers like request-based dm may change the queue + * limits while requests are in the queue (e.g. dm's table swapping). + * Such request stacking drivers should check those requests agaist + * the new queue limits again when they dispatch those requests, + * although such checkings are also done against the old queue limits + * when submitting requests. + */ +int blk_rq_check_limits(struct request_queue *q, struct request *rq) +{ + if (rq->nr_sectors > q->max_sectors || + rq->data_len > q->max_hw_sectors << 9) { + printk(KERN_ERR "%s: over max size limit.\n", __func__); + return -EIO; + } + + /* + * queue's settings related to segment counting like q->bounce_pfn + * may differ from that of other stacking queues. + * Recalculate it to check the request correctly on this queue's + * limitation. + */ + blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > q->max_phys_segments || + rq->nr_phys_segments > q->max_hw_segments) { + printk(KERN_ERR "%s: over max segments limit.\n", __func__); + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_rq_check_limits); + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +int blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + unsigned long flags; + + if (blk_rq_check_limits(q, rq)) + return -EIO; + +#ifdef CONFIG_FAIL_MAKE_REQUEST + if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && + should_fail(&fail_make_request, blk_rq_bytes(rq))) + return -EIO; +#endif + + spin_lock_irqsave(q->queue_lock, flags); + + /* + * Submitting request must be dequeued before calling this function + * because it will be linked to another request_queue + */ + BUG_ON(blk_queued_rq(rq)); + + drive_stat_acct(rq, 1); + __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** * __end_that_request_first - end I/O on a request * @req: the request being processed * @error: %0 for success, < %0 for error @@ -1857,6 +1939,22 @@ void end_request(struct request *req, in } EXPORT_SYMBOL(end_request); +static int end_that_request_data(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) +{ + if (rq->bio) { + if (__end_that_request_first(rq, error, nr_bytes)) + return 1; + + /* Bidi request must be completed as a whole */ + if (blk_bidi_rq(rq) && + __end_that_request_first(rq->next_rq, error, bidi_bytes)) + return 1; + } + + return 0; +} + /** * blk_end_io - Generic end_io function to complete a request. * @rq: the request being processed @@ -1883,15 +1981,8 @@ static int blk_end_io(struct request *rq struct request_queue *q = rq->q; unsigned long flags = 0UL; - if (rq->bio) { - if (__end_that_request_first(rq, error, nr_bytes)) - return 1; - - /* Bidi request must be completed as a whole */ - if (blk_bidi_rq(rq) && - __end_that_request_first(rq->next_rq, error, bidi_bytes)) - return 1; - } + if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) + return 1; /* Special feature for tricky drivers */ if (drv_callback && drv_callback(rq)) @@ -1974,6 +2065,36 @@ int blk_end_bidi_request(struct request EXPORT_SYMBOL_GPL(blk_end_bidi_request); /** + * blk_update_request - Special helper function for request stacking drivers + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete @rq + * + * Description: + * Ends I/O on a number of bytes attached to @rq, but doesn't complete + * the request structure even if @rq doesn't have leftover. + * If @rq has leftover, sets it up for the next range of segments. + * + * This special helper function is only for request stacking drivers + * (e.g. request-based dm) so that they can handle partial completion. + * Actual device drivers should use blk_end_request instead. + */ +void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) +{ + if (!end_that_request_data(rq, error, nr_bytes, 0)) { + /* + * These members are not updated in end_that_request_data() + * when all bios are completed. + * Update them so that the request stacking driver can find + * how many bytes remain in the request later. + */ + rq->nr_sectors = rq->hard_nr_sectors = 0; + rq->current_nr_sectors = rq->hard_cur_sectors = 0; + } +} +EXPORT_SYMBOL_GPL(blk_update_request); + +/** * blk_end_request_callback - Special helper function for tricky drivers * @rq: the request being processed * @error: %0 for success, < %0 for error @@ -2028,6 +2149,34 @@ void blk_rq_bio_prep(struct request_queu rq->rq_disk = bio->bi_bdev->bd_disk; } +/** + * blk_lld_busy - Check if underlying low-level drivers of a device are busy + * @q : the queue of the device being checked + * + * Description: + * Check if underlying low-level drivers of a device are busy. + * If the drivers want to export their busy state, they must set own + * exporting function using blk_queue_lld_busy() first. + * + * Basically, this function is used only by request stacking drivers + * to stop dispatching requests to underlying devices when underlying + * devices are busy. This behavior helps more I/O merging on the queue + * of the request stacking driver and prevents I/O throughput regression + * on burst I/O load. + * + * Return: + * 0 - Not busy (The request stacking driver should dispatch request) + * 1 - Busy (The request stacking driver should stop dispatching request) + */ +int blk_lld_busy(struct request_queue *q) +{ + if (q->lld_busy_fn) + return q->lld_busy_fn(q); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_lld_busy); + int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) { return queue_work(kblockd_workqueue, work); --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -89,6 +89,12 @@ void blk_queue_rq_timed_out(struct reque } EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); +void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn) +{ + q->lld_busy_fn = fn; +} +EXPORT_SYMBOL_GPL(blk_queue_lld_busy); + /** * blk_queue_make_request - define an alternate make_request function for a device * @q: the request queue for the device to be affected --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1465,6 +1465,37 @@ static inline int scsi_host_queue_ready( } /* + * Busy state exporting function for request stacking drivers. + * + * For efficiency, no lock is taken to check the busy state of + * shost/starget/sdev, since the returned value is not guaranteed and + * may be changed after request stacking drivers call the function, + * regardless of taking lock or not. + * + * When scsi can't dispatch I/Os anymore and needs to kill I/Os + * (e.g. !sdev), scsi needs to return 'not busy'. + * Otherwise, request stacking drivers may hold requests forever. + */ +static int scsi_lld_busy(struct request_queue *q) +{ + struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost; + struct scsi_target *starget; + + if (!sdev) + return 0; + + shost = sdev->host; + starget = scsi_target(sdev); + + if (scsi_host_in_recovery(shost) || scsi_host_is_busy(shost) || + scsi_target_is_busy(starget) || scsi_device_is_busy(sdev)) + return 1; + + return 0; +} + +/* * Kill a request for a dead device */ static void scsi_kill_request(struct request *req, struct request_queue *q) @@ -1778,6 +1809,7 @@ struct request_queue *scsi_alloc_queue(s blk_queue_prep_rq(q, scsi_prep_fn); blk_queue_softirq_done(q, scsi_softirq_done); blk_queue_rq_timed_out(q, scsi_times_out); + blk_queue_lld_busy(q, scsi_lld_busy); return q; } --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -272,6 +272,7 @@ typedef int (merge_bvec_fn) (struct requ typedef void (prepare_flush_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); +typedef int (lld_busy_fn) (struct request_queue *q); enum blk_eh_timer_return { BLK_EH_NOT_HANDLED, @@ -328,6 +329,7 @@ struct request_queue softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; + lld_busy_fn *lld_busy_fn; /* * Dispatch queue sorting @@ -443,6 +445,7 @@ struct request_queue #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ +#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ static inline int queue_is_locked(struct request_queue *q) { @@ -549,6 +552,8 @@ enum { #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) +#define blk_queue_stackable(q) \ + test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) @@ -695,6 +700,10 @@ extern void __blk_put_request(struct req extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); +extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); +extern int blk_lld_busy(struct request_queue *q); +extern int blk_insert_cloned_request(struct request_queue *q, + struct request *rq); extern void blk_plug_device(struct request_queue *); extern void blk_plug_device_unlocked(struct request_queue *); extern int blk_remove_plug(struct request_queue *); @@ -792,6 +801,8 @@ extern void blk_complete_request(struct extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); extern void blk_abort_queue(struct request_queue *); +extern void blk_update_request(struct request *rq, int error, + unsigned int nr_bytes); /* * blk_end_request() takes bytes instead of sectors as a complete size. @@ -821,6 +832,7 @@ extern void blk_queue_update_dma_pad(str extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); +extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);