From: Jens Axboe Subject: Block layer fixes for 2.6.28 Patch-Mainline: 2.6.28 This is a combined patchset with block layer fixes from 2.6.28. Commit IDs: 97dee27d1c4d6041ff1cc8150db95fe3eab6be5a 00bbda44114e70fc9879731be3c888122b1de8b1 7452d2a2be657becb2f385d0e0864ba51f1ae694 075a108f7d4dd24b8b69e59edcdf1a0fd84e6541 7a1b6029bf9ff3d0636e318d2482031dc493df16 b3a5faf3cefbff4b69ca181767b882bbd6189aaf 8fe902de23b4f4012db91f538cafd864c63308e7 dfef13dad8d34d0a9e83adee3e8cd9f94cca465e d2629dd70132f90f9d1bca07572197e9adea25b1 1f08a4484a223cb337e0466042005421cd55d22b fcdc7361d2925596d69d0538d738c08c221a69c9 cd93bcfa9ca9b15051220614160131c53d7f33f0 d371ca6b8a21a617b8607d23f7202197ad40482a 910ee03b1e61d5cfb121dfb1ee7c127f18bdae01 Signed-off-by: Hannes Reinecke --- Documentation/DocBook/kernel-api.tmpl | 4 Documentation/block/deadline-iosched.txt | 14 +- block/Makefile | 4 block/blk-core.c | 166 +++++++------------------------ block/blk-exec.c | 6 - block/blk-integrity.c | 4 block/blk-map.c | 16 +- block/blk-merge.c | 100 ------------------ block/blk-settings.c | 8 - block/blk-softirq.c | 103 +++++++++++++++++++ block/blk-tag.c | 8 - block/cfq-iosched.c | 47 +++++++- block/deadline-iosched.c | 40 ++----- block/elevator.c | 5 block/genhd.c | 5 drivers/block/ps3disk.c | 9 + drivers/block/virtio_blk.c | 4 drivers/md/raid1.c | 4 drivers/md/raid10.c | 4 drivers/md/raid5.c | 66 +++++++++--- fs/bio.c | 16 -- include/linux/bio.h | 33 ------ include/linux/blkdev.h | 18 +-- 23 files changed, 310 insertions(+), 374 deletions(-) --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -364,6 +364,10 @@ X!Edrivers/pnp/system.c !Eblock/blk-barrier.c !Eblock/blk-tag.c !Iblock/blk-tag.c +!Eblock/blk-integrity.c +!Iblock/blktrace.c +!Iblock/genhd.c +!Eblock/genhd.c --- a/Documentation/block/deadline-iosched.txt +++ b/Documentation/block/deadline-iosched.txt @@ -30,12 +30,18 @@ write_expire (in ms) Similar to read_expire mentioned above, but for writes. -fifo_batch +fifo_batch (number of requests) ---------- -When a read request expires its deadline, we must move some requests from -the sorted io scheduler list to the block device dispatch queue. fifo_batch -controls how many requests we move. +Requests are grouped into ``batches'' of a particular data direction (read or +write) which are serviced in increasing sector order. To limit extra seeking, +deadline expiries are only checked between batches. fifo_batch controls the +maximum number of requests per batch. + +This parameter tunes the balance between per-request latency and aggregate +throughput. When low latency is the primary concern, smaller is better (where +a value of 1 yields first-come first-served behaviour). Increasing fifo_batch +generally improves throughput, at the cost of latency variation. writes_starved (number of dispatches) --- a/block/Makefile +++ b/block/Makefile @@ -4,8 +4,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \ - cmd-filter.o + blk-exec.o blk-merge.o blk-softirq.o ioctl.o genhd.o \ + scsi_ioctl.o cmd-filter.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o --- a/block/blk-core.c +++ b/block/blk-core.c @@ -26,8 +26,6 @@ #include #include #include -#include -#include #include #include @@ -50,8 +48,6 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - static void drive_stat_acct(struct request *rq, int new_io) { struct hd_struct *part; @@ -531,7 +527,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node); * request queue; this lock will be taken also from interrupt context, so irq * disabling is needed for it. * - * Function returns a pointer to the initialized request queue, or NULL if + * Function returns a pointer to the initialized request queue, or %NULL if * it didn't succeed. * * Note: @@ -913,7 +909,7 @@ void blk_requeue_request(struct request_ EXPORT_SYMBOL(blk_requeue_request); /** - * blk_insert_request - insert a special request in to a request queue + * blk_insert_request - insert a special request into a request queue * @q: request queue where request should be inserted * @rq: request to be inserted * @at_head: insert request at head or tail of queue @@ -923,8 +919,8 @@ EXPORT_SYMBOL(blk_requeue_request); * Many block devices need to execute commands asynchronously, so they don't * block the whole kernel from preemption during request execution. This is * accomplished normally by inserting aritficial requests tagged as - * REQ_SPECIAL in to the corresponding request queue, and letting them be - * scheduled for actual execution by the request queue. + * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them + * be scheduled for actual execution by the request queue. * * We have the option of inserting the head or the tail of the queue. * Typically we use the tail for new ioctls and so forth. We use the head @@ -1322,7 +1318,7 @@ static inline int bio_check_eod(struct b } /** - * generic_make_request: hand a buffer to its device driver for I/O + * generic_make_request - hand a buffer to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * generic_make_request() is used to make I/O requests of block @@ -1480,13 +1476,13 @@ void generic_make_request(struct bio *bi EXPORT_SYMBOL(generic_make_request); /** - * submit_bio: submit a bio to the block device layer for I/O + * submit_bio - submit a bio to the block device layer for I/O * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and * uses that function to do most of the work. Both are fairly rough - * interfaces, @bio must be presetup and ready for I/O. + * interfaces; @bio must be presetup and ready for I/O. * */ void submit_bio(int rw, struct bio *bio) @@ -1524,7 +1520,7 @@ EXPORT_SYMBOL(submit_bio); /** * __end_that_request_first - end I/O on a request * @req: the request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: @@ -1532,8 +1528,8 @@ EXPORT_SYMBOL(submit_bio); * for the next range of segments (if any) in the cluster. * * Return: - * 0 - we are done with this request, call end_that_request_last() - * 1 - still buffers pending for this request + * %0 - we are done with this request, call end_that_request_last() + * %1 - still buffers pending for this request **/ static int __end_that_request_first(struct request *req, int error, int nr_bytes) @@ -1544,7 +1540,7 @@ static int __end_that_request_first(stru blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); /* - * for a REQ_BLOCK_PC request, we want to carry any eventual + * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual * sense key with us all the way through */ if (!blk_pc_request(req)) @@ -1646,82 +1642,6 @@ static int __end_that_request_first(stru } /* - * splice the completion data to a local structure and hand off to - * process_completion_queue() to complete the requests - */ -static void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = &__get_cpu_var(blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, donelist); - list_del_init(&rq->donelist); - rq->q->softirq_done_fn(rq); - } -} - -static int __cpuinit blk_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - &__get_cpu_var(blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - } - - return NOTIFY_OK; -} - - -static struct notifier_block blk_cpu_notifier __cpuinitdata = { - .notifier_call = blk_cpu_notify, -}; - -/** - * blk_complete_request - end I/O on a request - * @req: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions, - * unless the driver actually implements this in its completion callback - * through requeueing. The actual completion happens out-of-order, - * through a softirq handler. The user must have registered a completion - * callback through blk_queue_softirq_done(). - **/ - -void blk_complete_request(struct request *req) -{ - struct list_head *cpu_list; - unsigned long flags; - - BUG_ON(!req->q->softirq_done_fn); - - local_irq_save(flags); - - cpu_list = &__get_cpu_var(blk_cpu_done); - list_add_tail(&req->donelist, cpu_list); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_complete_request); - -/* * queue lock must be held */ static void end_that_request_last(struct request *req, int error) @@ -1810,11 +1730,11 @@ EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); /** * end_queued_request - end all I/O on a queued request * @rq: the request being processed - * @uptodate: error value or 0/1 uptodate flag + * @uptodate: error value or %0/%1 uptodate flag * * Description: * Ends all I/O on a request, and removes it from the block layer queues. - * Not suitable for normal IO completion, unless the driver still has + * Not suitable for normal I/O completion, unless the driver still has * the request attached to the block layer. * **/ @@ -1827,7 +1747,7 @@ EXPORT_SYMBOL(end_queued_request); /** * end_dequeued_request - end all I/O on a dequeued request * @rq: the request being processed - * @uptodate: error value or 0/1 uptodate flag + * @uptodate: error value or %0/%1 uptodate flag * * Description: * Ends all I/O on a request. The request must already have been @@ -1845,14 +1765,14 @@ EXPORT_SYMBOL(end_dequeued_request); /** * end_request - end I/O on the current segment of the request * @req: the request being processed - * @uptodate: error value or 0/1 uptodate flag + * @uptodate: error value or %0/%1 uptodate flag * * Description: * Ends I/O on the current segment of a request. If that is the only * remaining segment, the request is also completed and freed. * - * This is a remnant of how older block drivers handled IO completions. - * Modern drivers typically end IO on the full request in one go, unless + * This is a remnant of how older block drivers handled I/O completions. + * Modern drivers typically end I/O on the full request in one go, unless * they have a residual value to account for. For that case this function * isn't really useful, unless the residual just happens to be the * full current segment. In other words, don't use this function in new @@ -1870,12 +1790,12 @@ EXPORT_SYMBOL(end_request); /** * blk_end_io - Generic end_io function to complete a request. * @rq: the request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @drv_callback: function called between completion of bios in the request * and completion of the request. - * If the callback returns non 0, this helper returns without + * If the callback returns non %0, this helper returns without * completion of the request. * * Description: @@ -1883,8 +1803,8 @@ EXPORT_SYMBOL(end_request); * If @rq has leftover, sets it up for the next range of segments. * * Return: - * 0 - we are done with this request - * 1 - this request is not freed yet, it still has pending buffers. + * %0 - we are done with this request + * %1 - this request is not freed yet, it still has pending buffers. **/ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes, @@ -1893,7 +1813,7 @@ static int blk_end_io(struct request *rq struct request_queue *q = rq->q; unsigned long flags = 0UL; - if (bio_has_data(rq->bio) || blk_discard_rq(rq)) { + if (rq->bio) { if (__end_that_request_first(rq, error, nr_bytes)) return 1; @@ -1919,7 +1839,7 @@ static int blk_end_io(struct request *rq /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: @@ -1927,8 +1847,8 @@ static int blk_end_io(struct request *rq * If @rq has leftover, sets it up for the next range of segments. * * Return: - * 0 - we are done with this request - * 1 - still buffers pending for this request + * %0 - we are done with this request + * %1 - still buffers pending for this request **/ int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { @@ -1939,20 +1859,19 @@ EXPORT_SYMBOL_GPL(blk_end_request); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * * Description: * Must be called with queue lock held unlike blk_end_request(). * * Return: - * 0 - we are done with this request - * 1 - still buffers pending for this request + * %0 - we are done with this request + * %1 - still buffers pending for this request **/ int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - if ((bio_has_data(rq->bio) || blk_discard_rq(rq)) && - __end_that_request_first(rq, error, nr_bytes)) + if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) return 1; add_disk_randomness(rq->rq_disk); @@ -1966,7 +1885,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request); /** * blk_end_bidi_request - Helper function for drivers to complete bidi request. * @rq: the bidi request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * @@ -1974,8 +1893,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request); * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. * * Return: - * 0 - we are done with this request - * 1 - still buffers pending for this request + * %0 - we are done with this request + * %1 - still buffers pending for this request **/ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) @@ -1987,11 +1906,11 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request); /** * blk_end_request_callback - Special helper function for tricky drivers * @rq: the request being processed - * @error: 0 for success, < 0 for error + * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete * @drv_callback: function called between completion of bios in the request * and completion of the request. - * If the callback returns non 0, this helper returns without + * If the callback returns non %0, this helper returns without * completion of the request. * * Description: @@ -2004,10 +1923,10 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request); * Don't use this interface in other places anymore. * * Return: - * 0 - we are done with this request - * 1 - this request is not freed yet. - * this request still has pending buffers or - * the driver doesn't want to finish this request yet. + * %0 - we are done with this request + * %1 - this request is not freed yet. + * this request still has pending buffers or + * the driver doesn't want to finish this request yet. **/ int blk_end_request_callback(struct request *rq, int error, unsigned int nr_bytes, @@ -2026,7 +1945,6 @@ void blk_rq_bio_prep(struct request_queu if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->nr_hw_segments = bio_hw_segments(q, bio); rq->buffer = bio_data(bio); } rq->current_nr_sectors = bio_cur_sectors(bio); @@ -2054,8 +1972,6 @@ EXPORT_SYMBOL(kblockd_flush_work); int __init blk_dev_init(void) { - int i; - kblockd_workqueue = create_workqueue("kblockd"); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); @@ -2066,12 +1982,6 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("blkdev_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - register_hotcpu_notifier(&blk_cpu_notifier); - return 0; } --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -16,7 +16,7 @@ /** * blk_end_sync_rq - executes a completion event on a request * @rq: request to complete - * @error: end io status of the request + * @error: end I/O status of the request */ static void blk_end_sync_rq(struct request *rq, int error) { @@ -41,7 +41,7 @@ static void blk_end_sync_rq(struct reque * @done: I/O completion handler * * Description: - * Insert a fully prepared request at the back of the io scheduler queue + * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. */ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait) * @at_head: insert request at head or tail of queue * * Description: - * Insert a fully prepared request at the back of the io scheduler queue + * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. */ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -109,8 +109,8 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); /** * blk_integrity_compare - Compare integrity profile of two block devices - * @b1: Device to compare - * @b2: Device to compare + * @bd1: Device to compare + * @bd2: Device to compare * * Description: Meta-devices like DM and MD need to verify that all * sub-devices use the same integrity format before advertising to --- a/block/blk-map.c +++ b/block/blk-map.c @@ -85,17 +85,17 @@ static int __blk_rq_map_user(struct requ } /** - * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage + * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted * @rq: request structure to fill * @ubuf: the user buffer * @len: length of user data * * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise + * Data will be mapped directly for zero copy I/O, if possible. Otherwise * a kernel bounce buffer is used. * - * A matching blk_rq_unmap_user() must be issued at the end of io, while + * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. * * Note: The mapped bio may need to be bounced through blk_queue_bounce() @@ -154,7 +154,7 @@ unmap_rq: EXPORT_SYMBOL(blk_rq_map_user); /** - * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage + * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted * @rq: request to map data to * @iov: pointer to the iovec @@ -162,10 +162,10 @@ EXPORT_SYMBOL(blk_rq_map_user); * @len: I/O byte count * * Description: - * Data will be mapped directly for zero copy io, if possible. Otherwise + * Data will be mapped directly for zero copy I/O, if possible. Otherwise * a kernel bounce buffer is used. * - * A matching blk_rq_unmap_user() must be issued at the end of io, while + * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. * * Note: The mapped bio may need to be bounced through blk_queue_bounce() @@ -224,7 +224,7 @@ int blk_rq_map_user_iov(struct request_q * Description: * Unmap a rq previously mapped by blk_rq_map_user(). The caller must * supply the original rq->bio from the blk_rq_map_user() return, since - * the io completion may have changed rq->bio. + * the I/O completion may have changed rq->bio. */ int blk_rq_unmap_user(struct bio *bio) { @@ -250,7 +250,7 @@ int blk_rq_unmap_user(struct bio *bio) EXPORT_SYMBOL(blk_rq_unmap_user); /** - * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage + * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -41,12 +41,9 @@ void blk_recalc_rq_sectors(struct reques void blk_recalc_rq_segments(struct request *rq) { int nr_phys_segs; - int nr_hw_segs; unsigned int phys_size; - unsigned int hw_size; struct bio_vec *bv, *bvprv = NULL; int seg_size; - int hw_seg_size; int cluster; struct req_iterator iter; int high, highprv = 1; @@ -56,8 +53,8 @@ void blk_recalc_rq_segments(struct reque return; cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); - hw_seg_size = seg_size = 0; - phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; + seg_size = 0; + phys_size = nr_phys_segs = 0; rq_for_each_segment(bv, rq, iter) { /* * the trick here is making sure that a high page is never @@ -66,7 +63,7 @@ void blk_recalc_rq_segments(struct reque */ high = page_to_pfn(bv->bv_page) > q->bounce_pfn; if (high || highprv) - goto new_hw_segment; + goto new_segment; if (cluster) { if (seg_size + bv->bv_len > q->max_segment_size) goto new_segment; @@ -74,27 +71,12 @@ void blk_recalc_rq_segments(struct reque goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) goto new_segment; - if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) - goto new_hw_segment; seg_size += bv->bv_len; - hw_seg_size += bv->bv_len; bvprv = bv; continue; } new_segment: - if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && - !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) - hw_seg_size += bv->bv_len; - else { -new_hw_segment: - if (nr_hw_segs == 1 && - hw_seg_size > rq->bio->bi_hw_front_size) - rq->bio->bi_hw_front_size = hw_seg_size; - hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; - nr_hw_segs++; - } - if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) rq->bio->bi_seg_front_size = seg_size; @@ -104,17 +86,11 @@ new_hw_segment: highprv = high; } - if (nr_hw_segs == 1 && - hw_seg_size > rq->bio->bi_hw_front_size) - rq->bio->bi_hw_front_size = hw_seg_size; - if (hw_seg_size > rq->biotail->bi_hw_back_size) - rq->biotail->bi_hw_back_size = hw_seg_size; if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) rq->bio->bi_seg_front_size = seg_size; if (seg_size > rq->biotail->bi_seg_back_size) rq->biotail->bi_seg_back_size = seg_size; rq->nr_phys_segments = nr_phys_segs; - rq->nr_hw_segments = nr_hw_segs; } void blk_recount_segments(struct request_queue *q, struct bio *bio) @@ -127,7 +103,6 @@ void blk_recount_segments(struct request blk_recalc_rq_segments(&rq); bio->bi_next = nxt; bio->bi_phys_segments = rq.nr_phys_segments; - bio->bi_hw_segments = rq.nr_hw_segments; bio->bi_flags |= (1 << BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -158,23 +133,6 @@ static int blk_phys_contig_segment(struc return 0; } -static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, - struct bio *nxt) -{ - if (!bio_flagged(bio, BIO_SEG_VALID)) - blk_recount_segments(q, bio); - if (!bio_flagged(nxt, BIO_SEG_VALID)) - blk_recount_segments(q, nxt); - if (bio_has_data(bio) && - (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || - BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))) - return 0; - if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) - return 0; - - return 1; -} - /* * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries @@ -288,10 +246,9 @@ static inline int ll_new_hw_segment(stru struct request *req, struct bio *bio) { - int nr_hw_segs = bio_hw_segments(q, bio); int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments + if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) @@ -303,7 +260,6 @@ static inline int ll_new_hw_segment(stru * This will form the start of a new hw segment. Bump both * counters. */ - req->nr_hw_segments += nr_hw_segs; req->nr_phys_segments += nr_phys_segs; return 1; } @@ -312,7 +268,6 @@ int ll_back_merge_fn(struct request_queu struct bio *bio) { unsigned short max_sectors; - int len; if (unlikely(blk_pc_request(req))) max_sectors = q->max_hw_sectors; @@ -329,20 +284,6 @@ int ll_back_merge_fn(struct request_queu blk_recount_segments(q, req->biotail); if (!bio_flagged(bio, BIO_SEG_VALID)) blk_recount_segments(q, bio); - len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; - if (!bio_has_data(bio) || - (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) - && !BIOVEC_VIRT_OVERSIZE(len))) { - int mergeable = ll_new_mergeable(q, req, bio); - - if (mergeable) { - if (req->nr_hw_segments == 1) - req->bio->bi_hw_front_size = len; - if (bio->bi_hw_segments == 1) - bio->bi_hw_back_size = len; - } - return mergeable; - } return ll_new_hw_segment(q, req, bio); } @@ -351,7 +292,6 @@ int ll_front_merge_fn(struct request_que struct bio *bio) { unsigned short max_sectors; - int len; if (unlikely(blk_pc_request(req))) max_sectors = q->max_hw_sectors; @@ -365,24 +305,10 @@ int ll_front_merge_fn(struct request_que q->last_merge = NULL; return 0; } - len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; if (!bio_flagged(bio, BIO_SEG_VALID)) blk_recount_segments(q, bio); if (!bio_flagged(req->bio, BIO_SEG_VALID)) blk_recount_segments(q, req->bio); - if (!bio_has_data(bio) || - (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && - !BIOVEC_VIRT_OVERSIZE(len))) { - int mergeable = ll_new_mergeable(q, req, bio); - - if (mergeable) { - if (bio->bi_hw_segments == 1) - bio->bi_hw_front_size = len; - if (req->nr_hw_segments == 1) - req->biotail->bi_hw_back_size = len; - } - return mergeable; - } return ll_new_hw_segment(q, req, bio); } @@ -391,7 +317,6 @@ static int ll_merge_requests_fn(struct r struct request *next) { int total_phys_segments; - int total_hw_segments; unsigned int seg_size = req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; @@ -420,26 +345,11 @@ static int ll_merge_requests_fn(struct r if (total_phys_segments > q->max_phys_segments) return 0; - total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; - if (blk_hw_contig_segment(q, req->biotail, next->bio)) { - int len = req->biotail->bi_hw_back_size + - next->bio->bi_hw_front_size; - /* - * propagate the combined length to the end of the requests - */ - if (req->nr_hw_segments == 1) - req->bio->bi_hw_front_size = len; - if (next->nr_hw_segments == 1) - next->biotail->bi_hw_back_size = len; - total_hw_segments--; - } - - if (total_hw_segments > q->max_hw_segments) + if (total_phys_segments > q->max_hw_segments) return 0; /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; - req->nr_hw_segments = total_hw_segments; return 1; } --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -144,7 +144,7 @@ EXPORT_SYMBOL(blk_queue_make_request); * Different hardware can have different requirements as to what pages * it can do I/O directly to. A low level driver can call * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @page. + * buffers for doing I/O to pages residing above @dma_addr. **/ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) { @@ -229,7 +229,7 @@ EXPORT_SYMBOL(blk_queue_max_phys_segment * Description: * Enables a low level driver to set an upper limit on the number of * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give as once + * address/length pairs the host adapter can actually give at once * to the device. **/ void blk_queue_max_hw_segments(struct request_queue *q, @@ -410,7 +410,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary * @mask: alignment mask * * description: - * set required memory and length aligment for direct dma transactions. + * set required memory and length alignment for direct dma transactions. * this is used when buiding direct io requests for the queue. * **/ @@ -426,7 +426,7 @@ EXPORT_SYMBOL(blk_queue_dma_alignment); * @mask: alignment mask * * description: - * update required memory and length aligment for direct dma transactions. + * update required memory and length alignment for direct dma transactions. * If the requested alignment is larger than the current alignment, then * the current queue alignment is updated to the new value, otherwise it * is left alone. The design of this is to allow multiple objects --- /dev/null +++ b/block/blk-softirq.c @@ -0,0 +1,103 @@ +/* + * Functions related to softirq rq completions + */ +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + +static int __cpuinit blk_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + &__get_cpu_var(blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + } + + return NOTIFY_OK; +} + + +static struct notifier_block blk_cpu_notifier __cpuinitdata = { + .notifier_call = blk_cpu_notify, +}; + +/* + * splice the completion data to a local structure and hand off to + * process_completion_queue() to complete the requests + */ +static void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = &__get_cpu_var(blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, donelist); + list_del_init(&rq->donelist); + rq->q->softirq_done_fn(rq); + } +} + +/** + * blk_complete_request - end I/O on a request + * @req: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions, + * unless the driver actually implements this in its completion callback + * through requeueing. The actual completion happens out-of-order, + * through a softirq handler. The user must have registered a completion + * callback through blk_queue_softirq_done(). + **/ + +void blk_complete_request(struct request *req) +{ + struct list_head *cpu_list; + unsigned long flags; + + BUG_ON(!req->q->softirq_done_fn); + + local_irq_save(flags); + + cpu_list = &__get_cpu_var(blk_cpu_done); + list_add_tail(&req->donelist, cpu_list); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_complete_request); + +int __init blk_softirq_init(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + register_hotcpu_notifier(&blk_cpu_notifier); + return 0; +} +subsys_initcall(blk_softirq_init); --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -29,7 +29,7 @@ EXPORT_SYMBOL(blk_queue_find_tag); * __blk_free_tags - release a given set of tag maintenance info * @bqt: the tag map to free * - * Tries to free the specified @bqt@. Returns true if it was + * Tries to free the specified @bqt. Returns true if it was * actually freed and false if there are still references using it */ static int __blk_free_tags(struct blk_queue_tag *bqt) @@ -78,7 +78,7 @@ void __blk_queue_free_tags(struct reques * blk_free_tags - release a given set of tag maintenance info * @bqt: the tag map to free * - * For externally managed @bqt@ frees the map. Callers of this + * For externally managed @bqt frees the map. Callers of this * function must guarantee to have released all the queues that * might have been using this tag map. */ @@ -94,7 +94,7 @@ EXPORT_SYMBOL(blk_free_tags); * @q: the request queue for the device * * Notes: - * This is used to disabled tagged queuing to a device, yet leave + * This is used to disable tagged queuing to a device, yet leave * queue in function. **/ void blk_queue_free_tags(struct request_queue *q) @@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_queue_resize_tags); * @rq: the request that has completed * * Description: - * Typically called when end_that_request_first() returns 0, meaning + * Typically called when end_that_request_first() returns %0, meaning * all transfers have been done for a request. It's important to call * this function before end_that_request_last(), as that will put the * request back on the free list thus corrupting the internal tag list. --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -39,6 +39,7 @@ static int cfq_slice_idle = HZ / 125; #define CFQ_MIN_TT (2) #define CFQ_SLICE_SCALE (5) +#define CFQ_HW_QUEUE_MIN (5) #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) @@ -86,7 +87,14 @@ struct cfq_data { int rq_in_driver; int sync_flight; + + /* + * queue-depth detection + */ + int rq_queued; int hw_tag; + int hw_tag_samples; + int rq_in_driver_peak; /* * idle window management @@ -654,15 +662,6 @@ static void cfq_activate_request(struct cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", cfqd->rq_in_driver); - /* - * If the depth is larger 1, it really could be queueing. But lets - * make the mark a little higher - idling could still be good for - * low queueing, and a low queueing number could also just indicate - * a SCSI mid layer like behaviour where limit+1 is often seen. - */ - if (!cfqd->hw_tag && cfqd->rq_in_driver > 4) - cfqd->hw_tag = 1; - cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; } @@ -686,6 +685,7 @@ static void cfq_remove_request(struct re list_del_init(&rq->queuelist); cfq_del_rq_rb(rq); + cfqq->cfqd->rq_queued--; if (rq_is_meta(rq)) { WARN_ON(!cfqq->meta_pending); cfqq->meta_pending--; @@ -1833,6 +1833,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, s { struct cfq_io_context *cic = RQ_CIC(rq); + cfqd->rq_queued++; if (rq_is_meta(rq)) cfqq->meta_pending++; @@ -1880,6 +1881,31 @@ static void cfq_insert_request(struct re cfq_rq_enqueued(cfqd, cfqq, rq); } +/* + * Update hw_tag based on peak queue depth over 50 samples under + * sufficient load. + */ +static void cfq_update_hw_tag(struct cfq_data *cfqd) +{ + if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + + if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && + cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + return; + + if (cfqd->hw_tag_samples++ < 50) + return; + + if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) + cfqd->hw_tag = 1; + else + cfqd->hw_tag = 0; + + cfqd->hw_tag_samples = 0; + cfqd->rq_in_driver_peak = 0; +} + static void cfq_completed_request(struct request_queue *q, struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); @@ -1890,6 +1916,8 @@ static void cfq_completed_request(struct now = jiffies; cfq_log_cfqq(cfqd, cfqq, "complete"); + cfq_update_hw_tag(cfqd); + WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); cfqd->rq_in_driver--; @@ -2200,6 +2228,7 @@ static void *cfq_init_queue(struct reque cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->hw_tag = 1; return cfqd; } --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -33,7 +33,7 @@ struct deadline_data { */ struct rb_root sort_list[2]; struct list_head fifo_list[2]; - + /* * next in sort order. read, write or both are NULL */ @@ -53,7 +53,11 @@ struct deadline_data { static void deadline_move_request(struct deadline_data *, struct request *); -#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))]) +static inline struct rb_root * +deadline_rb_root(struct deadline_data *dd, struct request *rq) +{ + return &dd->sort_list[rq_data_dir(rq)]; +} /* * get the request after `rq' in sector-sorted order @@ -72,15 +76,11 @@ deadline_latter_request(struct request * static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { - struct rb_root *root = RQ_RB_ROOT(dd, rq); + struct rb_root *root = deadline_rb_root(dd, rq); struct request *__alias; -retry: - __alias = elv_rb_add(root, rq); - if (unlikely(__alias)) { + while (unlikely(__alias = elv_rb_add(root, rq))) deadline_move_request(dd, __alias); - goto retry; - } } static inline void @@ -91,7 +91,7 @@ deadline_del_rq_rb(struct deadline_data if (dd->next_rq[data_dir] == rq) dd->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(RQ_RB_ROOT(dd, rq), rq); + elv_rb_del(deadline_rb_root(dd, rq), rq); } /* @@ -106,7 +106,7 @@ deadline_add_request(struct request_queu deadline_add_rq_rb(dd, rq); /* - * set expire time (only used for reads) and add to fifo list + * set expire time and add to fifo list */ rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); @@ -162,7 +162,7 @@ static void deadline_merged_request(stru * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(RQ_RB_ROOT(dd, req), req); + elv_rb_del(deadline_rb_root(dd, req), req); deadline_add_rq_rb(dd, req); } } @@ -212,7 +212,7 @@ deadline_move_request(struct deadline_da dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); - dd->last_sector = rq->sector + rq->nr_sectors; + dd->last_sector = rq_end_sector(rq); /* * take it off the sort and fifo list, move @@ -222,7 +222,7 @@ deadline_move_request(struct deadline_da } /* - * deadline_check_fifo returns 0 if there are no expired reads on the fifo, + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) @@ -258,17 +258,9 @@ static int deadline_dispatch_requests(st else rq = dd->next_rq[READ]; - if (rq) { - /* we have a "next request" */ - - if (dd->last_sector != rq->sector) - /* end the batch on a non sequential request */ - dd->batching += dd->fifo_batch; - - if (dd->batching < dd->fifo_batch) - /* we are still entitled to batch */ - goto dispatch_request; - } + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; /* * at this point we are not running a batch. select the appropriate --- a/block/elevator.c +++ b/block/elevator.c @@ -34,8 +34,7 @@ #include #include #include - -#include +#include static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -790,7 +789,6 @@ struct request *elv_next_request(struct * device can handle */ rq->nr_phys_segments++; - rq->nr_hw_segments++; } if (!q->prep_rq_fn) @@ -813,7 +811,6 @@ struct request *elv_next_request(struct * so that we don't add it again */ --rq->nr_phys_segments; - --rq->nr_hw_segments; } rq = NULL; --- a/block/genhd.c +++ b/block/genhd.c @@ -211,10 +211,11 @@ void unlink_gendisk(struct gendisk *disk /** * get_gendisk - get partitioning information for a given device - * @dev: device to get partitioning information for + * @devt: device to get partitioning information for + * @part: returned partition index * * This function gets the structure containing partitioning - * information for the given device @dev. + * information for the given device @devt. */ struct gendisk *get_gendisk(dev_t devt, int *part) { --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -199,7 +199,8 @@ static void ps3disk_do_request(struct ps if (blk_fs_request(req)) { if (ps3disk_submit_request_sg(dev, req)) break; - } else if (req->cmd_type == REQ_TYPE_FLUSH) { + } else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK && + req->cmd[0] == REQ_LB_OP_FLUSH) { if (ps3disk_submit_flush_request(dev, req)) break; } else { @@ -257,7 +258,8 @@ static irqreturn_t ps3disk_interrupt(int return IRQ_HANDLED; } - if (req->cmd_type == REQ_TYPE_FLUSH) { + if (req->cmd_type == REQ_TYPE_LINUX_BLOCK && + req->cmd[0] == REQ_LB_OP_FLUSH) { read = 0; num_sectors = req->hard_cur_sectors; op = "flush"; @@ -405,7 +407,8 @@ static void ps3disk_prepare_flush(struct dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); - req->cmd_type = REQ_TYPE_FLUSH; + req->cmd_type = REQ_TYPE_LINUX_BLOCK; + req->cmd[0] = REQ_LB_OP_FLUSH; } static unsigned long ps3disk_mask; --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -84,11 +84,11 @@ static bool do_req(struct request_queue if (blk_fs_request(vbr->req)) { vbr->out_hdr.type = 0; vbr->out_hdr.sector = vbr->req->sector; - vbr->out_hdr.ioprio = vbr->req->ioprio; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); } else if (blk_pc_request(vbr->req)) { vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = vbr->req->ioprio; + vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); } else { /* We don't put anything else in the queue. */ BUG(); --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1303,9 +1303,6 @@ static void sync_request_write(mddev_t * sbio->bi_size = r1_bio->sectors << 9; sbio->bi_idx = 0; sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; sbio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bi_flags |= 1 << BIO_UPTODATE; sbio->bi_next = NULL; @@ -1791,7 +1788,6 @@ static sector_t sync_request(mddev_t *md bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1346,9 +1346,6 @@ static void sync_request_write(mddev_t * tbio->bi_size = r10_bio->sectors << 9; tbio->bi_idx = 0; tbio->bi_phys_segments = 0; - tbio->bi_hw_segments = 0; - tbio->bi_hw_front_size = 0; - tbio->bi_hw_back_size = 0; tbio->bi_flags &= ~(BIO_POOL_MASK - 1); tbio->bi_flags |= 1 << BIO_UPTODATE; tbio->bi_next = NULL; @@ -1948,7 +1945,6 @@ static sector_t sync_request(mddev_t *md bio->bi_vcnt = 0; bio->bi_idx = 0; bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; bio->bi_size = 0; } --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -101,6 +101,40 @@ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); #endif +/* + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits + */ +static inline int raid5_bi_phys_segments(struct bio *bio) +{ + return bio->bi_phys_segments & 0xffff; +} + +static inline int raid5_bi_hw_segments(struct bio *bio) +{ + return (bio->bi_phys_segments >> 16) & 0xffff; +} + +static inline int raid5_dec_bi_phys_segments(struct bio *bio) +{ + --bio->bi_phys_segments; + return raid5_bi_phys_segments(bio); +} + +static inline int raid5_dec_bi_hw_segments(struct bio *bio) +{ + unsigned short val = raid5_bi_hw_segments(bio); + + --val; + bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); + return val; +} + +static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) +{ + bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); +} + static inline int raid6_next_disk(int disk, int raid_disks) { disk++; @@ -507,7 +541,7 @@ static void ops_complete_biofill(void *s while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -1725,7 +1759,7 @@ static int add_stripe_bio(struct stripe_ if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments ++; + bi->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); spin_unlock(&sh->lock); @@ -1819,7 +1853,7 @@ handle_failed_stripe(raid5_conf_t *conf, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1834,7 +1868,7 @@ handle_failed_stripe(raid5_conf_t *conf, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1858,7 +1892,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } @@ -2033,7 +2067,7 @@ static void handle_stripe_clean_event(ra while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; @@ -2814,7 +2848,7 @@ static bool handle_stripe6(struct stripe copy_data(0, rbi, dev->page, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector); spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_phys_segments(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -3155,8 +3189,11 @@ static struct bio *remove_bio_from_retry if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; + /* + * this sets the active strip count to 1 and the processed + * strip count to zero (upper 8 bits) + */ bi->bi_phys_segments = 1; /* biased count of active stripes */ - bi->bi_hw_segments = 0; /* count of processed stripes */ } return bi; @@ -3206,8 +3243,7 @@ static int bio_fits_rdev(struct bio *bi) if ((bi->bi_size>>9) > q->max_sectors) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments || - bi->bi_hw_segments > q->max_hw_segments) + if (bi->bi_phys_segments > q->max_phys_segments) return 0; if (q->merge_bvec_fn) @@ -3469,7 +3505,7 @@ static int make_request(struct request_q } spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(bi); spin_unlock_irq(&conf->device_lock); if (remaining == 0) { @@ -3753,7 +3789,7 @@ static int retry_aligned_read(raid5_con sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid_bio->bi_hw_segments) + if (scnt < raid5_bi_hw_segments(raid_bio)) /* already done this stripe */ continue; @@ -3761,7 +3797,7 @@ static int retry_aligned_read(raid5_con if (!sh) { /* failed to get a stripe - must wait */ - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3769,7 +3805,7 @@ static int retry_aligned_read(raid5_con set_bit(R5_ReadError, &sh->dev[dd_idx].flags); if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_hw_segments(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } @@ -3779,7 +3815,7 @@ static int retry_aligned_read(raid5_con handled++; } spin_lock_irq(&conf->device_lock); - remaining = --raid_bio->bi_phys_segments; + remaining = raid5_dec_bi_phys_segments(raid_bio); spin_unlock_irq(&conf->device_lock); if (remaining == 0) bio_endio(raid_bio, 0); --- a/fs/bio.c +++ b/fs/bio.c @@ -208,14 +208,6 @@ inline int bio_phys_segments(struct requ return bio->bi_phys_segments; } -inline int bio_hw_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_hw_segments; -} - /** * __bio_clone - clone a bio * @bio: destination bio @@ -350,8 +342,7 @@ static int __bio_add_page(struct request */ while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_hw_segments >= q->max_hw_segments - || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { + || bio->bi_phys_segments >= q->max_hw_segments) { if (retried_segments) return 0; @@ -395,13 +386,11 @@ static int __bio_add_page(struct request } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || - BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); bio->bi_vcnt++; bio->bi_phys_segments++; - bio->bi_hw_segments++; done: bio->bi_size += len; return len; @@ -1393,7 +1382,6 @@ EXPORT_SYMBOL(bio_init); EXPORT_SYMBOL(__bio_clone); EXPORT_SYMBOL(bio_clone); EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -26,21 +26,8 @@ #ifdef CONFIG_BLOCK -/* Platforms may set this to teach the BIO layer about IOMMU hardware. */ #include -#if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY) -#define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1)) -#define BIOVEC_VIRT_OVERSIZE(x) ((x) > BIO_VMERGE_MAX_SIZE) -#else -#define BIOVEC_VIRT_START_SIZE(x) 0 -#define BIOVEC_VIRT_OVERSIZE(x) 0 -#endif - -#ifndef BIO_VMERGE_BOUNDARY -#define BIO_VMERGE_BOUNDARY 0 -#endif - #define BIO_DEBUG #ifdef BIO_DEBUG @@ -88,12 +75,7 @@ struct bio { /* Number of segments in this BIO after * physical address coalescing is performed. */ - unsigned short bi_phys_segments; - - /* Number of segments after physical and DMA remapping - * hardware coalescing is performed. - */ - unsigned short bi_hw_segments; + unsigned int bi_phys_segments; unsigned int bi_size; /* residual I/O count */ @@ -104,14 +86,6 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - /* - * To keep track of the max hw size, we account for the - * sizes of the first and last virtually mergeable segments - * in this bio - */ - unsigned int bi_hw_front_size; - unsigned int bi_hw_back_size; - unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -133,7 +107,7 @@ struct bio { #define BIO_UPTODATE 0 /* ok after I/O completion */ #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ #define BIO_EOF 2 /* out-out-bounds error */ -#define BIO_SEG_VALID 3 /* nr_hw_seg valid */ +#define BIO_SEG_VALID 3 /* bi_phys_segments valid */ #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ @@ -247,8 +221,6 @@ static inline void *bio_data(struct bio ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) #endif -#define BIOVEC_VIRT_MERGEABLE(vec1, vec2) \ - ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0) #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ @@ -346,7 +318,6 @@ extern void bio_free(struct bio *, struc extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); -extern int bio_hw_segments(struct request_queue *, struct bio *); extern void __bio_clone(struct bio *, struct bio *); extern struct bio *bio_clone(struct bio *, gfp_t); --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,7 +54,6 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_FLUSH, /* flush request */ REQ_TYPE_SPECIAL, /* driver defined type */ REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ /* @@ -76,11 +75,8 @@ enum rq_cmd_type_bits { * */ enum { - /* - * just examples for now - */ REQ_LB_OP_EJECT = 0x40, /* eject request */ - REQ_LB_OP_FLUSH = 0x41, /* flush device */ + REQ_LB_OP_FLUSH = 0x41, /* flush request */ REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; @@ -193,13 +189,6 @@ struct request { */ unsigned short nr_phys_segments; - /* Number of scatter-gather addr+len pairs after - * physical and DMA remapping hardware coalescing is performed. - * This is the number of scatter-gather entries the driver - * will actually have to deal with after DMA mapping is done. - */ - unsigned short nr_hw_segments; - unsigned short ioprio; void *special; @@ -236,6 +225,11 @@ struct request { struct request *next_rq; }; +static inline unsigned short req_get_ioprio(struct request *req) +{ + return req->ioprio; +} + /* * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME * requests. Some step values could eventually be made generic.