]>
Commit | Line | Data |
---|---|---|
427cc94b GKH |
1 | From 2bc13b83e6298486371761de503faeffd15b7534 Mon Sep 17 00:00:00 2001 |
2 | From: NeilBrown <neilb@suse.com> | |
3 | Date: Fri, 29 Mar 2019 10:46:17 -0700 | |
4 | Subject: md: batch flush requests. | |
5 | ||
6 | From: NeilBrown <neilb@suse.com> | |
7 | ||
8 | commit 2bc13b83e6298486371761de503faeffd15b7534 upstream. | |
9 | ||
10 | Currently if many flush requests are submitted to an md device is quick | |
11 | succession, they are serialized and can take a long to process them all. | |
12 | We don't really need to call flush all those times - a single flush call | |
13 | can satisfy all requests submitted before it started. | |
14 | So keep track of when the current flush started and when it finished, | |
15 | allow any pending flush that was requested before the flush started | |
16 | to complete without waiting any more. | |
17 | ||
18 | Test results from Xiao: | |
19 | ||
20 | Test is done on a raid10 device which is created by 4 SSDs. The tool is | |
21 | dbench. | |
22 | ||
23 | 1. The latest linux stable kernel | |
24 | Operation Count AvgLat MaxLat | |
25 | -------------------------------------------------- | |
26 | Deltree 768 10.509 78.305 | |
27 | Flush 2078376 0.013 10.094 | |
28 | Close 21787697 0.019 18.821 | |
29 | LockX 96580 0.007 3.184 | |
30 | Mkdir 384 0.008 0.062 | |
31 | Rename 1255883 0.191 23.534 | |
32 | ReadX 46495589 0.020 14.230 | |
33 | WriteX 14790591 7.123 60.706 | |
34 | Unlink 5989118 0.440 54.551 | |
35 | UnlockX 96580 0.005 2.736 | |
36 | FIND_FIRST 10393845 0.042 12.079 | |
37 | SET_FILE_INFORMATION 2415558 0.129 10.088 | |
38 | QUERY_FILE_INFORMATION 4711725 0.005 8.462 | |
39 | QUERY_PATH_INFORMATION 26883327 0.032 21.715 | |
40 | QUERY_FS_INFORMATION 4929409 0.010 8.238 | |
41 | NTCreateX 29660080 0.100 53.268 | |
42 | ||
43 | Throughput 1034.88 MB/sec (sync open) 128 clients 128 procs | |
44 | max_latency=60.712 ms | |
45 | ||
46 | 2. With patch1 "Revert "MD: fix lock contention for flush bios"" | |
47 | Operation Count AvgLat MaxLat | |
48 | -------------------------------------------------- | |
49 | Deltree 256 8.326 36.761 | |
50 | Flush 693291 3.974 180.269 | |
51 | Close 7266404 0.009 36.929 | |
52 | LockX 32160 0.006 0.840 | |
53 | Mkdir 128 0.008 0.021 | |
54 | Rename 418755 0.063 29.945 | |
55 | ReadX 15498708 0.007 7.216 | |
56 | WriteX 4932310 22.482 267.928 | |
57 | Unlink 1997557 0.109 47.553 | |
58 | UnlockX 32160 0.004 1.110 | |
59 | FIND_FIRST 3465791 0.036 7.320 | |
60 | SET_FILE_INFORMATION 805825 0.015 1.561 | |
61 | QUERY_FILE_INFORMATION 1570950 0.005 2.403 | |
62 | QUERY_PATH_INFORMATION 8965483 0.013 14.277 | |
63 | QUERY_FS_INFORMATION 1643626 0.009 3.314 | |
64 | NTCreateX 9892174 0.061 41.278 | |
65 | ||
66 | Throughput 345.009 MB/sec (sync open) 128 clients 128 procs | |
67 | max_latency=267.939 m | |
68 | ||
69 | 3. With patch1 and patch2 | |
70 | Operation Count AvgLat MaxLat | |
71 | -------------------------------------------------- | |
72 | Deltree 768 9.570 54.588 | |
73 | Flush 2061354 0.666 15.102 | |
74 | Close 21604811 0.012 25.697 | |
75 | LockX 95770 0.007 1.424 | |
76 | Mkdir 384 0.008 0.053 | |
77 | Rename 1245411 0.096 12.263 | |
78 | ReadX 46103198 0.011 12.116 | |
79 | WriteX 14667988 7.375 60.069 | |
80 | Unlink 5938936 0.173 30.905 | |
81 | UnlockX 95770 0.005 4.147 | |
82 | FIND_FIRST 10306407 0.041 11.715 | |
83 | SET_FILE_INFORMATION 2395987 0.048 7.640 | |
84 | QUERY_FILE_INFORMATION 4672371 0.005 9.291 | |
85 | QUERY_PATH_INFORMATION 26656735 0.018 19.719 | |
86 | QUERY_FS_INFORMATION 4887940 0.010 7.654 | |
87 | NTCreateX 29410811 0.059 28.551 | |
88 | ||
89 | Throughput 1026.21 MB/sec (sync open) 128 clients 128 procs | |
90 | max_latency=60.075 ms | |
91 | ||
92 | Cc: <stable@vger.kernel.org> # v4.19+ | |
93 | Tested-by: Xiao Ni <xni@redhat.com> | |
94 | Signed-off-by: NeilBrown <neilb@suse.com> | |
95 | Signed-off-by: Song Liu <songliubraving@fb.com> | |
96 | Signed-off-by: Jens Axboe <axboe@kernel.dk> | |
97 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
98 | ||
99 | --- | |
100 | drivers/md/md.c | 27 +++++++++++++++++++++++---- | |
101 | drivers/md/md.h | 3 +++ | |
102 | 2 files changed, 26 insertions(+), 4 deletions(-) | |
103 | ||
104 | --- a/drivers/md/md.c | |
105 | +++ b/drivers/md/md.c | |
106 | @@ -427,6 +427,7 @@ static void submit_flushes(struct work_s | |
107 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); | |
108 | struct md_rdev *rdev; | |
109 | ||
110 | + mddev->start_flush = ktime_get_boottime(); | |
111 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); | |
112 | atomic_set(&mddev->flush_pending, 1); | |
113 | rcu_read_lock(); | |
114 | @@ -467,6 +468,7 @@ static void md_submit_flush_data(struct | |
115 | * could wait for this and below md_handle_request could wait for those | |
116 | * bios because of suspend check | |
117 | */ | |
118 | + mddev->last_flush = mddev->start_flush; | |
119 | mddev->flush_bio = NULL; | |
120 | wake_up(&mddev->sb_wait); | |
121 | ||
122 | @@ -481,15 +483,32 @@ static void md_submit_flush_data(struct | |
123 | ||
124 | void md_flush_request(struct mddev *mddev, struct bio *bio) | |
125 | { | |
126 | + ktime_t start = ktime_get_boottime(); | |
127 | spin_lock_irq(&mddev->lock); | |
128 | wait_event_lock_irq(mddev->sb_wait, | |
129 | - !mddev->flush_bio, | |
130 | + !mddev->flush_bio || | |
131 | + ktime_after(mddev->last_flush, start), | |
132 | mddev->lock); | |
133 | - mddev->flush_bio = bio; | |
134 | + if (!ktime_after(mddev->last_flush, start)) { | |
135 | + WARN_ON(mddev->flush_bio); | |
136 | + mddev->flush_bio = bio; | |
137 | + bio = NULL; | |
138 | + } | |
139 | spin_unlock_irq(&mddev->lock); | |
140 | ||
141 | - INIT_WORK(&mddev->flush_work, submit_flushes); | |
142 | - queue_work(md_wq, &mddev->flush_work); | |
143 | + if (!bio) { | |
144 | + INIT_WORK(&mddev->flush_work, submit_flushes); | |
145 | + queue_work(md_wq, &mddev->flush_work); | |
146 | + } else { | |
147 | + /* flush was performed for some other bio while we waited. */ | |
148 | + if (bio->bi_iter.bi_size == 0) | |
149 | + /* an empty barrier - all done */ | |
150 | + bio_endio(bio); | |
151 | + else { | |
152 | + bio->bi_opf &= ~REQ_PREFLUSH; | |
153 | + mddev->pers->make_request(mddev, bio); | |
154 | + } | |
155 | + } | |
156 | } | |
157 | EXPORT_SYMBOL(md_flush_request); | |
158 | ||
159 | --- a/drivers/md/md.h | |
160 | +++ b/drivers/md/md.h | |
161 | @@ -463,6 +463,9 @@ struct mddev { | |
162 | */ | |
163 | struct bio *flush_bio; | |
164 | atomic_t flush_pending; | |
165 | + ktime_t start_flush, last_flush; /* last_flush is when the last completed | |
166 | + * flush was started. | |
167 | + */ | |
168 | struct work_struct flush_work; | |
169 | struct work_struct event_work; /* used by dm to report failure event */ | |
170 | void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); |