]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> |
2 | Subject: dm-mpath: add service-time oriented dynamic load balancer | |
3 | References: FATE#303862,FATE#302108 | |
4 | ||
5 | This patch adds a service time oriented dynamic load balancer, | |
6 | dm-service-time. | |
7 | ||
8 | ||
9 | Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> | |
10 | Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> | |
11 | Signed-off-by: Hannes Reinecke <hare@suse.de> | |
12 | ||
13 | Index: linux-2.6.27/drivers/md/Makefile | |
14 | =================================================================== | |
15 | --- linux-2.6.27.orig/drivers/md/Makefile | |
16 | +++ linux-2.6.27/drivers/md/Makefile | |
17 | @@ -33,7 +33,8 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | |
18 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | |
19 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | |
20 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o \ | |
21 | - dm-least-pending.o dm-queue-length.o | |
22 | + dm-least-pending.o dm-queue-length.o \ | |
23 | + dm-service-time.o | |
24 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | |
25 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o | |
26 | obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \ | |
27 | Index: linux-2.6.27/drivers/md/dm-service-time.c | |
28 | =================================================================== | |
29 | --- /dev/null | |
30 | +++ linux-2.6.27/drivers/md/dm-service-time.c | |
31 | @@ -0,0 +1,312 @@ | |
32 | +/* | |
33 | + * Copyright (C) 2007-2008 NEC Corporation. All Rights Reserved. | |
34 | + * | |
35 | + * Module Author: Kiyoshi Ueda | |
36 | + * | |
37 | + * This file is released under the GPL. | |
38 | + * | |
39 | + * Throughput oriented path selector. | |
40 | + */ | |
41 | + | |
42 | +#include "dm.h" | |
43 | +#include "dm-path-selector.h" | |
44 | + | |
45 | +#define DM_MSG_PREFIX "multipath service-time" | |
46 | +#define ST_MIN_IO 2 | |
47 | +#define ST_VERSION "0.1.0" | |
48 | + | |
49 | +struct selector { | |
50 | + struct list_head valid_paths; | |
51 | + struct list_head failed_paths; | |
52 | +}; | |
53 | + | |
54 | +struct path_info { | |
55 | + struct list_head list; | |
56 | + struct dm_path *path; | |
57 | + unsigned int repeat_count; | |
58 | + | |
59 | + atomic_t in_flight; /* Total size of in-flight I/Os */ | |
60 | + size_t perf; /* Recent performance of the path */ | |
61 | + sector_t last_sectors; /* Total sectors of the last disk_stat_read */ | |
62 | + size_t last_io_ticks; /* io_ticks of the last disk_stat_read */ | |
63 | +}; | |
64 | + | |
65 | +static struct selector *alloc_selector(void) | |
66 | +{ | |
67 | + struct selector *s = kzalloc(sizeof(*s), GFP_KERNEL); | |
68 | + | |
69 | + if (s) { | |
70 | + INIT_LIST_HEAD(&s->valid_paths); | |
71 | + INIT_LIST_HEAD(&s->failed_paths); | |
72 | + } | |
73 | + | |
74 | + return s; | |
75 | +} | |
76 | + | |
77 | +static int st_create(struct path_selector *ps, unsigned argc, char **argv) | |
78 | +{ | |
79 | + struct selector *s = alloc_selector(); | |
80 | + | |
81 | + if (!s) | |
82 | + return -ENOMEM; | |
83 | + | |
84 | + ps->context = s; | |
85 | + return 0; | |
86 | +} | |
87 | + | |
88 | +static void free_paths(struct list_head *paths) | |
89 | +{ | |
90 | + struct path_info *pi, *next; | |
91 | + | |
92 | + list_for_each_entry_safe(pi, next, paths, list) { | |
93 | + list_del(&pi->list); | |
94 | + pi->path->pscontext = NULL; | |
95 | + kfree(pi); | |
96 | + } | |
97 | +} | |
98 | + | |
99 | +static void st_destroy(struct path_selector *ps) | |
100 | +{ | |
101 | + struct selector *s = (struct selector *) ps->context; | |
102 | + | |
103 | + free_paths(&s->valid_paths); | |
104 | + free_paths(&s->failed_paths); | |
105 | + kfree(s); | |
106 | + ps->context = NULL; | |
107 | +} | |
108 | + | |
109 | +static int st_status(struct path_selector *ps, struct dm_path *path, | |
110 | + status_type_t type, char *result, unsigned int maxlen) | |
111 | +{ | |
112 | + int sz = 0; | |
113 | + struct path_info *pi; | |
114 | + | |
115 | + if (!path) | |
116 | + DMEMIT("0 "); | |
117 | + else { | |
118 | + pi = path->pscontext; | |
119 | + | |
120 | + switch (type) { | |
121 | + case STATUSTYPE_INFO: | |
122 | + DMEMIT("if:%08lu pf:%06lu ", | |
123 | + (unsigned long) atomic_read(&pi->in_flight), | |
124 | + pi->perf); | |
125 | + break; | |
126 | + case STATUSTYPE_TABLE: | |
127 | + DMEMIT("%u ", pi->repeat_count); | |
128 | + break; | |
129 | + } | |
130 | + } | |
131 | + | |
132 | + return sz; | |
133 | +} | |
134 | + | |
135 | +static int st_add_path(struct path_selector *ps, struct dm_path *path, | |
136 | + int argc, char **argv, char **error) | |
137 | +{ | |
138 | + struct selector *s = (struct selector *) ps->context; | |
139 | + struct path_info *pi; | |
140 | + unsigned int repeat_count = ST_MIN_IO; | |
141 | + struct gendisk *disk = path->dev->bdev->bd_disk; | |
142 | + | |
143 | + if (argc > 1) { | |
144 | + *error = "service-time ps: incorrect number of arguments"; | |
145 | + return -EINVAL; | |
146 | + } | |
147 | + | |
148 | + /* First path argument is number of I/Os before switching path. */ | |
149 | + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | |
150 | + *error = "service-time ps: invalid repeat count"; | |
151 | + return -EINVAL; | |
152 | + } | |
153 | + | |
154 | + /* allocate the path */ | |
155 | + pi = kmalloc(sizeof(*pi), GFP_KERNEL); | |
156 | + if (!pi) { | |
157 | + *error = "service-time ps: Error allocating path context"; | |
158 | + return -ENOMEM; | |
159 | + } | |
160 | + | |
161 | + pi->path = path; | |
162 | + pi->repeat_count = repeat_count; | |
163 | + | |
164 | + pi->perf = 0; | |
165 | + pi->last_sectors = disk_stat_read(disk, sectors[READ]) | |
166 | + + disk_stat_read(disk, sectors[WRITE]); | |
167 | + pi->last_io_ticks = disk_stat_read(disk, io_ticks); | |
168 | + atomic_set(&pi->in_flight, 0); | |
169 | + | |
170 | + path->pscontext = pi; | |
171 | + | |
172 | + list_add_tail(&pi->list, &s->valid_paths); | |
173 | + | |
174 | + return 0; | |
175 | +} | |
176 | + | |
177 | +static void st_fail_path(struct path_selector *ps, struct dm_path *path) | |
178 | +{ | |
179 | + struct selector *s = (struct selector *) ps->context; | |
180 | + struct path_info *pi = path->pscontext; | |
181 | + | |
182 | + list_move(&pi->list, &s->failed_paths); | |
183 | +} | |
184 | + | |
185 | +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | |
186 | +{ | |
187 | + struct selector *s = (struct selector *) ps->context; | |
188 | + struct path_info *pi = path->pscontext; | |
189 | + | |
190 | + list_move_tail(&pi->list, &s->valid_paths); | |
191 | + | |
192 | + return 0; | |
193 | +} | |
194 | + | |
195 | +static void stats_update(struct path_info *pi) | |
196 | +{ | |
197 | + sector_t sectors; | |
198 | + size_t io_ticks, tmp; | |
199 | + struct gendisk *disk = pi->path->dev->bdev->bd_disk; | |
200 | + | |
201 | + sectors = disk_stat_read(disk, sectors[READ]) | |
202 | + + disk_stat_read(disk, sectors[WRITE]); | |
203 | + io_ticks = disk_stat_read(disk, io_ticks); | |
204 | + | |
205 | + if ((sectors != pi->last_sectors) && (io_ticks != pi->last_io_ticks)) { | |
206 | + tmp = (sectors - pi->last_sectors) << 9; | |
207 | + do_div(tmp, jiffies_to_msecs((io_ticks - pi->last_io_ticks))); | |
208 | + pi->perf = tmp; | |
209 | + | |
210 | + pi->last_sectors = sectors; | |
211 | + pi->last_io_ticks = io_ticks; | |
212 | + } | |
213 | +} | |
214 | + | |
215 | +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | |
216 | + size_t new_io) | |
217 | +{ | |
218 | + size_t if1, if2; | |
219 | + | |
220 | + if1 = atomic_read(&pi1->in_flight); | |
221 | + if2 = atomic_read(&pi2->in_flight); | |
222 | + | |
223 | + /* | |
224 | + * Case 1: No performace data available. Choose less loaded path. | |
225 | + */ | |
226 | + if (!pi1->perf || !pi2->perf) | |
227 | + return if1 - if2; | |
228 | + | |
229 | + /* | |
230 | + * Case 2: Calculate service time. Choose faster path. | |
231 | + * if ((if1+new_io)/pi1->perf < (if2+new_io)/pi2->perf) pi1. | |
232 | + * if ((if1+new_io)/pi1->perf > (if2+new_io)/pi2->perf) pi2. | |
233 | + * To avoid do_div(), use | |
234 | + * if ((if1+new_io)*pi2->perf < (if2+new_io)*pi1->perf) pi1. | |
235 | + * if ((if1+new_io)*pi2->perf > (if2+new_io)*pi1->perf) pi2. | |
236 | + */ | |
237 | + if1 = (if1 + new_io) << 10; | |
238 | + if2 = (if2 + new_io) << 10; | |
239 | + do_div(if1, pi1->perf); | |
240 | + do_div(if2, pi2->perf); | |
241 | + | |
242 | + if (if1 != if2) | |
243 | + return if1 - if2; | |
244 | + | |
245 | + /* | |
246 | + * Case 3: Service time is equal. Choose faster path. | |
247 | + */ | |
248 | + return pi2->perf - pi1->perf; | |
249 | +} | |
250 | + | |
251 | +static struct dm_path *st_select_path(struct path_selector *ps, | |
252 | + unsigned *repeat_count, size_t nr_bytes) | |
253 | +{ | |
254 | + struct selector *s = (struct selector *) ps->context; | |
255 | + struct path_info *pi = NULL, *best = NULL; | |
256 | + | |
257 | + if (list_empty(&s->valid_paths)) | |
258 | + return NULL; | |
259 | + | |
260 | + /* Change preferred (first in list) path to evenly balance. */ | |
261 | + list_move_tail(s->valid_paths.next, &s->valid_paths); | |
262 | + | |
263 | + /* Update performance information before best path selection */ | |
264 | + list_for_each_entry(pi, &s->valid_paths, list) | |
265 | + stats_update(pi); | |
266 | + | |
267 | + list_for_each_entry(pi, &s->valid_paths, list) { | |
268 | + if (!best) | |
269 | + best = pi; | |
270 | + else if (st_compare_load(pi, best, nr_bytes) < 0) | |
271 | + best = pi; | |
272 | + } | |
273 | + | |
274 | + if (best) { | |
275 | + *repeat_count = best->repeat_count; | |
276 | + return best->path; | |
277 | + } | |
278 | + | |
279 | + return NULL; | |
280 | +} | |
281 | + | |
282 | +static int st_start_io(struct path_selector *ps, struct dm_path *path, | |
283 | + size_t nr_bytes) | |
284 | +{ | |
285 | + struct path_info *pi = path->pscontext; | |
286 | + | |
287 | + atomic_add(nr_bytes, &pi->in_flight); | |
288 | + | |
289 | + return 0; | |
290 | +} | |
291 | + | |
292 | +static int st_end_io(struct path_selector *ps, struct dm_path *path, | |
293 | + size_t nr_bytes) | |
294 | +{ | |
295 | + struct path_info *pi = path->pscontext; | |
296 | + | |
297 | + atomic_sub(nr_bytes, &pi->in_flight); | |
298 | + | |
299 | + return 0; | |
300 | +} | |
301 | + | |
302 | +static struct path_selector_type st_ps = { | |
303 | + .name = "service-time", | |
304 | + .module = THIS_MODULE, | |
305 | + .table_args = 1, | |
306 | + .info_args = 2, | |
307 | + .create = st_create, | |
308 | + .destroy = st_destroy, | |
309 | + .status = st_status, | |
310 | + .add_path = st_add_path, | |
311 | + .fail_path = st_fail_path, | |
312 | + .reinstate_path = st_reinstate_path, | |
313 | + .select_path = st_select_path, | |
314 | + .start_io = st_start_io, | |
315 | + .end_io = st_end_io, | |
316 | +}; | |
317 | + | |
318 | +static int __init dm_st_init(void) | |
319 | +{ | |
320 | + int r = dm_register_path_selector(&st_ps); | |
321 | + | |
322 | + if (r < 0) | |
323 | + DMERR("register failed %d", r); | |
324 | + | |
325 | + DMINFO("version " ST_VERSION " loaded"); | |
326 | + | |
327 | + return r; | |
328 | +} | |
329 | + | |
330 | +static void __exit dm_st_exit(void) | |
331 | +{ | |
332 | + int r = dm_unregister_path_selector(&st_ps); | |
333 | + | |
334 | + if (r < 0) | |
335 | + DMERR("unregister failed %d", r); | |
336 | +} | |
337 | + | |
338 | +module_init(dm_st_init); | |
339 | +module_exit(dm_st_exit); | |
340 | + | |
341 | +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | |
342 | +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | |
343 | +MODULE_LICENSE("GPL"); |