1 From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
2 Subject: dm-mpath: add service-time oriented dynamic load balancer
3 References: FATE#303862,FATE#302108
5 This patch adds a service time oriented dynamic load balancer,
9 Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
10 Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
11 Signed-off-by: Hannes Reinecke <hare@suse.de>
13 Index: linux-2.6.27/drivers/md/Makefile
14 ===================================================================
15 --- linux-2.6.27.orig/drivers/md/Makefile
16 +++ linux-2.6.27/drivers/md/Makefile
17 @@ -33,7 +33,8 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
18 obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
19 obj-$(CONFIG_DM_DELAY) += dm-delay.o
20 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o \
21 - dm-least-pending.o dm-queue-length.o
22 + dm-least-pending.o dm-queue-length.o \
24 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
25 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o
26 obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \
27 Index: linux-2.6.27/drivers/md/dm-service-time.c
28 ===================================================================
30 +++ linux-2.6.27/drivers/md/dm-service-time.c
33 + * Copyright (C) 2007-2008 NEC Corporation. All Rights Reserved.
35 + * Module Author: Kiyoshi Ueda
37 + * This file is released under the GPL.
39 + * Throughput oriented path selector.
43 +#include "dm-path-selector.h"
45 +#define DM_MSG_PREFIX "multipath service-time"
47 +#define ST_VERSION "0.1.0"
50 + struct list_head valid_paths;
51 + struct list_head failed_paths;
55 + struct list_head list;
56 + struct dm_path *path;
57 + unsigned int repeat_count;
59 + atomic_t in_flight; /* Total size of in-flight I/Os */
60 + size_t perf; /* Recent performance of the path */
61 + sector_t last_sectors; /* Total sectors of the last disk_stat_read */
62 + size_t last_io_ticks; /* io_ticks of the last disk_stat_read */
65 +static struct selector *alloc_selector(void)
67 + struct selector *s = kzalloc(sizeof(*s), GFP_KERNEL);
70 + INIT_LIST_HEAD(&s->valid_paths);
71 + INIT_LIST_HEAD(&s->failed_paths);
77 +static int st_create(struct path_selector *ps, unsigned argc, char **argv)
79 + struct selector *s = alloc_selector();
88 +static void free_paths(struct list_head *paths)
90 + struct path_info *pi, *next;
92 + list_for_each_entry_safe(pi, next, paths, list) {
93 + list_del(&pi->list);
94 + pi->path->pscontext = NULL;
99 +static void st_destroy(struct path_selector *ps)
101 + struct selector *s = (struct selector *) ps->context;
103 + free_paths(&s->valid_paths);
104 + free_paths(&s->failed_paths);
106 + ps->context = NULL;
109 +static int st_status(struct path_selector *ps, struct dm_path *path,
110 + status_type_t type, char *result, unsigned int maxlen)
113 + struct path_info *pi;
118 + pi = path->pscontext;
121 + case STATUSTYPE_INFO:
122 + DMEMIT("if:%08lu pf:%06lu ",
123 + (unsigned long) atomic_read(&pi->in_flight),
126 + case STATUSTYPE_TABLE:
127 + DMEMIT("%u ", pi->repeat_count);
135 +static int st_add_path(struct path_selector *ps, struct dm_path *path,
136 + int argc, char **argv, char **error)
138 + struct selector *s = (struct selector *) ps->context;
139 + struct path_info *pi;
140 + unsigned int repeat_count = ST_MIN_IO;
141 + struct gendisk *disk = path->dev->bdev->bd_disk;
144 + *error = "service-time ps: incorrect number of arguments";
148 + /* First path argument is number of I/Os before switching path. */
149 + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
150 + *error = "service-time ps: invalid repeat count";
154 + /* allocate the path */
155 + pi = kmalloc(sizeof(*pi), GFP_KERNEL);
157 + *error = "service-time ps: Error allocating path context";
162 + pi->repeat_count = repeat_count;
165 + pi->last_sectors = disk_stat_read(disk, sectors[READ])
166 + + disk_stat_read(disk, sectors[WRITE]);
167 + pi->last_io_ticks = disk_stat_read(disk, io_ticks);
168 + atomic_set(&pi->in_flight, 0);
170 + path->pscontext = pi;
172 + list_add_tail(&pi->list, &s->valid_paths);
177 +static void st_fail_path(struct path_selector *ps, struct dm_path *path)
179 + struct selector *s = (struct selector *) ps->context;
180 + struct path_info *pi = path->pscontext;
182 + list_move(&pi->list, &s->failed_paths);
185 +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
187 + struct selector *s = (struct selector *) ps->context;
188 + struct path_info *pi = path->pscontext;
190 + list_move_tail(&pi->list, &s->valid_paths);
195 +static void stats_update(struct path_info *pi)
198 + size_t io_ticks, tmp;
199 + struct gendisk *disk = pi->path->dev->bdev->bd_disk;
201 + sectors = disk_stat_read(disk, sectors[READ])
202 + + disk_stat_read(disk, sectors[WRITE]);
203 + io_ticks = disk_stat_read(disk, io_ticks);
205 + if ((sectors != pi->last_sectors) && (io_ticks != pi->last_io_ticks)) {
206 + tmp = (sectors - pi->last_sectors) << 9;
207 + do_div(tmp, jiffies_to_msecs((io_ticks - pi->last_io_ticks)));
210 + pi->last_sectors = sectors;
211 + pi->last_io_ticks = io_ticks;
215 +static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
220 + if1 = atomic_read(&pi1->in_flight);
221 + if2 = atomic_read(&pi2->in_flight);
224 + * Case 1: No performace data available. Choose less loaded path.
226 + if (!pi1->perf || !pi2->perf)
230 + * Case 2: Calculate service time. Choose faster path.
231 + * if ((if1+new_io)/pi1->perf < (if2+new_io)/pi2->perf) pi1.
232 + * if ((if1+new_io)/pi1->perf > (if2+new_io)/pi2->perf) pi2.
233 + * To avoid do_div(), use
234 + * if ((if1+new_io)*pi2->perf < (if2+new_io)*pi1->perf) pi1.
235 + * if ((if1+new_io)*pi2->perf > (if2+new_io)*pi1->perf) pi2.
237 + if1 = (if1 + new_io) << 10;
238 + if2 = (if2 + new_io) << 10;
239 + do_div(if1, pi1->perf);
240 + do_div(if2, pi2->perf);
246 + * Case 3: Service time is equal. Choose faster path.
248 + return pi2->perf - pi1->perf;
251 +static struct dm_path *st_select_path(struct path_selector *ps,
252 + unsigned *repeat_count, size_t nr_bytes)
254 + struct selector *s = (struct selector *) ps->context;
255 + struct path_info *pi = NULL, *best = NULL;
257 + if (list_empty(&s->valid_paths))
260 + /* Change preferred (first in list) path to evenly balance. */
261 + list_move_tail(s->valid_paths.next, &s->valid_paths);
263 + /* Update performance information before best path selection */
264 + list_for_each_entry(pi, &s->valid_paths, list)
267 + list_for_each_entry(pi, &s->valid_paths, list) {
270 + else if (st_compare_load(pi, best, nr_bytes) < 0)
275 + *repeat_count = best->repeat_count;
282 +static int st_start_io(struct path_selector *ps, struct dm_path *path,
285 + struct path_info *pi = path->pscontext;
287 + atomic_add(nr_bytes, &pi->in_flight);
292 +static int st_end_io(struct path_selector *ps, struct dm_path *path,
295 + struct path_info *pi = path->pscontext;
297 + atomic_sub(nr_bytes, &pi->in_flight);
302 +static struct path_selector_type st_ps = {
303 + .name = "service-time",
304 + .module = THIS_MODULE,
307 + .create = st_create,
308 + .destroy = st_destroy,
309 + .status = st_status,
310 + .add_path = st_add_path,
311 + .fail_path = st_fail_path,
312 + .reinstate_path = st_reinstate_path,
313 + .select_path = st_select_path,
314 + .start_io = st_start_io,
315 + .end_io = st_end_io,
318 +static int __init dm_st_init(void)
320 + int r = dm_register_path_selector(&st_ps);
323 + DMERR("register failed %d", r);
325 + DMINFO("version " ST_VERSION " loaded");
330 +static void __exit dm_st_exit(void)
332 + int r = dm_unregister_path_selector(&st_ps);
335 + DMERR("unregister failed %d", r);
338 +module_init(dm_st_init);
339 +module_exit(dm_st_exit);
341 +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
342 +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
343 +MODULE_LICENSE("GPL");