]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: "Heinz Mauelshagen <hjm@redhat.de> |
2 | Subject: DMRAID45 module | |
3 | X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/ | |
4 | ||
5 | DM-RAID 45 module. | |
6 | ||
7 | This driver is used for "Fake RAID" devices. | |
8 | ||
9 | Acked-by: Jeff Mahoney <jeffm@suse.com> | |
10 | ||
11 | --- | |
12 | ||
13 | drivers/md/Kconfig | 8 | |
14 | drivers/md/Makefile | 4 | |
15 | drivers/md/dm-memcache.c | 301 ++ | |
16 | drivers/md/dm-memcache.h | 68 | |
17 | drivers/md/dm-message.c | 182 + | |
18 | drivers/md/dm-message.h | 91 | |
19 | drivers/md/dm-raid45.c | 4516 +++++++++++++++++++++++++++++++++++++++++++++ | |
20 | drivers/md/dm-raid45.h | 28 | |
21 | drivers/md/dm-regions.c | 723 +++++++ | |
22 | drivers/md/dm.c | 1 | |
23 | include/linux/dm-regions.h | 115 + | |
24 | 11 files changed, 6036 insertions(+), 1 deletion(-) | |
25 | ||
26 | --- a/drivers/md/dm.c | |
27 | +++ b/drivers/md/dm.c | |
28 | @@ -1680,6 +1680,7 @@ struct gendisk *dm_disk(struct mapped_de | |
29 | { | |
30 | return md->disk; | |
31 | } | |
32 | +EXPORT_SYMBOL_GPL(dm_disk); | |
33 | ||
34 | int dm_suspended(struct mapped_device *md) | |
35 | { | |
36 | --- /dev/null | |
37 | +++ b/drivers/md/dm-memcache.c | |
38 | @@ -0,0 +1,301 @@ | |
39 | +/* | |
40 | + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | |
41 | + * | |
42 | + * Module Author: Heinz Mauelshagen <heinzm@redhat.com> | |
43 | + * | |
44 | + * Device-mapper memory object handling: | |
45 | + * | |
46 | + * o allocate/free total_pages in a per client page pool. | |
47 | + * | |
48 | + * o allocate/free memory objects with chunks (1..n) of | |
49 | + * pages_per_chunk pages hanging off. | |
50 | + * | |
51 | + * This file is released under the GPL. | |
52 | + */ | |
53 | + | |
54 | +#define DM_MEM_CACHE_VERSION "0.2" | |
55 | + | |
56 | +#include "dm.h" | |
57 | +#include "dm-memcache.h" | |
58 | +#include <linux/dm-io.h> | |
59 | + | |
60 | +struct dm_mem_cache_client { | |
61 | + spinlock_t lock; | |
62 | + mempool_t *objs_pool; | |
63 | + struct page_list *free_list; | |
64 | + unsigned objects; | |
65 | + unsigned chunks; | |
66 | + unsigned pages_per_chunk; | |
67 | + unsigned free_pages; | |
68 | + unsigned total_pages; | |
69 | +}; | |
70 | + | |
71 | +/* | |
72 | + * Free pages and page_list elements of client. | |
73 | + */ | |
74 | +static void free_cache_pages(struct page_list *list) | |
75 | +{ | |
76 | + while (list) { | |
77 | + struct page_list *pl = list; | |
78 | + | |
79 | + list = pl->next; | |
80 | + BUG_ON(!pl->page); | |
81 | + __free_page(pl->page); | |
82 | + kfree(pl); | |
83 | + } | |
84 | +} | |
85 | + | |
86 | +/* | |
87 | + * Alloc number of pages and page_list elements as required by client. | |
88 | + */ | |
89 | +static struct page_list *alloc_cache_pages(unsigned pages) | |
90 | +{ | |
91 | + struct page_list *pl, *ret = NULL; | |
92 | + struct page *page; | |
93 | + | |
94 | + while (pages--) { | |
95 | + page = alloc_page(GFP_NOIO); | |
96 | + if (!page) | |
97 | + goto err; | |
98 | + | |
99 | + pl = kmalloc(sizeof(*pl), GFP_NOIO); | |
100 | + if (!pl) { | |
101 | + __free_page(page); | |
102 | + goto err; | |
103 | + } | |
104 | + | |
105 | + pl->page = page; | |
106 | + pl->next = ret; | |
107 | + ret = pl; | |
108 | + } | |
109 | + | |
110 | + return ret; | |
111 | + | |
112 | +err: | |
113 | + free_cache_pages(ret); | |
114 | + return NULL; | |
115 | +} | |
116 | + | |
117 | +/* | |
118 | + * Allocate page_list elements from the pool to chunks of the memory object. | |
119 | + */ | |
120 | +static void alloc_chunks(struct dm_mem_cache_client *cl, | |
121 | + struct dm_mem_cache_object *obj) | |
122 | +{ | |
123 | + unsigned chunks = cl->chunks; | |
124 | + unsigned long flags; | |
125 | + | |
126 | + local_irq_save(flags); | |
127 | + local_irq_disable(); | |
128 | + while (chunks--) { | |
129 | + unsigned p = cl->pages_per_chunk; | |
130 | + | |
131 | + obj[chunks].pl = NULL; | |
132 | + | |
133 | + while (p--) { | |
134 | + struct page_list *pl; | |
135 | + | |
136 | + /* Take next element from free list */ | |
137 | + spin_lock(&cl->lock); | |
138 | + pl = cl->free_list; | |
139 | + BUG_ON(!pl); | |
140 | + cl->free_list = pl->next; | |
141 | + spin_unlock(&cl->lock); | |
142 | + | |
143 | + pl->next = obj[chunks].pl; | |
144 | + obj[chunks].pl = pl; | |
145 | + } | |
146 | + } | |
147 | + | |
148 | + local_irq_restore(flags); | |
149 | +} | |
150 | + | |
151 | +/* | |
152 | + * Free page_list elements putting them back onto free list | |
153 | + */ | |
154 | +static void free_chunks(struct dm_mem_cache_client *cl, | |
155 | + struct dm_mem_cache_object *obj) | |
156 | +{ | |
157 | + unsigned chunks = cl->chunks; | |
158 | + unsigned long flags; | |
159 | + struct page_list *next, *pl; | |
160 | + | |
161 | + local_irq_save(flags); | |
162 | + local_irq_disable(); | |
163 | + while (chunks--) { | |
164 | + for (pl = obj[chunks].pl; pl; pl = next) { | |
165 | + next = pl->next; | |
166 | + | |
167 | + spin_lock(&cl->lock); | |
168 | + pl->next = cl->free_list; | |
169 | + cl->free_list = pl; | |
170 | + cl->free_pages++; | |
171 | + spin_unlock(&cl->lock); | |
172 | + } | |
173 | + } | |
174 | + | |
175 | + local_irq_restore(flags); | |
176 | +} | |
177 | + | |
178 | +/* | |
179 | + * Create/destroy dm memory cache client resources. | |
180 | + */ | |
181 | +struct dm_mem_cache_client * | |
182 | +dm_mem_cache_client_create(unsigned objects, unsigned chunks, | |
183 | + unsigned pages_per_chunk) | |
184 | +{ | |
185 | + unsigned total_pages = objects * chunks * pages_per_chunk; | |
186 | + struct dm_mem_cache_client *client; | |
187 | + | |
188 | + BUG_ON(!total_pages); | |
189 | + client = kzalloc(sizeof(*client), GFP_KERNEL); | |
190 | + if (!client) | |
191 | + return ERR_PTR(-ENOMEM); | |
192 | + | |
193 | + client->objs_pool = mempool_create_kmalloc_pool(objects, | |
194 | + chunks * sizeof(struct dm_mem_cache_object)); | |
195 | + if (!client->objs_pool) | |
196 | + goto err; | |
197 | + | |
198 | + client->free_list = alloc_cache_pages(total_pages); | |
199 | + if (!client->free_list) | |
200 | + goto err1; | |
201 | + | |
202 | + spin_lock_init(&client->lock); | |
203 | + client->objects = objects; | |
204 | + client->chunks = chunks; | |
205 | + client->pages_per_chunk = pages_per_chunk; | |
206 | + client->free_pages = client->total_pages = total_pages; | |
207 | + return client; | |
208 | + | |
209 | +err1: | |
210 | + mempool_destroy(client->objs_pool); | |
211 | +err: | |
212 | + kfree(client); | |
213 | + return ERR_PTR(-ENOMEM); | |
214 | +} | |
215 | +EXPORT_SYMBOL(dm_mem_cache_client_create); | |
216 | + | |
217 | +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl) | |
218 | +{ | |
219 | + BUG_ON(cl->free_pages != cl->total_pages); | |
220 | + free_cache_pages(cl->free_list); | |
221 | + mempool_destroy(cl->objs_pool); | |
222 | + kfree(cl); | |
223 | +} | |
224 | +EXPORT_SYMBOL(dm_mem_cache_client_destroy); | |
225 | + | |
226 | +/* | |
227 | + * Grow a clients cache by an amount of pages. | |
228 | + * | |
229 | + * Don't call from interrupt context! | |
230 | + */ | |
231 | +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects) | |
232 | +{ | |
233 | + unsigned pages = objects * cl->chunks * cl->pages_per_chunk; | |
234 | + struct page_list *pl, *last; | |
235 | + | |
236 | + BUG_ON(!pages); | |
237 | + pl = alloc_cache_pages(pages); | |
238 | + if (!pl) | |
239 | + return -ENOMEM; | |
240 | + | |
241 | + last = pl; | |
242 | + while (last->next) | |
243 | + last = last->next; | |
244 | + | |
245 | + spin_lock_irq(&cl->lock); | |
246 | + last->next = cl->free_list; | |
247 | + cl->free_list = pl; | |
248 | + cl->free_pages += pages; | |
249 | + cl->total_pages += pages; | |
250 | + cl->objects++; | |
251 | + spin_unlock_irq(&cl->lock); | |
252 | + | |
253 | + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO); | |
254 | + return 0; | |
255 | +} | |
256 | +EXPORT_SYMBOL(dm_mem_cache_grow); | |
257 | + | |
258 | +/* Shrink a clients cache by an amount of pages */ | |
259 | +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects) | |
260 | +{ | |
261 | + int r; | |
262 | + unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages; | |
263 | + unsigned long flags; | |
264 | + struct page_list *last = NULL, *pl, *pos; | |
265 | + | |
266 | + BUG_ON(!pages); | |
267 | + | |
268 | + spin_lock_irqsave(&cl->lock, flags); | |
269 | + pl = pos = cl->free_list; | |
270 | + while (p-- && pos->next) { | |
271 | + last = pos; | |
272 | + pos = pos->next; | |
273 | + } | |
274 | + | |
275 | + if (++p) | |
276 | + r = -ENOMEM; | |
277 | + else { | |
278 | + r = 0; | |
279 | + cl->free_list = pos; | |
280 | + cl->free_pages -= pages; | |
281 | + cl->total_pages -= pages; | |
282 | + cl->objects--; | |
283 | + last->next = NULL; | |
284 | + } | |
285 | + spin_unlock_irqrestore(&cl->lock, flags); | |
286 | + | |
287 | + if (!r) { | |
288 | + free_cache_pages(pl); | |
289 | + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO); | |
290 | + } | |
291 | + | |
292 | + return r; | |
293 | +} | |
294 | +EXPORT_SYMBOL(dm_mem_cache_shrink); | |
295 | + | |
296 | +/* | |
297 | + * Allocate/free a memory object | |
298 | + * | |
299 | + * Can be called from interrupt context | |
300 | + */ | |
301 | +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl) | |
302 | +{ | |
303 | + int r = 0; | |
304 | + unsigned pages = cl->chunks * cl->pages_per_chunk; | |
305 | + unsigned long flags; | |
306 | + struct dm_mem_cache_object *obj; | |
307 | + | |
308 | + obj = mempool_alloc(cl->objs_pool, GFP_NOIO); | |
309 | + if (!obj) | |
310 | + return ERR_PTR(-ENOMEM); | |
311 | + | |
312 | + spin_lock_irqsave(&cl->lock, flags); | |
313 | + if (pages > cl->free_pages) | |
314 | + r = -ENOMEM; | |
315 | + else | |
316 | + cl->free_pages -= pages; | |
317 | + spin_unlock_irqrestore(&cl->lock, flags); | |
318 | + | |
319 | + if (r) { | |
320 | + mempool_free(obj, cl->objs_pool); | |
321 | + return ERR_PTR(r); | |
322 | + } | |
323 | + | |
324 | + alloc_chunks(cl, obj); | |
325 | + return obj; | |
326 | +} | |
327 | +EXPORT_SYMBOL(dm_mem_cache_alloc); | |
328 | + | |
329 | +void dm_mem_cache_free(struct dm_mem_cache_client *cl, | |
330 | + struct dm_mem_cache_object *obj) | |
331 | +{ | |
332 | + free_chunks(cl, obj); | |
333 | + mempool_free(obj, cl->objs_pool); | |
334 | +} | |
335 | +EXPORT_SYMBOL(dm_mem_cache_free); | |
336 | + | |
337 | +MODULE_DESCRIPTION(DM_NAME " dm memory cache"); | |
338 | +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>"); | |
339 | +MODULE_LICENSE("GPL"); | |
340 | --- /dev/null | |
341 | +++ b/drivers/md/dm-memcache.h | |
342 | @@ -0,0 +1,68 @@ | |
343 | +/* | |
344 | + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | |
345 | + * | |
346 | + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com> | |
347 | + * | |
348 | + * Device-mapper memory object handling: | |
349 | + * | |
350 | + * o allocate/free total_pages in a per client page pool. | |
351 | + * | |
352 | + * o allocate/free memory objects with chunks (1..n) of | |
353 | + * pages_per_chunk pages hanging off. | |
354 | + * | |
355 | + * This file is released under the GPL. | |
356 | + */ | |
357 | + | |
358 | +#ifndef _DM_MEM_CACHE_H | |
359 | +#define _DM_MEM_CACHE_H | |
360 | + | |
361 | +#define DM_MEM_CACHE_H_VERSION "0.1" | |
362 | + | |
363 | +#include "dm.h" | |
364 | +#include <linux/dm-io.h> | |
365 | + | |
366 | +static inline struct page_list *pl_elem(struct page_list *pl, unsigned p) | |
367 | +{ | |
368 | + while (pl && p--) | |
369 | + pl = pl->next; | |
370 | + | |
371 | + return pl; | |
372 | +} | |
373 | + | |
374 | +struct dm_mem_cache_object { | |
375 | + struct page_list *pl; /* Dynamically allocated array */ | |
376 | + void *private; /* Caller context reference */ | |
377 | +}; | |
378 | + | |
379 | +struct dm_mem_cache_client; | |
380 | + | |
381 | +/* | |
382 | + * Create/destroy dm memory cache client resources. | |
383 | + * | |
384 | + * On creation, a number of @objects with @chunks of | |
385 | + * @pages_per_chunk pages will be allocated. | |
386 | + */ | |
387 | +struct dm_mem_cache_client * | |
388 | +dm_mem_cache_client_create(unsigned objects, unsigned chunks, | |
389 | + unsigned pages_per_chunk); | |
390 | +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client); | |
391 | + | |
392 | +/* | |
393 | + * Grow/shrink a dm memory cache client resources | |
394 | + * by @objetcs amount of objects. | |
395 | + */ | |
396 | +int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects); | |
397 | +int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects); | |
398 | + | |
399 | +/* | |
400 | + * Allocate/free a memory object | |
401 | + * | |
402 | + * On allocation one object with an amount of chunks and | |
403 | + * an amount of pages per chunk will be returned on success. | |
404 | + */ | |
405 | +struct dm_mem_cache_object * | |
406 | +dm_mem_cache_alloc(struct dm_mem_cache_client *client); | |
407 | +void dm_mem_cache_free(struct dm_mem_cache_client *client, | |
408 | + struct dm_mem_cache_object *object); | |
409 | + | |
410 | +#endif | |
411 | --- /dev/null | |
412 | +++ b/drivers/md/dm-message.c | |
413 | @@ -0,0 +1,182 @@ | |
414 | +/* | |
415 | + * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved. | |
416 | + * | |
417 | + * Module Author: Heinz Mauelshagen <heinzm@redhat.com> | |
418 | + * | |
419 | + * General device-mapper message interface argument parser. | |
420 | + * | |
421 | + * This file is released under the GPL. | |
422 | + * | |
423 | + * device-mapper message parser. | |
424 | + * | |
425 | + */ | |
426 | + | |
427 | +#include "dm.h" | |
428 | +#include "dm-message.h" | |
429 | +#include <linux/kernel.h> | |
430 | + | |
431 | +#define DM_MSG_PREFIX "dm_message" | |
432 | + | |
433 | +/* Basename of a path. */ | |
434 | +static inline char * | |
435 | +basename(char *s) | |
436 | +{ | |
437 | + char *p = strrchr(s, '/'); | |
438 | + | |
439 | + return p ? p + 1 : s; | |
440 | +} | |
441 | + | |
442 | +/* Get an argument depending on type. */ | |
443 | +static void | |
444 | +message_arguments(struct dm_msg *msg, int argc, char **argv) | |
445 | +{ | |
446 | + | |
447 | + if (argc) { | |
448 | + int i; | |
449 | + struct dm_message_argument *args = msg->spec->args; | |
450 | + | |
451 | + for (i = 0; i < args->num_args; i++) { | |
452 | + int r; | |
453 | + unsigned long **ptr = args->ptr; | |
454 | + enum dm_message_argument_type type = args->types[i]; | |
455 | + | |
456 | + switch (type) { | |
457 | + case dm_msg_base_t: | |
458 | + ((char **) ptr)[i] = basename(argv[i]); | |
459 | + break; | |
460 | + | |
461 | + case dm_msg_str_t: | |
462 | + ((char **) ptr)[i] = argv[i]; | |
463 | + break; | |
464 | + | |
465 | + case dm_msg_int_t: | |
466 | + r = sscanf(argv[i], "%d", ((int **) ptr)[i]); | |
467 | + goto check; | |
468 | + | |
469 | + case dm_msg_uint_t: | |
470 | + r = sscanf(argv[i], "%u", | |
471 | + ((unsigned **) ptr)[i]); | |
472 | + goto check; | |
473 | + | |
474 | + case dm_msg_uint64_t: | |
475 | + r = sscanf(argv[i], "%llu", | |
476 | + ((unsigned long long **) ptr)[i]); | |
477 | + | |
478 | +check: | |
479 | + if (r != 1) { | |
480 | + set_bit(dm_msg_ret_undef, &msg->ret); | |
481 | + set_bit(dm_msg_ret_arg, &msg->ret); | |
482 | + } | |
483 | + } | |
484 | + } | |
485 | + } | |
486 | +} | |
487 | + | |
488 | +/* Parse message options. */ | |
489 | +static void | |
490 | +message_options_parse(struct dm_msg *msg, int argc, char **argv) | |
491 | +{ | |
492 | + int hit = 0; | |
493 | + unsigned long *action; | |
494 | + size_t l1 = strlen(*argv), l_hit = 0; | |
495 | + struct dm_message_option *o = msg->spec->options; | |
496 | + char **option, **option_end = o->options + o->num_options; | |
497 | + | |
498 | + for (option = o->options, action = o->actions; | |
499 | + option < option_end; option++, action++) { | |
500 | + size_t l2 = strlen(*option); | |
501 | + | |
502 | + if (!strnicmp(*argv, *option, min(l1, l2))) { | |
503 | + hit++; | |
504 | + l_hit = l2; | |
505 | + set_bit(*action, &msg->action); | |
506 | + } | |
507 | + } | |
508 | + | |
509 | + /* Assume error. */ | |
510 | + msg->ret = 0; | |
511 | + set_bit(dm_msg_ret_option, &msg->ret); | |
512 | + if (!hit || l1 > l_hit) | |
513 | + set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */ | |
514 | + else if (hit > 1) | |
515 | + set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/ | |
516 | + else { | |
517 | + clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */ | |
518 | + message_arguments(msg, --argc, ++argv); | |
519 | + } | |
520 | +} | |
521 | + | |
522 | +static inline void | |
523 | +print_ret(const char *caller, unsigned long ret) | |
524 | +{ | |
525 | + struct { | |
526 | + unsigned long err; | |
527 | + const char *err_str; | |
528 | + } static err_msg[] = { | |
529 | + { dm_msg_ret_ambiguous, "message ambiguous" }, | |
530 | + { dm_msg_ret_inval, "message invalid" }, | |
531 | + { dm_msg_ret_undef, "message undefined" }, | |
532 | + { dm_msg_ret_arg, "message argument" }, | |
533 | + { dm_msg_ret_argcount, "message argument count" }, | |
534 | + { dm_msg_ret_option, "option" }, | |
535 | + }, *e = ARRAY_END(err_msg); | |
536 | + | |
537 | + while (e-- > err_msg) { | |
538 | + if (test_bit(e->err, &ret)) | |
539 | + DMERR("%s %s", caller, e->err_str); | |
540 | + } | |
541 | +} | |
542 | + | |
543 | +/* Parse a message action. */ | |
544 | +int | |
545 | +dm_message_parse(const char *caller, struct dm_msg *msg, void *context, | |
546 | + int argc, char **argv) | |
547 | +{ | |
548 | + int hit = 0; | |
549 | + size_t l1 = strlen(*argv), l_hit = 0; | |
550 | + struct dm_msg_spec *s, *s_hit = NULL, | |
551 | + *s_end = msg->specs + msg->num_specs; | |
552 | + | |
553 | + if (argc < 2) | |
554 | + return -EINVAL; | |
555 | + | |
556 | + for (s = msg->specs; s < s_end; s++) { | |
557 | + size_t l2 = strlen(s->cmd); | |
558 | + | |
559 | + if (!strnicmp(*argv, s->cmd, min(l1, l2))) { | |
560 | + hit++; | |
561 | + l_hit = l2; | |
562 | + s_hit = s; | |
563 | + } | |
564 | + } | |
565 | + | |
566 | + msg->ret = 0; | |
567 | + if (!hit || l1 > l_hit) /* No hit or message string too long. */ | |
568 | + set_bit(dm_msg_ret_undef, &msg->ret); | |
569 | + else if (hit > 1) /* Ambiguous message. */ | |
570 | + set_bit(dm_msg_ret_ambiguous, &msg->ret); | |
571 | + else if (argc - 2 != s_hit->args->num_args) { | |
572 | + set_bit(dm_msg_ret_undef, &msg->ret); | |
573 | + set_bit(dm_msg_ret_argcount, &msg->ret); | |
574 | + } | |
575 | + | |
576 | + if (msg->ret) | |
577 | + goto bad; | |
578 | + | |
579 | + msg->action = 0; | |
580 | + msg->spec = s_hit; | |
581 | + set_bit(s_hit->action, &msg->action); | |
582 | + message_options_parse(msg, --argc, ++argv); | |
583 | + | |
584 | + if (!msg->ret) | |
585 | + return msg->spec->f(msg, context); | |
586 | + | |
587 | +bad: | |
588 | + print_ret(caller, msg->ret); | |
589 | + return -EINVAL; | |
590 | +} | |
591 | +EXPORT_SYMBOL(dm_message_parse); | |
592 | + | |
593 | +MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser"); | |
594 | +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>"); | |
595 | +MODULE_LICENSE("GPL"); | |
596 | --- /dev/null | |
597 | +++ b/drivers/md/dm-message.h | |
598 | @@ -0,0 +1,91 @@ | |
599 | +/* | |
600 | + * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved. | |
601 | + * | |
602 | + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de> | |
603 | + * | |
604 | + * General device-mapper message interface argument parser. | |
605 | + * | |
606 | + * This file is released under the GPL. | |
607 | + * | |
608 | + */ | |
609 | + | |
610 | +#ifndef DM_MESSAGE_H | |
611 | +#define DM_MESSAGE_H | |
612 | + | |
613 | +/* Factor out to dm.h. */ | |
614 | +/* Reference to array end. */ | |
615 | +#define ARRAY_END(a) ((a) + ARRAY_SIZE(a)) | |
616 | + | |
617 | +/* Message return bits. */ | |
618 | +enum dm_message_return { | |
619 | + dm_msg_ret_ambiguous, /* Action ambiguous. */ | |
620 | + dm_msg_ret_inval, /* Action invalid. */ | |
621 | + dm_msg_ret_undef, /* Action undefined. */ | |
622 | + | |
623 | + dm_msg_ret_option, /* Option error. */ | |
624 | + dm_msg_ret_arg, /* Argument error. */ | |
625 | + dm_msg_ret_argcount, /* Argument count error. */ | |
626 | +}; | |
627 | + | |
628 | +/* Message argument type conversions. */ | |
629 | +enum dm_message_argument_type { | |
630 | + dm_msg_base_t, /* Basename string. */ | |
631 | + dm_msg_str_t, /* String. */ | |
632 | + dm_msg_int_t, /* Signed int. */ | |
633 | + dm_msg_uint_t, /* Unsigned int. */ | |
634 | + dm_msg_uint64_t, /* Unsigned int 64. */ | |
635 | +}; | |
636 | + | |
637 | +/* A message option. */ | |
638 | +struct dm_message_option { | |
639 | + unsigned num_options; | |
640 | + char **options; | |
641 | + unsigned long *actions; | |
642 | +}; | |
643 | + | |
644 | +/* Message arguments and types. */ | |
645 | +struct dm_message_argument { | |
646 | + unsigned num_args; | |
647 | + unsigned long **ptr; | |
648 | + enum dm_message_argument_type types[]; | |
649 | +}; | |
650 | + | |
651 | +/* Client message. */ | |
652 | +struct dm_msg { | |
653 | + unsigned long action; /* Identified action. */ | |
654 | + unsigned long ret; /* Return bits. */ | |
655 | + unsigned num_specs; /* # of sepcifications listed. */ | |
656 | + struct dm_msg_spec *specs; /* Specification list. */ | |
657 | + struct dm_msg_spec *spec; /* Specification selected. */ | |
658 | +}; | |
659 | + | |
660 | +/* Secification of the message. */ | |
661 | +struct dm_msg_spec { | |
662 | + const char *cmd; /* Name of the command (i.e. 'bandwidth'). */ | |
663 | + unsigned long action; | |
664 | + struct dm_message_option *options; | |
665 | + struct dm_message_argument *args; | |
666 | + unsigned long parm; /* Parameter to pass through to callback. */ | |
667 | + /* Function to process for action. */ | |
668 | + int (*f) (struct dm_msg *msg, void *context); | |
669 | +}; | |
670 | + | |
671 | +/* Parameter access macros. */ | |
672 | +#define DM_MSG_PARM(msg) ((msg)->spec->parm) | |
673 | + | |
674 | +#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx]) | |
675 | +#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx]) | |
676 | +#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx)) | |
677 | +#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx]) | |
678 | + | |
679 | +#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0) | |
680 | +#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0) | |
681 | +#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0) | |
682 | +#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0) | |
683 | + | |
684 | + | |
685 | +/* Parse a message and its options and optionally call a function back. */ | |
686 | +int dm_message_parse(const char *caller, struct dm_msg *msg, void *context, | |
687 | + int argc, char **argv); | |
688 | + | |
689 | +#endif | |
690 | --- /dev/null | |
691 | +++ b/drivers/md/dm-raid45.c | |
692 | @@ -0,0 +1,4516 @@ | |
693 | +/* | |
694 | + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. | |
695 | + * | |
696 | + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com> | |
697 | + * | |
698 | + * This file is released under the GPL. | |
699 | + * | |
700 | + * | |
701 | + * Linux 2.6 Device Mapper RAID4 and RAID5 target. | |
702 | + * | |
703 | + * Supports: | |
704 | + * o RAID4 with dedicated and selectable parity device | |
705 | + * o RAID5 with rotating parity (left+right, symmetric+asymmetric) | |
706 | + * o run time optimization of xor algorithm used to calculate parity | |
707 | + * | |
708 | + * | |
709 | + * Thanks to MD for: | |
710 | + * o the raid address calculation algorithm | |
711 | + * o the base of the biovec <-> page list copier. | |
712 | + * | |
713 | + * | |
714 | + * Uses region hash to keep track of how many writes are in flight to | |
715 | + * regions in order to use dirty log to keep state of regions to recover: | |
716 | + * | |
717 | + * o clean regions (those which are synchronized | |
718 | + * and don't have write io in flight) | |
719 | + * o dirty regions (those with write io in flight) | |
720 | + * | |
721 | + * | |
722 | + * On startup, any dirty regions are migrated to the 'nosync' state | |
723 | + * and are subject to recovery by the daemon. | |
724 | + * | |
725 | + * See raid_ctr() for table definition. | |
726 | + * | |
727 | + * | |
728 | + * FIXME: | |
729 | + * o add virtual interface for locking | |
730 | + * o remove instrumentation (REMOVEME:) | |
731 | + * | |
732 | + */ | |
733 | + | |
734 | +static const char *version = "v0.2431"; | |
735 | + | |
736 | +#include "dm.h" | |
737 | +#include "dm-bio-list.h" | |
738 | +#include "dm-memcache.h" | |
739 | +#include "dm-message.h" | |
740 | +#include "dm-raid45.h" | |
741 | + | |
742 | +#include <linux/kernel.h> | |
743 | +#include <linux/vmalloc.h> | |
744 | + | |
745 | +#include <linux/dm-io.h> | |
746 | +#include <linux/dm-dirty-log.h> | |
747 | +#include <linux/dm-regions.h> | |
748 | + | |
749 | +/* # of parallel recovered regions */ | |
750 | +/* FIXME: cope with multiple recovery stripes in raid_set struct. */ | |
751 | +#define MAX_RECOVER 1 /* needs to be 1! */ | |
752 | + | |
753 | +/* | |
754 | + * Configurable parameters | |
755 | + */ | |
756 | +#define INLINE | |
757 | + | |
758 | +/* Default # of stripes if not set in constructor. */ | |
759 | +#define STRIPES 64 | |
760 | + | |
761 | +/* Minimum/maximum # of selectable stripes. */ | |
762 | +#define STRIPES_MIN 8 | |
763 | +#define STRIPES_MAX 16384 | |
764 | + | |
765 | +/* Default chunk size in sectors if not set in constructor. */ | |
766 | +#define CHUNK_SIZE 64 | |
767 | + | |
768 | +/* Default io size in sectors if not set in constructor. */ | |
769 | +#define IO_SIZE_MIN SECTORS_PER_PAGE | |
770 | +#define IO_SIZE IO_SIZE_MIN | |
771 | + | |
772 | +/* Maximum setable chunk size in sectors. */ | |
773 | +#define CHUNK_SIZE_MAX 16384 | |
774 | + | |
775 | +/* Recover io size default in sectors. */ | |
776 | +#define RECOVER_IO_SIZE_MIN 64 | |
777 | +#define RECOVER_IO_SIZE 256 | |
778 | + | |
779 | +/* Default percentage recover io bandwidth. */ | |
780 | +#define BANDWIDTH 10 | |
781 | +#define BANDWIDTH_MIN 1 | |
782 | +#define BANDWIDTH_MAX 100 | |
783 | +/* | |
784 | + * END Configurable parameters | |
785 | + */ | |
786 | + | |
787 | +#define TARGET "dm-raid45" | |
788 | +#define DAEMON "kraid45d" | |
789 | +#define DM_MSG_PREFIX TARGET | |
790 | + | |
791 | +#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT) | |
792 | + | |
793 | +/* Amount/size for __xor(). */ | |
794 | +#define SECTORS_PER_XOR SECTORS_PER_PAGE | |
795 | +#define XOR_SIZE PAGE_SIZE | |
796 | + | |
797 | +/* Derive raid_set from stripe_cache pointer. */ | |
798 | +#define RS(x) container_of(x, struct raid_set, sc) | |
799 | + | |
800 | +/* Check value in range. */ | |
801 | +#define range_ok(i, min, max) (i >= min && i <= max) | |
802 | + | |
803 | +/* Page reference. */ | |
804 | +#define PAGE(stripe, p) ((stripe)->obj[p].pl->page) | |
805 | + | |
806 | +/* Bio list reference. */ | |
807 | +#define BL(stripe, p, rw) (stripe->ss[p].bl + rw) | |
808 | + | |
809 | +/* Page list reference. */ | |
810 | +#define PL(stripe, p) (stripe->obj[p].pl) | |
811 | + | |
812 | +/* Check argument is power of 2. */ | |
813 | +#define POWER_OF_2(a) (!(a & (a - 1))) | |
814 | + | |
815 | +/* Factor out to dm-bio-list.h */ | |
816 | +static inline void bio_list_push(struct bio_list *bl, struct bio *bio) | |
817 | +{ | |
818 | + bio->bi_next = bl->head; | |
819 | + bl->head = bio; | |
820 | + | |
821 | + if (!bl->tail) | |
822 | + bl->tail = bio; | |
823 | +} | |
824 | + | |
825 | +/* Factor out to dm.h */ | |
826 | +#define TI_ERR_RET(str, ret) \ | |
827 | + do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0); | |
828 | +#define TI_ERR(str) TI_ERR_RET(str, -EINVAL) | |
829 | + | |
830 | +/*----------------------------------------------------------------- | |
831 | + * Stripe cache | |
832 | + * | |
833 | + * Cache for all reads and writes to raid sets (operational or degraded) | |
834 | + * | |
835 | + * We need to run all data to and from a RAID set through this cache, | |
836 | + * because parity chunks need to get calculated from data chunks | |
837 | + * or, in the degraded/resynchronization case, missing chunks need | |
838 | + * to be reconstructed using the other chunks of the stripe. | |
839 | + *---------------------------------------------------------------*/ | |
840 | +/* Protect kmem cache # counter. */ | |
841 | +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */ | |
842 | + | |
843 | +/* A stripe set (holds bios hanging off). */ | |
844 | +struct stripe_set { | |
845 | + struct stripe *stripe; /* Backpointer to stripe for endio(). */ | |
846 | + struct bio_list bl[3]; /* Reads, writes, and writes merged. */ | |
847 | +#define WRITE_MERGED 2 | |
848 | +}; | |
849 | + | |
850 | +#if READ != 0 || WRITE != 1 | |
851 | +#error dm-raid45: READ/WRITE != 0/1 used as index!!! | |
852 | +#endif | |
853 | + | |
854 | +/* | |
855 | + * Stripe linked list indexes. Keep order, because the stripe | |
856 | + * and the stripe cache rely on the first 3! | |
857 | + */ | |
858 | +enum list_types { | |
859 | + LIST_IO = 0, /* Stripes with io pending. */ | |
860 | + LIST_ENDIO, /* Stripes to endio. */ | |
861 | + LIST_LRU, /* Least recently used stripes. */ | |
862 | + LIST_HASH, /* Hashed stripes. */ | |
863 | + LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */ | |
864 | + NR_LISTS, /* To size array in struct stripe. */ | |
865 | +}; | |
866 | + | |
867 | +enum lock_types { | |
868 | + LOCK_ENDIO = 0, /* Protect endio list. */ | |
869 | + LOCK_LRU, /* Protect lru list. */ | |
870 | + NR_LOCKS, /* To size array in struct stripe_cache. */ | |
871 | +}; | |
872 | + | |
873 | +/* A stripe: the io object to handle all reads and writes to a RAID set. */ | |
874 | +struct stripe { | |
875 | + struct stripe_cache *sc; /* Backpointer to stripe cache. */ | |
876 | + | |
877 | + sector_t key; /* Hash key. */ | |
878 | + sector_t region; /* Region stripe is mapped to. */ | |
879 | + | |
880 | + /* Reference count. */ | |
881 | + atomic_t cnt; | |
882 | + | |
883 | + struct { | |
884 | + unsigned long flags; /* flags (see below). */ | |
885 | + | |
886 | + /* | |
887 | + * Pending ios in flight: | |
888 | + * | |
889 | + * used as a 'lock' to control move of stripe to endio list | |
890 | + */ | |
891 | + atomic_t pending; /* Pending ios in flight. */ | |
892 | + | |
893 | + /* Sectors to read and write for multi page stripe sets. */ | |
894 | + unsigned size; | |
895 | + } io; | |
896 | + | |
897 | + /* Lock on stripe (for clustering). */ | |
898 | + void *lock; | |
899 | + | |
900 | + /* | |
901 | + * 4 linked lists: | |
902 | + * o io list to flush io | |
903 | + * o endio list | |
904 | + * o LRU list to put stripes w/o reference count on | |
905 | + * o stripe cache hash | |
906 | + */ | |
907 | + struct list_head lists[NR_LISTS]; | |
908 | + | |
909 | + struct { | |
910 | + unsigned short parity; /* Parity chunk index. */ | |
911 | + short recover; /* Recovery chunk index. */ | |
912 | + } idx; | |
913 | + | |
914 | + /* This sets memory cache object (dm-mem-cache). */ | |
915 | + struct dm_mem_cache_object *obj; | |
916 | + | |
917 | + /* Array of stripe sets (dynamically allocated). */ | |
918 | + struct stripe_set ss[0]; | |
919 | +}; | |
920 | + | |
921 | +/* States stripes can be in (flags field). */ | |
922 | +enum stripe_states { | |
923 | + STRIPE_ACTIVE, /* Active io on stripe. */ | |
924 | + STRIPE_ERROR, /* io error on stripe. */ | |
925 | + STRIPE_MERGED, /* Writes got merged. */ | |
926 | + STRIPE_READ, /* Read. */ | |
927 | + STRIPE_RBW, /* Read-before-write. */ | |
928 | + STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */ | |
929 | + STRIPE_RECOVER, /* Stripe used for RAID set recovery. */ | |
930 | +}; | |
931 | + | |
932 | +/* ... and macros to access them. */ | |
933 | +#define BITOPS(name, what, var, flag) \ | |
934 | +static inline int TestClear ## name ## what(struct var *v) \ | |
935 | +{ return test_and_clear_bit(flag, &v->io.flags); } \ | |
936 | +static inline int TestSet ## name ## what(struct var *v) \ | |
937 | +{ return test_and_set_bit(flag, &v->io.flags); } \ | |
938 | +static inline void Clear ## name ## what(struct var *v) \ | |
939 | +{ clear_bit(flag, &v->io.flags); } \ | |
940 | +static inline void Set ## name ## what(struct var *v) \ | |
941 | +{ set_bit(flag, &v->io.flags); } \ | |
942 | +static inline int name ## what(struct var *v) \ | |
943 | +{ return test_bit(flag, &v->io.flags); } | |
944 | + | |
945 | + | |
946 | +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE) | |
947 | +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED) | |
948 | +BITOPS(Stripe, Error, stripe, STRIPE_ERROR) | |
949 | +BITOPS(Stripe, Read, stripe, STRIPE_READ) | |
950 | +BITOPS(Stripe, RBW, stripe, STRIPE_RBW) | |
951 | +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT) | |
952 | +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER) | |
953 | + | |
954 | +/* A stripe hash. */ | |
955 | +struct stripe_hash { | |
956 | + struct list_head *hash; | |
957 | + unsigned buckets; | |
958 | + unsigned mask; | |
959 | + unsigned prime; | |
960 | + unsigned shift; | |
961 | +}; | |
962 | + | |
963 | +/* A stripe cache. */ | |
964 | +struct stripe_cache { | |
965 | + /* Stripe hash. */ | |
966 | + struct stripe_hash hash; | |
967 | + | |
968 | + /* Stripes with io to flush, stripes to endio and LRU lists. */ | |
969 | + struct list_head lists[3]; | |
970 | + | |
971 | + /* Locks to protect endio and lru lists. */ | |
972 | + spinlock_t locks[NR_LOCKS]; | |
973 | + | |
974 | + /* Slab cache to allocate stripes from. */ | |
975 | + struct { | |
976 | + struct kmem_cache *cache; /* Cache itself. */ | |
977 | + char name[32]; /* Unique name. */ | |
978 | + } kc; | |
979 | + | |
980 | + struct dm_io_client *dm_io_client; /* dm-io client resource context. */ | |
981 | + | |
982 | + /* dm-mem-cache client resource context. */ | |
983 | + struct dm_mem_cache_client *mem_cache_client; | |
984 | + | |
985 | + int stripes_parm; /* # stripes parameter from constructor. */ | |
986 | + atomic_t stripes; /* actual # of stripes in cache. */ | |
987 | + atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */ | |
988 | + atomic_t stripes_last; /* last # of stripes in cache. */ | |
989 | + atomic_t active_stripes; /* actual # of active stripes in cache. */ | |
990 | + | |
991 | + /* REMOVEME: */ | |
992 | + atomic_t max_active_stripes; /* actual # of active stripes in cache. */ | |
993 | +}; | |
994 | + | |
995 | +/* Flag specs for raid_dev */ ; | |
996 | +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED }; | |
997 | + | |
998 | +/* The raid device in a set. */ | |
999 | +struct raid_dev { | |
1000 | + struct dm_dev *dev; | |
1001 | + unsigned long flags; /* raid_dev_flags. */ | |
1002 | + sector_t start; /* offset to map to. */ | |
1003 | +}; | |
1004 | + | |
1005 | +/* Flags spec for raid_set. */ | |
1006 | +enum raid_set_flags { | |
1007 | + RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */ | |
1008 | + RS_DEAD, /* RAID set inoperational. */ | |
1009 | + RS_DEVEL_STATS, /* REMOVEME: display status information. */ | |
1010 | + RS_IO_ERROR, /* io error on set. */ | |
1011 | + RS_RECOVER, /* Do recovery. */ | |
1012 | + RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */ | |
1013 | + RS_REGION_GET, /* get a region to recover. */ | |
1014 | + RS_SC_BUSY, /* stripe cache busy -> send an event. */ | |
1015 | + RS_SUSPENDED, /* RAID set suspendedn. */ | |
1016 | +}; | |
1017 | + | |
1018 | +/* REMOVEME: devel stats counters. */ | |
1019 | +enum stats_types { | |
1020 | + S_BIOS_READ, | |
1021 | + S_BIOS_ADDED_READ, | |
1022 | + S_BIOS_ENDIO_READ, | |
1023 | + S_BIOS_WRITE, | |
1024 | + S_BIOS_ADDED_WRITE, | |
1025 | + S_BIOS_ENDIO_WRITE, | |
1026 | + S_CAN_MERGE, | |
1027 | + S_CANT_MERGE, | |
1028 | + S_CONGESTED, | |
1029 | + S_DM_IO_READ, | |
1030 | + S_DM_IO_WRITE, | |
1031 | + S_ACTIVE_READS, | |
1032 | + S_BANDWIDTH, | |
1033 | + S_BARRIER, | |
1034 | + S_BIO_COPY_PL_NEXT, | |
1035 | + S_DEGRADED, | |
1036 | + S_DELAYED_BIOS, | |
1037 | + S_EVICT, | |
1038 | + S_FLUSHS, | |
1039 | + S_HITS_1ST, | |
1040 | + S_IOS_POST, | |
1041 | + S_INSCACHE, | |
1042 | + S_MAX_LOOKUP, | |
1043 | + S_MERGE_PAGE_LOCKED, | |
1044 | + S_NO_BANDWIDTH, | |
1045 | + S_NOT_CONGESTED, | |
1046 | + S_NO_RW, | |
1047 | + S_NOSYNC, | |
1048 | + S_PROHIBITPAGEIO, | |
1049 | + S_RECONSTRUCT_EI, | |
1050 | + S_RECONSTRUCT_DEV, | |
1051 | + S_REDO, | |
1052 | + S_REQUEUE, | |
1053 | + S_STRIPE_ERROR, | |
1054 | + S_SUM_DELAYED_BIOS, | |
1055 | + S_XORS, | |
1056 | + S_NR_STATS, /* # of stats counters. */ | |
1057 | +}; | |
1058 | + | |
1059 | +/* Status type -> string mappings. */ | |
1060 | +struct stats_map { | |
1061 | + const enum stats_types type; | |
1062 | + const char *str; | |
1063 | +}; | |
1064 | + | |
1065 | +static struct stats_map stats_map[] = { | |
1066 | + { S_BIOS_READ, "r=" }, | |
1067 | + { S_BIOS_ADDED_READ, "/" }, | |
1068 | + { S_BIOS_ENDIO_READ, "/" }, | |
1069 | + { S_BIOS_WRITE, " w=" }, | |
1070 | + { S_BIOS_ADDED_WRITE, "/" }, | |
1071 | + { S_BIOS_ENDIO_WRITE, "/" }, | |
1072 | + { S_DM_IO_READ, " rc=" }, | |
1073 | + { S_DM_IO_WRITE, " wc=" }, | |
1074 | + { S_ACTIVE_READS, " active_reads=" }, | |
1075 | + { S_BANDWIDTH, " bandwidth=" }, | |
1076 | + { S_NO_BANDWIDTH, " no_bandwidth=" }, | |
1077 | + { S_BARRIER, " barrier=" }, | |
1078 | + { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" }, | |
1079 | + { S_CAN_MERGE, " can_merge=" }, | |
1080 | + { S_MERGE_PAGE_LOCKED, "/page_locked=" }, | |
1081 | + { S_CANT_MERGE, "/cant_merge=" }, | |
1082 | + { S_CONGESTED, " congested=" }, | |
1083 | + { S_NOT_CONGESTED, "/not_congested=" }, | |
1084 | + { S_DEGRADED, " degraded=" }, | |
1085 | + { S_DELAYED_BIOS, " delayed_bios=" }, | |
1086 | + { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" }, | |
1087 | + { S_EVICT, " evict=" }, | |
1088 | + { S_FLUSHS, " flushs=" }, | |
1089 | + { S_HITS_1ST, " hits_1st=" }, | |
1090 | + { S_IOS_POST, " ios_post=" }, | |
1091 | + { S_INSCACHE, " inscache=" }, | |
1092 | + { S_MAX_LOOKUP, " max_lookup=" }, | |
1093 | + { S_NO_RW, " no_rw=" }, | |
1094 | + { S_NOSYNC, " nosync=" }, | |
1095 | + { S_PROHIBITPAGEIO, " ProhibitPageIO=" }, | |
1096 | + { S_RECONSTRUCT_EI, " reconstruct_ei=" }, | |
1097 | + { S_RECONSTRUCT_DEV, " reconstruct_dev=" }, | |
1098 | + { S_REDO, " redo=" }, | |
1099 | + { S_REQUEUE, " requeue=" }, | |
1100 | + { S_STRIPE_ERROR, " stripe_error=" }, | |
1101 | + { S_XORS, " xors=" }, | |
1102 | +}; | |
1103 | + | |
1104 | +/* | |
1105 | + * A RAID set. | |
1106 | + */ | |
1107 | +typedef void (*xor_function_t)(unsigned count, unsigned long **data); | |
1108 | +struct raid_set { | |
1109 | + struct dm_target *ti; /* Target pointer. */ | |
1110 | + | |
1111 | + struct { | |
1112 | + unsigned long flags; /* State flags. */ | |
1113 | + spinlock_t in_lock; /* Protects central input list below. */ | |
1114 | + struct bio_list in; /* Pending ios (central input list). */ | |
1115 | + struct bio_list work; /* ios work set. */ | |
1116 | + wait_queue_head_t suspendq; /* suspend synchronization. */ | |
1117 | + atomic_t in_process; /* counter of queued bios (suspendq). */ | |
1118 | + atomic_t in_process_max;/* counter of queued bios max. */ | |
1119 | + | |
1120 | + /* io work. */ | |
1121 | + struct workqueue_struct *wq; | |
1122 | + struct delayed_work dws; | |
1123 | + } io; | |
1124 | + | |
1125 | + /* External locking. */ | |
1126 | + struct dm_raid45_locking_type *locking; | |
1127 | + | |
1128 | + struct stripe_cache sc; /* Stripe cache for this set. */ | |
1129 | + | |
1130 | + /* Xor optimization. */ | |
1131 | + struct { | |
1132 | + struct xor_func *f; | |
1133 | + unsigned chunks; | |
1134 | + unsigned speed; | |
1135 | + } xor; | |
1136 | + | |
1137 | + /* Recovery parameters. */ | |
1138 | + struct recover { | |
1139 | + struct dm_dirty_log *dl; /* Dirty log. */ | |
1140 | + struct dm_rh_client *rh; /* Region hash. */ | |
1141 | + | |
1142 | + /* dm-mem-cache client resource context for recovery stripes. */ | |
1143 | + struct dm_mem_cache_client *mem_cache_client; | |
1144 | + | |
1145 | + struct list_head stripes; /* List of recovery stripes. */ | |
1146 | + | |
1147 | + region_t nr_regions; | |
1148 | + region_t nr_regions_to_recover; | |
1149 | + region_t nr_regions_recovered; | |
1150 | + unsigned long start_jiffies; | |
1151 | + unsigned long end_jiffies; | |
1152 | + | |
1153 | + unsigned bandwidth; /* Recovery bandwidth [%]. */ | |
1154 | + unsigned bandwidth_work; /* Recovery bandwidth [factor]. */ | |
1155 | + unsigned bandwidth_parm; /* " constructor parm. */ | |
1156 | + unsigned io_size; /* io size <= chunk size. */ | |
1157 | + unsigned io_size_parm; /* io size ctr parameter. */ | |
1158 | + | |
1159 | + /* recovery io throttling. */ | |
1160 | + atomic_t io_count[2]; /* counter recover/regular io. */ | |
1161 | + unsigned long last_jiffies; | |
1162 | + | |
1163 | + struct dm_region *reg; /* Actual region to recover. */ | |
1164 | + sector_t pos; /* Position within region to recover. */ | |
1165 | + sector_t end; /* End of region to recover. */ | |
1166 | + } recover; | |
1167 | + | |
1168 | + /* RAID set parameters. */ | |
1169 | + struct { | |
1170 | + struct raid_type *raid_type; /* RAID type (eg, RAID4). */ | |
1171 | + unsigned raid_parms; /* # variable raid parameters. */ | |
1172 | + | |
1173 | + unsigned chunk_size; /* Sectors per chunk. */ | |
1174 | + unsigned chunk_size_parm; | |
1175 | + unsigned chunk_mask; /* Mask for amount. */ | |
1176 | + unsigned chunk_shift; /* rsector chunk size shift. */ | |
1177 | + | |
1178 | + unsigned io_size; /* Sectors per io. */ | |
1179 | + unsigned io_size_parm; | |
1180 | + unsigned io_mask; /* Mask for amount. */ | |
1181 | + unsigned io_shift_mask; /* Mask for raid_address(). */ | |
1182 | + unsigned io_shift; /* rsector io size shift. */ | |
1183 | + unsigned pages_per_io; /* Pages per io. */ | |
1184 | + | |
1185 | + sector_t sectors_per_dev; /* Sectors per device. */ | |
1186 | + | |
1187 | + atomic_t failed_devs; /* Amount of devices failed. */ | |
1188 | + | |
1189 | + /* Index of device to initialize. */ | |
1190 | + int dev_to_init; | |
1191 | + int dev_to_init_parm; | |
1192 | + | |
1193 | + /* Raid devices dynamically allocated. */ | |
1194 | + unsigned raid_devs; /* # of RAID devices below. */ | |
1195 | + unsigned data_devs; /* # of RAID data devices. */ | |
1196 | + | |
1197 | + int ei; /* index of failed RAID device. */ | |
1198 | + | |
1199 | + /* index of dedicated parity device (i.e. RAID4). */ | |
1200 | + int pi; | |
1201 | + int pi_parm; /* constructor parm for status output. */ | |
1202 | + } set; | |
1203 | + | |
1204 | + /* REMOVEME: devel stats counters. */ | |
1205 | + atomic_t stats[S_NR_STATS]; | |
1206 | + | |
1207 | + /* Dynamically allocated temporary pointers for xor(). */ | |
1208 | + unsigned long **data; | |
1209 | + | |
1210 | + /* Dynamically allocated RAID devices. Alignment? */ | |
1211 | + struct raid_dev dev[0]; | |
1212 | +}; | |
1213 | + | |
1214 | + | |
1215 | +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH) | |
1216 | +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE) | |
1217 | +BITOPS(RS, Dead, raid_set, RS_DEAD) | |
1218 | +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS) | |
1219 | +BITOPS(RS, IoError, raid_set, RS_IO_ERROR) | |
1220 | +BITOPS(RS, Recover, raid_set, RS_RECOVER) | |
1221 | +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET) | |
1222 | +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY) | |
1223 | +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED) | |
1224 | +#undef BITOPS | |
1225 | + | |
1226 | +#define PageIO(page) PageChecked(page) | |
1227 | +#define AllowPageIO(page) SetPageChecked(page) | |
1228 | +#define ProhibitPageIO(page) ClearPageChecked(page) | |
1229 | + | |
1230 | +/*----------------------------------------------------------------- | |
1231 | + * Raid-4/5 set structures. | |
1232 | + *---------------------------------------------------------------*/ | |
1233 | +/* RAID level definitions. */ | |
1234 | +enum raid_level { | |
1235 | + raid4, | |
1236 | + raid5, | |
1237 | +}; | |
1238 | + | |
1239 | +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */ | |
1240 | +enum raid_algorithm { | |
1241 | + none, | |
1242 | + left_asym, | |
1243 | + right_asym, | |
1244 | + left_sym, | |
1245 | + right_sym, | |
1246 | +}; | |
1247 | + | |
1248 | +struct raid_type { | |
1249 | + const char *name; /* RAID algorithm. */ | |
1250 | + const char *descr; /* Descriptor text for logging. */ | |
1251 | + const unsigned parity_devs; /* # of parity devices. */ | |
1252 | + const unsigned minimal_devs; /* minimal # of devices in set. */ | |
1253 | + const enum raid_level level; /* RAID level. */ | |
1254 | + const enum raid_algorithm algorithm; /* RAID algorithm. */ | |
1255 | +}; | |
1256 | + | |
1257 | +/* Supported raid types and properties. */ | |
1258 | +static struct raid_type raid_types[] = { | |
1259 | + {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none}, | |
1260 | + {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym}, | |
1261 | + {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym}, | |
1262 | + {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym}, | |
1263 | + {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym}, | |
1264 | +}; | |
1265 | + | |
1266 | +/* Address as calculated by raid_address(). */ | |
1267 | +struct address { | |
1268 | + sector_t key; /* Hash key (start address of stripe). */ | |
1269 | + unsigned di, pi; /* Data and parity disks index. */ | |
1270 | +}; | |
1271 | + | |
1272 | +/* REMOVEME: reset statistics counters. */ | |
1273 | +static void stats_reset(struct raid_set *rs) | |
1274 | +{ | |
1275 | + unsigned s = S_NR_STATS; | |
1276 | + | |
1277 | + while (s--) | |
1278 | + atomic_set(rs->stats + s, 0); | |
1279 | +} | |
1280 | + | |
1281 | +/*---------------------------------------------------------------- | |
1282 | + * RAID set management routines. | |
1283 | + *--------------------------------------------------------------*/ | |
1284 | +/* | |
1285 | + * Begin small helper functions. | |
1286 | + */ | |
1287 | +/* Queue (optionally delayed) io work. */ | |
1288 | +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay) | |
1289 | +{ | |
1290 | + struct delayed_work *dws = &rs->io.dws; | |
1291 | + | |
1292 | + cancel_delayed_work(dws); | |
1293 | + queue_delayed_work(rs->io.wq, dws, delay); | |
1294 | +} | |
1295 | + | |
1296 | +/* Queue io work immediately (called from region hash too). */ | |
1297 | +static INLINE void wake_do_raid(void *context) | |
1298 | +{ | |
1299 | + wake_do_raid_delayed(context, 0); | |
1300 | +} | |
1301 | + | |
1302 | +/* Wait until all io has been processed. */ | |
1303 | +static INLINE void wait_ios(struct raid_set *rs) | |
1304 | +{ | |
1305 | + wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process)); | |
1306 | +} | |
1307 | + | |
1308 | +/* Declare io queued to device. */ | |
1309 | +static INLINE void io_dev_queued(struct raid_dev *dev) | |
1310 | +{ | |
1311 | + set_bit(IO_QUEUED, &dev->flags); | |
1312 | +} | |
1313 | + | |
1314 | +/* Io on device and reset ? */ | |
1315 | +static inline int io_dev_clear(struct raid_dev *dev) | |
1316 | +{ | |
1317 | + return test_and_clear_bit(IO_QUEUED, &dev->flags); | |
1318 | +} | |
1319 | + | |
1320 | +/* Get an io reference. */ | |
1321 | +static INLINE void io_get(struct raid_set *rs) | |
1322 | +{ | |
1323 | + int p = atomic_inc_return(&rs->io.in_process); | |
1324 | + | |
1325 | + if (p > atomic_read(&rs->io.in_process_max)) | |
1326 | + atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */ | |
1327 | +} | |
1328 | + | |
1329 | +/* Put the io reference and conditionally wake io waiters. */ | |
1330 | +static INLINE void io_put(struct raid_set *rs) | |
1331 | +{ | |
1332 | + /* Intel: rebuild data corrupter? */ | |
1333 | + if (!atomic_read(&rs->io.in_process)) { | |
1334 | + DMERR("%s would go negative!!!", __func__); | |
1335 | + return; | |
1336 | + } | |
1337 | + | |
1338 | + if (atomic_dec_and_test(&rs->io.in_process)) | |
1339 | + wake_up(&rs->io.suspendq); | |
1340 | +} | |
1341 | + | |
1342 | +/* Calculate device sector offset. */ | |
1343 | +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio) | |
1344 | +{ | |
1345 | + sector_t sector = bio->bi_sector; | |
1346 | + | |
1347 | + sector_div(sector, rs->set.data_devs); | |
1348 | + return sector; | |
1349 | +} | |
1350 | + | |
1351 | +/* Test device operational. */ | |
1352 | +static INLINE int dev_operational(struct raid_set *rs, unsigned p) | |
1353 | +{ | |
1354 | + return !test_bit(DEVICE_FAILED, &rs->dev[p].flags); | |
1355 | +} | |
1356 | + | |
1357 | +/* Return # of active stripes in stripe cache. */ | |
1358 | +static INLINE int sc_active(struct stripe_cache *sc) | |
1359 | +{ | |
1360 | + return atomic_read(&sc->active_stripes); | |
1361 | +} | |
1362 | + | |
1363 | +/* Test io pending on stripe. */ | |
1364 | +static INLINE int stripe_io(struct stripe *stripe) | |
1365 | +{ | |
1366 | + return atomic_read(&stripe->io.pending); | |
1367 | +} | |
1368 | + | |
1369 | +static INLINE void stripe_io_inc(struct stripe *stripe) | |
1370 | +{ | |
1371 | + atomic_inc(&stripe->io.pending); | |
1372 | +} | |
1373 | + | |
1374 | +static INLINE void stripe_io_dec(struct stripe *stripe) | |
1375 | +{ | |
1376 | + atomic_dec(&stripe->io.pending); | |
1377 | +} | |
1378 | + | |
1379 | +/* Wrapper needed by for_each_io_dev(). */ | |
1380 | +static void _stripe_io_inc(struct stripe *stripe, unsigned p) | |
1381 | +{ | |
1382 | + stripe_io_inc(stripe); | |
1383 | +} | |
1384 | + | |
1385 | +/* Error a stripe. */ | |
1386 | +static INLINE void stripe_error(struct stripe *stripe, struct page *page) | |
1387 | +{ | |
1388 | + SetStripeError(stripe); | |
1389 | + SetPageError(page); | |
1390 | + atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR); | |
1391 | +} | |
1392 | + | |
1393 | +/* Page IOed ok. */ | |
1394 | +enum dirty_type { CLEAN, DIRTY }; | |
1395 | +static INLINE void page_set(struct page *page, enum dirty_type type) | |
1396 | +{ | |
1397 | + switch (type) { | |
1398 | + case DIRTY: | |
1399 | + SetPageDirty(page); | |
1400 | + AllowPageIO(page); | |
1401 | + break; | |
1402 | + | |
1403 | + case CLEAN: | |
1404 | + ClearPageDirty(page); | |
1405 | + break; | |
1406 | + | |
1407 | + default: | |
1408 | + BUG(); | |
1409 | + } | |
1410 | + | |
1411 | + SetPageUptodate(page); | |
1412 | + ClearPageError(page); | |
1413 | +} | |
1414 | + | |
1415 | +/* Return region state for a sector. */ | |
1416 | +static INLINE int | |
1417 | +region_state(struct raid_set *rs, sector_t sector, unsigned long state) | |
1418 | +{ | |
1419 | + struct dm_rh_client *rh = rs->recover.rh; | |
1420 | + | |
1421 | + return RSRecover(rs) ? | |
1422 | + (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) & | |
1423 | + state) : 0; | |
1424 | +} | |
1425 | + | |
1426 | +/* Check maximum devices which may fail in a raid set. */ | |
1427 | +static inline int raid_set_degraded(struct raid_set *rs) | |
1428 | +{ | |
1429 | + return RSIoError(rs); | |
1430 | +} | |
1431 | + | |
1432 | +/* Check # of devices which may fail in a raid set. */ | |
1433 | +static INLINE int raid_set_operational(struct raid_set *rs) | |
1434 | +{ | |
1435 | + /* Too many failed devices -> BAD. */ | |
1436 | + return atomic_read(&rs->set.failed_devs) <= | |
1437 | + rs->set.raid_type->parity_devs; | |
1438 | +} | |
1439 | + | |
1440 | +/* | |
1441 | + * Return true in case a page_list should be read/written | |
1442 | + * | |
1443 | + * Conditions to read/write: | |
1444 | + * o 1st page in list not uptodate | |
1445 | + * o 1st page in list dirty | |
1446 | + * o if we optimized io away, we flag it using the pages checked bit. | |
1447 | + */ | |
1448 | +static INLINE unsigned page_io(struct page *page) | |
1449 | +{ | |
1450 | + /* Optimization: page was flagged to need io during first run. */ | |
1451 | + if (PagePrivate(page)) { | |
1452 | + ClearPagePrivate(page); | |
1453 | + return 1; | |
1454 | + } | |
1455 | + | |
1456 | + /* Avoid io if prohibited or a locked page. */ | |
1457 | + if (!PageIO(page) || PageLocked(page)) | |
1458 | + return 0; | |
1459 | + | |
1460 | + if (!PageUptodate(page) || PageDirty(page)) { | |
1461 | + /* Flag page needs io for second run optimization. */ | |
1462 | + SetPagePrivate(page); | |
1463 | + return 1; | |
1464 | + } | |
1465 | + | |
1466 | + return 0; | |
1467 | +} | |
1468 | + | |
1469 | +/* Call a function on each page list needing io. */ | |
1470 | +static INLINE unsigned | |
1471 | +for_each_io_dev(struct raid_set *rs, struct stripe *stripe, | |
1472 | + void (*f_io)(struct stripe *stripe, unsigned p)) | |
1473 | +{ | |
1474 | + unsigned p = rs->set.raid_devs, r = 0; | |
1475 | + | |
1476 | + while (p--) { | |
1477 | + if (page_io(PAGE(stripe, p))) { | |
1478 | + f_io(stripe, p); | |
1479 | + r++; | |
1480 | + } | |
1481 | + } | |
1482 | + | |
1483 | + return r; | |
1484 | +} | |
1485 | + | |
1486 | +/* Reconstruct a particular device ?. */ | |
1487 | +static INLINE int dev_to_init(struct raid_set *rs) | |
1488 | +{ | |
1489 | + return rs->set.dev_to_init > -1; | |
1490 | +} | |
1491 | + | |
1492 | +/* | |
1493 | + * Index of device to calculate parity on. | |
1494 | + * Either the parity device index *or* the selected device to init | |
1495 | + * after a spare replacement. | |
1496 | + */ | |
1497 | +static INLINE unsigned dev_for_parity(struct stripe *stripe) | |
1498 | +{ | |
1499 | + struct raid_set *rs = RS(stripe->sc); | |
1500 | + | |
1501 | + return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity; | |
1502 | +} | |
1503 | + | |
1504 | +/* Return the index of the device to be recovered. */ | |
1505 | +static int idx_get(struct raid_set *rs) | |
1506 | +{ | |
1507 | + /* Avoid to read in the pages to be reconstructed anyway. */ | |
1508 | + if (dev_to_init(rs)) | |
1509 | + return rs->set.dev_to_init; | |
1510 | + else if (rs->set.raid_type->level == raid4) | |
1511 | + return rs->set.pi; | |
1512 | + | |
1513 | + return -1; | |
1514 | +} | |
1515 | + | |
1516 | +/* RAID set congested function. */ | |
1517 | +static int raid_set_congested(void *congested_data, int bdi_bits) | |
1518 | +{ | |
1519 | + struct raid_set *rs = congested_data; | |
1520 | + int r = 0; /* Assume uncongested. */ | |
1521 | + unsigned p = rs->set.raid_devs; | |
1522 | + | |
1523 | + /* If any of our component devices are overloaded. */ | |
1524 | + while (p--) { | |
1525 | + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev); | |
1526 | + | |
1527 | + r |= bdi_congested(&q->backing_dev_info, bdi_bits); | |
1528 | + } | |
1529 | + | |
1530 | + /* REMOVEME: statistics. */ | |
1531 | + atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED)); | |
1532 | + return r; | |
1533 | +} | |
1534 | + | |
1535 | +/* Display RAID set dead message once. */ | |
1536 | +static void raid_set_dead(struct raid_set *rs) | |
1537 | +{ | |
1538 | + if (!TestSetRSDead(rs)) { | |
1539 | + unsigned p; | |
1540 | + char buf[BDEVNAME_SIZE]; | |
1541 | + | |
1542 | + DMERR("FATAL: too many devices failed -> RAID set dead"); | |
1543 | + | |
1544 | + for (p = 0; p < rs->set.raid_devs; p++) { | |
1545 | + if (!dev_operational(rs, p)) | |
1546 | + DMERR("device /dev/%s failed", | |
1547 | + bdevname(rs->dev[p].dev->bdev, buf)); | |
1548 | + } | |
1549 | + } | |
1550 | +} | |
1551 | + | |
1552 | +/* RAID set degrade check. */ | |
1553 | +static INLINE int | |
1554 | +raid_set_check_and_degrade(struct raid_set *rs, | |
1555 | + struct stripe *stripe, unsigned p) | |
1556 | +{ | |
1557 | + if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags)) | |
1558 | + return -EPERM; | |
1559 | + | |
1560 | + /* Through an event in case of member device errors. */ | |
1561 | + dm_table_event(rs->ti->table); | |
1562 | + atomic_inc(&rs->set.failed_devs); | |
1563 | + | |
1564 | + /* Only log the first member error. */ | |
1565 | + if (!TestSetRSIoError(rs)) { | |
1566 | + char buf[BDEVNAME_SIZE]; | |
1567 | + | |
1568 | + /* Store index for recovery. */ | |
1569 | + mb(); | |
1570 | + rs->set.ei = p; | |
1571 | + mb(); | |
1572 | + | |
1573 | + DMERR("CRITICAL: %sio error on device /dev/%s " | |
1574 | + "in region=%llu; DEGRADING RAID set", | |
1575 | + stripe ? "" : "FAKED ", | |
1576 | + bdevname(rs->dev[p].dev->bdev, buf), | |
1577 | + (unsigned long long) (stripe ? stripe->key : 0)); | |
1578 | + DMERR("further device error messages suppressed"); | |
1579 | + } | |
1580 | + | |
1581 | + return 0; | |
1582 | +} | |
1583 | + | |
1584 | +static void | |
1585 | +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe) | |
1586 | +{ | |
1587 | + unsigned p = rs->set.raid_devs; | |
1588 | + | |
1589 | + while (p--) { | |
1590 | + struct page *page = PAGE(stripe, p); | |
1591 | + | |
1592 | + if (PageError(page)) { | |
1593 | + ClearPageError(page); | |
1594 | + raid_set_check_and_degrade(rs, stripe, p); | |
1595 | + } | |
1596 | + } | |
1597 | +} | |
1598 | + | |
1599 | +/* RAID set upgrade check. */ | |
1600 | +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p) | |
1601 | +{ | |
1602 | + if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags)) | |
1603 | + return -EPERM; | |
1604 | + | |
1605 | + if (atomic_dec_and_test(&rs->set.failed_devs)) { | |
1606 | + ClearRSIoError(rs); | |
1607 | + rs->set.ei = -1; | |
1608 | + } | |
1609 | + | |
1610 | + return 0; | |
1611 | +} | |
1612 | + | |
1613 | +/* Lookup a RAID device by name or by major:minor number. */ | |
1614 | +union dev_lookup { | |
1615 | + const char *dev_name; | |
1616 | + struct raid_dev *dev; | |
1617 | +}; | |
1618 | +enum lookup_type { byname, bymajmin, bynumber }; | |
1619 | +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by, | |
1620 | + union dev_lookup *dl) | |
1621 | +{ | |
1622 | + unsigned p; | |
1623 | + | |
1624 | + /* | |
1625 | + * Must be an incremental loop, because the device array | |
1626 | + * can have empty slots still on calls from raid_ctr() | |
1627 | + */ | |
1628 | + for (p = 0; p < rs->set.raid_devs; p++) { | |
1629 | + char buf[BDEVNAME_SIZE]; | |
1630 | + struct raid_dev *dev = rs->dev + p; | |
1631 | + | |
1632 | + if (!dev->dev) | |
1633 | + break; | |
1634 | + | |
1635 | + /* Format dev string appropriately if necessary. */ | |
1636 | + if (by == byname) | |
1637 | + bdevname(dev->dev->bdev, buf); | |
1638 | + else if (by == bymajmin) | |
1639 | + format_dev_t(buf, dev->dev->bdev->bd_dev); | |
1640 | + | |
1641 | + /* Do the actual check. */ | |
1642 | + if (by == bynumber) { | |
1643 | + if (dl->dev->dev->bdev->bd_dev == | |
1644 | + dev->dev->bdev->bd_dev) | |
1645 | + return p; | |
1646 | + } else if (!strcmp(dl->dev_name, buf)) | |
1647 | + return p; | |
1648 | + } | |
1649 | + | |
1650 | + return -ENODEV; | |
1651 | +} | |
1652 | + | |
1653 | +/* End io wrapper. */ | |
1654 | +static INLINE void | |
1655 | +_bio_endio(struct raid_set *rs, struct bio *bio, int error) | |
1656 | +{ | |
1657 | + /* REMOVEME: statistics. */ | |
1658 | + atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ? | |
1659 | + S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ)); | |
1660 | + bio_endio(bio, error); | |
1661 | + io_put(rs); /* Wake any suspend waiters. */ | |
1662 | +} | |
1663 | + | |
1664 | +/* | |
1665 | + * End small helper functions. | |
1666 | + */ | |
1667 | + | |
1668 | + | |
1669 | +/* | |
1670 | + * Stripe hash functions | |
1671 | + */ | |
1672 | +/* Initialize/destroy stripe hash. */ | |
1673 | +static int hash_init(struct stripe_hash *hash, unsigned stripes) | |
1674 | +{ | |
1675 | + unsigned buckets = 2, max_buckets = stripes / 4; | |
1676 | + unsigned hash_primes[] = { | |
1677 | + /* Table of primes for hash_fn/table size optimization. */ | |
1678 | + 3, 7, 13, 27, 53, 97, 193, 389, 769, | |
1679 | + 1543, 3079, 6151, 12289, 24593, | |
1680 | + }; | |
1681 | + | |
1682 | + /* Calculate number of buckets (2^^n <= stripes / 4). */ | |
1683 | + while (buckets < max_buckets) | |
1684 | + buckets <<= 1; | |
1685 | + | |
1686 | + /* Allocate stripe hash. */ | |
1687 | + hash->hash = vmalloc(buckets * sizeof(*hash->hash)); | |
1688 | + if (!hash->hash) | |
1689 | + return -ENOMEM; | |
1690 | + | |
1691 | + hash->buckets = buckets; | |
1692 | + hash->mask = buckets - 1; | |
1693 | + hash->shift = ffs(buckets); | |
1694 | + if (hash->shift > ARRAY_SIZE(hash_primes) + 1) | |
1695 | + hash->shift = ARRAY_SIZE(hash_primes) + 1; | |
1696 | + | |
1697 | + BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1); | |
1698 | + hash->prime = hash_primes[hash->shift - 2]; | |
1699 | + | |
1700 | + /* Initialize buckets. */ | |
1701 | + while (buckets--) | |
1702 | + INIT_LIST_HEAD(hash->hash + buckets); | |
1703 | + | |
1704 | + return 0; | |
1705 | +} | |
1706 | + | |
1707 | +static INLINE void hash_exit(struct stripe_hash *hash) | |
1708 | +{ | |
1709 | + if (hash->hash) { | |
1710 | + vfree(hash->hash); | |
1711 | + hash->hash = NULL; | |
1712 | + } | |
1713 | +} | |
1714 | + | |
1715 | +/* List add (head/tail/locked/unlocked) inlines. */ | |
1716 | +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED }; | |
1717 | +#define LIST_DEL(name, list) \ | |
1718 | +static void stripe_ ## name ## _del(struct stripe *stripe, \ | |
1719 | + enum list_lock_type lock) { \ | |
1720 | + struct list_head *lh = stripe->lists + (list); \ | |
1721 | + spinlock_t *l = NULL; \ | |
1722 | +\ | |
1723 | + if (lock == LIST_LOCKED) { \ | |
1724 | + l = stripe->sc->locks + LOCK_LRU; \ | |
1725 | + spin_lock_irq(l); \ | |
1726 | + } \ | |
1727 | +\ | |
1728 | +\ | |
1729 | + if (!list_empty(lh)) \ | |
1730 | + list_del_init(lh); \ | |
1731 | +\ | |
1732 | + if (lock == LIST_LOCKED) \ | |
1733 | + spin_unlock_irq(l); \ | |
1734 | +} | |
1735 | + | |
1736 | +LIST_DEL(hash, LIST_HASH) | |
1737 | +LIST_DEL(lru, LIST_LRU) | |
1738 | +#undef LIST_DEL | |
1739 | + | |
1740 | +enum list_pos_type { POS_HEAD, POS_TAIL }; | |
1741 | +#define LIST_ADD(name, list) \ | |
1742 | +static void stripe_ ## name ## _add(struct stripe *stripe, \ | |
1743 | + enum list_pos_type pos, \ | |
1744 | + enum list_lock_type lock) { \ | |
1745 | + struct list_head *lh = stripe->lists + (list); \ | |
1746 | + struct stripe_cache *sc = stripe->sc; \ | |
1747 | + spinlock_t *l = NULL; \ | |
1748 | +\ | |
1749 | + if (lock == LIST_LOCKED) { \ | |
1750 | + l = sc->locks + LOCK_LRU; \ | |
1751 | + spin_lock_irq(l); \ | |
1752 | + } \ | |
1753 | +\ | |
1754 | + if (list_empty(lh)) { \ | |
1755 | + if (pos == POS_HEAD) \ | |
1756 | + list_add(lh, sc->lists + (list)); \ | |
1757 | + else \ | |
1758 | + list_add_tail(lh, sc->lists + (list)); \ | |
1759 | + } \ | |
1760 | +\ | |
1761 | + if (lock == LIST_LOCKED) \ | |
1762 | + spin_unlock_irq(l); \ | |
1763 | +} | |
1764 | + | |
1765 | +LIST_ADD(endio, LIST_ENDIO) | |
1766 | +LIST_ADD(io, LIST_IO) | |
1767 | +LIST_ADD(lru, LIST_LRU) | |
1768 | +#undef LIST_ADD | |
1769 | + | |
1770 | +#define POP(list) \ | |
1771 | + do { \ | |
1772 | + if (list_empty(sc->lists + list)) \ | |
1773 | + stripe = NULL; \ | |
1774 | + else { \ | |
1775 | + stripe = list_first_entry(&sc->lists[list], \ | |
1776 | + struct stripe, \ | |
1777 | + lists[list]); \ | |
1778 | + list_del_init(&stripe->lists[list]); \ | |
1779 | + } \ | |
1780 | + } while (0); | |
1781 | + | |
1782 | +/* Pop an available stripe off the lru list. */ | |
1783 | +static struct stripe *stripe_lru_pop(struct stripe_cache *sc) | |
1784 | +{ | |
1785 | + struct stripe *stripe; | |
1786 | + spinlock_t *lock = sc->locks + LOCK_LRU; | |
1787 | + | |
1788 | + spin_lock_irq(lock); | |
1789 | + POP(LIST_LRU); | |
1790 | + spin_unlock_irq(lock); | |
1791 | + | |
1792 | + if (stripe) | |
1793 | + /* Remove from hash before reuse. */ | |
1794 | + stripe_hash_del(stripe, LIST_UNLOCKED); | |
1795 | + | |
1796 | + return stripe; | |
1797 | +} | |
1798 | + | |
1799 | +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key) | |
1800 | +{ | |
1801 | + return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask); | |
1802 | +} | |
1803 | + | |
1804 | +static inline struct list_head * | |
1805 | +hash_bucket(struct stripe_hash *hash, sector_t key) | |
1806 | +{ | |
1807 | + return hash->hash + hash_fn(hash, key); | |
1808 | +} | |
1809 | + | |
1810 | +/* Insert an entry into a hash. */ | |
1811 | +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe) | |
1812 | +{ | |
1813 | + list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key)); | |
1814 | +} | |
1815 | + | |
1816 | +/* Insert an entry into the stripe hash. */ | |
1817 | +static inline void | |
1818 | +sc_insert(struct stripe_cache *sc, struct stripe *stripe) | |
1819 | +{ | |
1820 | + hash_insert(&sc->hash, stripe); | |
1821 | +} | |
1822 | + | |
1823 | +/* Lookup an entry in the stripe hash. */ | |
1824 | +static inline struct stripe * | |
1825 | +stripe_lookup(struct stripe_cache *sc, sector_t key) | |
1826 | +{ | |
1827 | + unsigned c = 0; | |
1828 | + struct stripe *stripe; | |
1829 | + struct list_head *bucket = hash_bucket(&sc->hash, key); | |
1830 | + | |
1831 | + list_for_each_entry(stripe, bucket, lists[LIST_HASH]) { | |
1832 | + /* REMOVEME: statisics. */ | |
1833 | + if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP)) | |
1834 | + atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c); | |
1835 | + | |
1836 | + if (stripe->key == key) | |
1837 | + return stripe; | |
1838 | + } | |
1839 | + | |
1840 | + return NULL; | |
1841 | +} | |
1842 | + | |
1843 | +/* Resize the stripe cache hash on size changes. */ | |
1844 | +static int hash_resize(struct stripe_cache *sc) | |
1845 | +{ | |
1846 | + /* Resize threshold reached? */ | |
1847 | + if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last) | |
1848 | + || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) { | |
1849 | + int r; | |
1850 | + struct stripe_hash hash, hash_tmp; | |
1851 | + spinlock_t *lock; | |
1852 | + | |
1853 | + r = hash_init(&hash, atomic_read(&sc->stripes)); | |
1854 | + if (r) | |
1855 | + return r; | |
1856 | + | |
1857 | + lock = sc->locks + LOCK_LRU; | |
1858 | + spin_lock_irq(lock); | |
1859 | + if (sc->hash.hash) { | |
1860 | + unsigned b = sc->hash.buckets; | |
1861 | + struct list_head *pos, *tmp; | |
1862 | + | |
1863 | + /* Walk old buckets and insert into new. */ | |
1864 | + while (b--) { | |
1865 | + list_for_each_safe(pos, tmp, sc->hash.hash + b) | |
1866 | + hash_insert(&hash, | |
1867 | + list_entry(pos, struct stripe, | |
1868 | + lists[LIST_HASH])); | |
1869 | + } | |
1870 | + | |
1871 | + } | |
1872 | + | |
1873 | + memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp)); | |
1874 | + memcpy(&sc->hash, &hash, sizeof(sc->hash)); | |
1875 | + atomic_set(&sc->stripes_last, atomic_read(&sc->stripes)); | |
1876 | + spin_unlock_irq(lock); | |
1877 | + | |
1878 | + hash_exit(&hash_tmp); | |
1879 | + } | |
1880 | + | |
1881 | + return 0; | |
1882 | +} | |
1883 | + | |
1884 | +/* | |
1885 | + * Stripe cache locking functions | |
1886 | + */ | |
1887 | +/* Dummy lock function for local RAID4+5. */ | |
1888 | +static void *no_lock(sector_t key, enum dm_lock_type type) | |
1889 | +{ | |
1890 | + return &no_lock; | |
1891 | +} | |
1892 | + | |
1893 | +/* Dummy unlock function for local RAID4+5. */ | |
1894 | +static void no_unlock(void *lock_handle) | |
1895 | +{ | |
1896 | +} | |
1897 | + | |
1898 | +/* No locking (for local RAID 4+5). */ | |
1899 | +static struct dm_raid45_locking_type locking_none = { | |
1900 | + .lock = no_lock, | |
1901 | + .unlock = no_unlock, | |
1902 | +}; | |
1903 | + | |
1904 | +/* Clustered RAID 4+5. */ | |
1905 | +/* FIXME: code this. */ | |
1906 | +static struct dm_raid45_locking_type locking_cluster = { | |
1907 | + .lock = no_lock, | |
1908 | + .unlock = no_unlock, | |
1909 | +}; | |
1910 | + | |
1911 | +/* Lock a stripe (for clustering). */ | |
1912 | +static int | |
1913 | +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key) | |
1914 | +{ | |
1915 | + stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED : | |
1916 | + DM_RAID45_EX); | |
1917 | + return stripe->lock ? 0 : -EPERM; | |
1918 | +} | |
1919 | + | |
1920 | +/* Unlock a stripe (for clustering). */ | |
1921 | +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe) | |
1922 | +{ | |
1923 | + rs->locking->unlock(stripe->lock); | |
1924 | + stripe->lock = NULL; | |
1925 | +} | |
1926 | + | |
1927 | +/* | |
1928 | + * Stripe cache functions. | |
1929 | + */ | |
1930 | +/* | |
1931 | + * Invalidate all page lists pages of a stripe. | |
1932 | + * | |
1933 | + * I only keep state for the whole list in the first page. | |
1934 | + */ | |
1935 | +static INLINE void | |
1936 | +stripe_pages_invalidate(struct stripe *stripe) | |
1937 | +{ | |
1938 | + unsigned p = RS(stripe->sc)->set.raid_devs; | |
1939 | + | |
1940 | + while (p--) { | |
1941 | + struct page *page = PAGE(stripe, p); | |
1942 | + | |
1943 | + ProhibitPageIO(page); | |
1944 | + ClearPageChecked(page); | |
1945 | + ClearPageDirty(page); | |
1946 | + ClearPageError(page); | |
1947 | + clear_page_locked(page); | |
1948 | + ClearPagePrivate(page); | |
1949 | + ClearPageUptodate(page); | |
1950 | + } | |
1951 | +} | |
1952 | + | |
1953 | +/* Prepare stripe for (re)use. */ | |
1954 | +static INLINE void stripe_invalidate(struct stripe *stripe) | |
1955 | +{ | |
1956 | + stripe->io.flags = 0; | |
1957 | + stripe_pages_invalidate(stripe); | |
1958 | +} | |
1959 | + | |
1960 | +/* Allow io on all chunks of a stripe. */ | |
1961 | +static INLINE void stripe_allow_io(struct stripe *stripe) | |
1962 | +{ | |
1963 | + unsigned p = RS(stripe->sc)->set.raid_devs; | |
1964 | + | |
1965 | + while (p--) | |
1966 | + AllowPageIO(PAGE(stripe, p)); | |
1967 | +} | |
1968 | + | |
1969 | +/* Initialize a stripe. */ | |
1970 | +static void | |
1971 | +stripe_init(struct stripe_cache *sc, struct stripe *stripe) | |
1972 | +{ | |
1973 | + unsigned p = RS(sc)->set.raid_devs; | |
1974 | + unsigned i; | |
1975 | + | |
1976 | + /* Work all io chunks. */ | |
1977 | + while (p--) { | |
1978 | + struct stripe_set *ss = stripe->ss + p; | |
1979 | + | |
1980 | + stripe->obj[p].private = ss; | |
1981 | + ss->stripe = stripe; | |
1982 | + | |
1983 | + i = ARRAY_SIZE(ss->bl); | |
1984 | + while (i--) | |
1985 | + bio_list_init(ss->bl + i); | |
1986 | + } | |
1987 | + | |
1988 | + stripe->sc = sc; | |
1989 | + | |
1990 | + i = ARRAY_SIZE(stripe->lists); | |
1991 | + while (i--) | |
1992 | + INIT_LIST_HEAD(stripe->lists + i); | |
1993 | + | |
1994 | + atomic_set(&stripe->cnt, 0); | |
1995 | + atomic_set(&stripe->io.pending, 0); | |
1996 | + | |
1997 | + stripe_invalidate(stripe); | |
1998 | +} | |
1999 | + | |
2000 | +/* Number of pages per chunk. */ | |
2001 | +static inline unsigned chunk_pages(unsigned io_size) | |
2002 | +{ | |
2003 | + return dm_div_up(io_size, SECTORS_PER_PAGE); | |
2004 | +} | |
2005 | + | |
2006 | +/* Number of pages per stripe. */ | |
2007 | +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size) | |
2008 | +{ | |
2009 | + return chunk_pages(io_size) * rs->set.raid_devs; | |
2010 | +} | |
2011 | + | |
2012 | +/* Initialize part of page_list (recovery). */ | |
2013 | +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p, | |
2014 | + unsigned start, unsigned count) | |
2015 | +{ | |
2016 | + unsigned pages = chunk_pages(count); | |
2017 | + /* Get offset into the page_list. */ | |
2018 | + struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE); | |
2019 | + | |
2020 | + BUG_ON(!pl); | |
2021 | + while (pl && pages--) { | |
2022 | + BUG_ON(!pl->page); | |
2023 | + memset(page_address(pl->page), 0, PAGE_SIZE); | |
2024 | + pl = pl->next; | |
2025 | + } | |
2026 | +} | |
2027 | + | |
2028 | +/* Initialize parity chunk of stripe. */ | |
2029 | +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p) | |
2030 | +{ | |
2031 | + stripe_zero_pl_part(stripe, p, 0, stripe->io.size); | |
2032 | +} | |
2033 | + | |
2034 | +/* Return dynamic stripe structure size. */ | |
2035 | +static INLINE size_t stripe_size(struct raid_set *rs) | |
2036 | +{ | |
2037 | + return sizeof(struct stripe) + | |
2038 | + rs->set.raid_devs * sizeof(struct stripe_set); | |
2039 | +} | |
2040 | + | |
2041 | +/* Allocate a stripe and its memory object. */ | |
2042 | +/* XXX adjust to cope with stripe cache and recovery stripe caches. */ | |
2043 | +enum grow { SC_GROW, SC_KEEP }; | |
2044 | +static struct stripe *stripe_alloc(struct stripe_cache *sc, | |
2045 | + struct dm_mem_cache_client *mc, | |
2046 | + enum grow grow) | |
2047 | +{ | |
2048 | + int r; | |
2049 | + struct stripe *stripe; | |
2050 | + | |
2051 | + stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL); | |
2052 | + if (stripe) { | |
2053 | + /* Grow the dm-mem-cache by one object. */ | |
2054 | + if (grow == SC_GROW) { | |
2055 | + r = dm_mem_cache_grow(mc, 1); | |
2056 | + if (r) | |
2057 | + goto err_free; | |
2058 | + } | |
2059 | + | |
2060 | + stripe->obj = dm_mem_cache_alloc(mc); | |
2061 | + if (!stripe->obj) | |
2062 | + goto err_shrink; | |
2063 | + | |
2064 | + stripe_init(sc, stripe); | |
2065 | + } | |
2066 | + | |
2067 | + return stripe; | |
2068 | + | |
2069 | +err_shrink: | |
2070 | + if (grow == SC_GROW) | |
2071 | + dm_mem_cache_shrink(mc, 1); | |
2072 | +err_free: | |
2073 | + kmem_cache_free(sc->kc.cache, stripe); | |
2074 | + return NULL; | |
2075 | +} | |
2076 | + | |
2077 | +/* | |
2078 | + * Free a stripes memory object, shrink the | |
2079 | + * memory cache and free the stripe itself | |
2080 | + */ | |
2081 | +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc) | |
2082 | +{ | |
2083 | + dm_mem_cache_free(mc, stripe->obj); | |
2084 | + dm_mem_cache_shrink(mc, 1); | |
2085 | + kmem_cache_free(stripe->sc->kc.cache, stripe); | |
2086 | +} | |
2087 | + | |
2088 | +/* Free the recovery stripe. */ | |
2089 | +static void stripe_recover_free(struct raid_set *rs) | |
2090 | +{ | |
2091 | + struct recover *rec = &rs->recover; | |
2092 | + struct list_head *stripes = &rec->stripes; | |
2093 | + | |
2094 | + while (!list_empty(stripes)) { | |
2095 | + struct stripe *stripe = list_first_entry(stripes, struct stripe, | |
2096 | + lists[LIST_RECOVER]); | |
2097 | + list_del(stripe->lists + LIST_RECOVER); | |
2098 | + stripe_free(stripe, rec->mem_cache_client); | |
2099 | + } | |
2100 | +} | |
2101 | + | |
2102 | +/* Push a stripe safely onto the endio list to be handled by do_endios(). */ | |
2103 | +static INLINE void stripe_endio_push(struct stripe *stripe) | |
2104 | +{ | |
2105 | + int wake; | |
2106 | + unsigned long flags; | |
2107 | + struct stripe_cache *sc = stripe->sc; | |
2108 | + spinlock_t *lock = sc->locks + LOCK_ENDIO; | |
2109 | + | |
2110 | + spin_lock_irqsave(lock, flags); | |
2111 | + wake = list_empty(sc->lists + LIST_ENDIO); | |
2112 | + stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED); | |
2113 | + spin_unlock_irqrestore(lock, flags); | |
2114 | + | |
2115 | + if (wake) | |
2116 | + wake_do_raid(RS(sc)); | |
2117 | +} | |
2118 | + | |
2119 | +/* Protected check for stripe cache endio list empty. */ | |
2120 | +static INLINE int stripe_endio_empty(struct stripe_cache *sc) | |
2121 | +{ | |
2122 | + int r; | |
2123 | + spinlock_t *lock = sc->locks + LOCK_ENDIO; | |
2124 | + | |
2125 | + spin_lock_irq(lock); | |
2126 | + r = list_empty(sc->lists + LIST_ENDIO); | |
2127 | + spin_unlock_irq(lock); | |
2128 | + | |
2129 | + return r; | |
2130 | +} | |
2131 | + | |
2132 | +/* Pop a stripe off safely off the endio list. */ | |
2133 | +static struct stripe *stripe_endio_pop(struct stripe_cache *sc) | |
2134 | +{ | |
2135 | + struct stripe *stripe; | |
2136 | + spinlock_t *lock = sc->locks + LOCK_ENDIO; | |
2137 | + | |
2138 | + /* This runs in parallel with endio(). */ | |
2139 | + spin_lock_irq(lock); | |
2140 | + POP(LIST_ENDIO) | |
2141 | + spin_unlock_irq(lock); | |
2142 | + return stripe; | |
2143 | +} | |
2144 | + | |
2145 | +#undef POP | |
2146 | + | |
2147 | +/* Evict stripe from cache. */ | |
2148 | +static void stripe_evict(struct stripe *stripe) | |
2149 | +{ | |
2150 | + struct raid_set *rs = RS(stripe->sc); | |
2151 | + stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */ | |
2152 | + | |
2153 | + if (list_empty(stripe->lists + LIST_LRU)) { | |
2154 | + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED); | |
2155 | + atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */ | |
2156 | + } | |
2157 | +} | |
2158 | + | |
2159 | +/* Grow stripe cache. */ | |
2160 | +static int | |
2161 | +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow) | |
2162 | +{ | |
2163 | + int r = 0; | |
2164 | + struct raid_set *rs = RS(sc); | |
2165 | + | |
2166 | + /* Try to allocate this many (additional) stripes. */ | |
2167 | + while (stripes--) { | |
2168 | + struct stripe *stripe = | |
2169 | + stripe_alloc(sc, sc->mem_cache_client, grow); | |
2170 | + | |
2171 | + if (likely(stripe)) { | |
2172 | + stripe->io.size = rs->set.io_size; | |
2173 | + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED); | |
2174 | + atomic_inc(&sc->stripes); | |
2175 | + } else { | |
2176 | + r = -ENOMEM; | |
2177 | + break; | |
2178 | + } | |
2179 | + } | |
2180 | + | |
2181 | + ClearRSScBusy(rs); | |
2182 | + return r ? r : hash_resize(sc); | |
2183 | +} | |
2184 | + | |
2185 | +/* Shrink stripe cache. */ | |
2186 | +static int sc_shrink(struct stripe_cache *sc, unsigned stripes) | |
2187 | +{ | |
2188 | + int r = 0; | |
2189 | + | |
2190 | + /* Try to get unused stripe from LRU list. */ | |
2191 | + while (stripes--) { | |
2192 | + struct stripe *stripe; | |
2193 | + | |
2194 | + stripe = stripe_lru_pop(sc); | |
2195 | + if (stripe) { | |
2196 | + /* An lru stripe may never have ios pending! */ | |
2197 | + BUG_ON(stripe_io(stripe)); | |
2198 | + stripe_free(stripe, sc->mem_cache_client); | |
2199 | + atomic_dec(&sc->stripes); | |
2200 | + } else { | |
2201 | + r = -ENOENT; | |
2202 | + break; | |
2203 | + } | |
2204 | + } | |
2205 | + | |
2206 | + /* Check if stats are still sane. */ | |
2207 | + if (atomic_read(&sc->max_active_stripes) > | |
2208 | + atomic_read(&sc->stripes)) | |
2209 | + atomic_set(&sc->max_active_stripes, 0); | |
2210 | + | |
2211 | + if (r) | |
2212 | + return r; | |
2213 | + | |
2214 | + ClearRSScBusy(RS(sc)); | |
2215 | + return hash_resize(sc); | |
2216 | +} | |
2217 | + | |
2218 | +/* Create stripe cache. */ | |
2219 | +static int sc_init(struct raid_set *rs, unsigned stripes) | |
2220 | +{ | |
2221 | + unsigned i, nr; | |
2222 | + struct stripe_cache *sc = &rs->sc; | |
2223 | + struct stripe *stripe; | |
2224 | + struct recover *rec = &rs->recover; | |
2225 | + | |
2226 | + /* Initialize lists and locks. */ | |
2227 | + i = ARRAY_SIZE(sc->lists); | |
2228 | + while (i--) | |
2229 | + INIT_LIST_HEAD(sc->lists + i); | |
2230 | + | |
2231 | + i = NR_LOCKS; | |
2232 | + while (i--) | |
2233 | + spin_lock_init(sc->locks + i); | |
2234 | + | |
2235 | + /* Initialize atomic variables. */ | |
2236 | + atomic_set(&sc->stripes, 0); | |
2237 | + atomic_set(&sc->stripes_last, 0); | |
2238 | + atomic_set(&sc->stripes_to_shrink, 0); | |
2239 | + atomic_set(&sc->active_stripes, 0); | |
2240 | + atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */ | |
2241 | + | |
2242 | + /* | |
2243 | + * We need a runtime unique # to suffix the kmem cache name | |
2244 | + * because we'll have one for each active RAID set. | |
2245 | + */ | |
2246 | + nr = atomic_inc_return(&_stripe_sc_nr); | |
2247 | + sprintf(sc->kc.name, "%s_%d", TARGET, nr); | |
2248 | + sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs), | |
2249 | + 0, 0, NULL); | |
2250 | + if (!sc->kc.cache) | |
2251 | + return -ENOMEM; | |
2252 | + | |
2253 | + /* Create memory cache client context for RAID stripe cache. */ | |
2254 | + sc->mem_cache_client = | |
2255 | + dm_mem_cache_client_create(stripes, rs->set.raid_devs, | |
2256 | + chunk_pages(rs->set.io_size)); | |
2257 | + if (IS_ERR(sc->mem_cache_client)) | |
2258 | + return PTR_ERR(sc->mem_cache_client); | |
2259 | + | |
2260 | + /* Create memory cache client context for RAID recovery stripe(s). */ | |
2261 | + rec->mem_cache_client = | |
2262 | + dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs, | |
2263 | + chunk_pages(rec->io_size)); | |
2264 | + if (IS_ERR(rec->mem_cache_client)) | |
2265 | + return PTR_ERR(rec->mem_cache_client); | |
2266 | + | |
2267 | + /* Allocate stripe for set recovery. */ | |
2268 | + /* XXX: cope with MAX_RECOVERY. */ | |
2269 | + INIT_LIST_HEAD(&rec->stripes); | |
2270 | + for (i = 0; i < MAX_RECOVER; i++) { | |
2271 | + stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP); | |
2272 | + if (!stripe) | |
2273 | + return -ENOMEM; | |
2274 | + | |
2275 | + SetStripeRecover(stripe); | |
2276 | + stripe->io.size = rec->io_size; | |
2277 | + list_add(stripe->lists + LIST_RECOVER, &rec->stripes); | |
2278 | + } | |
2279 | + | |
2280 | + /* | |
2281 | + * Allocate the stripe objetcs from the | |
2282 | + * cache and add them to the LRU list. | |
2283 | + */ | |
2284 | + return sc_grow(sc, stripes, SC_KEEP); | |
2285 | +} | |
2286 | + | |
2287 | +/* Destroy the stripe cache. */ | |
2288 | +static void sc_exit(struct stripe_cache *sc) | |
2289 | +{ | |
2290 | + if (sc->kc.cache) { | |
2291 | + BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes))); | |
2292 | + kmem_cache_destroy(sc->kc.cache); | |
2293 | + } | |
2294 | + | |
2295 | + if (sc->mem_cache_client) | |
2296 | + dm_mem_cache_client_destroy(sc->mem_cache_client); | |
2297 | + | |
2298 | + ClearRSRecover(RS(sc)); | |
2299 | + stripe_recover_free(RS(sc)); | |
2300 | + if (RS(sc)->recover.mem_cache_client) | |
2301 | + dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client); | |
2302 | + | |
2303 | + hash_exit(&sc->hash); | |
2304 | +} | |
2305 | + | |
2306 | +/* | |
2307 | + * Calculate RAID address | |
2308 | + * | |
2309 | + * Delivers tuple with the index of the data disk holding the chunk | |
2310 | + * in the set, the parity disks index and the start of the stripe | |
2311 | + * within the address space of the set (used as the stripe cache hash key). | |
2312 | + */ | |
2313 | +/* thx MD. */ | |
2314 | +static struct address * | |
2315 | +raid_address(struct raid_set *rs, sector_t sector, struct address *addr) | |
2316 | +{ | |
2317 | + unsigned data_devs = rs->set.data_devs, di, pi, | |
2318 | + raid_devs = rs->set.raid_devs; | |
2319 | + sector_t stripe, tmp; | |
2320 | + | |
2321 | + /* | |
2322 | + * chunk_number = sector / chunk_size | |
2323 | + * stripe = chunk_number / data_devs | |
2324 | + * di = stripe % data_devs; | |
2325 | + */ | |
2326 | + stripe = sector >> rs->set.chunk_shift; | |
2327 | + di = sector_div(stripe, data_devs); | |
2328 | + | |
2329 | + switch (rs->set.raid_type->level) { | |
2330 | + case raid5: | |
2331 | + tmp = stripe; | |
2332 | + pi = sector_div(tmp, raid_devs); | |
2333 | + | |
2334 | + switch (rs->set.raid_type->algorithm) { | |
2335 | + case left_asym: /* Left asymmetric. */ | |
2336 | + pi = data_devs - pi; | |
2337 | + case right_asym: /* Right asymmetric. */ | |
2338 | + if (di >= pi) | |
2339 | + di++; | |
2340 | + break; | |
2341 | + | |
2342 | + case left_sym: /* Left symmetric. */ | |
2343 | + pi = data_devs - pi; | |
2344 | + case right_sym: /* Right symmetric. */ | |
2345 | + di = (pi + di + 1) % raid_devs; | |
2346 | + break; | |
2347 | + | |
2348 | + default: | |
2349 | + DMERR("Unknown RAID algorithm %d", | |
2350 | + rs->set.raid_type->algorithm); | |
2351 | + goto out; | |
2352 | + } | |
2353 | + | |
2354 | + break; | |
2355 | + | |
2356 | + case raid4: | |
2357 | + pi = rs->set.pi; | |
2358 | + if (di >= pi) | |
2359 | + di++; | |
2360 | + break; | |
2361 | + | |
2362 | + default: | |
2363 | + DMERR("Unknown RAID level %d", rs->set.raid_type->level); | |
2364 | + goto out; | |
2365 | + } | |
2366 | + | |
2367 | + /* | |
2368 | + * Hash key = start offset on any single device of the RAID set; | |
2369 | + * adjusted in case io size differs from chunk size. | |
2370 | + */ | |
2371 | + addr->key = (stripe << rs->set.chunk_shift) + | |
2372 | + (sector & rs->set.io_shift_mask); | |
2373 | + addr->di = di; | |
2374 | + addr->pi = pi; | |
2375 | + | |
2376 | +out: | |
2377 | + return addr; | |
2378 | +} | |
2379 | + | |
2380 | +/* | |
2381 | + * Copy data across between stripe pages and bio vectors. | |
2382 | + * | |
2383 | + * Pay attention to data alignment in stripe and bio pages. | |
2384 | + */ | |
2385 | +static void | |
2386 | +bio_copy_page_list(int rw, struct stripe *stripe, | |
2387 | + struct page_list *pl, struct bio *bio) | |
2388 | +{ | |
2389 | + unsigned i, page_offset; | |
2390 | + void *page_addr; | |
2391 | + struct raid_set *rs = RS(stripe->sc); | |
2392 | + struct bio_vec *bv; | |
2393 | + | |
2394 | + /* Get start page in page list for this sector. */ | |
2395 | + i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE; | |
2396 | + pl = pl_elem(pl, i); | |
2397 | + | |
2398 | + page_addr = page_address(pl->page); | |
2399 | + page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1)); | |
2400 | + | |
2401 | + /* Walk all segments and copy data across between bio_vecs and pages. */ | |
2402 | + bio_for_each_segment(bv, bio, i) { | |
2403 | + int len = bv->bv_len, size; | |
2404 | + unsigned bio_offset = 0; | |
2405 | + void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0); | |
2406 | +redo: | |
2407 | + size = (page_offset + len > PAGE_SIZE) ? | |
2408 | + PAGE_SIZE - page_offset : len; | |
2409 | + | |
2410 | + if (rw == READ) | |
2411 | + memcpy(bio_addr + bio_offset, | |
2412 | + page_addr + page_offset, size); | |
2413 | + else | |
2414 | + memcpy(page_addr + page_offset, | |
2415 | + bio_addr + bio_offset, size); | |
2416 | + | |
2417 | + page_offset += size; | |
2418 | + if (page_offset == PAGE_SIZE) { | |
2419 | + /* | |
2420 | + * We reached the end of the chunk page -> | |
2421 | + * need refer to the next one to copy more data. | |
2422 | + */ | |
2423 | + len -= size; | |
2424 | + if (len) { | |
2425 | + /* Get next page. */ | |
2426 | + pl = pl->next; | |
2427 | + BUG_ON(!pl); | |
2428 | + page_addr = page_address(pl->page); | |
2429 | + page_offset = 0; | |
2430 | + bio_offset += size; | |
2431 | + /* REMOVEME: statistics. */ | |
2432 | + atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT); | |
2433 | + goto redo; | |
2434 | + } | |
2435 | + } | |
2436 | + | |
2437 | + __bio_kunmap_atomic(bio_addr, KM_USER0); | |
2438 | + } | |
2439 | +} | |
2440 | + | |
2441 | +/* | |
2442 | + * Xor optimization macros. | |
2443 | + */ | |
2444 | +/* Xor data pointer declaration and initialization macros. */ | |
2445 | +#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1] | |
2446 | +#define DECLARE_3 DECLARE_2, *d2 = data[2] | |
2447 | +#define DECLARE_4 DECLARE_3, *d3 = data[3] | |
2448 | +#define DECLARE_5 DECLARE_4, *d4 = data[4] | |
2449 | +#define DECLARE_6 DECLARE_5, *d5 = data[5] | |
2450 | +#define DECLARE_7 DECLARE_6, *d6 = data[6] | |
2451 | +#define DECLARE_8 DECLARE_7, *d7 = data[7] | |
2452 | + | |
2453 | +/* Xor unrole macros. */ | |
2454 | +#define D2(n) d0[n] = d0[n] ^ d1[n] | |
2455 | +#define D3(n) D2(n) ^ d2[n] | |
2456 | +#define D4(n) D3(n) ^ d3[n] | |
2457 | +#define D5(n) D4(n) ^ d4[n] | |
2458 | +#define D6(n) D5(n) ^ d5[n] | |
2459 | +#define D7(n) D6(n) ^ d6[n] | |
2460 | +#define D8(n) D7(n) ^ d7[n] | |
2461 | + | |
2462 | +#define X_2(macro, offset) macro(offset); macro(offset + 1); | |
2463 | +#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2); | |
2464 | +#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4); | |
2465 | +#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8); | |
2466 | +#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16); | |
2467 | +#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32); | |
2468 | + | |
2469 | +/* Define a _xor_#chunks_#xors_per_run() function. */ | |
2470 | +#define _XOR(chunks, xors_per_run) \ | |
2471 | +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \ | |
2472 | +{ \ | |
2473 | + unsigned end = XOR_SIZE / sizeof(data[0]), i; \ | |
2474 | + DECLARE_ ## chunks; \ | |
2475 | +\ | |
2476 | + for (i = 0; i < end; i += xors_per_run) { \ | |
2477 | + X_ ## xors_per_run(D ## chunks, i); \ | |
2478 | + } \ | |
2479 | +} | |
2480 | + | |
2481 | +/* Define xor functions for 2 - 8 chunks. */ | |
2482 | +#define MAKE_XOR_PER_RUN(xors_per_run) \ | |
2483 | + _XOR(2, xors_per_run); _XOR(3, xors_per_run); \ | |
2484 | + _XOR(4, xors_per_run); _XOR(5, xors_per_run); \ | |
2485 | + _XOR(6, xors_per_run); _XOR(7, xors_per_run); \ | |
2486 | + _XOR(8, xors_per_run); | |
2487 | + | |
2488 | +MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */ | |
2489 | +MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */ | |
2490 | +MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */ | |
2491 | +MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */ | |
2492 | + | |
2493 | +#define MAKE_XOR(xors_per_run) \ | |
2494 | +struct { \ | |
2495 | + void (*f)(unsigned long **); \ | |
2496 | +} static xor_funcs ## xors_per_run[] = { \ | |
2497 | + { NULL }, \ | |
2498 | + { NULL }, \ | |
2499 | + { _xor2_ ## xors_per_run }, \ | |
2500 | + { _xor3_ ## xors_per_run }, \ | |
2501 | + { _xor4_ ## xors_per_run }, \ | |
2502 | + { _xor5_ ## xors_per_run }, \ | |
2503 | + { _xor6_ ## xors_per_run }, \ | |
2504 | + { _xor7_ ## xors_per_run }, \ | |
2505 | + { _xor8_ ## xors_per_run }, \ | |
2506 | +}; \ | |
2507 | +\ | |
2508 | +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \ | |
2509 | +{ \ | |
2510 | + /* Call respective function for amount of chunks. */ \ | |
2511 | + xor_funcs ## xors_per_run[n].f(data); \ | |
2512 | +} | |
2513 | + | |
2514 | +/* Define xor_8() - xor_64 functions. */ | |
2515 | +MAKE_XOR(8) | |
2516 | +MAKE_XOR(16) | |
2517 | +MAKE_XOR(32) | |
2518 | +MAKE_XOR(64) | |
2519 | + | |
2520 | +/* Maximum number of chunks, which can be xor'ed in one go. */ | |
2521 | +#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1) | |
2522 | + | |
2523 | +struct xor_func { | |
2524 | + xor_function_t f; | |
2525 | + const char *name; | |
2526 | +} static xor_funcs[] = { | |
2527 | + {xor_8, "xor_8"}, | |
2528 | + {xor_16, "xor_16"}, | |
2529 | + {xor_32, "xor_32"}, | |
2530 | + {xor_64, "xor_64"}, | |
2531 | +}; | |
2532 | + | |
2533 | +/* | |
2534 | + * Calculate crc. | |
2535 | + * | |
2536 | + * This indexes into the page list of the stripe. | |
2537 | + * | |
2538 | + * All chunks will be xored into the parity chunk | |
2539 | + * in maximum groups of xor.chunks. | |
2540 | + * | |
2541 | + * FIXME: try mapping the pages on discontiguous memory. | |
2542 | + */ | |
2543 | +static void xor(struct stripe *stripe, unsigned pi, unsigned sector) | |
2544 | +{ | |
2545 | + struct raid_set *rs = RS(stripe->sc); | |
2546 | + unsigned max_chunks = rs->xor.chunks, n, p; | |
2547 | + unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */ | |
2548 | + unsigned long **d = rs->data; | |
2549 | + xor_function_t xor_f = rs->xor.f->f; | |
2550 | + | |
2551 | + /* Address of parity page to xor into. */ | |
2552 | + d[0] = page_address(pl_elem(PL(stripe, pi), o)->page); | |
2553 | + | |
2554 | + /* Preset pointers to data pages. */ | |
2555 | + for (n = 1, p = rs->set.raid_devs; p--; ) { | |
2556 | + if (p != pi && PageIO(PAGE(stripe, p))) | |
2557 | + d[n++] = page_address(pl_elem(PL(stripe, p), o)->page); | |
2558 | + | |
2559 | + /* If max chunks -> xor .*/ | |
2560 | + if (n == max_chunks) { | |
2561 | + xor_f(n, d); | |
2562 | + n = 1; | |
2563 | + } | |
2564 | + } | |
2565 | + | |
2566 | + /* If chunks -> xor. */ | |
2567 | + if (n > 1) | |
2568 | + xor_f(n, d); | |
2569 | + | |
2570 | + /* Set parity page uptodate and clean. */ | |
2571 | + page_set(PAGE(stripe, pi), CLEAN); | |
2572 | +} | |
2573 | + | |
2574 | +/* Common xor loop through all stripe page lists. */ | |
2575 | +static void common_xor(struct stripe *stripe, sector_t count, | |
2576 | + unsigned off, unsigned p) | |
2577 | +{ | |
2578 | + unsigned sector; | |
2579 | + | |
2580 | + for (sector = off; sector < count; sector += SECTORS_PER_XOR) | |
2581 | + xor(stripe, p, sector); | |
2582 | + | |
2583 | + atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */ | |
2584 | +} | |
2585 | + | |
2586 | +/* | |
2587 | + * Calculate parity sectors on intact stripes. | |
2588 | + * | |
2589 | + * Need to calculate raid address for recover stripe, because its | |
2590 | + * chunk sizes differs and is typically larger than io chunk size. | |
2591 | + */ | |
2592 | +static void parity_xor(struct stripe *stripe) | |
2593 | +{ | |
2594 | + struct raid_set *rs = RS(stripe->sc); | |
2595 | + unsigned chunk_size = rs->set.chunk_size, | |
2596 | + io_size = stripe->io.size, | |
2597 | + xor_size = chunk_size > io_size ? io_size : chunk_size; | |
2598 | + sector_t off; | |
2599 | + | |
2600 | + /* This can be the recover stripe with a larger io size. */ | |
2601 | + for (off = 0; off < io_size; off += xor_size) { | |
2602 | + unsigned pi; | |
2603 | + | |
2604 | + /* | |
2605 | + * Recover stripe likely is bigger than regular io | |
2606 | + * ones and has no precalculated parity disk index -> | |
2607 | + * need to calculate RAID address. | |
2608 | + */ | |
2609 | + if (unlikely(StripeRecover(stripe))) { | |
2610 | + struct address addr; | |
2611 | + | |
2612 | + raid_address(rs, | |
2613 | + (stripe->key + off) * rs->set.data_devs, | |
2614 | + &addr); | |
2615 | + pi = addr.pi; | |
2616 | + stripe_zero_pl_part(stripe, pi, off, | |
2617 | + rs->set.chunk_size); | |
2618 | + } else | |
2619 | + pi = stripe->idx.parity; | |
2620 | + | |
2621 | + common_xor(stripe, xor_size, off, pi); | |
2622 | + page_set(PAGE(stripe, pi), DIRTY); | |
2623 | + } | |
2624 | +} | |
2625 | + | |
2626 | +/* Reconstruct missing chunk. */ | |
2627 | +static void reconstruct_xor(struct stripe *stripe) | |
2628 | +{ | |
2629 | + struct raid_set *rs = RS(stripe->sc); | |
2630 | + int p = stripe->idx.recover; | |
2631 | + | |
2632 | + BUG_ON(p < 0); | |
2633 | + | |
2634 | + /* REMOVEME: statistics. */ | |
2635 | + atomic_inc(rs->stats + (raid_set_degraded(rs) ? | |
2636 | + S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV)); | |
2637 | + | |
2638 | + /* Zero chunk to be reconstructed. */ | |
2639 | + stripe_zero_chunk(stripe, p); | |
2640 | + common_xor(stripe, stripe->io.size, 0, p); | |
2641 | +} | |
2642 | + | |
2643 | +/* | |
2644 | + * Try getting a stripe either from the hash or from the lru list | |
2645 | + */ | |
2646 | +static inline void _stripe_get(struct stripe *stripe) | |
2647 | +{ | |
2648 | + atomic_inc(&stripe->cnt); | |
2649 | +} | |
2650 | + | |
2651 | +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr) | |
2652 | +{ | |
2653 | + struct stripe_cache *sc = &rs->sc; | |
2654 | + struct stripe *stripe; | |
2655 | + | |
2656 | + stripe = stripe_lookup(sc, addr->key); | |
2657 | + if (stripe) { | |
2658 | + _stripe_get(stripe); | |
2659 | + /* Remove from the lru list if on. */ | |
2660 | + stripe_lru_del(stripe, LIST_LOCKED); | |
2661 | + atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */ | |
2662 | + } else { | |
2663 | + /* Second try to get an LRU stripe. */ | |
2664 | + stripe = stripe_lru_pop(sc); | |
2665 | + if (stripe) { | |
2666 | + _stripe_get(stripe); | |
2667 | + /* Invalidate before reinserting with changed key. */ | |
2668 | + stripe_invalidate(stripe); | |
2669 | + stripe->key = addr->key; | |
2670 | + stripe->region = dm_rh_sector_to_region(rs->recover.rh, | |
2671 | + addr->key); | |
2672 | + stripe->idx.parity = addr->pi; | |
2673 | + sc_insert(sc, stripe); | |
2674 | + /* REMOVEME: statistics. */ | |
2675 | + atomic_inc(rs->stats + S_INSCACHE); | |
2676 | + } | |
2677 | + } | |
2678 | + | |
2679 | + return stripe; | |
2680 | +} | |
2681 | + | |
2682 | +/* | |
2683 | + * Decrement reference count on a stripe. | |
2684 | + * | |
2685 | + * Move it to list of LRU stripes if zero. | |
2686 | + */ | |
2687 | +static void stripe_put(struct stripe *stripe) | |
2688 | +{ | |
2689 | + if (atomic_dec_and_test(&stripe->cnt)) { | |
2690 | + if (TestClearStripeActive(stripe)) | |
2691 | + atomic_dec(&stripe->sc->active_stripes); | |
2692 | + | |
2693 | + /* Put stripe onto the LRU list. */ | |
2694 | + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED); | |
2695 | + } | |
2696 | + | |
2697 | + BUG_ON(atomic_read(&stripe->cnt) < 0); | |
2698 | +} | |
2699 | + | |
2700 | +/* | |
2701 | + * Process end io | |
2702 | + * | |
2703 | + * I need to do it here because I can't in interrupt | |
2704 | + * | |
2705 | + * Read and write functions are split in order to avoid | |
2706 | + * conditionals in the main loop for performamce reasons. | |
2707 | + */ | |
2708 | + | |
2709 | +/* Helper read bios on a page list. */ | |
2710 | +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl, | |
2711 | + struct bio *bio) | |
2712 | +{ | |
2713 | + bio_copy_page_list(READ, stripe, pl, bio); | |
2714 | +} | |
2715 | + | |
2716 | +/* Helper write bios on a page list. */ | |
2717 | +static void _rh_dec(struct stripe *stripe, struct page_list *pl, | |
2718 | + struct bio *bio) | |
2719 | +{ | |
2720 | + dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region); | |
2721 | +} | |
2722 | + | |
2723 | +/* End io all bios on a page list. */ | |
2724 | +static inline int | |
2725 | +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count) | |
2726 | +{ | |
2727 | + int r = 0; | |
2728 | + struct bio_list *bl = BL(stripe, p, rw); | |
2729 | + | |
2730 | + if (!bio_list_empty(bl)) { | |
2731 | + struct page_list *pl = PL(stripe, p); | |
2732 | + struct page *page = pl->page; | |
2733 | + | |
2734 | + if (PageLocked(page)) | |
2735 | + r = -EBUSY; | |
2736 | + /* | |
2737 | + * FIXME: PageUptodate() not cleared | |
2738 | + * properly for missing chunks ? | |
2739 | + */ | |
2740 | + else if (PageUptodate(page)) { | |
2741 | + struct bio *bio; | |
2742 | + struct raid_set *rs = RS(stripe->sc); | |
2743 | + void (*h_f)(struct stripe *, struct page_list *, | |
2744 | + struct bio *) = | |
2745 | + (rw == READ) ? _bio_copy_page_list : _rh_dec; | |
2746 | + | |
2747 | + while ((bio = bio_list_pop(bl))) { | |
2748 | + h_f(stripe, pl, bio); | |
2749 | + _bio_endio(rs, bio, 0); | |
2750 | + stripe_put(stripe); | |
2751 | + if (count) | |
2752 | + (*count)++; | |
2753 | + } | |
2754 | + } else | |
2755 | + r = -EAGAIN; | |
2756 | + } | |
2757 | + | |
2758 | + return r; | |
2759 | +} | |
2760 | + | |
2761 | +/* | |
2762 | + * End io all reads/writes on a stripe copying | |
2763 | + * read date accross from stripe to bios. | |
2764 | + */ | |
2765 | +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count) | |
2766 | +{ | |
2767 | + int r = 0; | |
2768 | + unsigned p = RS(stripe->sc)->set.raid_devs; | |
2769 | + | |
2770 | + while (p--) { | |
2771 | + int rr = page_list_endio(rw, stripe, p, count); | |
2772 | + | |
2773 | + if (rr && r != -EIO) | |
2774 | + r = rr; | |
2775 | + } | |
2776 | + | |
2777 | + return r; | |
2778 | +} | |
2779 | + | |
2780 | +/* Fail all ios on a bio list and return # of bios. */ | |
2781 | +static unsigned | |
2782 | +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl) | |
2783 | +{ | |
2784 | + unsigned r; | |
2785 | + struct bio *bio; | |
2786 | + | |
2787 | + raid_set_dead(rs); | |
2788 | + | |
2789 | + /* Update region counters. */ | |
2790 | + if (stripe) { | |
2791 | + struct dm_rh_client *rh = rs->recover.rh; | |
2792 | + | |
2793 | + bio_list_for_each(bio, bl) { | |
2794 | + if (bio_data_dir(bio) == WRITE) | |
2795 | + dm_rh_dec(rh, stripe->region); | |
2796 | + } | |
2797 | + } | |
2798 | + | |
2799 | + /* Error end io all bios. */ | |
2800 | + for (r = 0; (bio = bio_list_pop(bl)); r++) | |
2801 | + _bio_endio(rs, bio, -EIO); | |
2802 | + | |
2803 | + return r; | |
2804 | +} | |
2805 | + | |
2806 | +/* Fail all ios of a bio list of a stripe and drop io pending count. */ | |
2807 | +static void | |
2808 | +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe, | |
2809 | + struct bio_list *bl) | |
2810 | +{ | |
2811 | + unsigned put = bio_list_fail(rs, stripe, bl); | |
2812 | + | |
2813 | + while (put--) | |
2814 | + stripe_put(stripe); | |
2815 | +} | |
2816 | + | |
2817 | +/* Fail all ios hanging off all bio lists of a stripe. */ | |
2818 | +static void stripe_fail_io(struct stripe *stripe) | |
2819 | +{ | |
2820 | + struct raid_set *rs = RS(stripe->sc); | |
2821 | + unsigned p = rs->set.raid_devs; | |
2822 | + | |
2823 | + stripe_evict(stripe); | |
2824 | + | |
2825 | + while (p--) { | |
2826 | + struct stripe_set *ss = stripe->ss + p; | |
2827 | + int i = ARRAY_SIZE(ss->bl); | |
2828 | + | |
2829 | + while (i--) | |
2830 | + stripe_bio_list_fail(rs, stripe, ss->bl + i); | |
2831 | + } | |
2832 | +} | |
2833 | + | |
2834 | +/* | |
2835 | + * Handle all stripes by handing them to the daemon, because we can't | |
2836 | + * map their pages to copy the data in interrupt context. | |
2837 | + * | |
2838 | + * We don't want to handle them here either, while interrupts are disabled. | |
2839 | + */ | |
2840 | + | |
2841 | +/* Read/write endio function for dm-io (interrupt context). */ | |
2842 | +static void endio(unsigned long error, void *context) | |
2843 | +{ | |
2844 | + struct dm_mem_cache_object *obj = context; | |
2845 | + struct stripe_set *ss = obj->private; | |
2846 | + struct stripe *stripe = ss->stripe; | |
2847 | + struct page *page = obj->pl->page; | |
2848 | + | |
2849 | + if (unlikely(error)) | |
2850 | + stripe_error(stripe, page); | |
2851 | + else | |
2852 | + page_set(page, CLEAN); | |
2853 | + | |
2854 | + clear_page_locked(page); | |
2855 | + stripe_io_dec(stripe); | |
2856 | + | |
2857 | + /* Add stripe to endio list and wake daemon. */ | |
2858 | + stripe_endio_push(stripe); | |
2859 | +} | |
2860 | + | |
2861 | +/* | |
2862 | + * Recovery io throttling | |
2863 | + */ | |
2864 | +/* Conditionally reset io counters. */ | |
2865 | +enum count_type { IO_WORK = 0, IO_RECOVER }; | |
2866 | +static int recover_io_reset(struct raid_set *rs) | |
2867 | +{ | |
2868 | + unsigned long j = jiffies; | |
2869 | + | |
2870 | + /* Pay attention to jiffies overflows. */ | |
2871 | + if (j > rs->recover.last_jiffies + HZ | |
2872 | + || j < rs->recover.last_jiffies) { | |
2873 | + rs->recover.last_jiffies = j; | |
2874 | + atomic_set(rs->recover.io_count + IO_WORK, 0); | |
2875 | + atomic_set(rs->recover.io_count + IO_RECOVER, 0); | |
2876 | + return 1; | |
2877 | + } | |
2878 | + | |
2879 | + return 0; | |
2880 | +} | |
2881 | + | |
2882 | +/* Count ios. */ | |
2883 | +static INLINE void | |
2884 | +recover_io_count(struct raid_set *rs, struct stripe *stripe) | |
2885 | +{ | |
2886 | + if (RSRecover(rs)) { | |
2887 | + recover_io_reset(rs); | |
2888 | + atomic_inc(rs->recover.io_count + | |
2889 | + (StripeRecover(stripe) ? IO_RECOVER : IO_WORK)); | |
2890 | + } | |
2891 | +} | |
2892 | + | |
2893 | +/* Read/Write a page_list asynchronously. */ | |
2894 | +static void page_list_rw(struct stripe *stripe, unsigned p) | |
2895 | +{ | |
2896 | + struct stripe_cache *sc = stripe->sc; | |
2897 | + struct raid_set *rs = RS(sc); | |
2898 | + struct dm_mem_cache_object *obj = stripe->obj + p; | |
2899 | + struct page_list *pl = obj->pl; | |
2900 | + struct page *page = pl->page; | |
2901 | + struct raid_dev *dev = rs->dev + p; | |
2902 | + struct dm_io_region io = { | |
2903 | + .bdev = dev->dev->bdev, | |
2904 | + .sector = stripe->key, | |
2905 | + .count = stripe->io.size, | |
2906 | + }; | |
2907 | + struct dm_io_request control = { | |
2908 | + .bi_rw = PageDirty(page) ? WRITE : READ, | |
2909 | + .mem.type = DM_IO_PAGE_LIST, | |
2910 | + .mem.ptr.pl = pl, | |
2911 | + .mem.offset = 0, | |
2912 | + .notify.fn = endio, | |
2913 | + .notify.context = obj, | |
2914 | + .client = sc->dm_io_client, | |
2915 | + }; | |
2916 | + | |
2917 | + BUG_ON(PageLocked(page)); | |
2918 | + | |
2919 | + /* | |
2920 | + * Don't rw past end of device, which can happen, because | |
2921 | + * typically sectors_per_dev isn't divisable by io_size. | |
2922 | + */ | |
2923 | + if (unlikely(io.sector + io.count > rs->set.sectors_per_dev)) | |
2924 | + io.count = rs->set.sectors_per_dev - io.sector; | |
2925 | + | |
2926 | + io.sector += dev->start; /* Add <offset>. */ | |
2927 | + recover_io_count(rs, stripe); /* Recovery io accounting. */ | |
2928 | + | |
2929 | + /* REMOVEME: statistics. */ | |
2930 | + atomic_inc(rs->stats + | |
2931 | + (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ)); | |
2932 | + | |
2933 | + ClearPageError(page); | |
2934 | + set_page_locked(page); | |
2935 | + io_dev_queued(dev); | |
2936 | + BUG_ON(dm_io(&control, 1, &io, NULL)); | |
2937 | +} | |
2938 | + | |
2939 | +/* | |
2940 | + * Write dirty / read not uptodate page lists of a stripe. | |
2941 | + */ | |
2942 | +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe) | |
2943 | +{ | |
2944 | + unsigned r; | |
2945 | + | |
2946 | + /* | |
2947 | + * Increment the pending count on the stripe | |
2948 | + * first, so that we don't race in endio(). | |
2949 | + * | |
2950 | + * An inc (IO) is needed for any page: | |
2951 | + * | |
2952 | + * o not uptodate | |
2953 | + * o dirtied by writes merged | |
2954 | + * o dirtied by parity calculations | |
2955 | + */ | |
2956 | + r = for_each_io_dev(rs, stripe, _stripe_io_inc); | |
2957 | + if (r) { | |
2958 | + /* io needed: chunks are not uptodate/dirty. */ | |
2959 | + int max; /* REMOVEME: */ | |
2960 | + struct stripe_cache *sc = &rs->sc; | |
2961 | + | |
2962 | + if (!TestSetStripeActive(stripe)) | |
2963 | + atomic_inc(&sc->active_stripes); | |
2964 | + | |
2965 | + /* Take off the lru list in case it got added there. */ | |
2966 | + stripe_lru_del(stripe, LIST_LOCKED); | |
2967 | + | |
2968 | + /* Submit actual io. */ | |
2969 | + for_each_io_dev(rs, stripe, page_list_rw); | |
2970 | + | |
2971 | + /* REMOVEME: statistics */ | |
2972 | + max = sc_active(sc); | |
2973 | + if (atomic_read(&sc->max_active_stripes) < max) | |
2974 | + atomic_set(&sc->max_active_stripes, max); | |
2975 | + | |
2976 | + atomic_inc(rs->stats + S_FLUSHS); | |
2977 | + /* END REMOVEME: statistics */ | |
2978 | + } | |
2979 | + | |
2980 | + return r; | |
2981 | +} | |
2982 | + | |
2983 | +/* Work in all pending writes. */ | |
2984 | +static INLINE void _writes_merge(struct stripe *stripe, unsigned p) | |
2985 | +{ | |
2986 | + struct bio_list *write = BL(stripe, p, WRITE); | |
2987 | + | |
2988 | + if (!bio_list_empty(write)) { | |
2989 | + struct page_list *pl = stripe->obj[p].pl; | |
2990 | + struct bio *bio; | |
2991 | + struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED); | |
2992 | + | |
2993 | + /* | |
2994 | + * We can play with the lists without holding a lock, | |
2995 | + * because it is just us accessing them anyway. | |
2996 | + */ | |
2997 | + bio_list_for_each(bio, write) | |
2998 | + bio_copy_page_list(WRITE, stripe, pl, bio); | |
2999 | + | |
3000 | + bio_list_merge(write_merged, write); | |
3001 | + bio_list_init(write); | |
3002 | + page_set(pl->page, DIRTY); | |
3003 | + } | |
3004 | +} | |
3005 | + | |
3006 | +/* Merge in all writes hence dirtying respective pages. */ | |
3007 | +static INLINE void writes_merge(struct stripe *stripe) | |
3008 | +{ | |
3009 | + unsigned p = RS(stripe->sc)->set.raid_devs; | |
3010 | + | |
3011 | + while (p--) | |
3012 | + _writes_merge(stripe, p); | |
3013 | +} | |
3014 | + | |
3015 | +/* Check, if a chunk gets completely overwritten. */ | |
3016 | +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p) | |
3017 | +{ | |
3018 | + unsigned sectors = 0; | |
3019 | + struct bio *bio; | |
3020 | + struct bio_list *bl = BL(stripe, p, WRITE); | |
3021 | + | |
3022 | + bio_list_for_each(bio, bl) | |
3023 | + sectors += bio_sectors(bio); | |
3024 | + | |
3025 | + return sectors == RS(stripe->sc)->set.io_size; | |
3026 | +} | |
3027 | + | |
3028 | +/* | |
3029 | + * Prepare stripe to avoid io on broken/reconstructed | |
3030 | + * drive in order to reconstruct date on endio. | |
3031 | + */ | |
3032 | +enum prepare_type { IO_ALLOW, IO_PROHIBIT }; | |
3033 | +static void stripe_prepare(struct stripe *stripe, unsigned p, | |
3034 | + enum prepare_type type) | |
3035 | +{ | |
3036 | + struct page *page = PAGE(stripe, p); | |
3037 | + | |
3038 | + switch (type) { | |
3039 | + case IO_PROHIBIT: | |
3040 | + /* | |
3041 | + * In case we prohibit, we gotta make sure, that | |
3042 | + * io on all other chunks than the one which failed | |
3043 | + * or is being reconstructed is allowed and that it | |
3044 | + * doesn't have state uptodate. | |
3045 | + */ | |
3046 | + stripe_allow_io(stripe); | |
3047 | + ClearPageUptodate(page); | |
3048 | + ProhibitPageIO(page); | |
3049 | + | |
3050 | + /* REMOVEME: statistics. */ | |
3051 | + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO); | |
3052 | + stripe->idx.recover = p; | |
3053 | + SetStripeReconstruct(stripe); | |
3054 | + break; | |
3055 | + | |
3056 | + case IO_ALLOW: | |
3057 | + AllowPageIO(page); | |
3058 | + stripe->idx.recover = -1; | |
3059 | + ClearStripeReconstruct(stripe); | |
3060 | + break; | |
3061 | + | |
3062 | + default: | |
3063 | + BUG(); | |
3064 | + } | |
3065 | +} | |
3066 | + | |
3067 | +/* | |
3068 | + * Degraded/reconstruction mode. | |
3069 | + * | |
3070 | + * Check stripe state to figure which chunks don't need IO. | |
3071 | + */ | |
3072 | +static INLINE void stripe_check_reconstruct(struct stripe *stripe, | |
3073 | + int prohibited) | |
3074 | +{ | |
3075 | + struct raid_set *rs = RS(stripe->sc); | |
3076 | + | |
3077 | + /* | |
3078 | + * Degraded mode (device(s) failed) -> | |
3079 | + * avoid io on the failed device. | |
3080 | + */ | |
3081 | + if (unlikely(raid_set_degraded(rs))) { | |
3082 | + /* REMOVEME: statistics. */ | |
3083 | + atomic_inc(rs->stats + S_DEGRADED); | |
3084 | + stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT); | |
3085 | + return; | |
3086 | + } else { | |
3087 | + /* | |
3088 | + * Reconstruction mode (ie. a particular device or | |
3089 | + * some (rotating) parity chunk is being resynchronized) -> | |
3090 | + * o make sure all needed pages are read in | |
3091 | + * o writes are allowed to go through | |
3092 | + */ | |
3093 | + int r = region_state(rs, stripe->key, DM_RH_NOSYNC); | |
3094 | + | |
3095 | + if (r) { | |
3096 | + /* REMOVEME: statistics. */ | |
3097 | + atomic_inc(rs->stats + S_NOSYNC); | |
3098 | + stripe_prepare(stripe, dev_for_parity(stripe), | |
3099 | + IO_PROHIBIT); | |
3100 | + return; | |
3101 | + } | |
3102 | + } | |
3103 | + | |
3104 | + /* | |
3105 | + * All disks good. Avoid reading parity chunk and reconstruct it | |
3106 | + * unless we have prohibited io to chunk(s). | |
3107 | + */ | |
3108 | + if (!prohibited) { | |
3109 | + if (StripeMerged(stripe)) | |
3110 | + stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW); | |
3111 | + else { | |
3112 | + stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT); | |
3113 | + | |
3114 | + /* | |
3115 | + * Overrule stripe_prepare to reconstruct the | |
3116 | + * parity chunk, because it'll be created new anyway. | |
3117 | + */ | |
3118 | + ClearStripeReconstruct(stripe); | |
3119 | + } | |
3120 | + } | |
3121 | +} | |
3122 | + | |
3123 | +/* Check, if stripe is ready to merge writes. */ | |
3124 | +static INLINE int stripe_check_merge(struct stripe *stripe) | |
3125 | +{ | |
3126 | + struct raid_set *rs = RS(stripe->sc); | |
3127 | + int prohibited = 0; | |
3128 | + unsigned chunks = 0, p = rs->set.raid_devs; | |
3129 | + | |
3130 | + /* Walk all chunks. */ | |
3131 | + while (p--) { | |
3132 | + struct page *page = PAGE(stripe, p); | |
3133 | + | |
3134 | + /* Can't merge active chunks. */ | |
3135 | + if (PageLocked(page)) { | |
3136 | + /* REMOVEME: statistics. */ | |
3137 | + atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED); | |
3138 | + break; | |
3139 | + } | |
3140 | + | |
3141 | + /* Can merge uptodate chunks and have to count parity chunk. */ | |
3142 | + if (PageUptodate(page) || p == stripe->idx.parity) { | |
3143 | + chunks++; | |
3144 | + continue; | |
3145 | + } | |
3146 | + | |
3147 | + /* Read before write ordering. */ | |
3148 | + if (RSCheckOverwrite(rs) && | |
3149 | + bio_list_empty(BL(stripe, p, READ))) { | |
3150 | + int r = stripe_check_overwrite(stripe, p); | |
3151 | + | |
3152 | + if (r) { | |
3153 | + chunks++; | |
3154 | + /* REMOVEME: statistics. */ | |
3155 | + atomic_inc(RS(stripe->sc)->stats + | |
3156 | + S_PROHIBITPAGEIO); | |
3157 | + ProhibitPageIO(page); | |
3158 | + prohibited = 1; | |
3159 | + } | |
3160 | + } | |
3161 | + } | |
3162 | + | |
3163 | + if (chunks == rs->set.raid_devs) { | |
3164 | + /* All pages are uptodate or get written over or mixture. */ | |
3165 | + /* REMOVEME: statistics. */ | |
3166 | + atomic_inc(rs->stats + S_CAN_MERGE); | |
3167 | + return 0; | |
3168 | + } else | |
3169 | + /* REMOVEME: statistics.*/ | |
3170 | + atomic_inc(rs->stats + S_CANT_MERGE); | |
3171 | + | |
3172 | + return prohibited ? 1 : -EPERM; | |
3173 | +} | |
3174 | + | |
3175 | +/* Check, if stripe is ready to merge writes. */ | |
3176 | +static INLINE int stripe_check_read(struct stripe *stripe) | |
3177 | +{ | |
3178 | + int r = 0; | |
3179 | + unsigned p = RS(stripe->sc)->set.raid_devs; | |
3180 | + | |
3181 | + /* Walk all chunks. */ | |
3182 | + while (p--) { | |
3183 | + struct page *page = PAGE(stripe, p); | |
3184 | + | |
3185 | + if (!PageLocked(page) && | |
3186 | + bio_list_empty(BL(stripe, p, READ))) { | |
3187 | + ProhibitPageIO(page); | |
3188 | + r = 1; | |
3189 | + } | |
3190 | + } | |
3191 | + | |
3192 | + return r; | |
3193 | +} | |
3194 | + | |
3195 | +/* | |
3196 | + * Read/write a stripe. | |
3197 | + * | |
3198 | + * All stripe read/write activity goes through this function. | |
3199 | + * | |
3200 | + * States to cover: | |
3201 | + * o stripe to read and/or write | |
3202 | + * o stripe with error to reconstruct | |
3203 | + */ | |
3204 | +static int stripe_rw(struct stripe *stripe) | |
3205 | +{ | |
3206 | + struct raid_set *rs = RS(stripe->sc); | |
3207 | + int prohibited = 0, r; | |
3208 | + | |
3209 | + /* | |
3210 | + * Check the state of the RAID set and if degraded (or | |
3211 | + * resynchronizing for reads), read in all other chunks but | |
3212 | + * the one on the dead/resynchronizing device in order to be | |
3213 | + * able to reconstruct the missing one. | |
3214 | + * | |
3215 | + * Merge all writes hanging off uptodate pages of the stripe. | |
3216 | + */ | |
3217 | + | |
3218 | + /* Initially allow io on all chunks and prohibit below, if necessary. */ | |
3219 | + stripe_allow_io(stripe); | |
3220 | + | |
3221 | + if (StripeRBW(stripe)) { | |
3222 | + r = stripe_check_merge(stripe); | |
3223 | + if (!r) { | |
3224 | + /* | |
3225 | + * If I could rely on valid parity (which would only | |
3226 | + * be sure in case of a full synchronization), | |
3227 | + * I could xor a fraction of chunks out of | |
3228 | + * parity and back in. | |
3229 | + * | |
3230 | + * For the time being, I got to redo parity... | |
3231 | + */ | |
3232 | + /* parity_xor(stripe); */ /* Xor chunks out. */ | |
3233 | + stripe_zero_chunk(stripe, stripe->idx.parity); | |
3234 | + writes_merge(stripe); /* Merge writes in. */ | |
3235 | + parity_xor(stripe); /* Update parity. */ | |
3236 | + ClearStripeRBW(stripe); /* Disable RBW. */ | |
3237 | + SetStripeMerged(stripe); /* Writes merged. */ | |
3238 | + } | |
3239 | + | |
3240 | + if (r > 0) | |
3241 | + prohibited = 1; | |
3242 | + } else if (!raid_set_degraded(rs)) | |
3243 | + /* Only allow for read avoidance if not degraded. */ | |
3244 | + prohibited = stripe_check_read(stripe); | |
3245 | + | |
3246 | + /* | |
3247 | + * Check, if io needs to be allowed/prohibeted on certain chunks | |
3248 | + * because of a degraded set or reconstruction on a region. | |
3249 | + */ | |
3250 | + stripe_check_reconstruct(stripe, prohibited); | |
3251 | + | |
3252 | + /* Now submit any reads/writes. */ | |
3253 | + r = stripe_page_lists_rw(rs, stripe); | |
3254 | + if (!r) { | |
3255 | + /* | |
3256 | + * No io submitted because of chunk io prohibited or | |
3257 | + * locked pages -> push to end io list for processing. | |
3258 | + */ | |
3259 | + atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */ | |
3260 | + stripe_endio_push(stripe); | |
3261 | + wake_do_raid(rs); /* Wake myself. */ | |
3262 | + } | |
3263 | + | |
3264 | + return 0; | |
3265 | +} | |
3266 | + | |
3267 | +/* Flush stripe either via flush list or imeediately. */ | |
3268 | +enum flush_type { FLUSH_DELAY, FLUSH_NOW }; | |
3269 | +static int stripe_flush(struct stripe *stripe, enum flush_type type) | |
3270 | +{ | |
3271 | + int r = 0; | |
3272 | + | |
3273 | + stripe_lru_del(stripe, LIST_LOCKED); | |
3274 | + | |
3275 | + /* Immediately flush. */ | |
3276 | + if (type == FLUSH_NOW) { | |
3277 | + if (likely(raid_set_operational(RS(stripe->sc)))) | |
3278 | + r = stripe_rw(stripe); /* Read/write stripe. */ | |
3279 | + else | |
3280 | + /* Optimization: Fail early on failed sets. */ | |
3281 | + stripe_fail_io(stripe); | |
3282 | + /* Delay flush by putting it on io list for later processing. */ | |
3283 | + } else if (type == FLUSH_DELAY) | |
3284 | + stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED); | |
3285 | + else | |
3286 | + BUG(); | |
3287 | + | |
3288 | + return r; | |
3289 | +} | |
3290 | + | |
3291 | +/* | |
3292 | + * Queue reads and writes to a stripe by hanging | |
3293 | + * their bios off the stripsets read/write lists. | |
3294 | + * | |
3295 | + * Endio reads on uptodate chunks. | |
3296 | + */ | |
3297 | +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio, | |
3298 | + struct bio_list *reject) | |
3299 | +{ | |
3300 | + int r = 0; | |
3301 | + struct address addr; | |
3302 | + struct stripe *stripe = | |
3303 | + stripe_get(rs, raid_address(rs, bio->bi_sector, &addr)); | |
3304 | + | |
3305 | + if (stripe) { | |
3306 | + int rr, rw = bio_data_dir(bio); | |
3307 | + | |
3308 | + rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */ | |
3309 | + if (rr) { | |
3310 | + stripe_put(stripe); | |
3311 | + goto out; | |
3312 | + } | |
3313 | + | |
3314 | + /* Distinguish read and write cases. */ | |
3315 | + bio_list_add(BL(stripe, addr.di, rw), bio); | |
3316 | + | |
3317 | + /* REMOVEME: statistics */ | |
3318 | + atomic_inc(rs->stats + (rw == WRITE ? | |
3319 | + S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ)); | |
3320 | + | |
3321 | + if (rw == READ) | |
3322 | + SetStripeRead(stripe); | |
3323 | + else { | |
3324 | + SetStripeRBW(stripe); | |
3325 | + | |
3326 | + /* Inrement pending write count on region. */ | |
3327 | + dm_rh_inc(rs->recover.rh, stripe->region); | |
3328 | + r = 1; /* Region hash needs a flush. */ | |
3329 | + } | |
3330 | + | |
3331 | + /* | |
3332 | + * Optimize stripe flushing: | |
3333 | + * | |
3334 | + * o directly start io for read stripes. | |
3335 | + * | |
3336 | + * o put stripe onto stripe caches io_list for RBW, | |
3337 | + * so that do_flush() can belabour it after we put | |
3338 | + * more bios to the stripe for overwrite optimization. | |
3339 | + */ | |
3340 | + stripe_flush(stripe, | |
3341 | + StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY); | |
3342 | + | |
3343 | + /* Got no stripe from cache -> reject bio. */ | |
3344 | + } else { | |
3345 | +out: | |
3346 | + bio_list_add(reject, bio); | |
3347 | + /* REMOVEME: statistics. */ | |
3348 | + atomic_inc(rs->stats + S_IOS_POST); | |
3349 | + } | |
3350 | + | |
3351 | + return r; | |
3352 | +} | |
3353 | + | |
3354 | +/* | |
3355 | + * Recovery functions | |
3356 | + */ | |
3357 | +/* Read a stripe off a raid set for recovery. */ | |
3358 | +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx) | |
3359 | +{ | |
3360 | + /* Invalidate all pages so that they get read in. */ | |
3361 | + stripe_pages_invalidate(stripe); | |
3362 | + | |
3363 | + /* Allow io on all recovery chunks. */ | |
3364 | + stripe_allow_io(stripe); | |
3365 | + | |
3366 | + if (idx > -1) | |
3367 | + ProhibitPageIO(PAGE(stripe, idx)); | |
3368 | + | |
3369 | + stripe->key = rs->recover.pos; | |
3370 | + return stripe_page_lists_rw(rs, stripe); | |
3371 | +} | |
3372 | + | |
3373 | +/* Write a stripe to a raid set for recovery. */ | |
3374 | +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx) | |
3375 | +{ | |
3376 | + /* | |
3377 | + * If this is a reconstruct of a particular device, then | |
3378 | + * reconstruct the respective page(s), else create parity page(s). | |
3379 | + */ | |
3380 | + if (idx > -1) { | |
3381 | + struct page *page = PAGE(stripe, idx); | |
3382 | + | |
3383 | + AllowPageIO(page); | |
3384 | + stripe_zero_chunk(stripe, idx); | |
3385 | + common_xor(stripe, stripe->io.size, 0, idx); | |
3386 | + page_set(page, DIRTY); | |
3387 | + } else | |
3388 | + parity_xor(stripe); | |
3389 | + | |
3390 | + return stripe_page_lists_rw(rs, stripe); | |
3391 | +} | |
3392 | + | |
3393 | +/* Recover bandwidth available ?. */ | |
3394 | +static int recover_bandwidth(struct raid_set *rs) | |
3395 | +{ | |
3396 | + int r, work; | |
3397 | + | |
3398 | + /* On reset -> allow recovery. */ | |
3399 | + r = recover_io_reset(rs); | |
3400 | + if (r || RSBandwidth(rs)) | |
3401 | + goto out; | |
3402 | + | |
3403 | + work = atomic_read(rs->recover.io_count + IO_WORK); | |
3404 | + if (work) { | |
3405 | + /* Pay attention to larger recover stripe size. */ | |
3406 | + int recover = | |
3407 | + atomic_read(rs->recover.io_count + IO_RECOVER) * | |
3408 | + rs->recover.io_size / | |
3409 | + rs->set.io_size; | |
3410 | + | |
3411 | + /* | |
3412 | + * Don't use more than given bandwidth of | |
3413 | + * the work io for recovery. | |
3414 | + */ | |
3415 | + if (recover > work / rs->recover.bandwidth_work) { | |
3416 | + /* REMOVEME: statistics. */ | |
3417 | + atomic_inc(rs->stats + S_NO_BANDWIDTH); | |
3418 | + return 0; | |
3419 | + } | |
3420 | + } | |
3421 | + | |
3422 | +out: | |
3423 | + atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */ | |
3424 | + return 1; | |
3425 | +} | |
3426 | + | |
3427 | +/* Try to get a region to recover. */ | |
3428 | +static int recover_get_region(struct raid_set *rs) | |
3429 | +{ | |
3430 | + struct recover *rec = &rs->recover; | |
3431 | + struct dm_rh_client *rh = rec->rh; | |
3432 | + | |
3433 | + /* Start quiescing some regions. */ | |
3434 | + if (!RSRegionGet(rs)) { | |
3435 | + int r = recover_bandwidth(rs); /* Enough bandwidth ?. */ | |
3436 | + | |
3437 | + if (r) { | |
3438 | + r = dm_rh_recovery_prepare(rh); | |
3439 | + if (r < 0) { | |
3440 | + DMINFO("No %sregions to recover", | |
3441 | + rec->nr_regions_to_recover ? | |
3442 | + "more " : ""); | |
3443 | + return -ENOENT; | |
3444 | + } | |
3445 | + } else | |
3446 | + return -EAGAIN; | |
3447 | + | |
3448 | + SetRSRegionGet(rs); | |
3449 | + } | |
3450 | + | |
3451 | + if (!rec->reg) { | |
3452 | + rec->reg = dm_rh_recovery_start(rh); | |
3453 | + if (rec->reg) { | |
3454 | + /* | |
3455 | + * A reference for the the region I'll | |
3456 | + * keep till I've completely synced it. | |
3457 | + */ | |
3458 | + io_get(rs); | |
3459 | + rec->pos = dm_rh_region_to_sector(rh, | |
3460 | + dm_rh_get_region_key(rec->reg)); | |
3461 | + rec->end = rec->pos + dm_rh_get_region_size(rh); | |
3462 | + return 1; | |
3463 | + } else | |
3464 | + return -EAGAIN; | |
3465 | + } | |
3466 | + | |
3467 | + return 0; | |
3468 | +} | |
3469 | + | |
3470 | +/* Read/write a recovery stripe. */ | |
3471 | +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe) | |
3472 | +{ | |
3473 | + /* Read/write flip-flop. */ | |
3474 | + if (TestClearStripeRBW(stripe)) { | |
3475 | + SetStripeRead(stripe); | |
3476 | + return recover_read(rs, stripe, idx_get(rs)); | |
3477 | + } else if (TestClearStripeRead(stripe)) | |
3478 | + return recover_write(rs, stripe, idx_get(rs)); | |
3479 | + | |
3480 | + return 0; | |
3481 | +} | |
3482 | + | |
3483 | +/* Reset recovery variables. */ | |
3484 | +static void recovery_region_reset(struct raid_set *rs) | |
3485 | +{ | |
3486 | + rs->recover.reg = NULL; | |
3487 | + ClearRSRegionGet(rs); | |
3488 | +} | |
3489 | + | |
3490 | +/* Update region hash state. */ | |
3491 | +static void recover_rh_update(struct raid_set *rs, int error) | |
3492 | +{ | |
3493 | + struct recover *rec = &rs->recover; | |
3494 | + struct dm_rh_client *rh = rec->rh; | |
3495 | + struct dm_region *reg = rec->reg; | |
3496 | + | |
3497 | + if (reg) { | |
3498 | + dm_rh_recovery_end(rh, reg, error); | |
3499 | + if (!error) | |
3500 | + rec->nr_regions_recovered++; | |
3501 | + | |
3502 | + recovery_region_reset(rs); | |
3503 | + } | |
3504 | + | |
3505 | + dm_rh_update_states(rh, 1); | |
3506 | + dm_rh_flush(rh); | |
3507 | + io_put(rs); /* Release the io reference for the region. */ | |
3508 | +} | |
3509 | + | |
3510 | +/* Called by main io daemon to recover regions. */ | |
3511 | +/* FIXME: cope with MAX_RECOVER > 1. */ | |
3512 | +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe) | |
3513 | +{ | |
3514 | + int r; | |
3515 | + struct recover *rec = &rs->recover; | |
3516 | + | |
3517 | + /* If recovery is active -> return. */ | |
3518 | + if (StripeActive(stripe)) | |
3519 | + return; | |
3520 | + | |
3521 | + /* io error is fatal for recovery -> stop it. */ | |
3522 | + if (unlikely(StripeError(stripe))) | |
3523 | + goto err; | |
3524 | + | |
3525 | + /* Get a region to recover. */ | |
3526 | + r = recover_get_region(rs); | |
3527 | + switch (r) { | |
3528 | + case 1: /* Got a new region. */ | |
3529 | + /* Flag read before write. */ | |
3530 | + ClearStripeRead(stripe); | |
3531 | + SetStripeRBW(stripe); | |
3532 | + break; | |
3533 | + | |
3534 | + case 0: | |
3535 | + /* Got a region in the works. */ | |
3536 | + r = recover_bandwidth(rs); | |
3537 | + if (r) /* Got enough bandwidth. */ | |
3538 | + break; | |
3539 | + | |
3540 | + case -EAGAIN: | |
3541 | + /* No bandwidth/quiesced region yet, try later. */ | |
3542 | + wake_do_raid_delayed(rs, HZ / 10); | |
3543 | + return; | |
3544 | + | |
3545 | + case -ENOENT: /* No more regions. */ | |
3546 | + dm_table_event(rs->ti->table); | |
3547 | + goto free; | |
3548 | + } | |
3549 | + | |
3550 | + /* Read/write a recover stripe. */ | |
3551 | + r = recover_stripe_rw(rs, stripe); | |
3552 | + if (r) { | |
3553 | + /* IO initiated, get another reference for the IO. */ | |
3554 | + io_get(rs); | |
3555 | + return; | |
3556 | + } | |
3557 | + | |
3558 | + /* Update recovery position within region. */ | |
3559 | + rec->pos += stripe->io.size; | |
3560 | + | |
3561 | + /* If we're at end of region, update region hash. */ | |
3562 | + if (rec->pos >= rec->end || | |
3563 | + rec->pos >= rs->set.sectors_per_dev) | |
3564 | + recover_rh_update(rs, 0); | |
3565 | + else | |
3566 | + SetStripeRBW(stripe); | |
3567 | + | |
3568 | + /* Schedule myself for another round... */ | |
3569 | + wake_do_raid(rs); | |
3570 | + return; | |
3571 | + | |
3572 | +err: | |
3573 | + raid_set_check_degrade(rs, stripe); | |
3574 | + | |
3575 | + { | |
3576 | + char buf[BDEVNAME_SIZE]; | |
3577 | + | |
3578 | + DMERR("stopping recovery due to " | |
3579 | + "ERROR on /dev/%s, stripe at offset %llu", | |
3580 | + bdevname(rs->dev[rs->set.ei].dev->bdev, buf), | |
3581 | + (unsigned long long) stripe->key); | |
3582 | + | |
3583 | + } | |
3584 | + | |
3585 | + /* Make sure, that all quiesced regions get released. */ | |
3586 | + do { | |
3587 | + if (rec->reg) | |
3588 | + dm_rh_recovery_end(rec->rh, rec->reg, -EIO); | |
3589 | + | |
3590 | + rec->reg = dm_rh_recovery_start(rec->rh); | |
3591 | + } while (rec->reg); | |
3592 | + | |
3593 | + recover_rh_update(rs, -EIO); | |
3594 | +free: | |
3595 | + rs->set.dev_to_init = -1; | |
3596 | + | |
3597 | + /* Check for jiffies overrun. */ | |
3598 | + rs->recover.end_jiffies = jiffies; | |
3599 | + if (rs->recover.end_jiffies < rs->recover.start_jiffies) | |
3600 | + rs->recover.end_jiffies = ~0; | |
3601 | + | |
3602 | + ClearRSRecover(rs); | |
3603 | +} | |
3604 | + | |
3605 | +static INLINE void do_recovery(struct raid_set *rs) | |
3606 | +{ | |
3607 | + struct stripe *stripe; | |
3608 | + | |
3609 | + list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER]) | |
3610 | + _do_recovery(rs, stripe); | |
3611 | + | |
3612 | + if (!RSRecover(rs)) | |
3613 | + stripe_recover_free(rs); | |
3614 | +} | |
3615 | + | |
3616 | +/* | |
3617 | + * END recovery functions | |
3618 | + */ | |
3619 | + | |
3620 | +/* End io process all stripes handed in by endio() callback. */ | |
3621 | +static void do_endios(struct raid_set *rs) | |
3622 | +{ | |
3623 | + struct stripe_cache *sc = &rs->sc; | |
3624 | + struct stripe *stripe; | |
3625 | + | |
3626 | + while ((stripe = stripe_endio_pop(sc))) { | |
3627 | + unsigned count; | |
3628 | + | |
3629 | + /* Recovery stripe special case. */ | |
3630 | + if (unlikely(StripeRecover(stripe))) { | |
3631 | + if (stripe_io(stripe)) | |
3632 | + continue; | |
3633 | + | |
3634 | + io_put(rs); /* Release region io reference. */ | |
3635 | + ClearStripeActive(stripe); | |
3636 | + | |
3637 | + /* REMOVEME: statistics*/ | |
3638 | + atomic_dec(&sc->active_stripes); | |
3639 | + continue; | |
3640 | + } | |
3641 | + | |
3642 | + /* Early end io all reads on any uptodate chunks. */ | |
3643 | + stripe_endio(READ, stripe, (count = 0, &count)); | |
3644 | + if (stripe_io(stripe)) { | |
3645 | + if (count) /* REMOVEME: statistics. */ | |
3646 | + atomic_inc(rs->stats + S_ACTIVE_READS); | |
3647 | + | |
3648 | + continue; | |
3649 | + } | |
3650 | + | |
3651 | + /* Set stripe inactive after all io got processed. */ | |
3652 | + if (TestClearStripeActive(stripe)) | |
3653 | + atomic_dec(&sc->active_stripes); | |
3654 | + | |
3655 | + /* Unlock stripe (for clustering). */ | |
3656 | + stripe_unlock(rs, stripe); | |
3657 | + | |
3658 | + /* | |
3659 | + * If an io error on a stripe occured and the RAID set | |
3660 | + * is still operational, requeue the stripe for io. | |
3661 | + */ | |
3662 | + if (TestClearStripeError(stripe)) { | |
3663 | + raid_set_check_degrade(rs, stripe); | |
3664 | + ClearStripeReconstruct(stripe); | |
3665 | + | |
3666 | + if (!StripeMerged(stripe) && | |
3667 | + raid_set_operational(rs)) { | |
3668 | + stripe_pages_invalidate(stripe); | |
3669 | + stripe_flush(stripe, FLUSH_DELAY); | |
3670 | + /* REMOVEME: statistics. */ | |
3671 | + atomic_inc(rs->stats + S_REQUEUE); | |
3672 | + continue; | |
3673 | + } | |
3674 | + } | |
3675 | + | |
3676 | + /* Check if the RAID set is inoperational to error ios. */ | |
3677 | + if (!raid_set_operational(rs)) { | |
3678 | + ClearStripeReconstruct(stripe); | |
3679 | + stripe_fail_io(stripe); | |
3680 | + BUG_ON(atomic_read(&stripe->cnt)); | |
3681 | + continue; | |
3682 | + } | |
3683 | + | |
3684 | + /* Got to reconstruct a missing chunk. */ | |
3685 | + if (TestClearStripeReconstruct(stripe)) | |
3686 | + reconstruct_xor(stripe); | |
3687 | + | |
3688 | + /* | |
3689 | + * Now that we've got a complete stripe, we can | |
3690 | + * process the rest of the end ios on reads. | |
3691 | + */ | |
3692 | + BUG_ON(stripe_endio(READ, stripe, NULL)); | |
3693 | + ClearStripeRead(stripe); | |
3694 | + | |
3695 | + /* | |
3696 | + * Read-before-write stripes need to be flushed again in | |
3697 | + * order to work the write data into the pages *after* | |
3698 | + * they were read in. | |
3699 | + */ | |
3700 | + if (TestClearStripeMerged(stripe)) | |
3701 | + /* End io all bios which got merged already. */ | |
3702 | + BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL)); | |
3703 | + | |
3704 | + /* Got to put on flush list because of new writes. */ | |
3705 | + if (StripeRBW(stripe)) | |
3706 | + stripe_flush(stripe, FLUSH_DELAY); | |
3707 | + } | |
3708 | +} | |
3709 | + | |
3710 | +/* | |
3711 | + * Stripe cache shrinking. | |
3712 | + */ | |
3713 | +static INLINE void do_sc_shrink(struct raid_set *rs) | |
3714 | +{ | |
3715 | + unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink); | |
3716 | + | |
3717 | + if (shrink) { | |
3718 | + unsigned cur = atomic_read(&rs->sc.stripes); | |
3719 | + | |
3720 | + sc_shrink(&rs->sc, shrink); | |
3721 | + shrink -= cur - atomic_read(&rs->sc.stripes); | |
3722 | + atomic_set(&rs->sc.stripes_to_shrink, shrink); | |
3723 | + | |
3724 | + /* | |
3725 | + * Wake myself up in case we failed to shrink the | |
3726 | + * requested amount in order to try again later. | |
3727 | + */ | |
3728 | + if (shrink) | |
3729 | + wake_do_raid(rs); | |
3730 | + } | |
3731 | +} | |
3732 | + | |
3733 | + | |
3734 | +/* | |
3735 | + * Process all ios | |
3736 | + * | |
3737 | + * We do different things with the io depending on the | |
3738 | + * state of the region that it's in: | |
3739 | + * | |
3740 | + * o reads: hang off stripe cache or postpone if full | |
3741 | + * | |
3742 | + * o writes: | |
3743 | + * | |
3744 | + * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set. | |
3745 | + * In case stripe cache is full or busy, postpone the io. | |
3746 | + * | |
3747 | + * RECOVERING: delay the io until recovery of the region completes. | |
3748 | + * | |
3749 | + */ | |
3750 | +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios) | |
3751 | +{ | |
3752 | + int r; | |
3753 | + unsigned flush = 0; | |
3754 | + struct dm_rh_client *rh = rs->recover.rh; | |
3755 | + struct bio *bio; | |
3756 | + struct bio_list delay, reject; | |
3757 | + | |
3758 | + bio_list_init(&delay); | |
3759 | + bio_list_init(&reject); | |
3760 | + | |
3761 | + /* | |
3762 | + * Classify each io: | |
3763 | + * o delay to recovering regions | |
3764 | + * o queue to all other regions | |
3765 | + */ | |
3766 | + while ((bio = bio_list_pop(ios))) { | |
3767 | + /* | |
3768 | + * In case we get a barrier bio, push it back onto | |
3769 | + * the input queue unless all work queues are empty | |
3770 | + * and the stripe cache is inactive. | |
3771 | + */ | |
3772 | + if (unlikely(bio_barrier(bio))) { | |
3773 | + /* REMOVEME: statistics. */ | |
3774 | + atomic_inc(rs->stats + S_BARRIER); | |
3775 | + if (!list_empty(rs->sc.lists + LIST_IO) || | |
3776 | + !bio_list_empty(&delay) || | |
3777 | + !bio_list_empty(&reject) || | |
3778 | + sc_active(&rs->sc)) { | |
3779 | + bio_list_push(ios, bio); | |
3780 | + break; | |
3781 | + } | |
3782 | + } | |
3783 | + | |
3784 | + r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING); | |
3785 | + if (unlikely(r)) { | |
3786 | + /* Got to wait for recovering regions. */ | |
3787 | + bio_list_add(&delay, bio); | |
3788 | + SetRSBandwidth(rs); | |
3789 | + } else { | |
3790 | + /* | |
3791 | + * Process ios to non-recovering regions by queueing | |
3792 | + * them to stripes (does rh_inc()) for writes). | |
3793 | + */ | |
3794 | + flush += stripe_queue_bio(rs, bio, &reject); | |
3795 | + } | |
3796 | + } | |
3797 | + | |
3798 | + if (flush) { | |
3799 | + r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */ | |
3800 | + if (r) | |
3801 | + DMERR("dirty log flush"); | |
3802 | + } | |
3803 | + | |
3804 | + /* Delay ios to regions which are recovering. */ | |
3805 | + while ((bio = bio_list_pop(&delay))) { | |
3806 | + /* REMOVEME: statistics.*/ | |
3807 | + atomic_inc(rs->stats + S_DELAYED_BIOS); | |
3808 | + atomic_inc(rs->stats + S_SUM_DELAYED_BIOS); | |
3809 | + dm_rh_delay_by_region(rh, bio, | |
3810 | + dm_rh_sector_to_region(rh, _sector(rs, bio))); | |
3811 | + | |
3812 | + } | |
3813 | + | |
3814 | + /* Merge any rejected bios back to the head of the input list. */ | |
3815 | + bio_list_merge_head(ios, &reject); | |
3816 | +} | |
3817 | + | |
3818 | +/* Flush any stripes on the io list. */ | |
3819 | +static INLINE void do_flush(struct raid_set *rs) | |
3820 | +{ | |
3821 | + struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp; | |
3822 | + | |
3823 | + list_for_each_safe(pos, tmp, list) { | |
3824 | + int r = stripe_flush(list_entry(pos, struct stripe, | |
3825 | + lists[LIST_IO]), FLUSH_NOW); | |
3826 | + | |
3827 | + /* Remove from the list only if the stripe got processed. */ | |
3828 | + if (!r) | |
3829 | + list_del_init(pos); | |
3830 | + } | |
3831 | +} | |
3832 | + | |
3833 | +/* Send an event in case we're getting too busy. */ | |
3834 | +static INLINE void do_busy_event(struct raid_set *rs) | |
3835 | +{ | |
3836 | + if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) { | |
3837 | + if (!TestSetRSScBusy(rs)) | |
3838 | + dm_table_event(rs->ti->table); | |
3839 | + } else | |
3840 | + ClearRSScBusy(rs); | |
3841 | +} | |
3842 | + | |
3843 | +/* Unplug: let the io role on the sets devices. */ | |
3844 | +static INLINE void do_unplug(struct raid_set *rs) | |
3845 | +{ | |
3846 | + struct raid_dev *dev = rs->dev + rs->set.raid_devs; | |
3847 | + | |
3848 | + while (dev-- > rs->dev) { | |
3849 | + /* Only call any device unplug function, if io got queued. */ | |
3850 | + if (io_dev_clear(dev)) | |
3851 | + blk_unplug(bdev_get_queue(dev->dev->bdev)); | |
3852 | + } | |
3853 | +} | |
3854 | + | |
3855 | +/*----------------------------------------------------------------- | |
3856 | + * RAID daemon | |
3857 | + *---------------------------------------------------------------*/ | |
3858 | +/* | |
3859 | + * o belabour all end ios | |
3860 | + * o optionally shrink the stripe cache | |
3861 | + * o update the region hash states | |
3862 | + * o optionally do recovery | |
3863 | + * o grab the input queue | |
3864 | + * o work an all requeued or new ios and perform stripe cache flushs | |
3865 | + * unless the RAID set is inoperational (when we error ios) | |
3866 | + * o check, if the stripe cache gets too busy and throw an event if so | |
3867 | + * o unplug any component raid devices with queued bios | |
3868 | + */ | |
3869 | +static void do_raid(struct work_struct *ws) | |
3870 | +{ | |
3871 | + struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work); | |
3872 | + struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in; | |
3873 | + spinlock_t *lock = &rs->io.in_lock; | |
3874 | + | |
3875 | + /* | |
3876 | + * We always need to end io, so that ios | |
3877 | + * can get errored in case the set failed | |
3878 | + * and the region counters get decremented | |
3879 | + * before we update the region hash states. | |
3880 | + */ | |
3881 | +redo: | |
3882 | + do_endios(rs); | |
3883 | + | |
3884 | + /* | |
3885 | + * Now that we've end io'd, which may have put stripes on | |
3886 | + * the LRU list, we shrink the stripe cache if requested. | |
3887 | + */ | |
3888 | + do_sc_shrink(rs); | |
3889 | + | |
3890 | + /* Update region hash states before we go any further. */ | |
3891 | + dm_rh_update_states(rs->recover.rh, 1); | |
3892 | + | |
3893 | + /* Try to recover regions. */ | |
3894 | + if (RSRecover(rs)) | |
3895 | + do_recovery(rs); | |
3896 | + | |
3897 | + /* More endios -> process. */ | |
3898 | + if (!stripe_endio_empty(&rs->sc)) { | |
3899 | + atomic_inc(rs->stats + S_REDO); | |
3900 | + goto redo; | |
3901 | + } | |
3902 | + | |
3903 | + /* Quickly grab all new ios queued and add them to the work list. */ | |
3904 | + spin_lock_irq(lock); | |
3905 | + bio_list_merge(ios, ios_in); | |
3906 | + bio_list_init(ios_in); | |
3907 | + spin_unlock_irq(lock); | |
3908 | + | |
3909 | + /* Let's assume we're operational most of the time ;-). */ | |
3910 | + if (likely(raid_set_operational(rs))) { | |
3911 | + /* If we got ios, work them into the cache. */ | |
3912 | + if (!bio_list_empty(ios)) { | |
3913 | + do_ios(rs, ios); | |
3914 | + do_unplug(rs); /* Unplug the sets device queues. */ | |
3915 | + } | |
3916 | + | |
3917 | + do_flush(rs); /* Flush any stripes on io list. */ | |
3918 | + do_unplug(rs); /* Unplug the sets device queues. */ | |
3919 | + do_busy_event(rs); /* Check if we got too busy. */ | |
3920 | + | |
3921 | + /* More endios -> process. */ | |
3922 | + if (!stripe_endio_empty(&rs->sc)) { | |
3923 | + atomic_inc(rs->stats + S_REDO); | |
3924 | + goto redo; | |
3925 | + } | |
3926 | + } else | |
3927 | + /* No way to reconstruct data with too many devices failed. */ | |
3928 | + bio_list_fail(rs, NULL, ios); | |
3929 | +} | |
3930 | + | |
3931 | +/* | |
3932 | + * Callback for region hash to dispatch | |
3933 | + * delayed bios queued to recovered regions | |
3934 | + * (Gets called via rh_update_states()). | |
3935 | + */ | |
3936 | +static void dispatch_delayed_bios(void *context, struct bio_list *bl, int dummy) | |
3937 | +{ | |
3938 | + struct raid_set *rs = context; | |
3939 | + struct bio *bio; | |
3940 | + | |
3941 | + /* REMOVEME: decrement pending delayed bios counter. */ | |
3942 | + bio_list_for_each(bio, bl) | |
3943 | + atomic_dec(rs->stats + S_DELAYED_BIOS); | |
3944 | + | |
3945 | + /* Merge region hash private list to work list. */ | |
3946 | + bio_list_merge_head(&rs->io.work, bl); | |
3947 | + bio_list_init(bl); | |
3948 | + ClearRSBandwidth(rs); | |
3949 | +} | |
3950 | + | |
3951 | +/************************************************************* | |
3952 | + * Constructor helpers | |
3953 | + *************************************************************/ | |
3954 | +/* Calculate MB/sec. */ | |
3955 | +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed) | |
3956 | +{ | |
3957 | + return to_bytes(speed * rs->set.data_devs * | |
3958 | + rs->recover.io_size * HZ >> 10) >> 10; | |
3959 | +} | |
3960 | + | |
3961 | +/* | |
3962 | + * Discover fastest xor algorithm and # of chunks combination. | |
3963 | + */ | |
3964 | +/* Calculate speed for algorithm and # of chunks. */ | |
3965 | +static INLINE unsigned xor_speed(struct stripe *stripe) | |
3966 | +{ | |
3967 | + unsigned r = 0; | |
3968 | + unsigned long j; | |
3969 | + | |
3970 | + /* Wait for next tick. */ | |
3971 | + for (j = jiffies; j == jiffies;) | |
3972 | + ; | |
3973 | + | |
3974 | + /* Do xors for a full tick. */ | |
3975 | + for (j = jiffies; j == jiffies;) { | |
3976 | + mb(); | |
3977 | + common_xor(stripe, stripe->io.size, 0, 0); | |
3978 | + mb(); | |
3979 | + r++; | |
3980 | + mb(); | |
3981 | + } | |
3982 | + | |
3983 | + return r; | |
3984 | +} | |
3985 | + | |
3986 | +/* Optimize xor algorithm for this RAID set. */ | |
3987 | +static unsigned xor_optimize(struct raid_set *rs) | |
3988 | +{ | |
3989 | + unsigned chunks_max = 2, speed_max = 0; | |
3990 | + struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL; | |
3991 | + struct stripe *stripe; | |
3992 | + | |
3993 | + BUG_ON(list_empty(&rs->recover.stripes)); | |
3994 | + stripe = list_first_entry(&rs->recover.stripes, struct stripe, | |
3995 | + lists[LIST_RECOVER]); | |
3996 | + | |
3997 | + /* | |
3998 | + * Got to allow io on all chunks, so that | |
3999 | + * xor() will actually work on them. | |
4000 | + */ | |
4001 | + stripe_allow_io(stripe); | |
4002 | + | |
4003 | + /* Try all xor functions. */ | |
4004 | + while (f-- > xor_funcs) { | |
4005 | + unsigned speed; | |
4006 | + | |
4007 | + /* Set actual xor function for common_xor(). */ | |
4008 | + rs->xor.f = f; | |
4009 | + rs->xor.chunks = XOR_CHUNKS_MAX + 1; | |
4010 | + | |
4011 | + while (rs->xor.chunks-- > 2) { | |
4012 | + speed = xor_speed(stripe); | |
4013 | + if (speed > speed_max) { | |
4014 | + speed_max = speed; | |
4015 | + chunks_max = rs->xor.chunks; | |
4016 | + f_max = f; | |
4017 | + } | |
4018 | + } | |
4019 | + } | |
4020 | + | |
4021 | + /* Memorize optimum parameters. */ | |
4022 | + rs->xor.f = f_max; | |
4023 | + rs->xor.chunks = chunks_max; | |
4024 | + return speed_max; | |
4025 | +} | |
4026 | + | |
4027 | +/* | |
4028 | + * Allocate a RAID context (a RAID set) | |
4029 | + */ | |
4030 | +static int | |
4031 | +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type, | |
4032 | + unsigned stripes, unsigned chunk_size, unsigned io_size, | |
4033 | + unsigned recover_io_size, unsigned raid_devs, | |
4034 | + sector_t sectors_per_dev, | |
4035 | + struct dm_target *ti, unsigned dl_parms, char **argv) | |
4036 | +{ | |
4037 | + int r; | |
4038 | + unsigned p; | |
4039 | + size_t len; | |
4040 | + sector_t region_size, ti_len; | |
4041 | + struct raid_set *rs = NULL; | |
4042 | + struct dm_dirty_log *dl; | |
4043 | + struct recover *rec; | |
4044 | + | |
4045 | + /* | |
4046 | + * Create the dirty log | |
4047 | + * | |
4048 | + * We need to change length for the dirty log constructor, | |
4049 | + * because we want an amount of regions for all stripes derived | |
4050 | + * from the single device size, so that we can keep region | |
4051 | + * size = 2^^n independant of the number of devices | |
4052 | + */ | |
4053 | + ti_len = ti->len; | |
4054 | + ti->len = sectors_per_dev; | |
4055 | + dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2); | |
4056 | + ti->len = ti_len; | |
4057 | + if (!dl) | |
4058 | + goto bad_dirty_log; | |
4059 | + | |
4060 | + /* Chunk size *must* be smaller than region size. */ | |
4061 | + region_size = dl->type->get_region_size(dl); | |
4062 | + if (chunk_size > region_size) | |
4063 | + goto bad_chunk_size; | |
4064 | + | |
4065 | + /* Recover io size *must* be smaller than region size as well. */ | |
4066 | + if (recover_io_size > region_size) | |
4067 | + goto bad_recover_io_size; | |
4068 | + | |
4069 | + /* Size and allocate the RAID set structure. */ | |
4070 | + len = sizeof(*rs->data) + sizeof(*rs->dev); | |
4071 | + if (array_too_big(sizeof(*rs), len, raid_devs)) | |
4072 | + goto bad_array; | |
4073 | + | |
4074 | + len = sizeof(*rs) + raid_devs * len; | |
4075 | + rs = kzalloc(len, GFP_KERNEL); | |
4076 | + if (!rs) | |
4077 | + goto bad_alloc; | |
4078 | + | |
4079 | + rec = &rs->recover; | |
4080 | + atomic_set(&rs->io.in_process, 0); | |
4081 | + atomic_set(&rs->io.in_process_max, 0); | |
4082 | + rec->io_size = recover_io_size; | |
4083 | + | |
4084 | + /* Pointer to data array. */ | |
4085 | + rs->data = (unsigned long **) | |
4086 | + ((void *) rs->dev + raid_devs * sizeof(*rs->dev)); | |
4087 | + rec->dl = dl; | |
4088 | + rs->set.raid_devs = p = raid_devs; | |
4089 | + rs->set.data_devs = raid_devs - raid_type->parity_devs; | |
4090 | + rs->set.raid_type = raid_type; | |
4091 | + | |
4092 | + /* | |
4093 | + * Set chunk and io size and respective shifts | |
4094 | + * (used to avoid divisions) | |
4095 | + */ | |
4096 | + rs->set.chunk_size = chunk_size; | |
4097 | + rs->set.chunk_mask = chunk_size - 1; | |
4098 | + rs->set.chunk_shift = ffs(chunk_size) - 1; | |
4099 | + | |
4100 | + rs->set.io_size = io_size; | |
4101 | + rs->set.io_mask = io_size - 1; | |
4102 | + rs->set.io_shift = ffs(io_size) - 1; | |
4103 | + rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask; | |
4104 | + | |
4105 | + rs->set.pages_per_io = chunk_pages(io_size); | |
4106 | + rs->set.sectors_per_dev = sectors_per_dev; | |
4107 | + | |
4108 | + rs->set.ei = -1; /* Indicate no failed device. */ | |
4109 | + atomic_set(&rs->set.failed_devs, 0); | |
4110 | + | |
4111 | + rs->ti = ti; | |
4112 | + | |
4113 | + atomic_set(rec->io_count + IO_WORK, 0); | |
4114 | + atomic_set(rec->io_count + IO_RECOVER, 0); | |
4115 | + | |
4116 | + /* Initialize io lock and queues. */ | |
4117 | + spin_lock_init(&rs->io.in_lock); | |
4118 | + bio_list_init(&rs->io.in); | |
4119 | + bio_list_init(&rs->io.work); | |
4120 | + | |
4121 | + init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */ | |
4122 | + | |
4123 | + rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size); | |
4124 | + rec->rh = dm_rh_client_create(MAX_RECOVER, dispatch_delayed_bios, rs, | |
4125 | + wake_do_raid, rs, dl, region_size, | |
4126 | + rs->recover.nr_regions); | |
4127 | + if (IS_ERR(rec->rh)) | |
4128 | + goto bad_rh; | |
4129 | + | |
4130 | + /* Initialize stripe cache. */ | |
4131 | + r = sc_init(rs, stripes); | |
4132 | + if (r) | |
4133 | + goto bad_sc; | |
4134 | + | |
4135 | + /* Create dm-io client context. */ | |
4136 | + rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs * | |
4137 | + rs->set.pages_per_io); | |
4138 | + if (IS_ERR(rs->sc.dm_io_client)) | |
4139 | + goto bad_dm_io_client; | |
4140 | + | |
4141 | + /* REMOVEME: statistics. */ | |
4142 | + stats_reset(rs); | |
4143 | + ClearRSDevelStats(rs); /* Disnable development status. */ | |
4144 | + | |
4145 | + *raid_set = rs; | |
4146 | + return 0; | |
4147 | + | |
4148 | +bad_dirty_log: | |
4149 | + TI_ERR_RET("Error creating dirty log", -ENOMEM); | |
4150 | + | |
4151 | + | |
4152 | +bad_chunk_size: | |
4153 | + dm_dirty_log_destroy(dl); | |
4154 | + TI_ERR("Chunk size larger than region size"); | |
4155 | + | |
4156 | +bad_recover_io_size: | |
4157 | + dm_dirty_log_destroy(dl); | |
4158 | + TI_ERR("Recover stripe io size larger than region size"); | |
4159 | + | |
4160 | +bad_array: | |
4161 | + dm_dirty_log_destroy(dl); | |
4162 | + TI_ERR("Arry too big"); | |
4163 | + | |
4164 | +bad_alloc: | |
4165 | + dm_dirty_log_destroy(dl); | |
4166 | + TI_ERR_RET("Cannot allocate raid context", -ENOMEM); | |
4167 | + | |
4168 | +bad_rh: | |
4169 | + dm_dirty_log_destroy(dl); | |
4170 | + ti->error = DM_MSG_PREFIX "Error creating dirty region hash"; | |
4171 | + goto free_rs; | |
4172 | + | |
4173 | +bad_sc: | |
4174 | + ti->error = DM_MSG_PREFIX "Error creating stripe cache"; | |
4175 | + goto free; | |
4176 | + | |
4177 | +bad_dm_io_client: | |
4178 | + ti->error = DM_MSG_PREFIX "Error allocating dm-io resources"; | |
4179 | +free: | |
4180 | + dm_rh_client_destroy(rec->rh); | |
4181 | + sc_exit(&rs->sc); | |
4182 | + dm_rh_client_destroy(rec->rh); /* Destroys dirty log as well. */ | |
4183 | +free_rs: | |
4184 | + kfree(rs); | |
4185 | + return -ENOMEM; | |
4186 | +} | |
4187 | + | |
4188 | +/* Free a RAID context (a RAID set). */ | |
4189 | +static void | |
4190 | +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r) | |
4191 | +{ | |
4192 | + while (r--) | |
4193 | + dm_put_device(ti, rs->dev[r].dev); | |
4194 | + | |
4195 | + dm_io_client_destroy(rs->sc.dm_io_client); | |
4196 | + sc_exit(&rs->sc); | |
4197 | + dm_rh_client_destroy(rs->recover.rh); | |
4198 | + dm_dirty_log_destroy(rs->recover.dl); | |
4199 | + kfree(rs); | |
4200 | +} | |
4201 | + | |
4202 | +/* Create work queue and initialize work. */ | |
4203 | +static int rs_workqueue_init(struct raid_set *rs) | |
4204 | +{ | |
4205 | + struct dm_target *ti = rs->ti; | |
4206 | + | |
4207 | + rs->io.wq = create_singlethread_workqueue(DAEMON); | |
4208 | + if (!rs->io.wq) | |
4209 | + TI_ERR_RET("failed to create " DAEMON, -ENOMEM); | |
4210 | + | |
4211 | + INIT_DELAYED_WORK(&rs->io.dws, do_raid); | |
4212 | + return 0; | |
4213 | +} | |
4214 | + | |
4215 | +/* Return pointer to raid_type structure for raid name. */ | |
4216 | +static struct raid_type *get_raid_type(char *name) | |
4217 | +{ | |
4218 | + struct raid_type *r = ARRAY_END(raid_types); | |
4219 | + | |
4220 | + while (r-- > raid_types) { | |
4221 | + if (!strnicmp(STR_LEN(r->name, name))) | |
4222 | + return r; | |
4223 | + } | |
4224 | + | |
4225 | + return NULL; | |
4226 | +} | |
4227 | + | |
4228 | +/* FIXME: factor out to dm core. */ | |
4229 | +static int multiple(sector_t a, sector_t b, sector_t *n) | |
4230 | +{ | |
4231 | + sector_t r = a; | |
4232 | + | |
4233 | + sector_div(r, b); | |
4234 | + *n = r; | |
4235 | + return a == r * b; | |
4236 | +} | |
4237 | + | |
4238 | +/* Log RAID set information to kernel log. */ | |
4239 | +static void raid_set_log(struct raid_set *rs, unsigned speed) | |
4240 | +{ | |
4241 | + unsigned p; | |
4242 | + char buf[BDEVNAME_SIZE]; | |
4243 | + | |
4244 | + for (p = 0; p < rs->set.raid_devs; p++) | |
4245 | + DMINFO("/dev/%s is raid disk %u", | |
4246 | + bdevname(rs->dev[p].dev->bdev, buf), p); | |
4247 | + | |
4248 | + DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes", | |
4249 | + rs->set.chunk_size, rs->set.io_size, rs->recover.io_size, | |
4250 | + atomic_read(&rs->sc.stripes)); | |
4251 | + DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name, | |
4252 | + rs->xor.chunks, mbpers(rs, speed)); | |
4253 | + DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr, | |
4254 | + rs->set.data_devs, rs->set.raid_devs); | |
4255 | +} | |
4256 | + | |
4257 | +/* Get all devices and offsets. */ | |
4258 | +static int | |
4259 | +dev_parms(struct dm_target *ti, struct raid_set *rs, | |
4260 | + char **argv, int *p) | |
4261 | +{ | |
4262 | + for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) { | |
4263 | + int r; | |
4264 | + unsigned long long tmp; | |
4265 | + struct raid_dev *dev = rs->dev + *p; | |
4266 | + union dev_lookup dl = {.dev = dev }; | |
4267 | + | |
4268 | + /* Get offset and device. */ | |
4269 | + r = sscanf(argv[1], "%llu", &tmp); | |
4270 | + if (r != 1) | |
4271 | + TI_ERR("Invalid RAID device offset parameter"); | |
4272 | + | |
4273 | + dev->start = tmp; | |
4274 | + r = dm_get_device(ti, argv[0], dev->start, | |
4275 | + rs->set.sectors_per_dev, | |
4276 | + dm_table_get_mode(ti->table), &dev->dev); | |
4277 | + if (r) | |
4278 | + TI_ERR_RET("RAID device lookup failure", r); | |
4279 | + | |
4280 | + r = raid_dev_lookup(rs, bynumber, &dl); | |
4281 | + if (r != -ENODEV && r < *p) { | |
4282 | + (*p)++; /* Ensure dm_put_device() on actual device. */ | |
4283 | + TI_ERR_RET("Duplicate RAID device", -ENXIO); | |
4284 | + } | |
4285 | + } | |
4286 | + | |
4287 | + return 0; | |
4288 | +} | |
4289 | + | |
4290 | +/* Set recovery bandwidth. */ | |
4291 | +static INLINE void | |
4292 | +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth) | |
4293 | +{ | |
4294 | + rs->recover.bandwidth = bandwidth; | |
4295 | + rs->recover.bandwidth_work = 100 / bandwidth; | |
4296 | +} | |
4297 | + | |
4298 | +/* Handle variable number of RAID parameters. */ | |
4299 | +static int | |
4300 | +raid_variable_parms(struct dm_target *ti, char **argv, | |
4301 | + unsigned i, int *raid_parms, | |
4302 | + int *chunk_size, int *chunk_size_parm, | |
4303 | + int *stripes, int *stripes_parm, | |
4304 | + int *io_size, int *io_size_parm, | |
4305 | + int *recover_io_size, int *recover_io_size_parm, | |
4306 | + int *bandwidth, int *bandwidth_parm) | |
4307 | +{ | |
4308 | + /* Fetch # of variable raid parameters. */ | |
4309 | + if (sscanf(argv[i++], "%d", raid_parms) != 1 || | |
4310 | + !range_ok(*raid_parms, 0, 5)) | |
4311 | + TI_ERR("Bad variable raid parameters number"); | |
4312 | + | |
4313 | + if (*raid_parms) { | |
4314 | + /* | |
4315 | + * If we've got variable RAID parameters, | |
4316 | + * chunk size is the first one | |
4317 | + */ | |
4318 | + if (sscanf(argv[i++], "%d", chunk_size) != 1 || | |
4319 | + (*chunk_size != -1 && | |
4320 | + (!POWER_OF_2(*chunk_size) || | |
4321 | + !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX)))) | |
4322 | + TI_ERR("Invalid chunk size; must be 2^^n and <= 16384"); | |
4323 | + | |
4324 | + *chunk_size_parm = *chunk_size; | |
4325 | + if (*chunk_size == -1) | |
4326 | + *chunk_size = CHUNK_SIZE; | |
4327 | + | |
4328 | + /* | |
4329 | + * In case we've got 2 or more variable raid | |
4330 | + * parameters, the number of stripes is the second one | |
4331 | + */ | |
4332 | + if (*raid_parms > 1) { | |
4333 | + if (sscanf(argv[i++], "%d", stripes) != 1 || | |
4334 | + (*stripes != -1 && | |
4335 | + !range_ok(*stripes, STRIPES_MIN, | |
4336 | + STRIPES_MAX))) | |
4337 | + TI_ERR("Invalid number of stripes: must " | |
4338 | + "be >= 8 and <= 8192"); | |
4339 | + } | |
4340 | + | |
4341 | + *stripes_parm = *stripes; | |
4342 | + if (*stripes == -1) | |
4343 | + *stripes = STRIPES; | |
4344 | + | |
4345 | + /* | |
4346 | + * In case we've got 3 or more variable raid | |
4347 | + * parameters, the io size is the third one. | |
4348 | + */ | |
4349 | + if (*raid_parms > 2) { | |
4350 | + if (sscanf(argv[i++], "%d", io_size) != 1 || | |
4351 | + (*io_size != -1 && | |
4352 | + (!POWER_OF_2(*io_size) || | |
4353 | + !range_ok(*io_size, IO_SIZE_MIN, | |
4354 | + min(BIO_MAX_SECTORS / 2, | |
4355 | + *chunk_size))))) | |
4356 | + TI_ERR("Invalid io size; must " | |
4357 | + "be 2^^n and less equal " | |
4358 | + "min(BIO_MAX_SECTORS/2, chunk size)"); | |
4359 | + } else | |
4360 | + *io_size = *chunk_size; | |
4361 | + | |
4362 | + *io_size_parm = *io_size; | |
4363 | + if (*io_size == -1) | |
4364 | + *io_size = *chunk_size; | |
4365 | + | |
4366 | + /* | |
4367 | + * In case we've got 4 variable raid parameters, | |
4368 | + * the recovery stripe io_size is the fourth one | |
4369 | + */ | |
4370 | + if (*raid_parms > 3) { | |
4371 | + if (sscanf(argv[i++], "%d", recover_io_size) != 1 || | |
4372 | + (*recover_io_size != -1 && | |
4373 | + (!POWER_OF_2(*recover_io_size) || | |
4374 | + !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN, | |
4375 | + BIO_MAX_SECTORS / 2)))) | |
4376 | + TI_ERR("Invalid recovery io size; must be " | |
4377 | + "2^^n and less equal BIO_MAX_SECTORS/2"); | |
4378 | + } | |
4379 | + | |
4380 | + *recover_io_size_parm = *recover_io_size; | |
4381 | + if (*recover_io_size == -1) | |
4382 | + *recover_io_size = RECOVER_IO_SIZE; | |
4383 | + | |
4384 | + /* | |
4385 | + * In case we've got 5 variable raid parameters, | |
4386 | + * the recovery io bandwidth is the fifth one | |
4387 | + */ | |
4388 | + if (*raid_parms > 4) { | |
4389 | + if (sscanf(argv[i++], "%d", bandwidth) != 1 || | |
4390 | + (*bandwidth != -1 && | |
4391 | + !range_ok(*bandwidth, BANDWIDTH_MIN, | |
4392 | + BANDWIDTH_MAX))) | |
4393 | + TI_ERR("Invalid recovery bandwidth " | |
4394 | + "percentage; must be > 0 and <= 100"); | |
4395 | + } | |
4396 | + | |
4397 | + *bandwidth_parm = *bandwidth; | |
4398 | + if (*bandwidth == -1) | |
4399 | + *bandwidth = BANDWIDTH; | |
4400 | + } | |
4401 | + | |
4402 | + return 0; | |
4403 | +} | |
4404 | + | |
4405 | +/* Parse optional locking parameters. */ | |
4406 | +static int | |
4407 | +raid_locking_parms(struct dm_target *ti, char **argv, | |
4408 | + unsigned i, int *locking_parms, | |
4409 | + struct dm_raid45_locking_type **locking_type) | |
4410 | +{ | |
4411 | + *locking_parms = 0; | |
4412 | + *locking_type = &locking_none; | |
4413 | + | |
4414 | + if (!strnicmp(argv[i], "none", strlen(argv[i]))) | |
4415 | + *locking_parms = 1; | |
4416 | + else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) { | |
4417 | + *locking_type = &locking_none; | |
4418 | + *locking_parms = 2; | |
4419 | + } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) { | |
4420 | + *locking_type = &locking_cluster; | |
4421 | + /* FIXME: namespace. */ | |
4422 | + *locking_parms = 3; | |
4423 | + } | |
4424 | + | |
4425 | + return *locking_parms == 1 ? -EINVAL : 0; | |
4426 | +} | |
4427 | + | |
4428 | +/* Set backing device information properties of RAID set. */ | |
4429 | +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks) | |
4430 | +{ | |
4431 | + unsigned p, ra_pages; | |
4432 | + struct mapped_device *md = dm_table_get_md(rs->ti->table); | |
4433 | + struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info; | |
4434 | + | |
4435 | + /* Set read-ahead for the RAID set and the component devices. */ | |
4436 | + bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size); | |
4437 | + ra_pages = chunks * chunk_pages(rs->set.io_size); | |
4438 | + for (p = rs->set.raid_devs; p--; ) { | |
4439 | + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev); | |
4440 | + | |
4441 | + q->backing_dev_info.ra_pages = ra_pages; | |
4442 | + } | |
4443 | + | |
4444 | + /* Set congested function and data. */ | |
4445 | + bdi->congested_fn = raid_set_congested; | |
4446 | + bdi->congested_data = rs; | |
4447 | + | |
4448 | + dm_put(md); | |
4449 | +} | |
4450 | + | |
4451 | +/* Get backing device information properties of RAID set. */ | |
4452 | +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks) | |
4453 | +{ | |
4454 | + struct mapped_device *md = dm_table_get_md(rs->ti->table); | |
4455 | + | |
4456 | + *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages | |
4457 | + / stripe_pages(rs, rs->set.io_size); | |
4458 | + *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages | |
4459 | + / chunk_pages(rs->set.io_size); | |
4460 | + | |
4461 | + dm_put(md); | |
4462 | +} | |
4463 | + | |
4464 | +/* | |
4465 | + * Construct a RAID4/5 mapping: | |
4466 | + * | |
4467 | + * log_type #log_params <log_params> \ | |
4468 | + * raid_type [#parity_dev] #raid_variable_params <raid_params> \ | |
4469 | + * [locking "none"/"cluster"] | |
4470 | + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,} | |
4471 | + * | |
4472 | + * log_type = "core"/"disk", | |
4473 | + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only) | |
4474 | + * log_params = [dirty_log_path] region_size [[no]sync]) | |
4475 | + * | |
4476 | + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs" | |
4477 | + * | |
4478 | + * #parity_dev = N if raid_type = "raid4" | |
4479 | + * o N = -1: pick default = last device | |
4480 | + * o N >= 0 and < #raid_devs: parity device index | |
4481 | + * | |
4482 | + * #raid_variable_params = 0-5; raid_params (-1 = default): | |
4483 | + * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]] | |
4484 | + * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8 | |
4485 | + * and <= CHUNK_SIZE_MAX) | |
4486 | + * o #stripes is number of stripes allocated to stripe cache | |
4487 | + * (must be > 1 and < STRIPES_MAX) | |
4488 | + * o io_size (io unit size per device in sectors; must be 2^^n and > 8) | |
4489 | + * o recover_io_size (io unit size per device for recovery in sectors; | |
4490 | + must be 2^^n, > SECTORS_PER_PAGE and <= region_size) | |
4491 | + * o %recovery_bandwith is the maximum amount spend for recovery during | |
4492 | + * application io (1-100%) | |
4493 | + * If raid_variable_params = 0, defaults will be used. | |
4494 | + * Any raid_variable_param can be set to -1 to apply a default | |
4495 | + * | |
4496 | + * #raid_devs = N (N >= 3) | |
4497 | + * | |
4498 | + * #dev_to_initialize = N | |
4499 | + * -1: initialize parity on all devices | |
4500 | + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction | |
4501 | + * of a failed devices content after replacement | |
4502 | + * | |
4503 | + * <dev_path> = device_path (eg, /dev/sdd1) | |
4504 | + * <offset> = begin at offset on <dev_path> | |
4505 | + * | |
4506 | + */ | |
4507 | +#define MIN_PARMS 13 | |
4508 | +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |
4509 | +{ | |
4510 | + int bandwidth = BANDWIDTH, bandwidth_parm = -1, | |
4511 | + chunk_size = CHUNK_SIZE, chunk_size_parm = -1, | |
4512 | + dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1, | |
4513 | + i, io_size = IO_SIZE, io_size_parm = -1, | |
4514 | + r, raid_devs, raid_parms, | |
4515 | + recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1, | |
4516 | + stripes = STRIPES, stripes_parm = -1; | |
4517 | + unsigned speed; | |
4518 | + sector_t tmp, sectors_per_dev; | |
4519 | + struct dm_raid45_locking_type *locking; | |
4520 | + struct raid_set *rs; | |
4521 | + struct raid_type *raid_type; | |
4522 | + | |
4523 | + /* Ensure minimum number of parameters. */ | |
4524 | + if (argc < MIN_PARMS) | |
4525 | + TI_ERR("Not enough parameters"); | |
4526 | + | |
4527 | + /* Fetch # of dirty log parameters. */ | |
4528 | + if (sscanf(argv[1], "%d", &dl_parms) != 1 | |
4529 | + || !range_ok(dl_parms, 1, 4711)) | |
4530 | + TI_ERR("Bad dirty log parameters number"); | |
4531 | + | |
4532 | + /* Check raid_type. */ | |
4533 | + raid_type = get_raid_type(argv[dl_parms + 2]); | |
4534 | + if (!raid_type) | |
4535 | + TI_ERR("Bad raid type"); | |
4536 | + | |
4537 | + /* In case of RAID4, parity drive is selectable. */ | |
4538 | + parity_parm = !!(raid_type->level == raid4); | |
4539 | + | |
4540 | + /* Handle variable number of RAID parameters. */ | |
4541 | + r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3, | |
4542 | + &raid_parms, | |
4543 | + &chunk_size, &chunk_size_parm, | |
4544 | + &stripes, &stripes_parm, | |
4545 | + &io_size, &io_size_parm, | |
4546 | + &recover_io_size, &recover_io_size_parm, | |
4547 | + &bandwidth, &bandwidth_parm); | |
4548 | + if (r) | |
4549 | + return r; | |
4550 | + | |
4551 | + r = raid_locking_parms(ti, argv, | |
4552 | + dl_parms + parity_parm + raid_parms + 4, | |
4553 | + &locking_parms, &locking); | |
4554 | + if (r) | |
4555 | + return r; | |
4556 | + | |
4557 | + /* # of raid devices. */ | |
4558 | + i = dl_parms + parity_parm + raid_parms + locking_parms + 4; | |
4559 | + if (sscanf(argv[i], "%d", &raid_devs) != 1 || | |
4560 | + raid_devs < raid_type->minimal_devs) | |
4561 | + TI_ERR("Invalid number of raid devices"); | |
4562 | + | |
4563 | + /* In case of RAID4, check parity drive index is in limits. */ | |
4564 | + if (raid_type->level == raid4) { | |
4565 | + /* Fetch index of parity device. */ | |
4566 | + if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 || | |
4567 | + !range_ok(pi, 0, raid_devs - 1)) | |
4568 | + TI_ERR("Invalid RAID4 parity device index"); | |
4569 | + } | |
4570 | + | |
4571 | + /* | |
4572 | + * Index of device to initialize starts at 0 | |
4573 | + * | |
4574 | + * o -1 -> don't initialize a particular device, | |
4575 | + * o 0..raid_devs-1 -> initialize respective device | |
4576 | + * (used for reconstruction of a replaced device) | |
4577 | + */ | |
4578 | + if (sscanf | |
4579 | + (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5], | |
4580 | + "%d", &dev_to_init) != 1 | |
4581 | + || !range_ok(dev_to_init, -1, raid_devs - 1)) | |
4582 | + TI_ERR("Invalid number for raid device to initialize"); | |
4583 | + | |
4584 | + /* Check # of raid device arguments. */ | |
4585 | + if (argc - dl_parms - parity_parm - raid_parms - 6 != | |
4586 | + 2 * raid_devs) | |
4587 | + TI_ERR("Wrong number of raid device/offset arguments"); | |
4588 | + | |
4589 | + /* | |
4590 | + * Check that the table length is devisable | |
4591 | + * w/o rest by (raid_devs - parity_devs) | |
4592 | + */ | |
4593 | + if (!multiple(ti->len, raid_devs - raid_type->parity_devs, | |
4594 | + §ors_per_dev)) | |
4595 | + TI_ERR | |
4596 | + ("Target length not divisable by number of data devices"); | |
4597 | + | |
4598 | + /* | |
4599 | + * Check that the device size is | |
4600 | + * devisable w/o rest by chunk size | |
4601 | + */ | |
4602 | + if (!multiple(sectors_per_dev, chunk_size, &tmp)) | |
4603 | + TI_ERR("Device length not divisable by chunk_size"); | |
4604 | + | |
4605 | + /**************************************************************** | |
4606 | + * Now that we checked the constructor arguments -> | |
4607 | + * let's allocate the RAID set | |
4608 | + ****************************************************************/ | |
4609 | + r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size, | |
4610 | + recover_io_size, raid_devs, sectors_per_dev, | |
4611 | + ti, dl_parms, argv); | |
4612 | + if (r) | |
4613 | + return r; | |
4614 | + | |
4615 | + /* | |
4616 | + * Set these here in order to avoid passing | |
4617 | + * too many arguments to context_alloc() | |
4618 | + */ | |
4619 | + rs->set.dev_to_init_parm = dev_to_init; | |
4620 | + rs->set.dev_to_init = dev_to_init; | |
4621 | + rs->set.pi_parm = pi; | |
4622 | + rs->set.pi = (pi == -1) ? rs->set.data_devs : pi; | |
4623 | + rs->set.raid_parms = raid_parms; | |
4624 | + rs->set.chunk_size_parm = chunk_size_parm; | |
4625 | + rs->set.io_size_parm = io_size_parm; | |
4626 | + rs->sc.stripes_parm = stripes_parm; | |
4627 | + rs->recover.io_size_parm = recover_io_size_parm; | |
4628 | + rs->recover.bandwidth_parm = bandwidth_parm; | |
4629 | + recover_set_bandwidth(rs, bandwidth); | |
4630 | + | |
4631 | + /* Use locking type to lock stripe access. */ | |
4632 | + rs->locking = locking; | |
4633 | + | |
4634 | + /* Get the device/offset tupels. */ | |
4635 | + argv += dl_parms + 6 + parity_parm + raid_parms; | |
4636 | + r = dev_parms(ti, rs, argv, &i); | |
4637 | + if (r) | |
4638 | + goto err; | |
4639 | + | |
4640 | + /* Initialize recovery. */ | |
4641 | + rs->recover.start_jiffies = jiffies; | |
4642 | + rs->recover.end_jiffies = 0; | |
4643 | + recovery_region_reset(rs); | |
4644 | + | |
4645 | + /* Allow for recovery of any nosync regions. */ | |
4646 | + SetRSRecover(rs); | |
4647 | + | |
4648 | + /* Set backing device information (eg. read ahead). */ | |
4649 | + rs_set_bdi(rs, chunk_size * 2, io_size * 4); | |
4650 | + SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */ | |
4651 | + | |
4652 | + speed = xor_optimize(rs); /* Select best xor algorithm. */ | |
4653 | + | |
4654 | + /* Initialize work queue to handle this RAID set's io. */ | |
4655 | + r = rs_workqueue_init(rs); | |
4656 | + if (r) | |
4657 | + goto err; | |
4658 | + | |
4659 | + raid_set_log(rs, speed); /* Log information about RAID set. */ | |
4660 | + | |
4661 | + /* | |
4662 | + * Make sure that dm core only hands maximum io size | |
4663 | + * length down and pays attention to io boundaries. | |
4664 | + */ | |
4665 | + ti->split_io = rs->set.io_size; | |
4666 | + ti->private = rs; | |
4667 | + return 0; | |
4668 | + | |
4669 | +err: | |
4670 | + context_free(rs, ti, i); | |
4671 | + return r; | |
4672 | +} | |
4673 | + | |
4674 | +/* | |
4675 | + * Destruct a raid mapping | |
4676 | + */ | |
4677 | +static void raid_dtr(struct dm_target *ti) | |
4678 | +{ | |
4679 | + struct raid_set *rs = ti->private; | |
4680 | + | |
4681 | + /* Indicate recovery end so that ios in flight drain. */ | |
4682 | + ClearRSRecover(rs); | |
4683 | + | |
4684 | + wake_do_raid(rs); /* Wake daemon. */ | |
4685 | + wait_ios(rs); /* Wait for any io still being processed. */ | |
4686 | + destroy_workqueue(rs->io.wq); | |
4687 | + context_free(rs, ti, rs->set.raid_devs); | |
4688 | +} | |
4689 | + | |
4690 | +/* Queues ios to RAID sets. */ | |
4691 | +static inline void queue_bio(struct raid_set *rs, struct bio *bio) | |
4692 | +{ | |
4693 | + int wake; | |
4694 | + struct bio_list *in = &rs->io.in; | |
4695 | + spinlock_t *in_lock = &rs->io.in_lock; | |
4696 | + | |
4697 | + spin_lock_irq(in_lock); | |
4698 | + wake = bio_list_empty(in); | |
4699 | + bio_list_add(in, bio); | |
4700 | + spin_unlock_irq(in_lock); | |
4701 | + | |
4702 | + /* Wake daemon if input list was empty. */ | |
4703 | + if (wake) | |
4704 | + wake_do_raid(rs); | |
4705 | +} | |
4706 | + | |
4707 | +/* Raid mapping function. */ | |
4708 | +static int raid_map(struct dm_target *ti, struct bio *bio, | |
4709 | + union map_info *map_context) | |
4710 | +{ | |
4711 | + /* I don't want to waste stripe cache capacity. */ | |
4712 | + if (bio_rw(bio) == READA) | |
4713 | + return -EIO; | |
4714 | + else { | |
4715 | + struct raid_set *rs = ti->private; | |
4716 | + | |
4717 | + /* REMOVEME: statistics. */ | |
4718 | + atomic_inc(rs->stats + | |
4719 | + (bio_data_dir(bio) == WRITE ? | |
4720 | + S_BIOS_WRITE : S_BIOS_READ)); | |
4721 | + | |
4722 | + /* | |
4723 | + * Get io reference to be waiting for to drop | |
4724 | + * to zero on device suspension/destruction. | |
4725 | + */ | |
4726 | + io_get(rs); | |
4727 | + bio->bi_sector -= ti->begin; /* Remap sector. */ | |
4728 | + queue_bio(rs, bio); /* Queue to the daemon. */ | |
4729 | + return DM_MAPIO_SUBMITTED; /* Handle later. */ | |
4730 | + } | |
4731 | +} | |
4732 | + | |
4733 | +/* Device suspend. */ | |
4734 | +static void raid_postsuspend(struct dm_target *ti) | |
4735 | +{ | |
4736 | + struct raid_set *rs = ti->private; | |
4737 | + struct dm_dirty_log *dl = rs->recover.dl; | |
4738 | + | |
4739 | + SetRSSuspended(rs); | |
4740 | + | |
4741 | + if (RSRecover(rs)) | |
4742 | + dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */ | |
4743 | + else | |
4744 | + wake_do_raid(rs); | |
4745 | + | |
4746 | + wait_ios(rs); /* Wait for completion of all ios being processed. */ | |
4747 | + if (dl->type->postsuspend && dl->type->postsuspend(dl)) | |
4748 | + /* Suspend dirty log. */ | |
4749 | + /* FIXME: need better error handling. */ | |
4750 | + DMWARN("log suspend failed"); | |
4751 | +} | |
4752 | + | |
4753 | +/* Device resume. */ | |
4754 | +static void raid_resume(struct dm_target *ti) | |
4755 | +{ | |
4756 | + struct raid_set *rs = ti->private; | |
4757 | + struct recover *rec = &rs->recover; | |
4758 | + struct dm_dirty_log *dl = rec->dl; | |
4759 | + | |
4760 | + if (dl->type->resume && dl->type->resume(dl)) | |
4761 | + /* Resume dirty log. */ | |
4762 | + /* FIXME: need better error handling. */ | |
4763 | + DMWARN("log resume failed"); | |
4764 | + | |
4765 | + rec->nr_regions_to_recover = | |
4766 | + rec->nr_regions - dl->type->get_sync_count(dl); | |
4767 | + | |
4768 | + ClearRSSuspended(rs); | |
4769 | + | |
4770 | + /* Reset any unfinished recovery. */ | |
4771 | + if (RSRecover(rs)) { | |
4772 | + recovery_region_reset(rs); | |
4773 | + dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */ | |
4774 | + } else | |
4775 | + wake_do_raid(rs); | |
4776 | +} | |
4777 | + | |
4778 | +static INLINE unsigned sc_size(struct raid_set *rs) | |
4779 | +{ | |
4780 | + return to_sector(atomic_read(&rs->sc.stripes) * | |
4781 | + (sizeof(struct stripe) + | |
4782 | + (sizeof(struct stripe_set) + | |
4783 | + (sizeof(struct page_list) + | |
4784 | + to_bytes(rs->set.io_size) * | |
4785 | + rs->set.raid_devs)) + | |
4786 | + (rs->recover. | |
4787 | + end_jiffies ? 0 : to_bytes(rs->set.raid_devs * | |
4788 | + rs->recover. | |
4789 | + io_size)))); | |
4790 | +} | |
4791 | + | |
4792 | +/* REMOVEME: status output for development. */ | |
4793 | +static void | |
4794 | +raid_devel_stats(struct dm_target *ti, char *result, | |
4795 | + unsigned *size, unsigned maxlen) | |
4796 | +{ | |
4797 | + unsigned chunks, stripes, sz = *size; | |
4798 | + unsigned long j; | |
4799 | + char buf[BDEVNAME_SIZE], *p; | |
4800 | + struct stats_map *sm, *sm_end = ARRAY_END(stats_map); | |
4801 | + struct raid_set *rs = ti->private; | |
4802 | + struct recover *rec = &rs->recover; | |
4803 | + struct timespec ts; | |
4804 | + | |
4805 | + DMEMIT("%s ", version); | |
4806 | + DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process)); | |
4807 | + DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max)); | |
4808 | + | |
4809 | + for (sm = stats_map; sm < sm_end; sm++) | |
4810 | + DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type)); | |
4811 | + | |
4812 | + DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off"); | |
4813 | + DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size, | |
4814 | + atomic_read(&rs->sc.stripes), rs->sc.hash.buckets, | |
4815 | + sc_size(rs)); | |
4816 | + | |
4817 | + j = (rec->end_jiffies ? rec->end_jiffies : jiffies) - | |
4818 | + rec->start_jiffies; | |
4819 | + jiffies_to_timespec(j, &ts); | |
4820 | + sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec); | |
4821 | + p = strchr(buf, '.'); | |
4822 | + p[3] = 0; | |
4823 | + | |
4824 | + DMEMIT("rg=%llu%s/%llu/%llu/%u %s ", | |
4825 | + (unsigned long long) rec->nr_regions_recovered, | |
4826 | + RSRegionGet(rs) ? "+" : "", | |
4827 | + (unsigned long long) rec->nr_regions_to_recover, | |
4828 | + (unsigned long long) rec->nr_regions, rec->bandwidth, buf); | |
4829 | + | |
4830 | + rs_get_ra(rs, &stripes, &chunks); | |
4831 | + DMEMIT("ra=%u/%u ", stripes, chunks); | |
4832 | + | |
4833 | + *size = sz; | |
4834 | +} | |
4835 | + | |
4836 | +static int | |
4837 | +raid_status(struct dm_target *ti, status_type_t type, | |
4838 | + char *result, unsigned maxlen) | |
4839 | +{ | |
4840 | + unsigned i, sz = 0; | |
4841 | + char buf[BDEVNAME_SIZE]; | |
4842 | + struct raid_set *rs = ti->private; | |
4843 | + | |
4844 | + switch (type) { | |
4845 | + case STATUSTYPE_INFO: | |
4846 | + /* REMOVEME: statistics. */ | |
4847 | + if (RSDevelStats(rs)) | |
4848 | + raid_devel_stats(ti, result, &sz, maxlen); | |
4849 | + | |
4850 | + DMEMIT("%u ", rs->set.raid_devs); | |
4851 | + | |
4852 | + for (i = 0; i < rs->set.raid_devs; i++) | |
4853 | + DMEMIT("%s ", | |
4854 | + format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev)); | |
4855 | + | |
4856 | + DMEMIT("1 "); | |
4857 | + for (i = 0; i < rs->set.raid_devs; i++) { | |
4858 | + DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D'); | |
4859 | + | |
4860 | + if (rs->set.raid_type->level == raid4 && | |
4861 | + i == rs->set.pi) | |
4862 | + DMEMIT("p"); | |
4863 | + | |
4864 | + if (rs->set.dev_to_init == i) | |
4865 | + DMEMIT("i"); | |
4866 | + } | |
4867 | + | |
4868 | + break; | |
4869 | + | |
4870 | + case STATUSTYPE_TABLE: | |
4871 | + sz = rs->recover.dl->type->status(rs->recover.dl, type, | |
4872 | + result, maxlen); | |
4873 | + DMEMIT("%s %u ", rs->set.raid_type->name, | |
4874 | + rs->set.raid_parms); | |
4875 | + | |
4876 | + if (rs->set.raid_type->level == raid4) | |
4877 | + DMEMIT("%d ", rs->set.pi_parm); | |
4878 | + | |
4879 | + if (rs->set.raid_parms) | |
4880 | + DMEMIT("%d ", rs->set.chunk_size_parm); | |
4881 | + | |
4882 | + if (rs->set.raid_parms > 1) | |
4883 | + DMEMIT("%d ", rs->sc.stripes_parm); | |
4884 | + | |
4885 | + if (rs->set.raid_parms > 2) | |
4886 | + DMEMIT("%d ", rs->set.io_size_parm); | |
4887 | + | |
4888 | + if (rs->set.raid_parms > 3) | |
4889 | + DMEMIT("%d ", rs->recover.io_size_parm); | |
4890 | + | |
4891 | + if (rs->set.raid_parms > 4) | |
4892 | + DMEMIT("%d ", rs->recover.bandwidth_parm); | |
4893 | + | |
4894 | + DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init); | |
4895 | + | |
4896 | + for (i = 0; i < rs->set.raid_devs; i++) | |
4897 | + DMEMIT("%s %llu ", | |
4898 | + format_dev_t(buf, | |
4899 | + rs->dev[i].dev->bdev->bd_dev), | |
4900 | + (unsigned long long) rs->dev[i].start); | |
4901 | + } | |
4902 | + | |
4903 | + return 0; | |
4904 | +} | |
4905 | + | |
4906 | +/* | |
4907 | + * Message interface | |
4908 | + */ | |
4909 | +enum raid_msg_actions { | |
4910 | + act_bw, /* Recovery bandwidth switch. */ | |
4911 | + act_dev, /* Device failure switch. */ | |
4912 | + act_overwrite, /* Stripe overwrite check. */ | |
4913 | + act_read_ahead, /* Set read ahead. */ | |
4914 | + act_stats, /* Development statistics switch. */ | |
4915 | + act_sc, /* Stripe cache switch. */ | |
4916 | + | |
4917 | + act_on, /* Set entity on. */ | |
4918 | + act_off, /* Set entity off. */ | |
4919 | + act_reset, /* Reset entity. */ | |
4920 | + | |
4921 | + act_set = act_on, /* Set # absolute. */ | |
4922 | + act_grow = act_off, /* Grow # by an amount. */ | |
4923 | + act_shrink = act_reset, /* Shrink # by an amount. */ | |
4924 | +}; | |
4925 | + | |
4926 | +/* Turn a delta to absolute. */ | |
4927 | +static int _absolute(unsigned long action, int act, int r) | |
4928 | +{ | |
4929 | + /* Make delta absolute. */ | |
4930 | + if (test_bit(act_set, &action)) | |
4931 | + ; | |
4932 | + else if (test_bit(act_grow, &action)) | |
4933 | + r += act; | |
4934 | + else if (test_bit(act_shrink, &action)) | |
4935 | + r = act - r; | |
4936 | + else | |
4937 | + r = -EINVAL; | |
4938 | + | |
4939 | + return r; | |
4940 | +} | |
4941 | + | |
4942 | + /* Change recovery io bandwidth. */ | |
4943 | +static int bandwidth_change(struct dm_msg *msg, void *context) | |
4944 | +{ | |
4945 | + struct raid_set *rs = context; | |
4946 | + int act = rs->recover.bandwidth; | |
4947 | + int bandwidth = DM_MSG_INT_ARG(msg); | |
4948 | + | |
4949 | + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { | |
4950 | + /* Make delta bandwidth absolute. */ | |
4951 | + bandwidth = _absolute(msg->action, act, bandwidth); | |
4952 | + | |
4953 | + /* Check range. */ | |
4954 | + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { | |
4955 | + recover_set_bandwidth(rs, bandwidth); | |
4956 | + return 0; | |
4957 | + } | |
4958 | + } | |
4959 | + | |
4960 | + set_bit(dm_msg_ret_arg, &msg->ret); | |
4961 | + set_bit(dm_msg_ret_inval, &msg->ret); | |
4962 | + return -EINVAL; | |
4963 | +} | |
4964 | + | |
4965 | +/* Change state of a device (running/offline). */ | |
4966 | +/* FIXME: this only works while recovering!. */ | |
4967 | +static int device_state(struct dm_msg *msg, void *context) | |
4968 | +{ | |
4969 | + int r; | |
4970 | + const char *str = "is already "; | |
4971 | + union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) }; | |
4972 | + struct raid_set *rs = context; | |
4973 | + | |
4974 | + r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ? | |
4975 | + bymajmin : byname, &dl); | |
4976 | + if (r == -ENODEV) { | |
4977 | + DMERR("device %s is no member of this set", dl.dev_name); | |
4978 | + return r; | |
4979 | + } | |
4980 | + | |
4981 | + if (test_bit(act_off, &msg->action)) { | |
4982 | + if (dev_operational(rs, r)) | |
4983 | + str = ""; | |
4984 | + } else if (!dev_operational(rs, r)) | |
4985 | + str = ""; | |
4986 | + | |
4987 | + DMINFO("/dev/%s %s%s", dl.dev_name, str, | |
4988 | + test_bit(act_off, &msg->action) ? "offline" : "running"); | |
4989 | + | |
4990 | + return test_bit(act_off, &msg->action) ? | |
4991 | + raid_set_check_and_degrade(rs, NULL, r) : | |
4992 | + raid_set_check_and_upgrade(rs, r); | |
4993 | +} | |
4994 | + | |
4995 | +/* Set/reset development feature flags. */ | |
4996 | +static int devel_flags(struct dm_msg *msg, void *context) | |
4997 | +{ | |
4998 | + struct raid_set *rs = context; | |
4999 | + | |
5000 | + if (test_bit(act_on, &msg->action)) | |
5001 | + return test_and_set_bit(msg->spec->parm, | |
5002 | + &rs->io.flags) ? -EPERM : 0; | |
5003 | + else if (test_bit(act_off, &msg->action)) | |
5004 | + return test_and_clear_bit(msg->spec->parm, | |
5005 | + &rs->io.flags) ? 0 : -EPERM; | |
5006 | + else if (test_bit(act_reset, &msg->action)) { | |
5007 | + if (test_bit(act_stats, &msg->action)) { | |
5008 | + stats_reset(rs); | |
5009 | + goto on; | |
5010 | + } else if (test_bit(act_overwrite, &msg->action)) { | |
5011 | +on: | |
5012 | + set_bit(msg->spec->parm, &rs->io.flags); | |
5013 | + return 0; | |
5014 | + } | |
5015 | + } | |
5016 | + | |
5017 | + return -EINVAL; | |
5018 | +} | |
5019 | + | |
5020 | + /* Set stripe and chunk read ahead pages. */ | |
5021 | +static int read_ahead_set(struct dm_msg *msg, void *context) | |
5022 | +{ | |
5023 | + int stripes = DM_MSG_INT_ARGS(msg, 0); | |
5024 | + int chunks = DM_MSG_INT_ARGS(msg, 1); | |
5025 | + | |
5026 | + if (range_ok(stripes, 1, 512) && | |
5027 | + range_ok(chunks, 1, 512)) { | |
5028 | + rs_set_bdi(context, stripes, chunks); | |
5029 | + return 0; | |
5030 | + } | |
5031 | + | |
5032 | + set_bit(dm_msg_ret_arg, &msg->ret); | |
5033 | + set_bit(dm_msg_ret_inval, &msg->ret); | |
5034 | + return -EINVAL; | |
5035 | +} | |
5036 | + | |
5037 | +/* Resize the stripe cache. */ | |
5038 | +static int stripecache_resize(struct dm_msg *msg, void *context) | |
5039 | +{ | |
5040 | + int act, stripes; | |
5041 | + struct raid_set *rs = context; | |
5042 | + | |
5043 | + /* Deny permission in case the daemon is still shrinking!. */ | |
5044 | + if (atomic_read(&rs->sc.stripes_to_shrink)) | |
5045 | + return -EPERM; | |
5046 | + | |
5047 | + stripes = DM_MSG_INT_ARG(msg); | |
5048 | + if (stripes > 0) { | |
5049 | + act = atomic_read(&rs->sc.stripes); | |
5050 | + | |
5051 | + /* Make delta stripes absolute. */ | |
5052 | + stripes = _absolute(msg->action, act, stripes); | |
5053 | + | |
5054 | + /* | |
5055 | + * Check range and that the # of stripes changes. | |
5056 | + * We can grow from gere but need to leave any | |
5057 | + * shrinking to the worker for synchronization. | |
5058 | + */ | |
5059 | + if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) { | |
5060 | + if (stripes > act) | |
5061 | + return sc_grow(&rs->sc, stripes - act, SC_GROW); | |
5062 | + else if (stripes < act) { | |
5063 | + atomic_set(&rs->sc.stripes_to_shrink, | |
5064 | + act - stripes); | |
5065 | + wake_do_raid(rs); | |
5066 | + } | |
5067 | + | |
5068 | + return 0; | |
5069 | + } | |
5070 | + } | |
5071 | + | |
5072 | + set_bit(dm_msg_ret_arg, &msg->ret); | |
5073 | + set_bit(dm_msg_ret_inval, &msg->ret); | |
5074 | + return -EINVAL; | |
5075 | +} | |
5076 | + | |
5077 | +/* Parse the RAID message action. */ | |
5078 | +/* | |
5079 | + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50' | |
5080 | + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda' | |
5081 | + * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of' | |
5082 | + * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2' | |
5083 | + * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of' | |
5084 | + * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024' | |
5085 | + * | |
5086 | + */ | |
5087 | +static int | |
5088 | +raid_message(struct dm_target *ti, unsigned argc, char **argv) | |
5089 | +{ | |
5090 | + /* Variables to store the parsed parameters im. */ | |
5091 | + static int i[2]; | |
5092 | + static unsigned long *i_arg[] = { | |
5093 | + (unsigned long *) i + 0, | |
5094 | + (unsigned long *) i + 1, | |
5095 | + }; | |
5096 | + static char *p; | |
5097 | + static unsigned long *p_arg[] = { (unsigned long *) &p }; | |
5098 | + | |
5099 | + /* Declare all message option strings. */ | |
5100 | + static char *str_sgs[] = { "set", "grow", "shrink" }; | |
5101 | + static char *str_dev[] = { "running", "offline" }; | |
5102 | + static char *str_oor[] = { "on", "off", "reset" }; | |
5103 | + | |
5104 | + /* Declare all actions. */ | |
5105 | + static unsigned long act_sgs[] = { act_set, act_grow, act_shrink }; | |
5106 | + static unsigned long act_oor[] = { act_on, act_off, act_reset }; | |
5107 | + | |
5108 | + /* Bandwidth option. */ | |
5109 | + static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs }; | |
5110 | + static struct dm_message_argument bw_args = { | |
5111 | + 1, i_arg, { dm_msg_int_t } | |
5112 | + }; | |
5113 | + | |
5114 | + /* Device option. */ | |
5115 | + static struct dm_message_option dev_opt = { 2, str_dev, act_oor }; | |
5116 | + static struct dm_message_argument dev_args = { | |
5117 | + 1, p_arg, { dm_msg_base_t } | |
5118 | + }; | |
5119 | + | |
5120 | + /* Read ahead option. */ | |
5121 | + static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs }; | |
5122 | + static struct dm_message_argument ra_args = { | |
5123 | + 2, i_arg, { dm_msg_int_t, dm_msg_int_t } | |
5124 | + }; | |
5125 | + | |
5126 | + static struct dm_message_argument null_args = { | |
5127 | + 0, NULL, { dm_msg_int_t } | |
5128 | + }; | |
5129 | + | |
5130 | + /* Overwrite and statistics option. */ | |
5131 | + static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor }; | |
5132 | + | |
5133 | + /* Sripecache option. */ | |
5134 | + static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs }; | |
5135 | + | |
5136 | + /* Declare messages. */ | |
5137 | + static struct dm_msg_spec specs[] = { | |
5138 | + { "bandwidth", act_bw, &bw_opt, &bw_args, | |
5139 | + 0, bandwidth_change }, | |
5140 | + { "device", act_dev, &dev_opt, &dev_args, | |
5141 | + 0, device_state }, | |
5142 | + { "overwrite", act_overwrite, &ovr_stats_opt, &null_args, | |
5143 | + RS_CHECK_OVERWRITE, devel_flags }, | |
5144 | + { "read_ahead", act_read_ahead, &ra_opt, &ra_args, | |
5145 | + 0, read_ahead_set }, | |
5146 | + { "statistics", act_stats, &ovr_stats_opt, &null_args, | |
5147 | + RS_DEVEL_STATS, devel_flags }, | |
5148 | + { "stripecache", act_sc, &stripe_opt, &bw_args, | |
5149 | + 0, stripecache_resize }, | |
5150 | + }; | |
5151 | + | |
5152 | + /* The message for the parser. */ | |
5153 | + struct dm_msg msg = { | |
5154 | + .num_specs = ARRAY_SIZE(specs), | |
5155 | + .specs = specs, | |
5156 | + }; | |
5157 | + | |
5158 | + return dm_message_parse(TARGET, &msg, ti->private, argc, argv); | |
5159 | +} | |
5160 | +/* | |
5161 | + * END message interface | |
5162 | + */ | |
5163 | + | |
5164 | +static struct target_type raid_target = { | |
5165 | + .name = "raid45", | |
5166 | + .version = {1, 0, 0}, | |
5167 | + .module = THIS_MODULE, | |
5168 | + .ctr = raid_ctr, | |
5169 | + .dtr = raid_dtr, | |
5170 | + .map = raid_map, | |
5171 | + .postsuspend = raid_postsuspend, | |
5172 | + .resume = raid_resume, | |
5173 | + .status = raid_status, | |
5174 | + .message = raid_message, | |
5175 | +}; | |
5176 | + | |
5177 | +static void init_exit(const char *bad_msg, const char *good_msg, int r) | |
5178 | +{ | |
5179 | + if (r) | |
5180 | + DMERR("Failed to %sregister target [%d]", bad_msg, r); | |
5181 | + else | |
5182 | + DMINFO("%s %s", good_msg, version); | |
5183 | +} | |
5184 | + | |
5185 | +static int __init dm_raid_init(void) | |
5186 | +{ | |
5187 | + int r; | |
5188 | + | |
5189 | + r = dm_register_target(&raid_target); | |
5190 | + init_exit("", "initialized", r); | |
5191 | + return r; | |
5192 | +} | |
5193 | + | |
5194 | +static void __exit dm_raid_exit(void) | |
5195 | +{ | |
5196 | + int r; | |
5197 | + | |
5198 | + r = dm_unregister_target(&raid_target); | |
5199 | + init_exit("un", "exit", r); | |
5200 | +} | |
5201 | + | |
5202 | +/* Module hooks. */ | |
5203 | +module_init(dm_raid_init); | |
5204 | +module_exit(dm_raid_exit); | |
5205 | + | |
5206 | +MODULE_DESCRIPTION(DM_NAME " raid4/5 target"); | |
5207 | +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>"); | |
5208 | +MODULE_LICENSE("GPL"); | |
5209 | --- /dev/null | |
5210 | +++ b/drivers/md/dm-raid45.h | |
5211 | @@ -0,0 +1,28 @@ | |
5212 | +/* | |
5213 | + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | |
5214 | + * | |
5215 | + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com) | |
5216 | + * | |
5217 | + * Locking definitions for the device-mapper RAID45 target. | |
5218 | + * | |
5219 | + * This file is released under the GPL. | |
5220 | + * | |
5221 | + */ | |
5222 | + | |
5223 | +#ifndef _DM_RAID45_H | |
5224 | +#define _DM_RAID45_H | |
5225 | + | |
5226 | +/* Factor out to dm.h! */ | |
5227 | +#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr)) | |
5228 | + | |
5229 | +enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED }; | |
5230 | + | |
5231 | +struct dm_raid45_locking_type { | |
5232 | + /* Request a lock on a stripe. */ | |
5233 | + void* (*lock)(sector_t key, enum dm_lock_type type); | |
5234 | + | |
5235 | + /* Release a lock on a stripe. */ | |
5236 | + void (*unlock)(void *lock_handle); | |
5237 | +}; | |
5238 | + | |
5239 | +#endif | |
5240 | --- /dev/null | |
5241 | +++ b/drivers/md/dm-regions.c | |
5242 | @@ -0,0 +1,723 @@ | |
5243 | +/* | |
5244 | + * Copyright (C) 2003 Sistina Software Limited. | |
5245 | + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. | |
5246 | + * | |
5247 | + * This file is released under the GPL. | |
5248 | + */ | |
5249 | + | |
5250 | +#include <linux/dm-dirty-log.h> | |
5251 | +#include <linux/dm-regions.h> | |
5252 | + | |
5253 | +#include <linux/ctype.h> | |
5254 | +#include <linux/init.h> | |
5255 | +#include <linux/module.h> | |
5256 | +#include <linux/vmalloc.h> | |
5257 | + | |
5258 | +#include "dm.h" | |
5259 | +#include "dm-bio-list.h" | |
5260 | + | |
5261 | +#define DM_MSG_PREFIX "region hash" | |
5262 | + | |
5263 | +/*----------------------------------------------------------------- | |
5264 | + * Region hash | |
5265 | + * | |
5266 | + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions. | |
5267 | + * Each region can be in one of three states: | |
5268 | + * | |
5269 | + * o clean | |
5270 | + * o dirty, | |
5271 | + * o nosync. | |
5272 | + * | |
5273 | + * There is no need to put clean regions in the hash. | |
5274 | + * | |
5275 | + * | |
5276 | + * In addition to being present in the hash table a region _may_ | |
5277 | + * be present on one of three lists. | |
5278 | + * | |
5279 | + * clean_regions: Regions on this list have no io pending to | |
5280 | + * them, they are in sync, we are no longer interested in them, | |
5281 | + * they are dull. dm_rh_update_states() will remove them from the | |
5282 | + * hash table. | |
5283 | + * | |
5284 | + * quiesced_regions: These regions have been spun down, ready | |
5285 | + * for recovery. dm_rh_recovery_start() will remove regions from | |
5286 | + * this list and hand them to the caller, which will schedule the | |
5287 | + * recovery io. | |
5288 | + * | |
5289 | + * recovered_regions: Regions that the caller has successfully | |
5290 | + * recovered. dm_rh_update_states() will now schedule any delayed | |
5291 | + * io, up the recovery_count, and remove the region from the hash. | |
5292 | + * | |
5293 | + * There are 2 locks: | |
5294 | + * A rw spin lock 'hash_lock' protects just the hash table, | |
5295 | + * this is never held in write mode from interrupt context, | |
5296 | + * which I believe means that we only have to disable irqs when | |
5297 | + * doing a write lock. | |
5298 | + * | |
5299 | + * An ordinary spin lock 'region_lock' that protects the three | |
5300 | + * lists in the region_hash, with the 'state', 'list' and | |
5301 | + * 'delayed_bios' fields of the regions. This is used from irq | |
5302 | + * context, so all other uses will have to suspend local irqs. | |
5303 | + *---------------------------------------------------------------*/ | |
5304 | +struct region_hash { | |
5305 | + unsigned max_recovery; /* Max # of regions to recover in parallel */ | |
5306 | + | |
5307 | + /* Callback function to dispatch queued writes on recovered regions. */ | |
5308 | + void (*dispatch)(void *context, struct bio_list *bios, int error); | |
5309 | + void *dispatch_context; | |
5310 | + | |
5311 | + /* Callback function to wakeup callers worker thread. */ | |
5312 | + void (*wake)(void *context); | |
5313 | + void *wake_context; | |
5314 | + | |
5315 | + uint32_t region_size; | |
5316 | + unsigned region_shift; | |
5317 | + | |
5318 | + /* holds persistent region state */ | |
5319 | + struct dm_dirty_log *log; | |
5320 | + | |
5321 | + /* hash table */ | |
5322 | + rwlock_t hash_lock; | |
5323 | + mempool_t *region_pool; | |
5324 | + unsigned mask; | |
5325 | + unsigned nr_buckets; | |
5326 | + unsigned prime; | |
5327 | + unsigned shift; | |
5328 | + struct list_head *buckets; | |
5329 | + | |
5330 | + spinlock_t region_lock; | |
5331 | + atomic_t recovery_in_flight; | |
5332 | + struct semaphore recovery_count; | |
5333 | + struct list_head clean_regions; | |
5334 | + struct list_head quiesced_regions; | |
5335 | + struct list_head recovered_regions; | |
5336 | + struct list_head failed_recovered_regions; | |
5337 | +}; | |
5338 | + | |
5339 | +struct region { | |
5340 | + region_t key; | |
5341 | + enum dm_rh_region_states state; | |
5342 | + void *context; /* Caller context. */ | |
5343 | + | |
5344 | + struct list_head hash_list; | |
5345 | + struct list_head list; | |
5346 | + | |
5347 | + atomic_t pending; | |
5348 | + struct bio_list delayed_bios; | |
5349 | +}; | |
5350 | + | |
5351 | +/* | |
5352 | + * Conversion fns | |
5353 | + */ | |
5354 | +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector) | |
5355 | +{ | |
5356 | + return sector >> ((struct region_hash *) rh)->region_shift; | |
5357 | +} | |
5358 | +EXPORT_SYMBOL_GPL(dm_rh_sector_to_region); | |
5359 | + | |
5360 | +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio) | |
5361 | +{ | |
5362 | + return dm_rh_sector_to_region(rh, bio->bi_sector); | |
5363 | +} | |
5364 | +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); | |
5365 | + | |
5366 | +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region) | |
5367 | +{ | |
5368 | + return region << ((struct region_hash *) rh)->region_shift; | |
5369 | +} | |
5370 | +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); | |
5371 | + | |
5372 | +/* | |
5373 | + * Retrival fns. | |
5374 | + */ | |
5375 | +region_t dm_rh_get_region_key(struct dm_region *reg) | |
5376 | +{ | |
5377 | + return ((struct region *) reg)->key; | |
5378 | +} | |
5379 | +EXPORT_SYMBOL_GPL(dm_rh_get_region_key); | |
5380 | + | |
5381 | +sector_t dm_rh_get_region_size(struct dm_rh_client *rh) | |
5382 | +{ | |
5383 | + return ((struct region_hash *) rh)->region_size; | |
5384 | +} | |
5385 | +EXPORT_SYMBOL_GPL(dm_rh_get_region_size); | |
5386 | + | |
5387 | +/* Squirrel a context with a region. */ | |
5388 | +void *dm_rh_reg_get_context(struct dm_region *reg) | |
5389 | +{ | |
5390 | + return ((struct region *) reg)->context; | |
5391 | +} | |
5392 | +EXPORT_SYMBOL_GPL(dm_rh_reg_get_context); | |
5393 | + | |
5394 | +void dm_rh_reg_set_context(struct dm_region *reg, void *context) | |
5395 | +{ | |
5396 | + ((struct region *) reg)->context = context; | |
5397 | +} | |
5398 | +EXPORT_SYMBOL_GPL(dm_rh_reg_set_context); | |
5399 | + | |
5400 | +/* | |
5401 | + * Create region hash client. | |
5402 | + */ | |
5403 | +#define MIN_REGIONS 64 | |
5404 | +struct dm_rh_client *dm_rh_client_create( | |
5405 | + unsigned max_recovery, | |
5406 | + void (*dispatch)(void *dispatch_context, | |
5407 | + struct bio_list *bios, int error), | |
5408 | + void *dispatch_context, | |
5409 | + void (*wake)(void *wake_context), void *wake_context, | |
5410 | + struct dm_dirty_log *log, uint32_t region_size, | |
5411 | + region_t nr_regions) | |
5412 | +{ | |
5413 | + unsigned i; | |
5414 | + unsigned nr_buckets, max_buckets; | |
5415 | + unsigned hash_primes[] = { | |
5416 | + /* Table of primes for rh_hash/table size optimization. */ | |
5417 | + 3, 7, 13, 27, 53, 97, 193, 389, 769, | |
5418 | + 1543, 3079, 6151, 12289, 24593, | |
5419 | + }; | |
5420 | + struct region_hash *rh; | |
5421 | + | |
5422 | + if (region_size & (region_size - 1)) { | |
5423 | + DMERR("region size must be 2^^n"); | |
5424 | + return ERR_PTR(-EINVAL); | |
5425 | + } | |
5426 | + | |
5427 | + /* Calculate a suitable number of buckets for our hash table. */ | |
5428 | + max_buckets = nr_regions >> 6; | |
5429 | + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) | |
5430 | + ; | |
5431 | + nr_buckets >>= 1; | |
5432 | + | |
5433 | + rh = kmalloc(sizeof(*rh), GFP_KERNEL); | |
5434 | + if (!rh) { | |
5435 | + DMERR("unable to allocate region hash memory"); | |
5436 | + return ERR_PTR(-ENOMEM); | |
5437 | + } | |
5438 | + | |
5439 | + rh->max_recovery = max_recovery; | |
5440 | + rh->dispatch = dispatch; | |
5441 | + rh->dispatch_context = dispatch_context; | |
5442 | + rh->wake = wake; | |
5443 | + rh->wake_context = wake_context; | |
5444 | + rh->log = log; | |
5445 | + rh->region_size = region_size; | |
5446 | + rh->region_shift = ffs(region_size) - 1; | |
5447 | + rwlock_init(&rh->hash_lock); | |
5448 | + rh->mask = nr_buckets - 1; | |
5449 | + rh->nr_buckets = nr_buckets; | |
5450 | + rh->shift = ffs(nr_buckets); | |
5451 | + | |
5452 | + /* Check prime array limits. */ | |
5453 | + i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ? | |
5454 | + ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2; | |
5455 | + rh->prime = hash_primes[i]; | |
5456 | + | |
5457 | + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); | |
5458 | + if (!rh->buckets) { | |
5459 | + DMERR("unable to allocate region hash bucket memory"); | |
5460 | + kfree(rh); | |
5461 | + return ERR_PTR(-ENOMEM); | |
5462 | + } | |
5463 | + | |
5464 | + for (i = 0; i < nr_buckets; i++) | |
5465 | + INIT_LIST_HEAD(rh->buckets + i); | |
5466 | + | |
5467 | + spin_lock_init(&rh->region_lock); | |
5468 | + sema_init(&rh->recovery_count, 0); | |
5469 | + atomic_set(&rh->recovery_in_flight, 0); | |
5470 | + INIT_LIST_HEAD(&rh->clean_regions); | |
5471 | + INIT_LIST_HEAD(&rh->quiesced_regions); | |
5472 | + INIT_LIST_HEAD(&rh->recovered_regions); | |
5473 | + INIT_LIST_HEAD(&rh->failed_recovered_regions); | |
5474 | + | |
5475 | + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | |
5476 | + sizeof(struct region)); | |
5477 | + if (!rh->region_pool) { | |
5478 | + vfree(rh->buckets); | |
5479 | + kfree(rh); | |
5480 | + rh = ERR_PTR(-ENOMEM); | |
5481 | + } | |
5482 | + | |
5483 | + return (struct dm_rh_client *) rh; | |
5484 | +} | |
5485 | +EXPORT_SYMBOL_GPL(dm_rh_client_create); | |
5486 | + | |
5487 | +void dm_rh_client_destroy(struct dm_rh_client *rh_in) | |
5488 | +{ | |
5489 | + unsigned h; | |
5490 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5491 | + struct region *reg, *tmp; | |
5492 | + | |
5493 | + BUG_ON(!list_empty(&rh->quiesced_regions)); | |
5494 | + | |
5495 | + for (h = 0; h < rh->nr_buckets; h++) { | |
5496 | + list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) { | |
5497 | + BUG_ON(atomic_read(®->pending)); | |
5498 | + mempool_free(reg, rh->region_pool); | |
5499 | + } | |
5500 | + } | |
5501 | + | |
5502 | + if (rh->region_pool) | |
5503 | + mempool_destroy(rh->region_pool); | |
5504 | + | |
5505 | + vfree(rh->buckets); | |
5506 | + kfree(rh); | |
5507 | +} | |
5508 | +EXPORT_SYMBOL_GPL(dm_rh_client_destroy); | |
5509 | + | |
5510 | +static inline unsigned rh_hash(struct region_hash *rh, region_t region) | |
5511 | +{ | |
5512 | + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; | |
5513 | +} | |
5514 | + | |
5515 | +static struct region *__rh_lookup(struct region_hash *rh, region_t region) | |
5516 | +{ | |
5517 | + struct region *reg; | |
5518 | + struct list_head *bucket = rh->buckets + rh_hash(rh, region); | |
5519 | + | |
5520 | + list_for_each_entry(reg, bucket, hash_list) { | |
5521 | + if (reg->key == region) | |
5522 | + return reg; | |
5523 | + } | |
5524 | + | |
5525 | + return NULL; | |
5526 | +} | |
5527 | + | |
5528 | +static void __rh_insert(struct region_hash *rh, struct region *reg) | |
5529 | +{ | |
5530 | + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); | |
5531 | +} | |
5532 | + | |
5533 | +static struct region *__rh_alloc(struct region_hash *rh, region_t region) | |
5534 | +{ | |
5535 | + struct region *reg, *nreg; | |
5536 | + | |
5537 | + read_unlock(&rh->hash_lock); | |
5538 | + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | |
5539 | + if (unlikely(!nreg)) | |
5540 | + nreg = kmalloc(sizeof(*nreg), GFP_NOIO); | |
5541 | + | |
5542 | + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | |
5543 | + DM_RH_CLEAN : DM_RH_NOSYNC; | |
5544 | + nreg->key = region; | |
5545 | + INIT_LIST_HEAD(&nreg->list); | |
5546 | + atomic_set(&nreg->pending, 0); | |
5547 | + bio_list_init(&nreg->delayed_bios); | |
5548 | + | |
5549 | + write_lock_irq(&rh->hash_lock); | |
5550 | + reg = __rh_lookup(rh, region); | |
5551 | + if (reg) | |
5552 | + /* We lost the race. */ | |
5553 | + mempool_free(nreg, rh->region_pool); | |
5554 | + else { | |
5555 | + __rh_insert(rh, nreg); | |
5556 | + if (nreg->state == DM_RH_CLEAN) { | |
5557 | + spin_lock(&rh->region_lock); | |
5558 | + list_add(&nreg->list, &rh->clean_regions); | |
5559 | + spin_unlock(&rh->region_lock); | |
5560 | + } | |
5561 | + | |
5562 | + reg = nreg; | |
5563 | + } | |
5564 | + | |
5565 | + write_unlock_irq(&rh->hash_lock); | |
5566 | + read_lock(&rh->hash_lock); | |
5567 | + return reg; | |
5568 | +} | |
5569 | + | |
5570 | +static inline struct region *__rh_find(struct region_hash *rh, region_t region) | |
5571 | +{ | |
5572 | + struct region *reg; | |
5573 | + | |
5574 | + reg = __rh_lookup(rh, region); | |
5575 | + return reg ? reg : __rh_alloc(rh, region); | |
5576 | +} | |
5577 | + | |
5578 | +int dm_rh_get_state(struct dm_rh_client *rh_in, region_t region, int may_block) | |
5579 | +{ | |
5580 | + int r; | |
5581 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5582 | + struct region *reg; | |
5583 | + | |
5584 | + read_lock(&rh->hash_lock); | |
5585 | + reg = __rh_lookup(rh, region); | |
5586 | + read_unlock(&rh->hash_lock); | |
5587 | + | |
5588 | + if (reg) | |
5589 | + return reg->state; | |
5590 | + | |
5591 | + /* | |
5592 | + * The region wasn't in the hash, so we fall back to the dirty log. | |
5593 | + */ | |
5594 | + r = rh->log->type->in_sync(rh->log, region, may_block); | |
5595 | + | |
5596 | + /* | |
5597 | + * Any error from the dirty log (eg. -EWOULDBLOCK) | |
5598 | + * gets taken as a DM_RH_NOSYNC | |
5599 | + */ | |
5600 | + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; | |
5601 | +} | |
5602 | +EXPORT_SYMBOL_GPL(dm_rh_get_state); | |
5603 | + | |
5604 | +void dm_rh_set_state(struct dm_rh_client *rh_in, region_t region, | |
5605 | + enum dm_rh_region_states state, int may_block) | |
5606 | +{ | |
5607 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5608 | + struct region *reg; | |
5609 | + struct dm_dirty_log *log = rh->log; | |
5610 | + | |
5611 | + if (state == DM_RH_NOSYNC) | |
5612 | + log->type->set_region_sync(log, region, 0); | |
5613 | + else if (state == DM_RH_CLEAN) | |
5614 | + log->type->clear_region(log, region); | |
5615 | + else if (state == DM_RH_DIRTY) | |
5616 | + log->type->mark_region(log, region); | |
5617 | + | |
5618 | + read_lock(&rh->hash_lock); | |
5619 | + reg = __rh_find(rh, region); | |
5620 | + reg->state = state; | |
5621 | + read_unlock(&rh->hash_lock); | |
5622 | +} | |
5623 | +EXPORT_SYMBOL_GPL(dm_rh_set_state); | |
5624 | + | |
5625 | +void dm_rh_update_states(struct dm_rh_client *rh_in, int errors_handled) | |
5626 | +{ | |
5627 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5628 | + struct region *reg, *next; | |
5629 | + LIST_HEAD(clean); | |
5630 | + LIST_HEAD(recovered); | |
5631 | + LIST_HEAD(failed_recovered); | |
5632 | + | |
5633 | + /* | |
5634 | + * Quickly grab the lists and remove any regions from hash. | |
5635 | + */ | |
5636 | + write_lock_irq(&rh->hash_lock); | |
5637 | + spin_lock(&rh->region_lock); | |
5638 | + if (!list_empty(&rh->clean_regions)) { | |
5639 | + list_splice_init(&rh->clean_regions, &clean); | |
5640 | + | |
5641 | + list_for_each_entry(reg, &clean, list) | |
5642 | + list_del(®->hash_list); | |
5643 | + } | |
5644 | + | |
5645 | + if (!list_empty(&rh->recovered_regions)) { | |
5646 | + list_splice_init(&rh->recovered_regions, &recovered); | |
5647 | + | |
5648 | + list_for_each_entry(reg, &recovered, list) | |
5649 | + list_del(®->hash_list); | |
5650 | + } | |
5651 | + | |
5652 | + if (!list_empty(&rh->failed_recovered_regions)) { | |
5653 | + list_splice_init(&rh->failed_recovered_regions, | |
5654 | + &failed_recovered); | |
5655 | + | |
5656 | + list_for_each_entry(reg, &recovered, list) | |
5657 | + list_del(®->hash_list); | |
5658 | + } | |
5659 | + | |
5660 | + spin_unlock(&rh->region_lock); | |
5661 | + write_unlock_irq(&rh->hash_lock); | |
5662 | + | |
5663 | + /* | |
5664 | + * All the regions on the recovered and clean lists have | |
5665 | + * now been pulled out of the system, so no need to do | |
5666 | + * any more locking. | |
5667 | + */ | |
5668 | + list_for_each_entry_safe(reg, next, &recovered, list) { | |
5669 | + rh->log->type->clear_region(rh->log, reg->key); | |
5670 | + rh->log->type->set_region_sync(rh->log, reg->key, 1); | |
5671 | + | |
5672 | + if (reg->delayed_bios.head) | |
5673 | + rh->dispatch(rh->dispatch_context, | |
5674 | + ®->delayed_bios, 0); | |
5675 | + | |
5676 | + up(&rh->recovery_count); | |
5677 | + mempool_free(reg, rh->region_pool); | |
5678 | + } | |
5679 | + | |
5680 | + list_for_each_entry_safe(reg, next, &failed_recovered, list) { | |
5681 | + rh->log->type->set_region_sync(rh->log, reg->key, | |
5682 | + errors_handled ? 0 : 1); | |
5683 | + if (reg->delayed_bios.head) | |
5684 | + rh->dispatch(rh->dispatch_context, | |
5685 | + ®->delayed_bios, -EIO); | |
5686 | + | |
5687 | + up(&rh->recovery_count); | |
5688 | + mempool_free(reg, rh->region_pool); | |
5689 | + } | |
5690 | + | |
5691 | + list_for_each_entry_safe(reg, next, &clean, list) { | |
5692 | + rh->log->type->clear_region(rh->log, reg->key); | |
5693 | + mempool_free(reg, rh->region_pool); | |
5694 | + } | |
5695 | + | |
5696 | + dm_rh_flush(rh_in); | |
5697 | +} | |
5698 | +EXPORT_SYMBOL_GPL(dm_rh_update_states); | |
5699 | + | |
5700 | +void dm_rh_inc(struct dm_rh_client *rh_in, region_t region) | |
5701 | +{ | |
5702 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5703 | + struct region *reg; | |
5704 | + | |
5705 | + read_lock(&rh->hash_lock); | |
5706 | + reg = __rh_find(rh, region); | |
5707 | + if (reg->state == DM_RH_CLEAN) { | |
5708 | + rh->log->type->mark_region(rh->log, reg->key); | |
5709 | + | |
5710 | + spin_lock_irq(&rh->region_lock); | |
5711 | + reg->state = DM_RH_DIRTY; | |
5712 | + list_del_init(®->list); /* Take off the clean list. */ | |
5713 | + spin_unlock_irq(&rh->region_lock); | |
5714 | + } | |
5715 | + | |
5716 | + atomic_inc(®->pending); | |
5717 | + read_unlock(&rh->hash_lock); | |
5718 | +} | |
5719 | +EXPORT_SYMBOL_GPL(dm_rh_inc); | |
5720 | + | |
5721 | +void dm_rh_inc_pending(struct dm_rh_client *rh_in, struct bio_list *bios) | |
5722 | +{ | |
5723 | + struct bio *bio; | |
5724 | + | |
5725 | + for (bio = bios->head; bio; bio = bio->bi_next) | |
5726 | + dm_rh_inc(rh_in, dm_rh_bio_to_region(rh_in, bio)); | |
5727 | +} | |
5728 | +EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | |
5729 | + | |
5730 | +int dm_rh_dec(struct dm_rh_client *rh_in, region_t region) | |
5731 | +{ | |
5732 | + int r = 0; | |
5733 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5734 | + struct region *reg; | |
5735 | + | |
5736 | + read_lock(&rh->hash_lock); | |
5737 | + reg = __rh_lookup(rh, region); | |
5738 | + read_unlock(&rh->hash_lock); | |
5739 | + | |
5740 | + BUG_ON(!reg); | |
5741 | + | |
5742 | + if (atomic_dec_and_test(®->pending)) { | |
5743 | + unsigned long flags; | |
5744 | + | |
5745 | + /* | |
5746 | + * There is no pending I/O for this region. | |
5747 | + * We can move the region to corresponding list for next action. | |
5748 | + * At this point, the region is not yet connected to any list. | |
5749 | + * | |
5750 | + * If the state is DM_RH_NOSYNC, the region should be kept off | |
5751 | + * from clean list. | |
5752 | + * The hash entry for DM_RH_NOSYNC will remain in memory | |
5753 | + * until the region is recovered or the map is reloaded. | |
5754 | + */ | |
5755 | + | |
5756 | + spin_lock_irqsave(&rh->region_lock, flags); | |
5757 | + if (reg->state == DM_RH_RECOVERING) | |
5758 | + list_add_tail(®->list, &rh->quiesced_regions); | |
5759 | + else { | |
5760 | + reg->state = DM_RH_CLEAN; | |
5761 | + list_add(®->list, &rh->clean_regions); | |
5762 | + } | |
5763 | + spin_unlock_irqrestore(&rh->region_lock, flags); | |
5764 | + | |
5765 | + r = 1; | |
5766 | + } | |
5767 | + | |
5768 | + return r; | |
5769 | +} | |
5770 | +EXPORT_SYMBOL_GPL(dm_rh_dec); | |
5771 | + | |
5772 | +/* | |
5773 | + * Starts quiescing a region in preparation for recovery. | |
5774 | + */ | |
5775 | +static int __rh_recovery_prepare(struct region_hash *rh) | |
5776 | +{ | |
5777 | + int r; | |
5778 | + region_t region; | |
5779 | + struct region *reg; | |
5780 | + | |
5781 | + /* | |
5782 | + * Ask the dirty log what's next. | |
5783 | + */ | |
5784 | + r = rh->log->type->get_resync_work(rh->log, ®ion); | |
5785 | + if (r <= 0) | |
5786 | + return r; | |
5787 | + | |
5788 | + /* | |
5789 | + * Get this region, and start it quiescing | |
5790 | + * by setting the recovering flag. | |
5791 | + */ | |
5792 | + read_lock(&rh->hash_lock); | |
5793 | + reg = __rh_find(rh, region); | |
5794 | + read_unlock(&rh->hash_lock); | |
5795 | + | |
5796 | + spin_lock_irq(&rh->region_lock); | |
5797 | + | |
5798 | + reg->state = DM_RH_RECOVERING; | |
5799 | + | |
5800 | + /* Already quiesced ? */ | |
5801 | + list_del_init(®->list); | |
5802 | + if (!atomic_read(®->pending)) | |
5803 | + list_add(®->list, &rh->quiesced_regions); | |
5804 | + | |
5805 | + spin_unlock_irq(&rh->region_lock); | |
5806 | + return 1; | |
5807 | +} | |
5808 | + | |
5809 | +int dm_rh_recovery_prepare(struct dm_rh_client *rh_in) | |
5810 | +{ | |
5811 | + int r = 0; | |
5812 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5813 | + | |
5814 | + /* Extra reference to avoid race with rh_stop_recovery */ | |
5815 | + atomic_inc(&rh->recovery_in_flight); | |
5816 | + | |
5817 | + while (!down_trylock(&rh->recovery_count)) { | |
5818 | + atomic_inc(&rh->recovery_in_flight); | |
5819 | + | |
5820 | + if (__rh_recovery_prepare(rh) <= 0) { | |
5821 | + atomic_dec(&rh->recovery_in_flight); | |
5822 | + up(&rh->recovery_count); | |
5823 | + r = -ENOENT; | |
5824 | + break; | |
5825 | + } | |
5826 | + } | |
5827 | + | |
5828 | + /* Drop the extra reference. */ | |
5829 | + if (atomic_dec_and_test(&rh->recovery_in_flight)) | |
5830 | + r = -ESRCH; | |
5831 | + | |
5832 | + return r; | |
5833 | +} | |
5834 | +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); | |
5835 | + | |
5836 | +/* | |
5837 | + * Returns any quiesced regions. | |
5838 | + */ | |
5839 | +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh_in) | |
5840 | +{ | |
5841 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5842 | + struct region *reg = NULL; | |
5843 | + | |
5844 | + spin_lock_irq(&rh->region_lock); | |
5845 | + if (!list_empty(&rh->quiesced_regions)) { | |
5846 | + reg = list_entry(rh->quiesced_regions.next, | |
5847 | + struct region, list); | |
5848 | + list_del_init(®->list); /* Remove from the quiesced list. */ | |
5849 | + } | |
5850 | + | |
5851 | + spin_unlock_irq(&rh->region_lock); | |
5852 | + return (struct dm_region *) reg; | |
5853 | +} | |
5854 | +EXPORT_SYMBOL_GPL(dm_rh_recovery_start); | |
5855 | + | |
5856 | +/* | |
5857 | + * Put region on list of recovered ones. | |
5858 | + */ | |
5859 | +void dm_rh_recovery_end(struct dm_rh_client *rh_in, struct dm_region *reg_in, | |
5860 | + int error) | |
5861 | +{ | |
5862 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5863 | + struct region *reg = (struct region *) reg_in; | |
5864 | + | |
5865 | + spin_lock_irq(&rh->region_lock); | |
5866 | + if (error) { | |
5867 | + reg->state = DM_RH_NOSYNC; | |
5868 | + list_add(®->list, &rh->failed_recovered_regions); | |
5869 | + } else | |
5870 | + list_add(®->list, &rh->recovered_regions); | |
5871 | + | |
5872 | + atomic_dec(&rh->recovery_in_flight); | |
5873 | + spin_unlock_irq(&rh->region_lock); | |
5874 | + | |
5875 | + rh->wake(rh->wake_context); | |
5876 | + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0); | |
5877 | +} | |
5878 | +EXPORT_SYMBOL_GPL(dm_rh_recovery_end); | |
5879 | + | |
5880 | +/* Return recovery in flight count. */ | |
5881 | +int dm_rh_recovery_in_flight(struct dm_rh_client *rh_in) | |
5882 | +{ | |
5883 | + return atomic_read(&((struct region_hash *) rh_in)->recovery_in_flight); | |
5884 | +} | |
5885 | +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); | |
5886 | + | |
5887 | +int dm_rh_flush(struct dm_rh_client *rh_in) | |
5888 | +{ | |
5889 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5890 | + | |
5891 | + return rh->log->type->flush(rh->log); | |
5892 | +} | |
5893 | +EXPORT_SYMBOL_GPL(dm_rh_flush); | |
5894 | + | |
5895 | +void dm_rh_delay_by_region(struct dm_rh_client *rh_in, | |
5896 | + struct bio *bio, region_t region) | |
5897 | +{ | |
5898 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5899 | + struct region *reg; | |
5900 | + | |
5901 | + /* FIXME: locking. */ | |
5902 | + read_lock(&rh->hash_lock); | |
5903 | + reg = __rh_find(rh, region); | |
5904 | + bio_list_add(®->delayed_bios, bio); | |
5905 | + read_unlock(&rh->hash_lock); | |
5906 | +} | |
5907 | +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region); | |
5908 | + | |
5909 | +void dm_rh_delay(struct dm_rh_client *rh_in, struct bio *bio) | |
5910 | +{ | |
5911 | + return dm_rh_delay_by_region(rh_in, bio, | |
5912 | + dm_rh_bio_to_region(rh_in, bio)); | |
5913 | +} | |
5914 | +EXPORT_SYMBOL_GPL(dm_rh_delay); | |
5915 | + | |
5916 | +void dm_rh_dispatch_bios(struct dm_rh_client *rh_in, | |
5917 | + region_t region, int error) | |
5918 | +{ | |
5919 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5920 | + struct region *reg; | |
5921 | + struct bio_list delayed_bios; | |
5922 | + | |
5923 | + /* FIXME: locking. */ | |
5924 | + read_lock(&rh->hash_lock); | |
5925 | + reg = __rh_find(rh, region); | |
5926 | + BUG_ON(!reg); | |
5927 | + delayed_bios = reg->delayed_bios; | |
5928 | + bio_list_init(®->delayed_bios); | |
5929 | + read_unlock(&rh->hash_lock); | |
5930 | + | |
5931 | + if (delayed_bios.head) | |
5932 | + rh->dispatch(rh->dispatch_context, &delayed_bios, error); | |
5933 | + | |
5934 | + up(&rh->recovery_count); | |
5935 | +} | |
5936 | +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios); | |
5937 | + | |
5938 | +void dm_rh_stop_recovery(struct dm_rh_client *rh_in) | |
5939 | +{ | |
5940 | + int i; | |
5941 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5942 | + | |
5943 | + rh->wake(rh->wake_context); | |
5944 | + | |
5945 | + /* wait for any recovering regions */ | |
5946 | + for (i = 0; i < rh->max_recovery; i++) | |
5947 | + down(&rh->recovery_count); | |
5948 | +} | |
5949 | +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); | |
5950 | + | |
5951 | +void dm_rh_start_recovery(struct dm_rh_client *rh_in) | |
5952 | +{ | |
5953 | + int i; | |
5954 | + struct region_hash *rh = (struct region_hash *) rh_in; | |
5955 | + | |
5956 | + for (i = 0; i < rh->max_recovery; i++) | |
5957 | + up(&rh->recovery_count); | |
5958 | + | |
5959 | + rh->wake(rh->wake_context); | |
5960 | +} | |
5961 | +EXPORT_SYMBOL_GPL(dm_rh_start_recovery); | |
5962 | + | |
5963 | +MODULE_DESCRIPTION(DM_NAME " region hash"); | |
5964 | +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <hjm@redhat.com>"); | |
5965 | +MODULE_LICENSE("GPL"); | |
5966 | --- a/drivers/md/Kconfig | |
5967 | +++ b/drivers/md/Kconfig | |
5968 | @@ -269,6 +269,14 @@ config DM_DELAY | |
5969 | ||
5970 | If unsure, say N. | |
5971 | ||
5972 | +config DM_RAID45 | |
5973 | + tristate "RAID 4/5 target (EXPERIMENTAL)" | |
5974 | + depends on BLK_DEV_DM && EXPERIMENTAL | |
5975 | + ---help--- | |
5976 | + A target that supports RAID4 and RAID5 mappings. | |
5977 | + | |
5978 | + If unsure, say N. | |
5979 | + | |
5980 | config DM_UEVENT | |
5981 | bool "DM uevents (EXPERIMENTAL)" | |
5982 | depends on BLK_DEV_DM && EXPERIMENTAL | |
5983 | --- a/drivers/md/Makefile | |
5984 | +++ b/drivers/md/Makefile | |
5985 | @@ -34,7 +34,9 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | |
5986 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | |
5987 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | |
5988 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | |
5989 | -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o | |
5990 | +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o | |
5991 | +obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \ | |
5992 | + dm-regions.o dm-message.o | |
5993 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | |
5994 | ||
5995 | quiet_cmd_unroll = UNROLL $@ | |
5996 | --- /dev/null | |
5997 | +++ b/include/linux/dm-regions.h | |
5998 | @@ -0,0 +1,115 @@ | |
5999 | +/* | |
6000 | + * Copyright (C) 2003 Sistina Software Limited. | |
6001 | + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. | |
6002 | + * | |
6003 | + * Device-Mapper dirty region hash interface. | |
6004 | + * | |
6005 | + * This file is released under the GPL. | |
6006 | + */ | |
6007 | + | |
6008 | +#ifndef DM_REGION_HASH_H | |
6009 | +#define DM_REGION_HASH_H | |
6010 | + | |
6011 | +#include <linux/dm-dirty-log.h> | |
6012 | + | |
6013 | +/*----------------------------------------------------------------- | |
6014 | + * Region hash | |
6015 | + *----------------------------------------------------------------*/ | |
6016 | +struct dm_rh_client; | |
6017 | +struct dm_region; | |
6018 | + | |
6019 | +/* | |
6020 | + * States a region can have. | |
6021 | + */ | |
6022 | +enum dm_rh_region_states { | |
6023 | + DM_RH_CLEAN = 0x01, /* No writes in flight. */ | |
6024 | + DM_RH_DIRTY = 0x02, /* Writes in flight. */ | |
6025 | + DM_RH_NOSYNC = 0x04, /* Out of sync. */ | |
6026 | + DM_RH_RECOVERING = 0x08, /* Under resynchronization. */ | |
6027 | +}; | |
6028 | + | |
6029 | +/* | |
6030 | + * Region hash create/destroy. | |
6031 | + */ | |
6032 | +struct bio_list; | |
6033 | +struct dm_rh_client *dm_rh_client_create( | |
6034 | + unsigned max_recovery, | |
6035 | + void (*dispatch)(void *dispatch_context, | |
6036 | + struct bio_list *bios, int error), | |
6037 | + void *dispatch_context, | |
6038 | + void (*wake)(void *wake_context), void *wake_context, | |
6039 | + struct dm_dirty_log *log, uint32_t region_size, | |
6040 | + region_t nr_regions); | |
6041 | +void dm_rh_client_destroy(struct dm_rh_client *rh); | |
6042 | + | |
6043 | +/* | |
6044 | + * Conversion fns: | |
6045 | + * | |
6046 | + * bio -> region | |
6047 | + * sector -> region | |
6048 | + * region -> sector | |
6049 | + */ | |
6050 | +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio); | |
6051 | +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector); | |
6052 | +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region); | |
6053 | + | |
6054 | +/* | |
6055 | + * Functions to set a caller context in a region. | |
6056 | + */ | |
6057 | +void *dm_rh_reg_get_context(struct dm_region *reg); | |
6058 | +void dm_rh_reg_set_context(struct dm_region *reg, void *context); | |
6059 | + | |
6060 | +/* | |
6061 | + * Get region size and key (ie. number of the region). | |
6062 | + */ | |
6063 | +sector_t dm_rh_get_region_size(struct dm_rh_client *rh); | |
6064 | +sector_t dm_rh_get_region_key(struct dm_region *reg); | |
6065 | + | |
6066 | +/* | |
6067 | + * Get/set/update region state (and dirty log). | |
6068 | + * | |
6069 | + * dm_rh_update_states | |
6070 | + * @errors_handled != 0 influences | |
6071 | + * that the state of the region will be kept NOSYNC | |
6072 | + */ | |
6073 | +int dm_rh_get_state(struct dm_rh_client *rh, region_t region, int may_block); | |
6074 | +void dm_rh_set_state(struct dm_rh_client *rh, region_t region, | |
6075 | + enum dm_rh_region_states state, int may_block); | |
6076 | +void dm_rh_update_states(struct dm_rh_client *rh, int errors_handled); | |
6077 | + | |
6078 | +/* Flush the region hash and dirty log. */ | |
6079 | +int dm_rh_flush(struct dm_rh_client *rh); | |
6080 | + | |
6081 | +/* Inc/dec pending count on regions. */ | |
6082 | +void dm_rh_inc(struct dm_rh_client *rh, region_t region); | |
6083 | +void dm_rh_inc_pending(struct dm_rh_client *rh, struct bio_list *bios); | |
6084 | +int dm_rh_dec(struct dm_rh_client *rh, region_t region); | |
6085 | + | |
6086 | +/* Delay bios on regions. */ | |
6087 | +void dm_rh_delay(struct dm_rh_client *rh, struct bio *bio); | |
6088 | +void dm_rh_delay_by_region(struct dm_rh_client *rh, | |
6089 | + struct bio *bio, region_t region); | |
6090 | + | |
6091 | +/* | |
6092 | + * Normally, the region hash will automatically call the dispatch function. | |
6093 | + * dm_rh_dispatch_bios() is for intentional dispatching of bios. | |
6094 | + */ | |
6095 | +void dm_rh_dispatch_bios(struct dm_rh_client *rh, region_t region, int error); | |
6096 | + | |
6097 | +/* | |
6098 | + * Region recovery control. | |
6099 | + */ | |
6100 | +/* Prepare some regions for recovery by starting to quiesce them. */ | |
6101 | +int dm_rh_recovery_prepare(struct dm_rh_client *rh); | |
6102 | +/* Try fetching a quiesced region for recovery. */ | |
6103 | +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh); | |
6104 | +/* Report recovery end on a region. */ | |
6105 | +void dm_rh_recovery_end(struct dm_rh_client *rh, struct dm_region *reg, | |
6106 | + int error); | |
6107 | +/* Check for amount of recoveries in flight. */ | |
6108 | +int dm_rh_recovery_in_flight(struct dm_rh_client *rh); | |
6109 | +/* Start/stop recovery. */ | |
6110 | +void dm_rh_stop_recovery(struct dm_rh_client *rh); | |
6111 | +void dm_rh_start_recovery(struct dm_rh_client *rh); | |
6112 | + | |
6113 | +#endif /* #ifdef DM_REGION_HASH_H */ |