]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.suse/dm-raid45_2.6.27_20081027.patch
Updated xen patches taken from suse.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / dm-raid45_2.6.27_20081027.patch
1 From: "Heinz Mauelshagen <hjm@redhat.de>
2 Subject: DMRAID45 module
3 X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/
4
5 DM-RAID 45 module.
6
7 This driver is used for "Fake RAID" devices.
8
9 Acked-by: Jeff Mahoney <jeffm@suse.com>
10
11 ---
12
13 drivers/md/Kconfig | 8
14 drivers/md/Makefile | 4
15 drivers/md/dm-memcache.c | 301 ++
16 drivers/md/dm-memcache.h | 68
17 drivers/md/dm-message.c | 182 +
18 drivers/md/dm-message.h | 91
19 drivers/md/dm-raid45.c | 4516 +++++++++++++++++++++++++++++++++++++++++++++
20 drivers/md/dm-raid45.h | 28
21 drivers/md/dm-regions.c | 723 +++++++
22 drivers/md/dm.c | 1
23 include/linux/dm-regions.h | 115 +
24 11 files changed, 6036 insertions(+), 1 deletion(-)
25
26 --- a/drivers/md/dm.c
27 +++ b/drivers/md/dm.c
28 @@ -1680,6 +1680,7 @@ struct gendisk *dm_disk(struct mapped_de
29 {
30 return md->disk;
31 }
32 +EXPORT_SYMBOL_GPL(dm_disk);
33
34 int dm_suspended(struct mapped_device *md)
35 {
36 --- /dev/null
37 +++ b/drivers/md/dm-memcache.c
38 @@ -0,0 +1,301 @@
39 +/*
40 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
41 + *
42 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
43 + *
44 + * Device-mapper memory object handling:
45 + *
46 + * o allocate/free total_pages in a per client page pool.
47 + *
48 + * o allocate/free memory objects with chunks (1..n) of
49 + * pages_per_chunk pages hanging off.
50 + *
51 + * This file is released under the GPL.
52 + */
53 +
54 +#define DM_MEM_CACHE_VERSION "0.2"
55 +
56 +#include "dm.h"
57 +#include "dm-memcache.h"
58 +#include <linux/dm-io.h>
59 +
60 +struct dm_mem_cache_client {
61 + spinlock_t lock;
62 + mempool_t *objs_pool;
63 + struct page_list *free_list;
64 + unsigned objects;
65 + unsigned chunks;
66 + unsigned pages_per_chunk;
67 + unsigned free_pages;
68 + unsigned total_pages;
69 +};
70 +
71 +/*
72 + * Free pages and page_list elements of client.
73 + */
74 +static void free_cache_pages(struct page_list *list)
75 +{
76 + while (list) {
77 + struct page_list *pl = list;
78 +
79 + list = pl->next;
80 + BUG_ON(!pl->page);
81 + __free_page(pl->page);
82 + kfree(pl);
83 + }
84 +}
85 +
86 +/*
87 + * Alloc number of pages and page_list elements as required by client.
88 + */
89 +static struct page_list *alloc_cache_pages(unsigned pages)
90 +{
91 + struct page_list *pl, *ret = NULL;
92 + struct page *page;
93 +
94 + while (pages--) {
95 + page = alloc_page(GFP_NOIO);
96 + if (!page)
97 + goto err;
98 +
99 + pl = kmalloc(sizeof(*pl), GFP_NOIO);
100 + if (!pl) {
101 + __free_page(page);
102 + goto err;
103 + }
104 +
105 + pl->page = page;
106 + pl->next = ret;
107 + ret = pl;
108 + }
109 +
110 + return ret;
111 +
112 +err:
113 + free_cache_pages(ret);
114 + return NULL;
115 +}
116 +
117 +/*
118 + * Allocate page_list elements from the pool to chunks of the memory object.
119 + */
120 +static void alloc_chunks(struct dm_mem_cache_client *cl,
121 + struct dm_mem_cache_object *obj)
122 +{
123 + unsigned chunks = cl->chunks;
124 + unsigned long flags;
125 +
126 + local_irq_save(flags);
127 + local_irq_disable();
128 + while (chunks--) {
129 + unsigned p = cl->pages_per_chunk;
130 +
131 + obj[chunks].pl = NULL;
132 +
133 + while (p--) {
134 + struct page_list *pl;
135 +
136 + /* Take next element from free list */
137 + spin_lock(&cl->lock);
138 + pl = cl->free_list;
139 + BUG_ON(!pl);
140 + cl->free_list = pl->next;
141 + spin_unlock(&cl->lock);
142 +
143 + pl->next = obj[chunks].pl;
144 + obj[chunks].pl = pl;
145 + }
146 + }
147 +
148 + local_irq_restore(flags);
149 +}
150 +
151 +/*
152 + * Free page_list elements putting them back onto free list
153 + */
154 +static void free_chunks(struct dm_mem_cache_client *cl,
155 + struct dm_mem_cache_object *obj)
156 +{
157 + unsigned chunks = cl->chunks;
158 + unsigned long flags;
159 + struct page_list *next, *pl;
160 +
161 + local_irq_save(flags);
162 + local_irq_disable();
163 + while (chunks--) {
164 + for (pl = obj[chunks].pl; pl; pl = next) {
165 + next = pl->next;
166 +
167 + spin_lock(&cl->lock);
168 + pl->next = cl->free_list;
169 + cl->free_list = pl;
170 + cl->free_pages++;
171 + spin_unlock(&cl->lock);
172 + }
173 + }
174 +
175 + local_irq_restore(flags);
176 +}
177 +
178 +/*
179 + * Create/destroy dm memory cache client resources.
180 + */
181 +struct dm_mem_cache_client *
182 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
183 + unsigned pages_per_chunk)
184 +{
185 + unsigned total_pages = objects * chunks * pages_per_chunk;
186 + struct dm_mem_cache_client *client;
187 +
188 + BUG_ON(!total_pages);
189 + client = kzalloc(sizeof(*client), GFP_KERNEL);
190 + if (!client)
191 + return ERR_PTR(-ENOMEM);
192 +
193 + client->objs_pool = mempool_create_kmalloc_pool(objects,
194 + chunks * sizeof(struct dm_mem_cache_object));
195 + if (!client->objs_pool)
196 + goto err;
197 +
198 + client->free_list = alloc_cache_pages(total_pages);
199 + if (!client->free_list)
200 + goto err1;
201 +
202 + spin_lock_init(&client->lock);
203 + client->objects = objects;
204 + client->chunks = chunks;
205 + client->pages_per_chunk = pages_per_chunk;
206 + client->free_pages = client->total_pages = total_pages;
207 + return client;
208 +
209 +err1:
210 + mempool_destroy(client->objs_pool);
211 +err:
212 + kfree(client);
213 + return ERR_PTR(-ENOMEM);
214 +}
215 +EXPORT_SYMBOL(dm_mem_cache_client_create);
216 +
217 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
218 +{
219 + BUG_ON(cl->free_pages != cl->total_pages);
220 + free_cache_pages(cl->free_list);
221 + mempool_destroy(cl->objs_pool);
222 + kfree(cl);
223 +}
224 +EXPORT_SYMBOL(dm_mem_cache_client_destroy);
225 +
226 +/*
227 + * Grow a clients cache by an amount of pages.
228 + *
229 + * Don't call from interrupt context!
230 + */
231 +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
232 +{
233 + unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
234 + struct page_list *pl, *last;
235 +
236 + BUG_ON(!pages);
237 + pl = alloc_cache_pages(pages);
238 + if (!pl)
239 + return -ENOMEM;
240 +
241 + last = pl;
242 + while (last->next)
243 + last = last->next;
244 +
245 + spin_lock_irq(&cl->lock);
246 + last->next = cl->free_list;
247 + cl->free_list = pl;
248 + cl->free_pages += pages;
249 + cl->total_pages += pages;
250 + cl->objects++;
251 + spin_unlock_irq(&cl->lock);
252 +
253 + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
254 + return 0;
255 +}
256 +EXPORT_SYMBOL(dm_mem_cache_grow);
257 +
258 +/* Shrink a clients cache by an amount of pages */
259 +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
260 +{
261 + int r;
262 + unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
263 + unsigned long flags;
264 + struct page_list *last = NULL, *pl, *pos;
265 +
266 + BUG_ON(!pages);
267 +
268 + spin_lock_irqsave(&cl->lock, flags);
269 + pl = pos = cl->free_list;
270 + while (p-- && pos->next) {
271 + last = pos;
272 + pos = pos->next;
273 + }
274 +
275 + if (++p)
276 + r = -ENOMEM;
277 + else {
278 + r = 0;
279 + cl->free_list = pos;
280 + cl->free_pages -= pages;
281 + cl->total_pages -= pages;
282 + cl->objects--;
283 + last->next = NULL;
284 + }
285 + spin_unlock_irqrestore(&cl->lock, flags);
286 +
287 + if (!r) {
288 + free_cache_pages(pl);
289 + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
290 + }
291 +
292 + return r;
293 +}
294 +EXPORT_SYMBOL(dm_mem_cache_shrink);
295 +
296 +/*
297 + * Allocate/free a memory object
298 + *
299 + * Can be called from interrupt context
300 + */
301 +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
302 +{
303 + int r = 0;
304 + unsigned pages = cl->chunks * cl->pages_per_chunk;
305 + unsigned long flags;
306 + struct dm_mem_cache_object *obj;
307 +
308 + obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
309 + if (!obj)
310 + return ERR_PTR(-ENOMEM);
311 +
312 + spin_lock_irqsave(&cl->lock, flags);
313 + if (pages > cl->free_pages)
314 + r = -ENOMEM;
315 + else
316 + cl->free_pages -= pages;
317 + spin_unlock_irqrestore(&cl->lock, flags);
318 +
319 + if (r) {
320 + mempool_free(obj, cl->objs_pool);
321 + return ERR_PTR(r);
322 + }
323 +
324 + alloc_chunks(cl, obj);
325 + return obj;
326 +}
327 +EXPORT_SYMBOL(dm_mem_cache_alloc);
328 +
329 +void dm_mem_cache_free(struct dm_mem_cache_client *cl,
330 + struct dm_mem_cache_object *obj)
331 +{
332 + free_chunks(cl, obj);
333 + mempool_free(obj, cl->objs_pool);
334 +}
335 +EXPORT_SYMBOL(dm_mem_cache_free);
336 +
337 +MODULE_DESCRIPTION(DM_NAME " dm memory cache");
338 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
339 +MODULE_LICENSE("GPL");
340 --- /dev/null
341 +++ b/drivers/md/dm-memcache.h
342 @@ -0,0 +1,68 @@
343 +/*
344 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
345 + *
346 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
347 + *
348 + * Device-mapper memory object handling:
349 + *
350 + * o allocate/free total_pages in a per client page pool.
351 + *
352 + * o allocate/free memory objects with chunks (1..n) of
353 + * pages_per_chunk pages hanging off.
354 + *
355 + * This file is released under the GPL.
356 + */
357 +
358 +#ifndef _DM_MEM_CACHE_H
359 +#define _DM_MEM_CACHE_H
360 +
361 +#define DM_MEM_CACHE_H_VERSION "0.1"
362 +
363 +#include "dm.h"
364 +#include <linux/dm-io.h>
365 +
366 +static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
367 +{
368 + while (pl && p--)
369 + pl = pl->next;
370 +
371 + return pl;
372 +}
373 +
374 +struct dm_mem_cache_object {
375 + struct page_list *pl; /* Dynamically allocated array */
376 + void *private; /* Caller context reference */
377 +};
378 +
379 +struct dm_mem_cache_client;
380 +
381 +/*
382 + * Create/destroy dm memory cache client resources.
383 + *
384 + * On creation, a number of @objects with @chunks of
385 + * @pages_per_chunk pages will be allocated.
386 + */
387 +struct dm_mem_cache_client *
388 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
389 + unsigned pages_per_chunk);
390 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
391 +
392 +/*
393 + * Grow/shrink a dm memory cache client resources
394 + * by @objetcs amount of objects.
395 + */
396 +int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
397 +int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
398 +
399 +/*
400 + * Allocate/free a memory object
401 + *
402 + * On allocation one object with an amount of chunks and
403 + * an amount of pages per chunk will be returned on success.
404 + */
405 +struct dm_mem_cache_object *
406 +dm_mem_cache_alloc(struct dm_mem_cache_client *client);
407 +void dm_mem_cache_free(struct dm_mem_cache_client *client,
408 + struct dm_mem_cache_object *object);
409 +
410 +#endif
411 --- /dev/null
412 +++ b/drivers/md/dm-message.c
413 @@ -0,0 +1,182 @@
414 +/*
415 + * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved.
416 + *
417 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
418 + *
419 + * General device-mapper message interface argument parser.
420 + *
421 + * This file is released under the GPL.
422 + *
423 + * device-mapper message parser.
424 + *
425 + */
426 +
427 +#include "dm.h"
428 +#include "dm-message.h"
429 +#include <linux/kernel.h>
430 +
431 +#define DM_MSG_PREFIX "dm_message"
432 +
433 +/* Basename of a path. */
434 +static inline char *
435 +basename(char *s)
436 +{
437 + char *p = strrchr(s, '/');
438 +
439 + return p ? p + 1 : s;
440 +}
441 +
442 +/* Get an argument depending on type. */
443 +static void
444 +message_arguments(struct dm_msg *msg, int argc, char **argv)
445 +{
446 +
447 + if (argc) {
448 + int i;
449 + struct dm_message_argument *args = msg->spec->args;
450 +
451 + for (i = 0; i < args->num_args; i++) {
452 + int r;
453 + unsigned long **ptr = args->ptr;
454 + enum dm_message_argument_type type = args->types[i];
455 +
456 + switch (type) {
457 + case dm_msg_base_t:
458 + ((char **) ptr)[i] = basename(argv[i]);
459 + break;
460 +
461 + case dm_msg_str_t:
462 + ((char **) ptr)[i] = argv[i];
463 + break;
464 +
465 + case dm_msg_int_t:
466 + r = sscanf(argv[i], "%d", ((int **) ptr)[i]);
467 + goto check;
468 +
469 + case dm_msg_uint_t:
470 + r = sscanf(argv[i], "%u",
471 + ((unsigned **) ptr)[i]);
472 + goto check;
473 +
474 + case dm_msg_uint64_t:
475 + r = sscanf(argv[i], "%llu",
476 + ((unsigned long long **) ptr)[i]);
477 +
478 +check:
479 + if (r != 1) {
480 + set_bit(dm_msg_ret_undef, &msg->ret);
481 + set_bit(dm_msg_ret_arg, &msg->ret);
482 + }
483 + }
484 + }
485 + }
486 +}
487 +
488 +/* Parse message options. */
489 +static void
490 +message_options_parse(struct dm_msg *msg, int argc, char **argv)
491 +{
492 + int hit = 0;
493 + unsigned long *action;
494 + size_t l1 = strlen(*argv), l_hit = 0;
495 + struct dm_message_option *o = msg->spec->options;
496 + char **option, **option_end = o->options + o->num_options;
497 +
498 + for (option = o->options, action = o->actions;
499 + option < option_end; option++, action++) {
500 + size_t l2 = strlen(*option);
501 +
502 + if (!strnicmp(*argv, *option, min(l1, l2))) {
503 + hit++;
504 + l_hit = l2;
505 + set_bit(*action, &msg->action);
506 + }
507 + }
508 +
509 + /* Assume error. */
510 + msg->ret = 0;
511 + set_bit(dm_msg_ret_option, &msg->ret);
512 + if (!hit || l1 > l_hit)
513 + set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */
514 + else if (hit > 1)
515 + set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/
516 + else {
517 + clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */
518 + message_arguments(msg, --argc, ++argv);
519 + }
520 +}
521 +
522 +static inline void
523 +print_ret(const char *caller, unsigned long ret)
524 +{
525 + struct {
526 + unsigned long err;
527 + const char *err_str;
528 + } static err_msg[] = {
529 + { dm_msg_ret_ambiguous, "message ambiguous" },
530 + { dm_msg_ret_inval, "message invalid" },
531 + { dm_msg_ret_undef, "message undefined" },
532 + { dm_msg_ret_arg, "message argument" },
533 + { dm_msg_ret_argcount, "message argument count" },
534 + { dm_msg_ret_option, "option" },
535 + }, *e = ARRAY_END(err_msg);
536 +
537 + while (e-- > err_msg) {
538 + if (test_bit(e->err, &ret))
539 + DMERR("%s %s", caller, e->err_str);
540 + }
541 +}
542 +
543 +/* Parse a message action. */
544 +int
545 +dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
546 + int argc, char **argv)
547 +{
548 + int hit = 0;
549 + size_t l1 = strlen(*argv), l_hit = 0;
550 + struct dm_msg_spec *s, *s_hit = NULL,
551 + *s_end = msg->specs + msg->num_specs;
552 +
553 + if (argc < 2)
554 + return -EINVAL;
555 +
556 + for (s = msg->specs; s < s_end; s++) {
557 + size_t l2 = strlen(s->cmd);
558 +
559 + if (!strnicmp(*argv, s->cmd, min(l1, l2))) {
560 + hit++;
561 + l_hit = l2;
562 + s_hit = s;
563 + }
564 + }
565 +
566 + msg->ret = 0;
567 + if (!hit || l1 > l_hit) /* No hit or message string too long. */
568 + set_bit(dm_msg_ret_undef, &msg->ret);
569 + else if (hit > 1) /* Ambiguous message. */
570 + set_bit(dm_msg_ret_ambiguous, &msg->ret);
571 + else if (argc - 2 != s_hit->args->num_args) {
572 + set_bit(dm_msg_ret_undef, &msg->ret);
573 + set_bit(dm_msg_ret_argcount, &msg->ret);
574 + }
575 +
576 + if (msg->ret)
577 + goto bad;
578 +
579 + msg->action = 0;
580 + msg->spec = s_hit;
581 + set_bit(s_hit->action, &msg->action);
582 + message_options_parse(msg, --argc, ++argv);
583 +
584 + if (!msg->ret)
585 + return msg->spec->f(msg, context);
586 +
587 +bad:
588 + print_ret(caller, msg->ret);
589 + return -EINVAL;
590 +}
591 +EXPORT_SYMBOL(dm_message_parse);
592 +
593 +MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser");
594 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
595 +MODULE_LICENSE("GPL");
596 --- /dev/null
597 +++ b/drivers/md/dm-message.h
598 @@ -0,0 +1,91 @@
599 +/*
600 + * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved.
601 + *
602 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de>
603 + *
604 + * General device-mapper message interface argument parser.
605 + *
606 + * This file is released under the GPL.
607 + *
608 + */
609 +
610 +#ifndef DM_MESSAGE_H
611 +#define DM_MESSAGE_H
612 +
613 +/* Factor out to dm.h. */
614 +/* Reference to array end. */
615 +#define ARRAY_END(a) ((a) + ARRAY_SIZE(a))
616 +
617 +/* Message return bits. */
618 +enum dm_message_return {
619 + dm_msg_ret_ambiguous, /* Action ambiguous. */
620 + dm_msg_ret_inval, /* Action invalid. */
621 + dm_msg_ret_undef, /* Action undefined. */
622 +
623 + dm_msg_ret_option, /* Option error. */
624 + dm_msg_ret_arg, /* Argument error. */
625 + dm_msg_ret_argcount, /* Argument count error. */
626 +};
627 +
628 +/* Message argument type conversions. */
629 +enum dm_message_argument_type {
630 + dm_msg_base_t, /* Basename string. */
631 + dm_msg_str_t, /* String. */
632 + dm_msg_int_t, /* Signed int. */
633 + dm_msg_uint_t, /* Unsigned int. */
634 + dm_msg_uint64_t, /* Unsigned int 64. */
635 +};
636 +
637 +/* A message option. */
638 +struct dm_message_option {
639 + unsigned num_options;
640 + char **options;
641 + unsigned long *actions;
642 +};
643 +
644 +/* Message arguments and types. */
645 +struct dm_message_argument {
646 + unsigned num_args;
647 + unsigned long **ptr;
648 + enum dm_message_argument_type types[];
649 +};
650 +
651 +/* Client message. */
652 +struct dm_msg {
653 + unsigned long action; /* Identified action. */
654 + unsigned long ret; /* Return bits. */
655 + unsigned num_specs; /* # of sepcifications listed. */
656 + struct dm_msg_spec *specs; /* Specification list. */
657 + struct dm_msg_spec *spec; /* Specification selected. */
658 +};
659 +
660 +/* Secification of the message. */
661 +struct dm_msg_spec {
662 + const char *cmd; /* Name of the command (i.e. 'bandwidth'). */
663 + unsigned long action;
664 + struct dm_message_option *options;
665 + struct dm_message_argument *args;
666 + unsigned long parm; /* Parameter to pass through to callback. */
667 + /* Function to process for action. */
668 + int (*f) (struct dm_msg *msg, void *context);
669 +};
670 +
671 +/* Parameter access macros. */
672 +#define DM_MSG_PARM(msg) ((msg)->spec->parm)
673 +
674 +#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx])
675 +#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx])
676 +#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx))
677 +#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx])
678 +
679 +#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0)
680 +#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0)
681 +#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0)
682 +#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0)
683 +
684 +
685 +/* Parse a message and its options and optionally call a function back. */
686 +int dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
687 + int argc, char **argv);
688 +
689 +#endif
690 --- /dev/null
691 +++ b/drivers/md/dm-raid45.c
692 @@ -0,0 +1,4516 @@
693 +/*
694 + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
695 + *
696 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
697 + *
698 + * This file is released under the GPL.
699 + *
700 + *
701 + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
702 + *
703 + * Supports:
704 + * o RAID4 with dedicated and selectable parity device
705 + * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
706 + * o run time optimization of xor algorithm used to calculate parity
707 + *
708 + *
709 + * Thanks to MD for:
710 + * o the raid address calculation algorithm
711 + * o the base of the biovec <-> page list copier.
712 + *
713 + *
714 + * Uses region hash to keep track of how many writes are in flight to
715 + * regions in order to use dirty log to keep state of regions to recover:
716 + *
717 + * o clean regions (those which are synchronized
718 + * and don't have write io in flight)
719 + * o dirty regions (those with write io in flight)
720 + *
721 + *
722 + * On startup, any dirty regions are migrated to the 'nosync' state
723 + * and are subject to recovery by the daemon.
724 + *
725 + * See raid_ctr() for table definition.
726 + *
727 + *
728 + * FIXME:
729 + * o add virtual interface for locking
730 + * o remove instrumentation (REMOVEME:)
731 + *
732 + */
733 +
734 +static const char *version = "v0.2431";
735 +
736 +#include "dm.h"
737 +#include "dm-bio-list.h"
738 +#include "dm-memcache.h"
739 +#include "dm-message.h"
740 +#include "dm-raid45.h"
741 +
742 +#include <linux/kernel.h>
743 +#include <linux/vmalloc.h>
744 +
745 +#include <linux/dm-io.h>
746 +#include <linux/dm-dirty-log.h>
747 +#include <linux/dm-regions.h>
748 +
749 +/* # of parallel recovered regions */
750 +/* FIXME: cope with multiple recovery stripes in raid_set struct. */
751 +#define MAX_RECOVER 1 /* needs to be 1! */
752 +
753 +/*
754 + * Configurable parameters
755 + */
756 +#define INLINE
757 +
758 +/* Default # of stripes if not set in constructor. */
759 +#define STRIPES 64
760 +
761 +/* Minimum/maximum # of selectable stripes. */
762 +#define STRIPES_MIN 8
763 +#define STRIPES_MAX 16384
764 +
765 +/* Default chunk size in sectors if not set in constructor. */
766 +#define CHUNK_SIZE 64
767 +
768 +/* Default io size in sectors if not set in constructor. */
769 +#define IO_SIZE_MIN SECTORS_PER_PAGE
770 +#define IO_SIZE IO_SIZE_MIN
771 +
772 +/* Maximum setable chunk size in sectors. */
773 +#define CHUNK_SIZE_MAX 16384
774 +
775 +/* Recover io size default in sectors. */
776 +#define RECOVER_IO_SIZE_MIN 64
777 +#define RECOVER_IO_SIZE 256
778 +
779 +/* Default percentage recover io bandwidth. */
780 +#define BANDWIDTH 10
781 +#define BANDWIDTH_MIN 1
782 +#define BANDWIDTH_MAX 100
783 +/*
784 + * END Configurable parameters
785 + */
786 +
787 +#define TARGET "dm-raid45"
788 +#define DAEMON "kraid45d"
789 +#define DM_MSG_PREFIX TARGET
790 +
791 +#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
792 +
793 +/* Amount/size for __xor(). */
794 +#define SECTORS_PER_XOR SECTORS_PER_PAGE
795 +#define XOR_SIZE PAGE_SIZE
796 +
797 +/* Derive raid_set from stripe_cache pointer. */
798 +#define RS(x) container_of(x, struct raid_set, sc)
799 +
800 +/* Check value in range. */
801 +#define range_ok(i, min, max) (i >= min && i <= max)
802 +
803 +/* Page reference. */
804 +#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
805 +
806 +/* Bio list reference. */
807 +#define BL(stripe, p, rw) (stripe->ss[p].bl + rw)
808 +
809 +/* Page list reference. */
810 +#define PL(stripe, p) (stripe->obj[p].pl)
811 +
812 +/* Check argument is power of 2. */
813 +#define POWER_OF_2(a) (!(a & (a - 1)))
814 +
815 +/* Factor out to dm-bio-list.h */
816 +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
817 +{
818 + bio->bi_next = bl->head;
819 + bl->head = bio;
820 +
821 + if (!bl->tail)
822 + bl->tail = bio;
823 +}
824 +
825 +/* Factor out to dm.h */
826 +#define TI_ERR_RET(str, ret) \
827 + do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
828 +#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
829 +
830 +/*-----------------------------------------------------------------
831 + * Stripe cache
832 + *
833 + * Cache for all reads and writes to raid sets (operational or degraded)
834 + *
835 + * We need to run all data to and from a RAID set through this cache,
836 + * because parity chunks need to get calculated from data chunks
837 + * or, in the degraded/resynchronization case, missing chunks need
838 + * to be reconstructed using the other chunks of the stripe.
839 + *---------------------------------------------------------------*/
840 +/* Protect kmem cache # counter. */
841 +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
842 +
843 +/* A stripe set (holds bios hanging off). */
844 +struct stripe_set {
845 + struct stripe *stripe; /* Backpointer to stripe for endio(). */
846 + struct bio_list bl[3]; /* Reads, writes, and writes merged. */
847 +#define WRITE_MERGED 2
848 +};
849 +
850 +#if READ != 0 || WRITE != 1
851 +#error dm-raid45: READ/WRITE != 0/1 used as index!!!
852 +#endif
853 +
854 +/*
855 + * Stripe linked list indexes. Keep order, because the stripe
856 + * and the stripe cache rely on the first 3!
857 + */
858 +enum list_types {
859 + LIST_IO = 0, /* Stripes with io pending. */
860 + LIST_ENDIO, /* Stripes to endio. */
861 + LIST_LRU, /* Least recently used stripes. */
862 + LIST_HASH, /* Hashed stripes. */
863 + LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
864 + NR_LISTS, /* To size array in struct stripe. */
865 +};
866 +
867 +enum lock_types {
868 + LOCK_ENDIO = 0, /* Protect endio list. */
869 + LOCK_LRU, /* Protect lru list. */
870 + NR_LOCKS, /* To size array in struct stripe_cache. */
871 +};
872 +
873 +/* A stripe: the io object to handle all reads and writes to a RAID set. */
874 +struct stripe {
875 + struct stripe_cache *sc; /* Backpointer to stripe cache. */
876 +
877 + sector_t key; /* Hash key. */
878 + sector_t region; /* Region stripe is mapped to. */
879 +
880 + /* Reference count. */
881 + atomic_t cnt;
882 +
883 + struct {
884 + unsigned long flags; /* flags (see below). */
885 +
886 + /*
887 + * Pending ios in flight:
888 + *
889 + * used as a 'lock' to control move of stripe to endio list
890 + */
891 + atomic_t pending; /* Pending ios in flight. */
892 +
893 + /* Sectors to read and write for multi page stripe sets. */
894 + unsigned size;
895 + } io;
896 +
897 + /* Lock on stripe (for clustering). */
898 + void *lock;
899 +
900 + /*
901 + * 4 linked lists:
902 + * o io list to flush io
903 + * o endio list
904 + * o LRU list to put stripes w/o reference count on
905 + * o stripe cache hash
906 + */
907 + struct list_head lists[NR_LISTS];
908 +
909 + struct {
910 + unsigned short parity; /* Parity chunk index. */
911 + short recover; /* Recovery chunk index. */
912 + } idx;
913 +
914 + /* This sets memory cache object (dm-mem-cache). */
915 + struct dm_mem_cache_object *obj;
916 +
917 + /* Array of stripe sets (dynamically allocated). */
918 + struct stripe_set ss[0];
919 +};
920 +
921 +/* States stripes can be in (flags field). */
922 +enum stripe_states {
923 + STRIPE_ACTIVE, /* Active io on stripe. */
924 + STRIPE_ERROR, /* io error on stripe. */
925 + STRIPE_MERGED, /* Writes got merged. */
926 + STRIPE_READ, /* Read. */
927 + STRIPE_RBW, /* Read-before-write. */
928 + STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */
929 + STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
930 +};
931 +
932 +/* ... and macros to access them. */
933 +#define BITOPS(name, what, var, flag) \
934 +static inline int TestClear ## name ## what(struct var *v) \
935 +{ return test_and_clear_bit(flag, &v->io.flags); } \
936 +static inline int TestSet ## name ## what(struct var *v) \
937 +{ return test_and_set_bit(flag, &v->io.flags); } \
938 +static inline void Clear ## name ## what(struct var *v) \
939 +{ clear_bit(flag, &v->io.flags); } \
940 +static inline void Set ## name ## what(struct var *v) \
941 +{ set_bit(flag, &v->io.flags); } \
942 +static inline int name ## what(struct var *v) \
943 +{ return test_bit(flag, &v->io.flags); }
944 +
945 +
946 +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
947 +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
948 +BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
949 +BITOPS(Stripe, Read, stripe, STRIPE_READ)
950 +BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
951 +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
952 +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
953 +
954 +/* A stripe hash. */
955 +struct stripe_hash {
956 + struct list_head *hash;
957 + unsigned buckets;
958 + unsigned mask;
959 + unsigned prime;
960 + unsigned shift;
961 +};
962 +
963 +/* A stripe cache. */
964 +struct stripe_cache {
965 + /* Stripe hash. */
966 + struct stripe_hash hash;
967 +
968 + /* Stripes with io to flush, stripes to endio and LRU lists. */
969 + struct list_head lists[3];
970 +
971 + /* Locks to protect endio and lru lists. */
972 + spinlock_t locks[NR_LOCKS];
973 +
974 + /* Slab cache to allocate stripes from. */
975 + struct {
976 + struct kmem_cache *cache; /* Cache itself. */
977 + char name[32]; /* Unique name. */
978 + } kc;
979 +
980 + struct dm_io_client *dm_io_client; /* dm-io client resource context. */
981 +
982 + /* dm-mem-cache client resource context. */
983 + struct dm_mem_cache_client *mem_cache_client;
984 +
985 + int stripes_parm; /* # stripes parameter from constructor. */
986 + atomic_t stripes; /* actual # of stripes in cache. */
987 + atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
988 + atomic_t stripes_last; /* last # of stripes in cache. */
989 + atomic_t active_stripes; /* actual # of active stripes in cache. */
990 +
991 + /* REMOVEME: */
992 + atomic_t max_active_stripes; /* actual # of active stripes in cache. */
993 +};
994 +
995 +/* Flag specs for raid_dev */ ;
996 +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
997 +
998 +/* The raid device in a set. */
999 +struct raid_dev {
1000 + struct dm_dev *dev;
1001 + unsigned long flags; /* raid_dev_flags. */
1002 + sector_t start; /* offset to map to. */
1003 +};
1004 +
1005 +/* Flags spec for raid_set. */
1006 +enum raid_set_flags {
1007 + RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
1008 + RS_DEAD, /* RAID set inoperational. */
1009 + RS_DEVEL_STATS, /* REMOVEME: display status information. */
1010 + RS_IO_ERROR, /* io error on set. */
1011 + RS_RECOVER, /* Do recovery. */
1012 + RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
1013 + RS_REGION_GET, /* get a region to recover. */
1014 + RS_SC_BUSY, /* stripe cache busy -> send an event. */
1015 + RS_SUSPENDED, /* RAID set suspendedn. */
1016 +};
1017 +
1018 +/* REMOVEME: devel stats counters. */
1019 +enum stats_types {
1020 + S_BIOS_READ,
1021 + S_BIOS_ADDED_READ,
1022 + S_BIOS_ENDIO_READ,
1023 + S_BIOS_WRITE,
1024 + S_BIOS_ADDED_WRITE,
1025 + S_BIOS_ENDIO_WRITE,
1026 + S_CAN_MERGE,
1027 + S_CANT_MERGE,
1028 + S_CONGESTED,
1029 + S_DM_IO_READ,
1030 + S_DM_IO_WRITE,
1031 + S_ACTIVE_READS,
1032 + S_BANDWIDTH,
1033 + S_BARRIER,
1034 + S_BIO_COPY_PL_NEXT,
1035 + S_DEGRADED,
1036 + S_DELAYED_BIOS,
1037 + S_EVICT,
1038 + S_FLUSHS,
1039 + S_HITS_1ST,
1040 + S_IOS_POST,
1041 + S_INSCACHE,
1042 + S_MAX_LOOKUP,
1043 + S_MERGE_PAGE_LOCKED,
1044 + S_NO_BANDWIDTH,
1045 + S_NOT_CONGESTED,
1046 + S_NO_RW,
1047 + S_NOSYNC,
1048 + S_PROHIBITPAGEIO,
1049 + S_RECONSTRUCT_EI,
1050 + S_RECONSTRUCT_DEV,
1051 + S_REDO,
1052 + S_REQUEUE,
1053 + S_STRIPE_ERROR,
1054 + S_SUM_DELAYED_BIOS,
1055 + S_XORS,
1056 + S_NR_STATS, /* # of stats counters. */
1057 +};
1058 +
1059 +/* Status type -> string mappings. */
1060 +struct stats_map {
1061 + const enum stats_types type;
1062 + const char *str;
1063 +};
1064 +
1065 +static struct stats_map stats_map[] = {
1066 + { S_BIOS_READ, "r=" },
1067 + { S_BIOS_ADDED_READ, "/" },
1068 + { S_BIOS_ENDIO_READ, "/" },
1069 + { S_BIOS_WRITE, " w=" },
1070 + { S_BIOS_ADDED_WRITE, "/" },
1071 + { S_BIOS_ENDIO_WRITE, "/" },
1072 + { S_DM_IO_READ, " rc=" },
1073 + { S_DM_IO_WRITE, " wc=" },
1074 + { S_ACTIVE_READS, " active_reads=" },
1075 + { S_BANDWIDTH, " bandwidth=" },
1076 + { S_NO_BANDWIDTH, " no_bandwidth=" },
1077 + { S_BARRIER, " barrier=" },
1078 + { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
1079 + { S_CAN_MERGE, " can_merge=" },
1080 + { S_MERGE_PAGE_LOCKED, "/page_locked=" },
1081 + { S_CANT_MERGE, "/cant_merge=" },
1082 + { S_CONGESTED, " congested=" },
1083 + { S_NOT_CONGESTED, "/not_congested=" },
1084 + { S_DEGRADED, " degraded=" },
1085 + { S_DELAYED_BIOS, " delayed_bios=" },
1086 + { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
1087 + { S_EVICT, " evict=" },
1088 + { S_FLUSHS, " flushs=" },
1089 + { S_HITS_1ST, " hits_1st=" },
1090 + { S_IOS_POST, " ios_post=" },
1091 + { S_INSCACHE, " inscache=" },
1092 + { S_MAX_LOOKUP, " max_lookup=" },
1093 + { S_NO_RW, " no_rw=" },
1094 + { S_NOSYNC, " nosync=" },
1095 + { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
1096 + { S_RECONSTRUCT_EI, " reconstruct_ei=" },
1097 + { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
1098 + { S_REDO, " redo=" },
1099 + { S_REQUEUE, " requeue=" },
1100 + { S_STRIPE_ERROR, " stripe_error=" },
1101 + { S_XORS, " xors=" },
1102 +};
1103 +
1104 +/*
1105 + * A RAID set.
1106 + */
1107 +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
1108 +struct raid_set {
1109 + struct dm_target *ti; /* Target pointer. */
1110 +
1111 + struct {
1112 + unsigned long flags; /* State flags. */
1113 + spinlock_t in_lock; /* Protects central input list below. */
1114 + struct bio_list in; /* Pending ios (central input list). */
1115 + struct bio_list work; /* ios work set. */
1116 + wait_queue_head_t suspendq; /* suspend synchronization. */
1117 + atomic_t in_process; /* counter of queued bios (suspendq). */
1118 + atomic_t in_process_max;/* counter of queued bios max. */
1119 +
1120 + /* io work. */
1121 + struct workqueue_struct *wq;
1122 + struct delayed_work dws;
1123 + } io;
1124 +
1125 + /* External locking. */
1126 + struct dm_raid45_locking_type *locking;
1127 +
1128 + struct stripe_cache sc; /* Stripe cache for this set. */
1129 +
1130 + /* Xor optimization. */
1131 + struct {
1132 + struct xor_func *f;
1133 + unsigned chunks;
1134 + unsigned speed;
1135 + } xor;
1136 +
1137 + /* Recovery parameters. */
1138 + struct recover {
1139 + struct dm_dirty_log *dl; /* Dirty log. */
1140 + struct dm_rh_client *rh; /* Region hash. */
1141 +
1142 + /* dm-mem-cache client resource context for recovery stripes. */
1143 + struct dm_mem_cache_client *mem_cache_client;
1144 +
1145 + struct list_head stripes; /* List of recovery stripes. */
1146 +
1147 + region_t nr_regions;
1148 + region_t nr_regions_to_recover;
1149 + region_t nr_regions_recovered;
1150 + unsigned long start_jiffies;
1151 + unsigned long end_jiffies;
1152 +
1153 + unsigned bandwidth; /* Recovery bandwidth [%]. */
1154 + unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
1155 + unsigned bandwidth_parm; /* " constructor parm. */
1156 + unsigned io_size; /* io size <= chunk size. */
1157 + unsigned io_size_parm; /* io size ctr parameter. */
1158 +
1159 + /* recovery io throttling. */
1160 + atomic_t io_count[2]; /* counter recover/regular io. */
1161 + unsigned long last_jiffies;
1162 +
1163 + struct dm_region *reg; /* Actual region to recover. */
1164 + sector_t pos; /* Position within region to recover. */
1165 + sector_t end; /* End of region to recover. */
1166 + } recover;
1167 +
1168 + /* RAID set parameters. */
1169 + struct {
1170 + struct raid_type *raid_type; /* RAID type (eg, RAID4). */
1171 + unsigned raid_parms; /* # variable raid parameters. */
1172 +
1173 + unsigned chunk_size; /* Sectors per chunk. */
1174 + unsigned chunk_size_parm;
1175 + unsigned chunk_mask; /* Mask for amount. */
1176 + unsigned chunk_shift; /* rsector chunk size shift. */
1177 +
1178 + unsigned io_size; /* Sectors per io. */
1179 + unsigned io_size_parm;
1180 + unsigned io_mask; /* Mask for amount. */
1181 + unsigned io_shift_mask; /* Mask for raid_address(). */
1182 + unsigned io_shift; /* rsector io size shift. */
1183 + unsigned pages_per_io; /* Pages per io. */
1184 +
1185 + sector_t sectors_per_dev; /* Sectors per device. */
1186 +
1187 + atomic_t failed_devs; /* Amount of devices failed. */
1188 +
1189 + /* Index of device to initialize. */
1190 + int dev_to_init;
1191 + int dev_to_init_parm;
1192 +
1193 + /* Raid devices dynamically allocated. */
1194 + unsigned raid_devs; /* # of RAID devices below. */
1195 + unsigned data_devs; /* # of RAID data devices. */
1196 +
1197 + int ei; /* index of failed RAID device. */
1198 +
1199 + /* index of dedicated parity device (i.e. RAID4). */
1200 + int pi;
1201 + int pi_parm; /* constructor parm for status output. */
1202 + } set;
1203 +
1204 + /* REMOVEME: devel stats counters. */
1205 + atomic_t stats[S_NR_STATS];
1206 +
1207 + /* Dynamically allocated temporary pointers for xor(). */
1208 + unsigned long **data;
1209 +
1210 + /* Dynamically allocated RAID devices. Alignment? */
1211 + struct raid_dev dev[0];
1212 +};
1213 +
1214 +
1215 +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
1216 +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
1217 +BITOPS(RS, Dead, raid_set, RS_DEAD)
1218 +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
1219 +BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
1220 +BITOPS(RS, Recover, raid_set, RS_RECOVER)
1221 +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
1222 +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
1223 +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
1224 +#undef BITOPS
1225 +
1226 +#define PageIO(page) PageChecked(page)
1227 +#define AllowPageIO(page) SetPageChecked(page)
1228 +#define ProhibitPageIO(page) ClearPageChecked(page)
1229 +
1230 +/*-----------------------------------------------------------------
1231 + * Raid-4/5 set structures.
1232 + *---------------------------------------------------------------*/
1233 +/* RAID level definitions. */
1234 +enum raid_level {
1235 + raid4,
1236 + raid5,
1237 +};
1238 +
1239 +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
1240 +enum raid_algorithm {
1241 + none,
1242 + left_asym,
1243 + right_asym,
1244 + left_sym,
1245 + right_sym,
1246 +};
1247 +
1248 +struct raid_type {
1249 + const char *name; /* RAID algorithm. */
1250 + const char *descr; /* Descriptor text for logging. */
1251 + const unsigned parity_devs; /* # of parity devices. */
1252 + const unsigned minimal_devs; /* minimal # of devices in set. */
1253 + const enum raid_level level; /* RAID level. */
1254 + const enum raid_algorithm algorithm; /* RAID algorithm. */
1255 +};
1256 +
1257 +/* Supported raid types and properties. */
1258 +static struct raid_type raid_types[] = {
1259 + {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
1260 + {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
1261 + {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
1262 + {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
1263 + {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
1264 +};
1265 +
1266 +/* Address as calculated by raid_address(). */
1267 +struct address {
1268 + sector_t key; /* Hash key (start address of stripe). */
1269 + unsigned di, pi; /* Data and parity disks index. */
1270 +};
1271 +
1272 +/* REMOVEME: reset statistics counters. */
1273 +static void stats_reset(struct raid_set *rs)
1274 +{
1275 + unsigned s = S_NR_STATS;
1276 +
1277 + while (s--)
1278 + atomic_set(rs->stats + s, 0);
1279 +}
1280 +
1281 +/*----------------------------------------------------------------
1282 + * RAID set management routines.
1283 + *--------------------------------------------------------------*/
1284 +/*
1285 + * Begin small helper functions.
1286 + */
1287 +/* Queue (optionally delayed) io work. */
1288 +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
1289 +{
1290 + struct delayed_work *dws = &rs->io.dws;
1291 +
1292 + cancel_delayed_work(dws);
1293 + queue_delayed_work(rs->io.wq, dws, delay);
1294 +}
1295 +
1296 +/* Queue io work immediately (called from region hash too). */
1297 +static INLINE void wake_do_raid(void *context)
1298 +{
1299 + wake_do_raid_delayed(context, 0);
1300 +}
1301 +
1302 +/* Wait until all io has been processed. */
1303 +static INLINE void wait_ios(struct raid_set *rs)
1304 +{
1305 + wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
1306 +}
1307 +
1308 +/* Declare io queued to device. */
1309 +static INLINE void io_dev_queued(struct raid_dev *dev)
1310 +{
1311 + set_bit(IO_QUEUED, &dev->flags);
1312 +}
1313 +
1314 +/* Io on device and reset ? */
1315 +static inline int io_dev_clear(struct raid_dev *dev)
1316 +{
1317 + return test_and_clear_bit(IO_QUEUED, &dev->flags);
1318 +}
1319 +
1320 +/* Get an io reference. */
1321 +static INLINE void io_get(struct raid_set *rs)
1322 +{
1323 + int p = atomic_inc_return(&rs->io.in_process);
1324 +
1325 + if (p > atomic_read(&rs->io.in_process_max))
1326 + atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
1327 +}
1328 +
1329 +/* Put the io reference and conditionally wake io waiters. */
1330 +static INLINE void io_put(struct raid_set *rs)
1331 +{
1332 + /* Intel: rebuild data corrupter? */
1333 + if (!atomic_read(&rs->io.in_process)) {
1334 + DMERR("%s would go negative!!!", __func__);
1335 + return;
1336 + }
1337 +
1338 + if (atomic_dec_and_test(&rs->io.in_process))
1339 + wake_up(&rs->io.suspendq);
1340 +}
1341 +
1342 +/* Calculate device sector offset. */
1343 +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
1344 +{
1345 + sector_t sector = bio->bi_sector;
1346 +
1347 + sector_div(sector, rs->set.data_devs);
1348 + return sector;
1349 +}
1350 +
1351 +/* Test device operational. */
1352 +static INLINE int dev_operational(struct raid_set *rs, unsigned p)
1353 +{
1354 + return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
1355 +}
1356 +
1357 +/* Return # of active stripes in stripe cache. */
1358 +static INLINE int sc_active(struct stripe_cache *sc)
1359 +{
1360 + return atomic_read(&sc->active_stripes);
1361 +}
1362 +
1363 +/* Test io pending on stripe. */
1364 +static INLINE int stripe_io(struct stripe *stripe)
1365 +{
1366 + return atomic_read(&stripe->io.pending);
1367 +}
1368 +
1369 +static INLINE void stripe_io_inc(struct stripe *stripe)
1370 +{
1371 + atomic_inc(&stripe->io.pending);
1372 +}
1373 +
1374 +static INLINE void stripe_io_dec(struct stripe *stripe)
1375 +{
1376 + atomic_dec(&stripe->io.pending);
1377 +}
1378 +
1379 +/* Wrapper needed by for_each_io_dev(). */
1380 +static void _stripe_io_inc(struct stripe *stripe, unsigned p)
1381 +{
1382 + stripe_io_inc(stripe);
1383 +}
1384 +
1385 +/* Error a stripe. */
1386 +static INLINE void stripe_error(struct stripe *stripe, struct page *page)
1387 +{
1388 + SetStripeError(stripe);
1389 + SetPageError(page);
1390 + atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
1391 +}
1392 +
1393 +/* Page IOed ok. */
1394 +enum dirty_type { CLEAN, DIRTY };
1395 +static INLINE void page_set(struct page *page, enum dirty_type type)
1396 +{
1397 + switch (type) {
1398 + case DIRTY:
1399 + SetPageDirty(page);
1400 + AllowPageIO(page);
1401 + break;
1402 +
1403 + case CLEAN:
1404 + ClearPageDirty(page);
1405 + break;
1406 +
1407 + default:
1408 + BUG();
1409 + }
1410 +
1411 + SetPageUptodate(page);
1412 + ClearPageError(page);
1413 +}
1414 +
1415 +/* Return region state for a sector. */
1416 +static INLINE int
1417 +region_state(struct raid_set *rs, sector_t sector, unsigned long state)
1418 +{
1419 + struct dm_rh_client *rh = rs->recover.rh;
1420 +
1421 + return RSRecover(rs) ?
1422 + (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
1423 + state) : 0;
1424 +}
1425 +
1426 +/* Check maximum devices which may fail in a raid set. */
1427 +static inline int raid_set_degraded(struct raid_set *rs)
1428 +{
1429 + return RSIoError(rs);
1430 +}
1431 +
1432 +/* Check # of devices which may fail in a raid set. */
1433 +static INLINE int raid_set_operational(struct raid_set *rs)
1434 +{
1435 + /* Too many failed devices -> BAD. */
1436 + return atomic_read(&rs->set.failed_devs) <=
1437 + rs->set.raid_type->parity_devs;
1438 +}
1439 +
1440 +/*
1441 + * Return true in case a page_list should be read/written
1442 + *
1443 + * Conditions to read/write:
1444 + * o 1st page in list not uptodate
1445 + * o 1st page in list dirty
1446 + * o if we optimized io away, we flag it using the pages checked bit.
1447 + */
1448 +static INLINE unsigned page_io(struct page *page)
1449 +{
1450 + /* Optimization: page was flagged to need io during first run. */
1451 + if (PagePrivate(page)) {
1452 + ClearPagePrivate(page);
1453 + return 1;
1454 + }
1455 +
1456 + /* Avoid io if prohibited or a locked page. */
1457 + if (!PageIO(page) || PageLocked(page))
1458 + return 0;
1459 +
1460 + if (!PageUptodate(page) || PageDirty(page)) {
1461 + /* Flag page needs io for second run optimization. */
1462 + SetPagePrivate(page);
1463 + return 1;
1464 + }
1465 +
1466 + return 0;
1467 +}
1468 +
1469 +/* Call a function on each page list needing io. */
1470 +static INLINE unsigned
1471 +for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
1472 + void (*f_io)(struct stripe *stripe, unsigned p))
1473 +{
1474 + unsigned p = rs->set.raid_devs, r = 0;
1475 +
1476 + while (p--) {
1477 + if (page_io(PAGE(stripe, p))) {
1478 + f_io(stripe, p);
1479 + r++;
1480 + }
1481 + }
1482 +
1483 + return r;
1484 +}
1485 +
1486 +/* Reconstruct a particular device ?. */
1487 +static INLINE int dev_to_init(struct raid_set *rs)
1488 +{
1489 + return rs->set.dev_to_init > -1;
1490 +}
1491 +
1492 +/*
1493 + * Index of device to calculate parity on.
1494 + * Either the parity device index *or* the selected device to init
1495 + * after a spare replacement.
1496 + */
1497 +static INLINE unsigned dev_for_parity(struct stripe *stripe)
1498 +{
1499 + struct raid_set *rs = RS(stripe->sc);
1500 +
1501 + return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
1502 +}
1503 +
1504 +/* Return the index of the device to be recovered. */
1505 +static int idx_get(struct raid_set *rs)
1506 +{
1507 + /* Avoid to read in the pages to be reconstructed anyway. */
1508 + if (dev_to_init(rs))
1509 + return rs->set.dev_to_init;
1510 + else if (rs->set.raid_type->level == raid4)
1511 + return rs->set.pi;
1512 +
1513 + return -1;
1514 +}
1515 +
1516 +/* RAID set congested function. */
1517 +static int raid_set_congested(void *congested_data, int bdi_bits)
1518 +{
1519 + struct raid_set *rs = congested_data;
1520 + int r = 0; /* Assume uncongested. */
1521 + unsigned p = rs->set.raid_devs;
1522 +
1523 + /* If any of our component devices are overloaded. */
1524 + while (p--) {
1525 + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
1526 +
1527 + r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1528 + }
1529 +
1530 + /* REMOVEME: statistics. */
1531 + atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
1532 + return r;
1533 +}
1534 +
1535 +/* Display RAID set dead message once. */
1536 +static void raid_set_dead(struct raid_set *rs)
1537 +{
1538 + if (!TestSetRSDead(rs)) {
1539 + unsigned p;
1540 + char buf[BDEVNAME_SIZE];
1541 +
1542 + DMERR("FATAL: too many devices failed -> RAID set dead");
1543 +
1544 + for (p = 0; p < rs->set.raid_devs; p++) {
1545 + if (!dev_operational(rs, p))
1546 + DMERR("device /dev/%s failed",
1547 + bdevname(rs->dev[p].dev->bdev, buf));
1548 + }
1549 + }
1550 +}
1551 +
1552 +/* RAID set degrade check. */
1553 +static INLINE int
1554 +raid_set_check_and_degrade(struct raid_set *rs,
1555 + struct stripe *stripe, unsigned p)
1556 +{
1557 + if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
1558 + return -EPERM;
1559 +
1560 + /* Through an event in case of member device errors. */
1561 + dm_table_event(rs->ti->table);
1562 + atomic_inc(&rs->set.failed_devs);
1563 +
1564 + /* Only log the first member error. */
1565 + if (!TestSetRSIoError(rs)) {
1566 + char buf[BDEVNAME_SIZE];
1567 +
1568 + /* Store index for recovery. */
1569 + mb();
1570 + rs->set.ei = p;
1571 + mb();
1572 +
1573 + DMERR("CRITICAL: %sio error on device /dev/%s "
1574 + "in region=%llu; DEGRADING RAID set",
1575 + stripe ? "" : "FAKED ",
1576 + bdevname(rs->dev[p].dev->bdev, buf),
1577 + (unsigned long long) (stripe ? stripe->key : 0));
1578 + DMERR("further device error messages suppressed");
1579 + }
1580 +
1581 + return 0;
1582 +}
1583 +
1584 +static void
1585 +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
1586 +{
1587 + unsigned p = rs->set.raid_devs;
1588 +
1589 + while (p--) {
1590 + struct page *page = PAGE(stripe, p);
1591 +
1592 + if (PageError(page)) {
1593 + ClearPageError(page);
1594 + raid_set_check_and_degrade(rs, stripe, p);
1595 + }
1596 + }
1597 +}
1598 +
1599 +/* RAID set upgrade check. */
1600 +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
1601 +{
1602 + if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
1603 + return -EPERM;
1604 +
1605 + if (atomic_dec_and_test(&rs->set.failed_devs)) {
1606 + ClearRSIoError(rs);
1607 + rs->set.ei = -1;
1608 + }
1609 +
1610 + return 0;
1611 +}
1612 +
1613 +/* Lookup a RAID device by name or by major:minor number. */
1614 +union dev_lookup {
1615 + const char *dev_name;
1616 + struct raid_dev *dev;
1617 +};
1618 +enum lookup_type { byname, bymajmin, bynumber };
1619 +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
1620 + union dev_lookup *dl)
1621 +{
1622 + unsigned p;
1623 +
1624 + /*
1625 + * Must be an incremental loop, because the device array
1626 + * can have empty slots still on calls from raid_ctr()
1627 + */
1628 + for (p = 0; p < rs->set.raid_devs; p++) {
1629 + char buf[BDEVNAME_SIZE];
1630 + struct raid_dev *dev = rs->dev + p;
1631 +
1632 + if (!dev->dev)
1633 + break;
1634 +
1635 + /* Format dev string appropriately if necessary. */
1636 + if (by == byname)
1637 + bdevname(dev->dev->bdev, buf);
1638 + else if (by == bymajmin)
1639 + format_dev_t(buf, dev->dev->bdev->bd_dev);
1640 +
1641 + /* Do the actual check. */
1642 + if (by == bynumber) {
1643 + if (dl->dev->dev->bdev->bd_dev ==
1644 + dev->dev->bdev->bd_dev)
1645 + return p;
1646 + } else if (!strcmp(dl->dev_name, buf))
1647 + return p;
1648 + }
1649 +
1650 + return -ENODEV;
1651 +}
1652 +
1653 +/* End io wrapper. */
1654 +static INLINE void
1655 +_bio_endio(struct raid_set *rs, struct bio *bio, int error)
1656 +{
1657 + /* REMOVEME: statistics. */
1658 + atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
1659 + S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
1660 + bio_endio(bio, error);
1661 + io_put(rs); /* Wake any suspend waiters. */
1662 +}
1663 +
1664 +/*
1665 + * End small helper functions.
1666 + */
1667 +
1668 +
1669 +/*
1670 + * Stripe hash functions
1671 + */
1672 +/* Initialize/destroy stripe hash. */
1673 +static int hash_init(struct stripe_hash *hash, unsigned stripes)
1674 +{
1675 + unsigned buckets = 2, max_buckets = stripes / 4;
1676 + unsigned hash_primes[] = {
1677 + /* Table of primes for hash_fn/table size optimization. */
1678 + 3, 7, 13, 27, 53, 97, 193, 389, 769,
1679 + 1543, 3079, 6151, 12289, 24593,
1680 + };
1681 +
1682 + /* Calculate number of buckets (2^^n <= stripes / 4). */
1683 + while (buckets < max_buckets)
1684 + buckets <<= 1;
1685 +
1686 + /* Allocate stripe hash. */
1687 + hash->hash = vmalloc(buckets * sizeof(*hash->hash));
1688 + if (!hash->hash)
1689 + return -ENOMEM;
1690 +
1691 + hash->buckets = buckets;
1692 + hash->mask = buckets - 1;
1693 + hash->shift = ffs(buckets);
1694 + if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
1695 + hash->shift = ARRAY_SIZE(hash_primes) + 1;
1696 +
1697 + BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
1698 + hash->prime = hash_primes[hash->shift - 2];
1699 +
1700 + /* Initialize buckets. */
1701 + while (buckets--)
1702 + INIT_LIST_HEAD(hash->hash + buckets);
1703 +
1704 + return 0;
1705 +}
1706 +
1707 +static INLINE void hash_exit(struct stripe_hash *hash)
1708 +{
1709 + if (hash->hash) {
1710 + vfree(hash->hash);
1711 + hash->hash = NULL;
1712 + }
1713 +}
1714 +
1715 +/* List add (head/tail/locked/unlocked) inlines. */
1716 +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
1717 +#define LIST_DEL(name, list) \
1718 +static void stripe_ ## name ## _del(struct stripe *stripe, \
1719 + enum list_lock_type lock) { \
1720 + struct list_head *lh = stripe->lists + (list); \
1721 + spinlock_t *l = NULL; \
1722 +\
1723 + if (lock == LIST_LOCKED) { \
1724 + l = stripe->sc->locks + LOCK_LRU; \
1725 + spin_lock_irq(l); \
1726 + } \
1727 +\
1728 +\
1729 + if (!list_empty(lh)) \
1730 + list_del_init(lh); \
1731 +\
1732 + if (lock == LIST_LOCKED) \
1733 + spin_unlock_irq(l); \
1734 +}
1735 +
1736 +LIST_DEL(hash, LIST_HASH)
1737 +LIST_DEL(lru, LIST_LRU)
1738 +#undef LIST_DEL
1739 +
1740 +enum list_pos_type { POS_HEAD, POS_TAIL };
1741 +#define LIST_ADD(name, list) \
1742 +static void stripe_ ## name ## _add(struct stripe *stripe, \
1743 + enum list_pos_type pos, \
1744 + enum list_lock_type lock) { \
1745 + struct list_head *lh = stripe->lists + (list); \
1746 + struct stripe_cache *sc = stripe->sc; \
1747 + spinlock_t *l = NULL; \
1748 +\
1749 + if (lock == LIST_LOCKED) { \
1750 + l = sc->locks + LOCK_LRU; \
1751 + spin_lock_irq(l); \
1752 + } \
1753 +\
1754 + if (list_empty(lh)) { \
1755 + if (pos == POS_HEAD) \
1756 + list_add(lh, sc->lists + (list)); \
1757 + else \
1758 + list_add_tail(lh, sc->lists + (list)); \
1759 + } \
1760 +\
1761 + if (lock == LIST_LOCKED) \
1762 + spin_unlock_irq(l); \
1763 +}
1764 +
1765 +LIST_ADD(endio, LIST_ENDIO)
1766 +LIST_ADD(io, LIST_IO)
1767 +LIST_ADD(lru, LIST_LRU)
1768 +#undef LIST_ADD
1769 +
1770 +#define POP(list) \
1771 + do { \
1772 + if (list_empty(sc->lists + list)) \
1773 + stripe = NULL; \
1774 + else { \
1775 + stripe = list_first_entry(&sc->lists[list], \
1776 + struct stripe, \
1777 + lists[list]); \
1778 + list_del_init(&stripe->lists[list]); \
1779 + } \
1780 + } while (0);
1781 +
1782 +/* Pop an available stripe off the lru list. */
1783 +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1784 +{
1785 + struct stripe *stripe;
1786 + spinlock_t *lock = sc->locks + LOCK_LRU;
1787 +
1788 + spin_lock_irq(lock);
1789 + POP(LIST_LRU);
1790 + spin_unlock_irq(lock);
1791 +
1792 + if (stripe)
1793 + /* Remove from hash before reuse. */
1794 + stripe_hash_del(stripe, LIST_UNLOCKED);
1795 +
1796 + return stripe;
1797 +}
1798 +
1799 +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
1800 +{
1801 + return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
1802 +}
1803 +
1804 +static inline struct list_head *
1805 +hash_bucket(struct stripe_hash *hash, sector_t key)
1806 +{
1807 + return hash->hash + hash_fn(hash, key);
1808 +}
1809 +
1810 +/* Insert an entry into a hash. */
1811 +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
1812 +{
1813 + list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
1814 +}
1815 +
1816 +/* Insert an entry into the stripe hash. */
1817 +static inline void
1818 +sc_insert(struct stripe_cache *sc, struct stripe *stripe)
1819 +{
1820 + hash_insert(&sc->hash, stripe);
1821 +}
1822 +
1823 +/* Lookup an entry in the stripe hash. */
1824 +static inline struct stripe *
1825 +stripe_lookup(struct stripe_cache *sc, sector_t key)
1826 +{
1827 + unsigned c = 0;
1828 + struct stripe *stripe;
1829 + struct list_head *bucket = hash_bucket(&sc->hash, key);
1830 +
1831 + list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
1832 + /* REMOVEME: statisics. */
1833 + if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1834 + atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
1835 +
1836 + if (stripe->key == key)
1837 + return stripe;
1838 + }
1839 +
1840 + return NULL;
1841 +}
1842 +
1843 +/* Resize the stripe cache hash on size changes. */
1844 +static int hash_resize(struct stripe_cache *sc)
1845 +{
1846 + /* Resize threshold reached? */
1847 + if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
1848 + || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
1849 + int r;
1850 + struct stripe_hash hash, hash_tmp;
1851 + spinlock_t *lock;
1852 +
1853 + r = hash_init(&hash, atomic_read(&sc->stripes));
1854 + if (r)
1855 + return r;
1856 +
1857 + lock = sc->locks + LOCK_LRU;
1858 + spin_lock_irq(lock);
1859 + if (sc->hash.hash) {
1860 + unsigned b = sc->hash.buckets;
1861 + struct list_head *pos, *tmp;
1862 +
1863 + /* Walk old buckets and insert into new. */
1864 + while (b--) {
1865 + list_for_each_safe(pos, tmp, sc->hash.hash + b)
1866 + hash_insert(&hash,
1867 + list_entry(pos, struct stripe,
1868 + lists[LIST_HASH]));
1869 + }
1870 +
1871 + }
1872 +
1873 + memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
1874 + memcpy(&sc->hash, &hash, sizeof(sc->hash));
1875 + atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1876 + spin_unlock_irq(lock);
1877 +
1878 + hash_exit(&hash_tmp);
1879 + }
1880 +
1881 + return 0;
1882 +}
1883 +
1884 +/*
1885 + * Stripe cache locking functions
1886 + */
1887 +/* Dummy lock function for local RAID4+5. */
1888 +static void *no_lock(sector_t key, enum dm_lock_type type)
1889 +{
1890 + return &no_lock;
1891 +}
1892 +
1893 +/* Dummy unlock function for local RAID4+5. */
1894 +static void no_unlock(void *lock_handle)
1895 +{
1896 +}
1897 +
1898 +/* No locking (for local RAID 4+5). */
1899 +static struct dm_raid45_locking_type locking_none = {
1900 + .lock = no_lock,
1901 + .unlock = no_unlock,
1902 +};
1903 +
1904 +/* Clustered RAID 4+5. */
1905 +/* FIXME: code this. */
1906 +static struct dm_raid45_locking_type locking_cluster = {
1907 + .lock = no_lock,
1908 + .unlock = no_unlock,
1909 +};
1910 +
1911 +/* Lock a stripe (for clustering). */
1912 +static int
1913 +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
1914 +{
1915 + stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
1916 + DM_RAID45_EX);
1917 + return stripe->lock ? 0 : -EPERM;
1918 +}
1919 +
1920 +/* Unlock a stripe (for clustering). */
1921 +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
1922 +{
1923 + rs->locking->unlock(stripe->lock);
1924 + stripe->lock = NULL;
1925 +}
1926 +
1927 +/*
1928 + * Stripe cache functions.
1929 + */
1930 +/*
1931 + * Invalidate all page lists pages of a stripe.
1932 + *
1933 + * I only keep state for the whole list in the first page.
1934 + */
1935 +static INLINE void
1936 +stripe_pages_invalidate(struct stripe *stripe)
1937 +{
1938 + unsigned p = RS(stripe->sc)->set.raid_devs;
1939 +
1940 + while (p--) {
1941 + struct page *page = PAGE(stripe, p);
1942 +
1943 + ProhibitPageIO(page);
1944 + ClearPageChecked(page);
1945 + ClearPageDirty(page);
1946 + ClearPageError(page);
1947 + clear_page_locked(page);
1948 + ClearPagePrivate(page);
1949 + ClearPageUptodate(page);
1950 + }
1951 +}
1952 +
1953 +/* Prepare stripe for (re)use. */
1954 +static INLINE void stripe_invalidate(struct stripe *stripe)
1955 +{
1956 + stripe->io.flags = 0;
1957 + stripe_pages_invalidate(stripe);
1958 +}
1959 +
1960 +/* Allow io on all chunks of a stripe. */
1961 +static INLINE void stripe_allow_io(struct stripe *stripe)
1962 +{
1963 + unsigned p = RS(stripe->sc)->set.raid_devs;
1964 +
1965 + while (p--)
1966 + AllowPageIO(PAGE(stripe, p));
1967 +}
1968 +
1969 +/* Initialize a stripe. */
1970 +static void
1971 +stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1972 +{
1973 + unsigned p = RS(sc)->set.raid_devs;
1974 + unsigned i;
1975 +
1976 + /* Work all io chunks. */
1977 + while (p--) {
1978 + struct stripe_set *ss = stripe->ss + p;
1979 +
1980 + stripe->obj[p].private = ss;
1981 + ss->stripe = stripe;
1982 +
1983 + i = ARRAY_SIZE(ss->bl);
1984 + while (i--)
1985 + bio_list_init(ss->bl + i);
1986 + }
1987 +
1988 + stripe->sc = sc;
1989 +
1990 + i = ARRAY_SIZE(stripe->lists);
1991 + while (i--)
1992 + INIT_LIST_HEAD(stripe->lists + i);
1993 +
1994 + atomic_set(&stripe->cnt, 0);
1995 + atomic_set(&stripe->io.pending, 0);
1996 +
1997 + stripe_invalidate(stripe);
1998 +}
1999 +
2000 +/* Number of pages per chunk. */
2001 +static inline unsigned chunk_pages(unsigned io_size)
2002 +{
2003 + return dm_div_up(io_size, SECTORS_PER_PAGE);
2004 +}
2005 +
2006 +/* Number of pages per stripe. */
2007 +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
2008 +{
2009 + return chunk_pages(io_size) * rs->set.raid_devs;
2010 +}
2011 +
2012 +/* Initialize part of page_list (recovery). */
2013 +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
2014 + unsigned start, unsigned count)
2015 +{
2016 + unsigned pages = chunk_pages(count);
2017 + /* Get offset into the page_list. */
2018 + struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
2019 +
2020 + BUG_ON(!pl);
2021 + while (pl && pages--) {
2022 + BUG_ON(!pl->page);
2023 + memset(page_address(pl->page), 0, PAGE_SIZE);
2024 + pl = pl->next;
2025 + }
2026 +}
2027 +
2028 +/* Initialize parity chunk of stripe. */
2029 +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
2030 +{
2031 + stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
2032 +}
2033 +
2034 +/* Return dynamic stripe structure size. */
2035 +static INLINE size_t stripe_size(struct raid_set *rs)
2036 +{
2037 + return sizeof(struct stripe) +
2038 + rs->set.raid_devs * sizeof(struct stripe_set);
2039 +}
2040 +
2041 +/* Allocate a stripe and its memory object. */
2042 +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
2043 +enum grow { SC_GROW, SC_KEEP };
2044 +static struct stripe *stripe_alloc(struct stripe_cache *sc,
2045 + struct dm_mem_cache_client *mc,
2046 + enum grow grow)
2047 +{
2048 + int r;
2049 + struct stripe *stripe;
2050 +
2051 + stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
2052 + if (stripe) {
2053 + /* Grow the dm-mem-cache by one object. */
2054 + if (grow == SC_GROW) {
2055 + r = dm_mem_cache_grow(mc, 1);
2056 + if (r)
2057 + goto err_free;
2058 + }
2059 +
2060 + stripe->obj = dm_mem_cache_alloc(mc);
2061 + if (!stripe->obj)
2062 + goto err_shrink;
2063 +
2064 + stripe_init(sc, stripe);
2065 + }
2066 +
2067 + return stripe;
2068 +
2069 +err_shrink:
2070 + if (grow == SC_GROW)
2071 + dm_mem_cache_shrink(mc, 1);
2072 +err_free:
2073 + kmem_cache_free(sc->kc.cache, stripe);
2074 + return NULL;
2075 +}
2076 +
2077 +/*
2078 + * Free a stripes memory object, shrink the
2079 + * memory cache and free the stripe itself
2080 + */
2081 +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
2082 +{
2083 + dm_mem_cache_free(mc, stripe->obj);
2084 + dm_mem_cache_shrink(mc, 1);
2085 + kmem_cache_free(stripe->sc->kc.cache, stripe);
2086 +}
2087 +
2088 +/* Free the recovery stripe. */
2089 +static void stripe_recover_free(struct raid_set *rs)
2090 +{
2091 + struct recover *rec = &rs->recover;
2092 + struct list_head *stripes = &rec->stripes;
2093 +
2094 + while (!list_empty(stripes)) {
2095 + struct stripe *stripe = list_first_entry(stripes, struct stripe,
2096 + lists[LIST_RECOVER]);
2097 + list_del(stripe->lists + LIST_RECOVER);
2098 + stripe_free(stripe, rec->mem_cache_client);
2099 + }
2100 +}
2101 +
2102 +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
2103 +static INLINE void stripe_endio_push(struct stripe *stripe)
2104 +{
2105 + int wake;
2106 + unsigned long flags;
2107 + struct stripe_cache *sc = stripe->sc;
2108 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2109 +
2110 + spin_lock_irqsave(lock, flags);
2111 + wake = list_empty(sc->lists + LIST_ENDIO);
2112 + stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
2113 + spin_unlock_irqrestore(lock, flags);
2114 +
2115 + if (wake)
2116 + wake_do_raid(RS(sc));
2117 +}
2118 +
2119 +/* Protected check for stripe cache endio list empty. */
2120 +static INLINE int stripe_endio_empty(struct stripe_cache *sc)
2121 +{
2122 + int r;
2123 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2124 +
2125 + spin_lock_irq(lock);
2126 + r = list_empty(sc->lists + LIST_ENDIO);
2127 + spin_unlock_irq(lock);
2128 +
2129 + return r;
2130 +}
2131 +
2132 +/* Pop a stripe off safely off the endio list. */
2133 +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
2134 +{
2135 + struct stripe *stripe;
2136 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2137 +
2138 + /* This runs in parallel with endio(). */
2139 + spin_lock_irq(lock);
2140 + POP(LIST_ENDIO)
2141 + spin_unlock_irq(lock);
2142 + return stripe;
2143 +}
2144 +
2145 +#undef POP
2146 +
2147 +/* Evict stripe from cache. */
2148 +static void stripe_evict(struct stripe *stripe)
2149 +{
2150 + struct raid_set *rs = RS(stripe->sc);
2151 + stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
2152 +
2153 + if (list_empty(stripe->lists + LIST_LRU)) {
2154 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2155 + atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
2156 + }
2157 +}
2158 +
2159 +/* Grow stripe cache. */
2160 +static int
2161 +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
2162 +{
2163 + int r = 0;
2164 + struct raid_set *rs = RS(sc);
2165 +
2166 + /* Try to allocate this many (additional) stripes. */
2167 + while (stripes--) {
2168 + struct stripe *stripe =
2169 + stripe_alloc(sc, sc->mem_cache_client, grow);
2170 +
2171 + if (likely(stripe)) {
2172 + stripe->io.size = rs->set.io_size;
2173 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2174 + atomic_inc(&sc->stripes);
2175 + } else {
2176 + r = -ENOMEM;
2177 + break;
2178 + }
2179 + }
2180 +
2181 + ClearRSScBusy(rs);
2182 + return r ? r : hash_resize(sc);
2183 +}
2184 +
2185 +/* Shrink stripe cache. */
2186 +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
2187 +{
2188 + int r = 0;
2189 +
2190 + /* Try to get unused stripe from LRU list. */
2191 + while (stripes--) {
2192 + struct stripe *stripe;
2193 +
2194 + stripe = stripe_lru_pop(sc);
2195 + if (stripe) {
2196 + /* An lru stripe may never have ios pending! */
2197 + BUG_ON(stripe_io(stripe));
2198 + stripe_free(stripe, sc->mem_cache_client);
2199 + atomic_dec(&sc->stripes);
2200 + } else {
2201 + r = -ENOENT;
2202 + break;
2203 + }
2204 + }
2205 +
2206 + /* Check if stats are still sane. */
2207 + if (atomic_read(&sc->max_active_stripes) >
2208 + atomic_read(&sc->stripes))
2209 + atomic_set(&sc->max_active_stripes, 0);
2210 +
2211 + if (r)
2212 + return r;
2213 +
2214 + ClearRSScBusy(RS(sc));
2215 + return hash_resize(sc);
2216 +}
2217 +
2218 +/* Create stripe cache. */
2219 +static int sc_init(struct raid_set *rs, unsigned stripes)
2220 +{
2221 + unsigned i, nr;
2222 + struct stripe_cache *sc = &rs->sc;
2223 + struct stripe *stripe;
2224 + struct recover *rec = &rs->recover;
2225 +
2226 + /* Initialize lists and locks. */
2227 + i = ARRAY_SIZE(sc->lists);
2228 + while (i--)
2229 + INIT_LIST_HEAD(sc->lists + i);
2230 +
2231 + i = NR_LOCKS;
2232 + while (i--)
2233 + spin_lock_init(sc->locks + i);
2234 +
2235 + /* Initialize atomic variables. */
2236 + atomic_set(&sc->stripes, 0);
2237 + atomic_set(&sc->stripes_last, 0);
2238 + atomic_set(&sc->stripes_to_shrink, 0);
2239 + atomic_set(&sc->active_stripes, 0);
2240 + atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
2241 +
2242 + /*
2243 + * We need a runtime unique # to suffix the kmem cache name
2244 + * because we'll have one for each active RAID set.
2245 + */
2246 + nr = atomic_inc_return(&_stripe_sc_nr);
2247 + sprintf(sc->kc.name, "%s_%d", TARGET, nr);
2248 + sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
2249 + 0, 0, NULL);
2250 + if (!sc->kc.cache)
2251 + return -ENOMEM;
2252 +
2253 + /* Create memory cache client context for RAID stripe cache. */
2254 + sc->mem_cache_client =
2255 + dm_mem_cache_client_create(stripes, rs->set.raid_devs,
2256 + chunk_pages(rs->set.io_size));
2257 + if (IS_ERR(sc->mem_cache_client))
2258 + return PTR_ERR(sc->mem_cache_client);
2259 +
2260 + /* Create memory cache client context for RAID recovery stripe(s). */
2261 + rec->mem_cache_client =
2262 + dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
2263 + chunk_pages(rec->io_size));
2264 + if (IS_ERR(rec->mem_cache_client))
2265 + return PTR_ERR(rec->mem_cache_client);
2266 +
2267 + /* Allocate stripe for set recovery. */
2268 + /* XXX: cope with MAX_RECOVERY. */
2269 + INIT_LIST_HEAD(&rec->stripes);
2270 + for (i = 0; i < MAX_RECOVER; i++) {
2271 + stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
2272 + if (!stripe)
2273 + return -ENOMEM;
2274 +
2275 + SetStripeRecover(stripe);
2276 + stripe->io.size = rec->io_size;
2277 + list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
2278 + }
2279 +
2280 + /*
2281 + * Allocate the stripe objetcs from the
2282 + * cache and add them to the LRU list.
2283 + */
2284 + return sc_grow(sc, stripes, SC_KEEP);
2285 +}
2286 +
2287 +/* Destroy the stripe cache. */
2288 +static void sc_exit(struct stripe_cache *sc)
2289 +{
2290 + if (sc->kc.cache) {
2291 + BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
2292 + kmem_cache_destroy(sc->kc.cache);
2293 + }
2294 +
2295 + if (sc->mem_cache_client)
2296 + dm_mem_cache_client_destroy(sc->mem_cache_client);
2297 +
2298 + ClearRSRecover(RS(sc));
2299 + stripe_recover_free(RS(sc));
2300 + if (RS(sc)->recover.mem_cache_client)
2301 + dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
2302 +
2303 + hash_exit(&sc->hash);
2304 +}
2305 +
2306 +/*
2307 + * Calculate RAID address
2308 + *
2309 + * Delivers tuple with the index of the data disk holding the chunk
2310 + * in the set, the parity disks index and the start of the stripe
2311 + * within the address space of the set (used as the stripe cache hash key).
2312 + */
2313 +/* thx MD. */
2314 +static struct address *
2315 +raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
2316 +{
2317 + unsigned data_devs = rs->set.data_devs, di, pi,
2318 + raid_devs = rs->set.raid_devs;
2319 + sector_t stripe, tmp;
2320 +
2321 + /*
2322 + * chunk_number = sector / chunk_size
2323 + * stripe = chunk_number / data_devs
2324 + * di = stripe % data_devs;
2325 + */
2326 + stripe = sector >> rs->set.chunk_shift;
2327 + di = sector_div(stripe, data_devs);
2328 +
2329 + switch (rs->set.raid_type->level) {
2330 + case raid5:
2331 + tmp = stripe;
2332 + pi = sector_div(tmp, raid_devs);
2333 +
2334 + switch (rs->set.raid_type->algorithm) {
2335 + case left_asym: /* Left asymmetric. */
2336 + pi = data_devs - pi;
2337 + case right_asym: /* Right asymmetric. */
2338 + if (di >= pi)
2339 + di++;
2340 + break;
2341 +
2342 + case left_sym: /* Left symmetric. */
2343 + pi = data_devs - pi;
2344 + case right_sym: /* Right symmetric. */
2345 + di = (pi + di + 1) % raid_devs;
2346 + break;
2347 +
2348 + default:
2349 + DMERR("Unknown RAID algorithm %d",
2350 + rs->set.raid_type->algorithm);
2351 + goto out;
2352 + }
2353 +
2354 + break;
2355 +
2356 + case raid4:
2357 + pi = rs->set.pi;
2358 + if (di >= pi)
2359 + di++;
2360 + break;
2361 +
2362 + default:
2363 + DMERR("Unknown RAID level %d", rs->set.raid_type->level);
2364 + goto out;
2365 + }
2366 +
2367 + /*
2368 + * Hash key = start offset on any single device of the RAID set;
2369 + * adjusted in case io size differs from chunk size.
2370 + */
2371 + addr->key = (stripe << rs->set.chunk_shift) +
2372 + (sector & rs->set.io_shift_mask);
2373 + addr->di = di;
2374 + addr->pi = pi;
2375 +
2376 +out:
2377 + return addr;
2378 +}
2379 +
2380 +/*
2381 + * Copy data across between stripe pages and bio vectors.
2382 + *
2383 + * Pay attention to data alignment in stripe and bio pages.
2384 + */
2385 +static void
2386 +bio_copy_page_list(int rw, struct stripe *stripe,
2387 + struct page_list *pl, struct bio *bio)
2388 +{
2389 + unsigned i, page_offset;
2390 + void *page_addr;
2391 + struct raid_set *rs = RS(stripe->sc);
2392 + struct bio_vec *bv;
2393 +
2394 + /* Get start page in page list for this sector. */
2395 + i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
2396 + pl = pl_elem(pl, i);
2397 +
2398 + page_addr = page_address(pl->page);
2399 + page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
2400 +
2401 + /* Walk all segments and copy data across between bio_vecs and pages. */
2402 + bio_for_each_segment(bv, bio, i) {
2403 + int len = bv->bv_len, size;
2404 + unsigned bio_offset = 0;
2405 + void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
2406 +redo:
2407 + size = (page_offset + len > PAGE_SIZE) ?
2408 + PAGE_SIZE - page_offset : len;
2409 +
2410 + if (rw == READ)
2411 + memcpy(bio_addr + bio_offset,
2412 + page_addr + page_offset, size);
2413 + else
2414 + memcpy(page_addr + page_offset,
2415 + bio_addr + bio_offset, size);
2416 +
2417 + page_offset += size;
2418 + if (page_offset == PAGE_SIZE) {
2419 + /*
2420 + * We reached the end of the chunk page ->
2421 + * need refer to the next one to copy more data.
2422 + */
2423 + len -= size;
2424 + if (len) {
2425 + /* Get next page. */
2426 + pl = pl->next;
2427 + BUG_ON(!pl);
2428 + page_addr = page_address(pl->page);
2429 + page_offset = 0;
2430 + bio_offset += size;
2431 + /* REMOVEME: statistics. */
2432 + atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
2433 + goto redo;
2434 + }
2435 + }
2436 +
2437 + __bio_kunmap_atomic(bio_addr, KM_USER0);
2438 + }
2439 +}
2440 +
2441 +/*
2442 + * Xor optimization macros.
2443 + */
2444 +/* Xor data pointer declaration and initialization macros. */
2445 +#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
2446 +#define DECLARE_3 DECLARE_2, *d2 = data[2]
2447 +#define DECLARE_4 DECLARE_3, *d3 = data[3]
2448 +#define DECLARE_5 DECLARE_4, *d4 = data[4]
2449 +#define DECLARE_6 DECLARE_5, *d5 = data[5]
2450 +#define DECLARE_7 DECLARE_6, *d6 = data[6]
2451 +#define DECLARE_8 DECLARE_7, *d7 = data[7]
2452 +
2453 +/* Xor unrole macros. */
2454 +#define D2(n) d0[n] = d0[n] ^ d1[n]
2455 +#define D3(n) D2(n) ^ d2[n]
2456 +#define D4(n) D3(n) ^ d3[n]
2457 +#define D5(n) D4(n) ^ d4[n]
2458 +#define D6(n) D5(n) ^ d5[n]
2459 +#define D7(n) D6(n) ^ d6[n]
2460 +#define D8(n) D7(n) ^ d7[n]
2461 +
2462 +#define X_2(macro, offset) macro(offset); macro(offset + 1);
2463 +#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
2464 +#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
2465 +#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
2466 +#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
2467 +#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
2468 +
2469 +/* Define a _xor_#chunks_#xors_per_run() function. */
2470 +#define _XOR(chunks, xors_per_run) \
2471 +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
2472 +{ \
2473 + unsigned end = XOR_SIZE / sizeof(data[0]), i; \
2474 + DECLARE_ ## chunks; \
2475 +\
2476 + for (i = 0; i < end; i += xors_per_run) { \
2477 + X_ ## xors_per_run(D ## chunks, i); \
2478 + } \
2479 +}
2480 +
2481 +/* Define xor functions for 2 - 8 chunks. */
2482 +#define MAKE_XOR_PER_RUN(xors_per_run) \
2483 + _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
2484 + _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
2485 + _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
2486 + _XOR(8, xors_per_run);
2487 +
2488 +MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
2489 +MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
2490 +MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
2491 +MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
2492 +
2493 +#define MAKE_XOR(xors_per_run) \
2494 +struct { \
2495 + void (*f)(unsigned long **); \
2496 +} static xor_funcs ## xors_per_run[] = { \
2497 + { NULL }, \
2498 + { NULL }, \
2499 + { _xor2_ ## xors_per_run }, \
2500 + { _xor3_ ## xors_per_run }, \
2501 + { _xor4_ ## xors_per_run }, \
2502 + { _xor5_ ## xors_per_run }, \
2503 + { _xor6_ ## xors_per_run }, \
2504 + { _xor7_ ## xors_per_run }, \
2505 + { _xor8_ ## xors_per_run }, \
2506 +}; \
2507 +\
2508 +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
2509 +{ \
2510 + /* Call respective function for amount of chunks. */ \
2511 + xor_funcs ## xors_per_run[n].f(data); \
2512 +}
2513 +
2514 +/* Define xor_8() - xor_64 functions. */
2515 +MAKE_XOR(8)
2516 +MAKE_XOR(16)
2517 +MAKE_XOR(32)
2518 +MAKE_XOR(64)
2519 +
2520 +/* Maximum number of chunks, which can be xor'ed in one go. */
2521 +#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
2522 +
2523 +struct xor_func {
2524 + xor_function_t f;
2525 + const char *name;
2526 +} static xor_funcs[] = {
2527 + {xor_8, "xor_8"},
2528 + {xor_16, "xor_16"},
2529 + {xor_32, "xor_32"},
2530 + {xor_64, "xor_64"},
2531 +};
2532 +
2533 +/*
2534 + * Calculate crc.
2535 + *
2536 + * This indexes into the page list of the stripe.
2537 + *
2538 + * All chunks will be xored into the parity chunk
2539 + * in maximum groups of xor.chunks.
2540 + *
2541 + * FIXME: try mapping the pages on discontiguous memory.
2542 + */
2543 +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
2544 +{
2545 + struct raid_set *rs = RS(stripe->sc);
2546 + unsigned max_chunks = rs->xor.chunks, n, p;
2547 + unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
2548 + unsigned long **d = rs->data;
2549 + xor_function_t xor_f = rs->xor.f->f;
2550 +
2551 + /* Address of parity page to xor into. */
2552 + d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
2553 +
2554 + /* Preset pointers to data pages. */
2555 + for (n = 1, p = rs->set.raid_devs; p--; ) {
2556 + if (p != pi && PageIO(PAGE(stripe, p)))
2557 + d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
2558 +
2559 + /* If max chunks -> xor .*/
2560 + if (n == max_chunks) {
2561 + xor_f(n, d);
2562 + n = 1;
2563 + }
2564 + }
2565 +
2566 + /* If chunks -> xor. */
2567 + if (n > 1)
2568 + xor_f(n, d);
2569 +
2570 + /* Set parity page uptodate and clean. */
2571 + page_set(PAGE(stripe, pi), CLEAN);
2572 +}
2573 +
2574 +/* Common xor loop through all stripe page lists. */
2575 +static void common_xor(struct stripe *stripe, sector_t count,
2576 + unsigned off, unsigned p)
2577 +{
2578 + unsigned sector;
2579 +
2580 + for (sector = off; sector < count; sector += SECTORS_PER_XOR)
2581 + xor(stripe, p, sector);
2582 +
2583 + atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
2584 +}
2585 +
2586 +/*
2587 + * Calculate parity sectors on intact stripes.
2588 + *
2589 + * Need to calculate raid address for recover stripe, because its
2590 + * chunk sizes differs and is typically larger than io chunk size.
2591 + */
2592 +static void parity_xor(struct stripe *stripe)
2593 +{
2594 + struct raid_set *rs = RS(stripe->sc);
2595 + unsigned chunk_size = rs->set.chunk_size,
2596 + io_size = stripe->io.size,
2597 + xor_size = chunk_size > io_size ? io_size : chunk_size;
2598 + sector_t off;
2599 +
2600 + /* This can be the recover stripe with a larger io size. */
2601 + for (off = 0; off < io_size; off += xor_size) {
2602 + unsigned pi;
2603 +
2604 + /*
2605 + * Recover stripe likely is bigger than regular io
2606 + * ones and has no precalculated parity disk index ->
2607 + * need to calculate RAID address.
2608 + */
2609 + if (unlikely(StripeRecover(stripe))) {
2610 + struct address addr;
2611 +
2612 + raid_address(rs,
2613 + (stripe->key + off) * rs->set.data_devs,
2614 + &addr);
2615 + pi = addr.pi;
2616 + stripe_zero_pl_part(stripe, pi, off,
2617 + rs->set.chunk_size);
2618 + } else
2619 + pi = stripe->idx.parity;
2620 +
2621 + common_xor(stripe, xor_size, off, pi);
2622 + page_set(PAGE(stripe, pi), DIRTY);
2623 + }
2624 +}
2625 +
2626 +/* Reconstruct missing chunk. */
2627 +static void reconstruct_xor(struct stripe *stripe)
2628 +{
2629 + struct raid_set *rs = RS(stripe->sc);
2630 + int p = stripe->idx.recover;
2631 +
2632 + BUG_ON(p < 0);
2633 +
2634 + /* REMOVEME: statistics. */
2635 + atomic_inc(rs->stats + (raid_set_degraded(rs) ?
2636 + S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
2637 +
2638 + /* Zero chunk to be reconstructed. */
2639 + stripe_zero_chunk(stripe, p);
2640 + common_xor(stripe, stripe->io.size, 0, p);
2641 +}
2642 +
2643 +/*
2644 + * Try getting a stripe either from the hash or from the lru list
2645 + */
2646 +static inline void _stripe_get(struct stripe *stripe)
2647 +{
2648 + atomic_inc(&stripe->cnt);
2649 +}
2650 +
2651 +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
2652 +{
2653 + struct stripe_cache *sc = &rs->sc;
2654 + struct stripe *stripe;
2655 +
2656 + stripe = stripe_lookup(sc, addr->key);
2657 + if (stripe) {
2658 + _stripe_get(stripe);
2659 + /* Remove from the lru list if on. */
2660 + stripe_lru_del(stripe, LIST_LOCKED);
2661 + atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2662 + } else {
2663 + /* Second try to get an LRU stripe. */
2664 + stripe = stripe_lru_pop(sc);
2665 + if (stripe) {
2666 + _stripe_get(stripe);
2667 + /* Invalidate before reinserting with changed key. */
2668 + stripe_invalidate(stripe);
2669 + stripe->key = addr->key;
2670 + stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2671 + addr->key);
2672 + stripe->idx.parity = addr->pi;
2673 + sc_insert(sc, stripe);
2674 + /* REMOVEME: statistics. */
2675 + atomic_inc(rs->stats + S_INSCACHE);
2676 + }
2677 + }
2678 +
2679 + return stripe;
2680 +}
2681 +
2682 +/*
2683 + * Decrement reference count on a stripe.
2684 + *
2685 + * Move it to list of LRU stripes if zero.
2686 + */
2687 +static void stripe_put(struct stripe *stripe)
2688 +{
2689 + if (atomic_dec_and_test(&stripe->cnt)) {
2690 + if (TestClearStripeActive(stripe))
2691 + atomic_dec(&stripe->sc->active_stripes);
2692 +
2693 + /* Put stripe onto the LRU list. */
2694 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2695 + }
2696 +
2697 + BUG_ON(atomic_read(&stripe->cnt) < 0);
2698 +}
2699 +
2700 +/*
2701 + * Process end io
2702 + *
2703 + * I need to do it here because I can't in interrupt
2704 + *
2705 + * Read and write functions are split in order to avoid
2706 + * conditionals in the main loop for performamce reasons.
2707 + */
2708 +
2709 +/* Helper read bios on a page list. */
2710 +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
2711 + struct bio *bio)
2712 +{
2713 + bio_copy_page_list(READ, stripe, pl, bio);
2714 +}
2715 +
2716 +/* Helper write bios on a page list. */
2717 +static void _rh_dec(struct stripe *stripe, struct page_list *pl,
2718 + struct bio *bio)
2719 +{
2720 + dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
2721 +}
2722 +
2723 +/* End io all bios on a page list. */
2724 +static inline int
2725 +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
2726 +{
2727 + int r = 0;
2728 + struct bio_list *bl = BL(stripe, p, rw);
2729 +
2730 + if (!bio_list_empty(bl)) {
2731 + struct page_list *pl = PL(stripe, p);
2732 + struct page *page = pl->page;
2733 +
2734 + if (PageLocked(page))
2735 + r = -EBUSY;
2736 + /*
2737 + * FIXME: PageUptodate() not cleared
2738 + * properly for missing chunks ?
2739 + */
2740 + else if (PageUptodate(page)) {
2741 + struct bio *bio;
2742 + struct raid_set *rs = RS(stripe->sc);
2743 + void (*h_f)(struct stripe *, struct page_list *,
2744 + struct bio *) =
2745 + (rw == READ) ? _bio_copy_page_list : _rh_dec;
2746 +
2747 + while ((bio = bio_list_pop(bl))) {
2748 + h_f(stripe, pl, bio);
2749 + _bio_endio(rs, bio, 0);
2750 + stripe_put(stripe);
2751 + if (count)
2752 + (*count)++;
2753 + }
2754 + } else
2755 + r = -EAGAIN;
2756 + }
2757 +
2758 + return r;
2759 +}
2760 +
2761 +/*
2762 + * End io all reads/writes on a stripe copying
2763 + * read date accross from stripe to bios.
2764 + */
2765 +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
2766 +{
2767 + int r = 0;
2768 + unsigned p = RS(stripe->sc)->set.raid_devs;
2769 +
2770 + while (p--) {
2771 + int rr = page_list_endio(rw, stripe, p, count);
2772 +
2773 + if (rr && r != -EIO)
2774 + r = rr;
2775 + }
2776 +
2777 + return r;
2778 +}
2779 +
2780 +/* Fail all ios on a bio list and return # of bios. */
2781 +static unsigned
2782 +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
2783 +{
2784 + unsigned r;
2785 + struct bio *bio;
2786 +
2787 + raid_set_dead(rs);
2788 +
2789 + /* Update region counters. */
2790 + if (stripe) {
2791 + struct dm_rh_client *rh = rs->recover.rh;
2792 +
2793 + bio_list_for_each(bio, bl) {
2794 + if (bio_data_dir(bio) == WRITE)
2795 + dm_rh_dec(rh, stripe->region);
2796 + }
2797 + }
2798 +
2799 + /* Error end io all bios. */
2800 + for (r = 0; (bio = bio_list_pop(bl)); r++)
2801 + _bio_endio(rs, bio, -EIO);
2802 +
2803 + return r;
2804 +}
2805 +
2806 +/* Fail all ios of a bio list of a stripe and drop io pending count. */
2807 +static void
2808 +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
2809 + struct bio_list *bl)
2810 +{
2811 + unsigned put = bio_list_fail(rs, stripe, bl);
2812 +
2813 + while (put--)
2814 + stripe_put(stripe);
2815 +}
2816 +
2817 +/* Fail all ios hanging off all bio lists of a stripe. */
2818 +static void stripe_fail_io(struct stripe *stripe)
2819 +{
2820 + struct raid_set *rs = RS(stripe->sc);
2821 + unsigned p = rs->set.raid_devs;
2822 +
2823 + stripe_evict(stripe);
2824 +
2825 + while (p--) {
2826 + struct stripe_set *ss = stripe->ss + p;
2827 + int i = ARRAY_SIZE(ss->bl);
2828 +
2829 + while (i--)
2830 + stripe_bio_list_fail(rs, stripe, ss->bl + i);
2831 + }
2832 +}
2833 +
2834 +/*
2835 + * Handle all stripes by handing them to the daemon, because we can't
2836 + * map their pages to copy the data in interrupt context.
2837 + *
2838 + * We don't want to handle them here either, while interrupts are disabled.
2839 + */
2840 +
2841 +/* Read/write endio function for dm-io (interrupt context). */
2842 +static void endio(unsigned long error, void *context)
2843 +{
2844 + struct dm_mem_cache_object *obj = context;
2845 + struct stripe_set *ss = obj->private;
2846 + struct stripe *stripe = ss->stripe;
2847 + struct page *page = obj->pl->page;
2848 +
2849 + if (unlikely(error))
2850 + stripe_error(stripe, page);
2851 + else
2852 + page_set(page, CLEAN);
2853 +
2854 + clear_page_locked(page);
2855 + stripe_io_dec(stripe);
2856 +
2857 + /* Add stripe to endio list and wake daemon. */
2858 + stripe_endio_push(stripe);
2859 +}
2860 +
2861 +/*
2862 + * Recovery io throttling
2863 + */
2864 +/* Conditionally reset io counters. */
2865 +enum count_type { IO_WORK = 0, IO_RECOVER };
2866 +static int recover_io_reset(struct raid_set *rs)
2867 +{
2868 + unsigned long j = jiffies;
2869 +
2870 + /* Pay attention to jiffies overflows. */
2871 + if (j > rs->recover.last_jiffies + HZ
2872 + || j < rs->recover.last_jiffies) {
2873 + rs->recover.last_jiffies = j;
2874 + atomic_set(rs->recover.io_count + IO_WORK, 0);
2875 + atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2876 + return 1;
2877 + }
2878 +
2879 + return 0;
2880 +}
2881 +
2882 +/* Count ios. */
2883 +static INLINE void
2884 +recover_io_count(struct raid_set *rs, struct stripe *stripe)
2885 +{
2886 + if (RSRecover(rs)) {
2887 + recover_io_reset(rs);
2888 + atomic_inc(rs->recover.io_count +
2889 + (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2890 + }
2891 +}
2892 +
2893 +/* Read/Write a page_list asynchronously. */
2894 +static void page_list_rw(struct stripe *stripe, unsigned p)
2895 +{
2896 + struct stripe_cache *sc = stripe->sc;
2897 + struct raid_set *rs = RS(sc);
2898 + struct dm_mem_cache_object *obj = stripe->obj + p;
2899 + struct page_list *pl = obj->pl;
2900 + struct page *page = pl->page;
2901 + struct raid_dev *dev = rs->dev + p;
2902 + struct dm_io_region io = {
2903 + .bdev = dev->dev->bdev,
2904 + .sector = stripe->key,
2905 + .count = stripe->io.size,
2906 + };
2907 + struct dm_io_request control = {
2908 + .bi_rw = PageDirty(page) ? WRITE : READ,
2909 + .mem.type = DM_IO_PAGE_LIST,
2910 + .mem.ptr.pl = pl,
2911 + .mem.offset = 0,
2912 + .notify.fn = endio,
2913 + .notify.context = obj,
2914 + .client = sc->dm_io_client,
2915 + };
2916 +
2917 + BUG_ON(PageLocked(page));
2918 +
2919 + /*
2920 + * Don't rw past end of device, which can happen, because
2921 + * typically sectors_per_dev isn't divisable by io_size.
2922 + */
2923 + if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2924 + io.count = rs->set.sectors_per_dev - io.sector;
2925 +
2926 + io.sector += dev->start; /* Add <offset>. */
2927 + recover_io_count(rs, stripe); /* Recovery io accounting. */
2928 +
2929 + /* REMOVEME: statistics. */
2930 + atomic_inc(rs->stats +
2931 + (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
2932 +
2933 + ClearPageError(page);
2934 + set_page_locked(page);
2935 + io_dev_queued(dev);
2936 + BUG_ON(dm_io(&control, 1, &io, NULL));
2937 +}
2938 +
2939 +/*
2940 + * Write dirty / read not uptodate page lists of a stripe.
2941 + */
2942 +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
2943 +{
2944 + unsigned r;
2945 +
2946 + /*
2947 + * Increment the pending count on the stripe
2948 + * first, so that we don't race in endio().
2949 + *
2950 + * An inc (IO) is needed for any page:
2951 + *
2952 + * o not uptodate
2953 + * o dirtied by writes merged
2954 + * o dirtied by parity calculations
2955 + */
2956 + r = for_each_io_dev(rs, stripe, _stripe_io_inc);
2957 + if (r) {
2958 + /* io needed: chunks are not uptodate/dirty. */
2959 + int max; /* REMOVEME: */
2960 + struct stripe_cache *sc = &rs->sc;
2961 +
2962 + if (!TestSetStripeActive(stripe))
2963 + atomic_inc(&sc->active_stripes);
2964 +
2965 + /* Take off the lru list in case it got added there. */
2966 + stripe_lru_del(stripe, LIST_LOCKED);
2967 +
2968 + /* Submit actual io. */
2969 + for_each_io_dev(rs, stripe, page_list_rw);
2970 +
2971 + /* REMOVEME: statistics */
2972 + max = sc_active(sc);
2973 + if (atomic_read(&sc->max_active_stripes) < max)
2974 + atomic_set(&sc->max_active_stripes, max);
2975 +
2976 + atomic_inc(rs->stats + S_FLUSHS);
2977 + /* END REMOVEME: statistics */
2978 + }
2979 +
2980 + return r;
2981 +}
2982 +
2983 +/* Work in all pending writes. */
2984 +static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
2985 +{
2986 + struct bio_list *write = BL(stripe, p, WRITE);
2987 +
2988 + if (!bio_list_empty(write)) {
2989 + struct page_list *pl = stripe->obj[p].pl;
2990 + struct bio *bio;
2991 + struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
2992 +
2993 + /*
2994 + * We can play with the lists without holding a lock,
2995 + * because it is just us accessing them anyway.
2996 + */
2997 + bio_list_for_each(bio, write)
2998 + bio_copy_page_list(WRITE, stripe, pl, bio);
2999 +
3000 + bio_list_merge(write_merged, write);
3001 + bio_list_init(write);
3002 + page_set(pl->page, DIRTY);
3003 + }
3004 +}
3005 +
3006 +/* Merge in all writes hence dirtying respective pages. */
3007 +static INLINE void writes_merge(struct stripe *stripe)
3008 +{
3009 + unsigned p = RS(stripe->sc)->set.raid_devs;
3010 +
3011 + while (p--)
3012 + _writes_merge(stripe, p);
3013 +}
3014 +
3015 +/* Check, if a chunk gets completely overwritten. */
3016 +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
3017 +{
3018 + unsigned sectors = 0;
3019 + struct bio *bio;
3020 + struct bio_list *bl = BL(stripe, p, WRITE);
3021 +
3022 + bio_list_for_each(bio, bl)
3023 + sectors += bio_sectors(bio);
3024 +
3025 + return sectors == RS(stripe->sc)->set.io_size;
3026 +}
3027 +
3028 +/*
3029 + * Prepare stripe to avoid io on broken/reconstructed
3030 + * drive in order to reconstruct date on endio.
3031 + */
3032 +enum prepare_type { IO_ALLOW, IO_PROHIBIT };
3033 +static void stripe_prepare(struct stripe *stripe, unsigned p,
3034 + enum prepare_type type)
3035 +{
3036 + struct page *page = PAGE(stripe, p);
3037 +
3038 + switch (type) {
3039 + case IO_PROHIBIT:
3040 + /*
3041 + * In case we prohibit, we gotta make sure, that
3042 + * io on all other chunks than the one which failed
3043 + * or is being reconstructed is allowed and that it
3044 + * doesn't have state uptodate.
3045 + */
3046 + stripe_allow_io(stripe);
3047 + ClearPageUptodate(page);
3048 + ProhibitPageIO(page);
3049 +
3050 + /* REMOVEME: statistics. */
3051 + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
3052 + stripe->idx.recover = p;
3053 + SetStripeReconstruct(stripe);
3054 + break;
3055 +
3056 + case IO_ALLOW:
3057 + AllowPageIO(page);
3058 + stripe->idx.recover = -1;
3059 + ClearStripeReconstruct(stripe);
3060 + break;
3061 +
3062 + default:
3063 + BUG();
3064 + }
3065 +}
3066 +
3067 +/*
3068 + * Degraded/reconstruction mode.
3069 + *
3070 + * Check stripe state to figure which chunks don't need IO.
3071 + */
3072 +static INLINE void stripe_check_reconstruct(struct stripe *stripe,
3073 + int prohibited)
3074 +{
3075 + struct raid_set *rs = RS(stripe->sc);
3076 +
3077 + /*
3078 + * Degraded mode (device(s) failed) ->
3079 + * avoid io on the failed device.
3080 + */
3081 + if (unlikely(raid_set_degraded(rs))) {
3082 + /* REMOVEME: statistics. */
3083 + atomic_inc(rs->stats + S_DEGRADED);
3084 + stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
3085 + return;
3086 + } else {
3087 + /*
3088 + * Reconstruction mode (ie. a particular device or
3089 + * some (rotating) parity chunk is being resynchronized) ->
3090 + * o make sure all needed pages are read in
3091 + * o writes are allowed to go through
3092 + */
3093 + int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
3094 +
3095 + if (r) {
3096 + /* REMOVEME: statistics. */
3097 + atomic_inc(rs->stats + S_NOSYNC);
3098 + stripe_prepare(stripe, dev_for_parity(stripe),
3099 + IO_PROHIBIT);
3100 + return;
3101 + }
3102 + }
3103 +
3104 + /*
3105 + * All disks good. Avoid reading parity chunk and reconstruct it
3106 + * unless we have prohibited io to chunk(s).
3107 + */
3108 + if (!prohibited) {
3109 + if (StripeMerged(stripe))
3110 + stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
3111 + else {
3112 + stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
3113 +
3114 + /*
3115 + * Overrule stripe_prepare to reconstruct the
3116 + * parity chunk, because it'll be created new anyway.
3117 + */
3118 + ClearStripeReconstruct(stripe);
3119 + }
3120 + }
3121 +}
3122 +
3123 +/* Check, if stripe is ready to merge writes. */
3124 +static INLINE int stripe_check_merge(struct stripe *stripe)
3125 +{
3126 + struct raid_set *rs = RS(stripe->sc);
3127 + int prohibited = 0;
3128 + unsigned chunks = 0, p = rs->set.raid_devs;
3129 +
3130 + /* Walk all chunks. */
3131 + while (p--) {
3132 + struct page *page = PAGE(stripe, p);
3133 +
3134 + /* Can't merge active chunks. */
3135 + if (PageLocked(page)) {
3136 + /* REMOVEME: statistics. */
3137 + atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
3138 + break;
3139 + }
3140 +
3141 + /* Can merge uptodate chunks and have to count parity chunk. */
3142 + if (PageUptodate(page) || p == stripe->idx.parity) {
3143 + chunks++;
3144 + continue;
3145 + }
3146 +
3147 + /* Read before write ordering. */
3148 + if (RSCheckOverwrite(rs) &&
3149 + bio_list_empty(BL(stripe, p, READ))) {
3150 + int r = stripe_check_overwrite(stripe, p);
3151 +
3152 + if (r) {
3153 + chunks++;
3154 + /* REMOVEME: statistics. */
3155 + atomic_inc(RS(stripe->sc)->stats +
3156 + S_PROHIBITPAGEIO);
3157 + ProhibitPageIO(page);
3158 + prohibited = 1;
3159 + }
3160 + }
3161 + }
3162 +
3163 + if (chunks == rs->set.raid_devs) {
3164 + /* All pages are uptodate or get written over or mixture. */
3165 + /* REMOVEME: statistics. */
3166 + atomic_inc(rs->stats + S_CAN_MERGE);
3167 + return 0;
3168 + } else
3169 + /* REMOVEME: statistics.*/
3170 + atomic_inc(rs->stats + S_CANT_MERGE);
3171 +
3172 + return prohibited ? 1 : -EPERM;
3173 +}
3174 +
3175 +/* Check, if stripe is ready to merge writes. */
3176 +static INLINE int stripe_check_read(struct stripe *stripe)
3177 +{
3178 + int r = 0;
3179 + unsigned p = RS(stripe->sc)->set.raid_devs;
3180 +
3181 + /* Walk all chunks. */
3182 + while (p--) {
3183 + struct page *page = PAGE(stripe, p);
3184 +
3185 + if (!PageLocked(page) &&
3186 + bio_list_empty(BL(stripe, p, READ))) {
3187 + ProhibitPageIO(page);
3188 + r = 1;
3189 + }
3190 + }
3191 +
3192 + return r;
3193 +}
3194 +
3195 +/*
3196 + * Read/write a stripe.
3197 + *
3198 + * All stripe read/write activity goes through this function.
3199 + *
3200 + * States to cover:
3201 + * o stripe to read and/or write
3202 + * o stripe with error to reconstruct
3203 + */
3204 +static int stripe_rw(struct stripe *stripe)
3205 +{
3206 + struct raid_set *rs = RS(stripe->sc);
3207 + int prohibited = 0, r;
3208 +
3209 + /*
3210 + * Check the state of the RAID set and if degraded (or
3211 + * resynchronizing for reads), read in all other chunks but
3212 + * the one on the dead/resynchronizing device in order to be
3213 + * able to reconstruct the missing one.
3214 + *
3215 + * Merge all writes hanging off uptodate pages of the stripe.
3216 + */
3217 +
3218 + /* Initially allow io on all chunks and prohibit below, if necessary. */
3219 + stripe_allow_io(stripe);
3220 +
3221 + if (StripeRBW(stripe)) {
3222 + r = stripe_check_merge(stripe);
3223 + if (!r) {
3224 + /*
3225 + * If I could rely on valid parity (which would only
3226 + * be sure in case of a full synchronization),
3227 + * I could xor a fraction of chunks out of
3228 + * parity and back in.
3229 + *
3230 + * For the time being, I got to redo parity...
3231 + */
3232 + /* parity_xor(stripe); */ /* Xor chunks out. */
3233 + stripe_zero_chunk(stripe, stripe->idx.parity);
3234 + writes_merge(stripe); /* Merge writes in. */
3235 + parity_xor(stripe); /* Update parity. */
3236 + ClearStripeRBW(stripe); /* Disable RBW. */
3237 + SetStripeMerged(stripe); /* Writes merged. */
3238 + }
3239 +
3240 + if (r > 0)
3241 + prohibited = 1;
3242 + } else if (!raid_set_degraded(rs))
3243 + /* Only allow for read avoidance if not degraded. */
3244 + prohibited = stripe_check_read(stripe);
3245 +
3246 + /*
3247 + * Check, if io needs to be allowed/prohibeted on certain chunks
3248 + * because of a degraded set or reconstruction on a region.
3249 + */
3250 + stripe_check_reconstruct(stripe, prohibited);
3251 +
3252 + /* Now submit any reads/writes. */
3253 + r = stripe_page_lists_rw(rs, stripe);
3254 + if (!r) {
3255 + /*
3256 + * No io submitted because of chunk io prohibited or
3257 + * locked pages -> push to end io list for processing.
3258 + */
3259 + atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
3260 + stripe_endio_push(stripe);
3261 + wake_do_raid(rs); /* Wake myself. */
3262 + }
3263 +
3264 + return 0;
3265 +}
3266 +
3267 +/* Flush stripe either via flush list or imeediately. */
3268 +enum flush_type { FLUSH_DELAY, FLUSH_NOW };
3269 +static int stripe_flush(struct stripe *stripe, enum flush_type type)
3270 +{
3271 + int r = 0;
3272 +
3273 + stripe_lru_del(stripe, LIST_LOCKED);
3274 +
3275 + /* Immediately flush. */
3276 + if (type == FLUSH_NOW) {
3277 + if (likely(raid_set_operational(RS(stripe->sc))))
3278 + r = stripe_rw(stripe); /* Read/write stripe. */
3279 + else
3280 + /* Optimization: Fail early on failed sets. */
3281 + stripe_fail_io(stripe);
3282 + /* Delay flush by putting it on io list for later processing. */
3283 + } else if (type == FLUSH_DELAY)
3284 + stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
3285 + else
3286 + BUG();
3287 +
3288 + return r;
3289 +}
3290 +
3291 +/*
3292 + * Queue reads and writes to a stripe by hanging
3293 + * their bios off the stripsets read/write lists.
3294 + *
3295 + * Endio reads on uptodate chunks.
3296 + */
3297 +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
3298 + struct bio_list *reject)
3299 +{
3300 + int r = 0;
3301 + struct address addr;
3302 + struct stripe *stripe =
3303 + stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
3304 +
3305 + if (stripe) {
3306 + int rr, rw = bio_data_dir(bio);
3307 +
3308 + rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
3309 + if (rr) {
3310 + stripe_put(stripe);
3311 + goto out;
3312 + }
3313 +
3314 + /* Distinguish read and write cases. */
3315 + bio_list_add(BL(stripe, addr.di, rw), bio);
3316 +
3317 + /* REMOVEME: statistics */
3318 + atomic_inc(rs->stats + (rw == WRITE ?
3319 + S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
3320 +
3321 + if (rw == READ)
3322 + SetStripeRead(stripe);
3323 + else {
3324 + SetStripeRBW(stripe);
3325 +
3326 + /* Inrement pending write count on region. */
3327 + dm_rh_inc(rs->recover.rh, stripe->region);
3328 + r = 1; /* Region hash needs a flush. */
3329 + }
3330 +
3331 + /*
3332 + * Optimize stripe flushing:
3333 + *
3334 + * o directly start io for read stripes.
3335 + *
3336 + * o put stripe onto stripe caches io_list for RBW,
3337 + * so that do_flush() can belabour it after we put
3338 + * more bios to the stripe for overwrite optimization.
3339 + */
3340 + stripe_flush(stripe,
3341 + StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
3342 +
3343 + /* Got no stripe from cache -> reject bio. */
3344 + } else {
3345 +out:
3346 + bio_list_add(reject, bio);
3347 + /* REMOVEME: statistics. */
3348 + atomic_inc(rs->stats + S_IOS_POST);
3349 + }
3350 +
3351 + return r;
3352 +}
3353 +
3354 +/*
3355 + * Recovery functions
3356 + */
3357 +/* Read a stripe off a raid set for recovery. */
3358 +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
3359 +{
3360 + /* Invalidate all pages so that they get read in. */
3361 + stripe_pages_invalidate(stripe);
3362 +
3363 + /* Allow io on all recovery chunks. */
3364 + stripe_allow_io(stripe);
3365 +
3366 + if (idx > -1)
3367 + ProhibitPageIO(PAGE(stripe, idx));
3368 +
3369 + stripe->key = rs->recover.pos;
3370 + return stripe_page_lists_rw(rs, stripe);
3371 +}
3372 +
3373 +/* Write a stripe to a raid set for recovery. */
3374 +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
3375 +{
3376 + /*
3377 + * If this is a reconstruct of a particular device, then
3378 + * reconstruct the respective page(s), else create parity page(s).
3379 + */
3380 + if (idx > -1) {
3381 + struct page *page = PAGE(stripe, idx);
3382 +
3383 + AllowPageIO(page);
3384 + stripe_zero_chunk(stripe, idx);
3385 + common_xor(stripe, stripe->io.size, 0, idx);
3386 + page_set(page, DIRTY);
3387 + } else
3388 + parity_xor(stripe);
3389 +
3390 + return stripe_page_lists_rw(rs, stripe);
3391 +}
3392 +
3393 +/* Recover bandwidth available ?. */
3394 +static int recover_bandwidth(struct raid_set *rs)
3395 +{
3396 + int r, work;
3397 +
3398 + /* On reset -> allow recovery. */
3399 + r = recover_io_reset(rs);
3400 + if (r || RSBandwidth(rs))
3401 + goto out;
3402 +
3403 + work = atomic_read(rs->recover.io_count + IO_WORK);
3404 + if (work) {
3405 + /* Pay attention to larger recover stripe size. */
3406 + int recover =
3407 + atomic_read(rs->recover.io_count + IO_RECOVER) *
3408 + rs->recover.io_size /
3409 + rs->set.io_size;
3410 +
3411 + /*
3412 + * Don't use more than given bandwidth of
3413 + * the work io for recovery.
3414 + */
3415 + if (recover > work / rs->recover.bandwidth_work) {
3416 + /* REMOVEME: statistics. */
3417 + atomic_inc(rs->stats + S_NO_BANDWIDTH);
3418 + return 0;
3419 + }
3420 + }
3421 +
3422 +out:
3423 + atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
3424 + return 1;
3425 +}
3426 +
3427 +/* Try to get a region to recover. */
3428 +static int recover_get_region(struct raid_set *rs)
3429 +{
3430 + struct recover *rec = &rs->recover;
3431 + struct dm_rh_client *rh = rec->rh;
3432 +
3433 + /* Start quiescing some regions. */
3434 + if (!RSRegionGet(rs)) {
3435 + int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
3436 +
3437 + if (r) {
3438 + r = dm_rh_recovery_prepare(rh);
3439 + if (r < 0) {
3440 + DMINFO("No %sregions to recover",
3441 + rec->nr_regions_to_recover ?
3442 + "more " : "");
3443 + return -ENOENT;
3444 + }
3445 + } else
3446 + return -EAGAIN;
3447 +
3448 + SetRSRegionGet(rs);
3449 + }
3450 +
3451 + if (!rec->reg) {
3452 + rec->reg = dm_rh_recovery_start(rh);
3453 + if (rec->reg) {
3454 + /*
3455 + * A reference for the the region I'll
3456 + * keep till I've completely synced it.
3457 + */
3458 + io_get(rs);
3459 + rec->pos = dm_rh_region_to_sector(rh,
3460 + dm_rh_get_region_key(rec->reg));
3461 + rec->end = rec->pos + dm_rh_get_region_size(rh);
3462 + return 1;
3463 + } else
3464 + return -EAGAIN;
3465 + }
3466 +
3467 + return 0;
3468 +}
3469 +
3470 +/* Read/write a recovery stripe. */
3471 +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
3472 +{
3473 + /* Read/write flip-flop. */
3474 + if (TestClearStripeRBW(stripe)) {
3475 + SetStripeRead(stripe);
3476 + return recover_read(rs, stripe, idx_get(rs));
3477 + } else if (TestClearStripeRead(stripe))
3478 + return recover_write(rs, stripe, idx_get(rs));
3479 +
3480 + return 0;
3481 +}
3482 +
3483 +/* Reset recovery variables. */
3484 +static void recovery_region_reset(struct raid_set *rs)
3485 +{
3486 + rs->recover.reg = NULL;
3487 + ClearRSRegionGet(rs);
3488 +}
3489 +
3490 +/* Update region hash state. */
3491 +static void recover_rh_update(struct raid_set *rs, int error)
3492 +{
3493 + struct recover *rec = &rs->recover;
3494 + struct dm_rh_client *rh = rec->rh;
3495 + struct dm_region *reg = rec->reg;
3496 +
3497 + if (reg) {
3498 + dm_rh_recovery_end(rh, reg, error);
3499 + if (!error)
3500 + rec->nr_regions_recovered++;
3501 +
3502 + recovery_region_reset(rs);
3503 + }
3504 +
3505 + dm_rh_update_states(rh, 1);
3506 + dm_rh_flush(rh);
3507 + io_put(rs); /* Release the io reference for the region. */
3508 +}
3509 +
3510 +/* Called by main io daemon to recover regions. */
3511 +/* FIXME: cope with MAX_RECOVER > 1. */
3512 +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
3513 +{
3514 + int r;
3515 + struct recover *rec = &rs->recover;
3516 +
3517 + /* If recovery is active -> return. */
3518 + if (StripeActive(stripe))
3519 + return;
3520 +
3521 + /* io error is fatal for recovery -> stop it. */
3522 + if (unlikely(StripeError(stripe)))
3523 + goto err;
3524 +
3525 + /* Get a region to recover. */
3526 + r = recover_get_region(rs);
3527 + switch (r) {
3528 + case 1: /* Got a new region. */
3529 + /* Flag read before write. */
3530 + ClearStripeRead(stripe);
3531 + SetStripeRBW(stripe);
3532 + break;
3533 +
3534 + case 0:
3535 + /* Got a region in the works. */
3536 + r = recover_bandwidth(rs);
3537 + if (r) /* Got enough bandwidth. */
3538 + break;
3539 +
3540 + case -EAGAIN:
3541 + /* No bandwidth/quiesced region yet, try later. */
3542 + wake_do_raid_delayed(rs, HZ / 10);
3543 + return;
3544 +
3545 + case -ENOENT: /* No more regions. */
3546 + dm_table_event(rs->ti->table);
3547 + goto free;
3548 + }
3549 +
3550 + /* Read/write a recover stripe. */
3551 + r = recover_stripe_rw(rs, stripe);
3552 + if (r) {
3553 + /* IO initiated, get another reference for the IO. */
3554 + io_get(rs);
3555 + return;
3556 + }
3557 +
3558 + /* Update recovery position within region. */
3559 + rec->pos += stripe->io.size;
3560 +
3561 + /* If we're at end of region, update region hash. */
3562 + if (rec->pos >= rec->end ||
3563 + rec->pos >= rs->set.sectors_per_dev)
3564 + recover_rh_update(rs, 0);
3565 + else
3566 + SetStripeRBW(stripe);
3567 +
3568 + /* Schedule myself for another round... */
3569 + wake_do_raid(rs);
3570 + return;
3571 +
3572 +err:
3573 + raid_set_check_degrade(rs, stripe);
3574 +
3575 + {
3576 + char buf[BDEVNAME_SIZE];
3577 +
3578 + DMERR("stopping recovery due to "
3579 + "ERROR on /dev/%s, stripe at offset %llu",
3580 + bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3581 + (unsigned long long) stripe->key);
3582 +
3583 + }
3584 +
3585 + /* Make sure, that all quiesced regions get released. */
3586 + do {
3587 + if (rec->reg)
3588 + dm_rh_recovery_end(rec->rh, rec->reg, -EIO);
3589 +
3590 + rec->reg = dm_rh_recovery_start(rec->rh);
3591 + } while (rec->reg);
3592 +
3593 + recover_rh_update(rs, -EIO);
3594 +free:
3595 + rs->set.dev_to_init = -1;
3596 +
3597 + /* Check for jiffies overrun. */
3598 + rs->recover.end_jiffies = jiffies;
3599 + if (rs->recover.end_jiffies < rs->recover.start_jiffies)
3600 + rs->recover.end_jiffies = ~0;
3601 +
3602 + ClearRSRecover(rs);
3603 +}
3604 +
3605 +static INLINE void do_recovery(struct raid_set *rs)
3606 +{
3607 + struct stripe *stripe;
3608 +
3609 + list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
3610 + _do_recovery(rs, stripe);
3611 +
3612 + if (!RSRecover(rs))
3613 + stripe_recover_free(rs);
3614 +}
3615 +
3616 +/*
3617 + * END recovery functions
3618 + */
3619 +
3620 +/* End io process all stripes handed in by endio() callback. */
3621 +static void do_endios(struct raid_set *rs)
3622 +{
3623 + struct stripe_cache *sc = &rs->sc;
3624 + struct stripe *stripe;
3625 +
3626 + while ((stripe = stripe_endio_pop(sc))) {
3627 + unsigned count;
3628 +
3629 + /* Recovery stripe special case. */
3630 + if (unlikely(StripeRecover(stripe))) {
3631 + if (stripe_io(stripe))
3632 + continue;
3633 +
3634 + io_put(rs); /* Release region io reference. */
3635 + ClearStripeActive(stripe);
3636 +
3637 + /* REMOVEME: statistics*/
3638 + atomic_dec(&sc->active_stripes);
3639 + continue;
3640 + }
3641 +
3642 + /* Early end io all reads on any uptodate chunks. */
3643 + stripe_endio(READ, stripe, (count = 0, &count));
3644 + if (stripe_io(stripe)) {
3645 + if (count) /* REMOVEME: statistics. */
3646 + atomic_inc(rs->stats + S_ACTIVE_READS);
3647 +
3648 + continue;
3649 + }
3650 +
3651 + /* Set stripe inactive after all io got processed. */
3652 + if (TestClearStripeActive(stripe))
3653 + atomic_dec(&sc->active_stripes);
3654 +
3655 + /* Unlock stripe (for clustering). */
3656 + stripe_unlock(rs, stripe);
3657 +
3658 + /*
3659 + * If an io error on a stripe occured and the RAID set
3660 + * is still operational, requeue the stripe for io.
3661 + */
3662 + if (TestClearStripeError(stripe)) {
3663 + raid_set_check_degrade(rs, stripe);
3664 + ClearStripeReconstruct(stripe);
3665 +
3666 + if (!StripeMerged(stripe) &&
3667 + raid_set_operational(rs)) {
3668 + stripe_pages_invalidate(stripe);
3669 + stripe_flush(stripe, FLUSH_DELAY);
3670 + /* REMOVEME: statistics. */
3671 + atomic_inc(rs->stats + S_REQUEUE);
3672 + continue;
3673 + }
3674 + }
3675 +
3676 + /* Check if the RAID set is inoperational to error ios. */
3677 + if (!raid_set_operational(rs)) {
3678 + ClearStripeReconstruct(stripe);
3679 + stripe_fail_io(stripe);
3680 + BUG_ON(atomic_read(&stripe->cnt));
3681 + continue;
3682 + }
3683 +
3684 + /* Got to reconstruct a missing chunk. */
3685 + if (TestClearStripeReconstruct(stripe))
3686 + reconstruct_xor(stripe);
3687 +
3688 + /*
3689 + * Now that we've got a complete stripe, we can
3690 + * process the rest of the end ios on reads.
3691 + */
3692 + BUG_ON(stripe_endio(READ, stripe, NULL));
3693 + ClearStripeRead(stripe);
3694 +
3695 + /*
3696 + * Read-before-write stripes need to be flushed again in
3697 + * order to work the write data into the pages *after*
3698 + * they were read in.
3699 + */
3700 + if (TestClearStripeMerged(stripe))
3701 + /* End io all bios which got merged already. */
3702 + BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
3703 +
3704 + /* Got to put on flush list because of new writes. */
3705 + if (StripeRBW(stripe))
3706 + stripe_flush(stripe, FLUSH_DELAY);
3707 + }
3708 +}
3709 +
3710 +/*
3711 + * Stripe cache shrinking.
3712 + */
3713 +static INLINE void do_sc_shrink(struct raid_set *rs)
3714 +{
3715 + unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
3716 +
3717 + if (shrink) {
3718 + unsigned cur = atomic_read(&rs->sc.stripes);
3719 +
3720 + sc_shrink(&rs->sc, shrink);
3721 + shrink -= cur - atomic_read(&rs->sc.stripes);
3722 + atomic_set(&rs->sc.stripes_to_shrink, shrink);
3723 +
3724 + /*
3725 + * Wake myself up in case we failed to shrink the
3726 + * requested amount in order to try again later.
3727 + */
3728 + if (shrink)
3729 + wake_do_raid(rs);
3730 + }
3731 +}
3732 +
3733 +
3734 +/*
3735 + * Process all ios
3736 + *
3737 + * We do different things with the io depending on the
3738 + * state of the region that it's in:
3739 + *
3740 + * o reads: hang off stripe cache or postpone if full
3741 + *
3742 + * o writes:
3743 + *
3744 + * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3745 + * In case stripe cache is full or busy, postpone the io.
3746 + *
3747 + * RECOVERING: delay the io until recovery of the region completes.
3748 + *
3749 + */
3750 +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
3751 +{
3752 + int r;
3753 + unsigned flush = 0;
3754 + struct dm_rh_client *rh = rs->recover.rh;
3755 + struct bio *bio;
3756 + struct bio_list delay, reject;
3757 +
3758 + bio_list_init(&delay);
3759 + bio_list_init(&reject);
3760 +
3761 + /*
3762 + * Classify each io:
3763 + * o delay to recovering regions
3764 + * o queue to all other regions
3765 + */
3766 + while ((bio = bio_list_pop(ios))) {
3767 + /*
3768 + * In case we get a barrier bio, push it back onto
3769 + * the input queue unless all work queues are empty
3770 + * and the stripe cache is inactive.
3771 + */
3772 + if (unlikely(bio_barrier(bio))) {
3773 + /* REMOVEME: statistics. */
3774 + atomic_inc(rs->stats + S_BARRIER);
3775 + if (!list_empty(rs->sc.lists + LIST_IO) ||
3776 + !bio_list_empty(&delay) ||
3777 + !bio_list_empty(&reject) ||
3778 + sc_active(&rs->sc)) {
3779 + bio_list_push(ios, bio);
3780 + break;
3781 + }
3782 + }
3783 +
3784 + r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
3785 + if (unlikely(r)) {
3786 + /* Got to wait for recovering regions. */
3787 + bio_list_add(&delay, bio);
3788 + SetRSBandwidth(rs);
3789 + } else {
3790 + /*
3791 + * Process ios to non-recovering regions by queueing
3792 + * them to stripes (does rh_inc()) for writes).
3793 + */
3794 + flush += stripe_queue_bio(rs, bio, &reject);
3795 + }
3796 + }
3797 +
3798 + if (flush) {
3799 + r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3800 + if (r)
3801 + DMERR("dirty log flush");
3802 + }
3803 +
3804 + /* Delay ios to regions which are recovering. */
3805 + while ((bio = bio_list_pop(&delay))) {
3806 + /* REMOVEME: statistics.*/
3807 + atomic_inc(rs->stats + S_DELAYED_BIOS);
3808 + atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3809 + dm_rh_delay_by_region(rh, bio,
3810 + dm_rh_sector_to_region(rh, _sector(rs, bio)));
3811 +
3812 + }
3813 +
3814 + /* Merge any rejected bios back to the head of the input list. */
3815 + bio_list_merge_head(ios, &reject);
3816 +}
3817 +
3818 +/* Flush any stripes on the io list. */
3819 +static INLINE void do_flush(struct raid_set *rs)
3820 +{
3821 + struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
3822 +
3823 + list_for_each_safe(pos, tmp, list) {
3824 + int r = stripe_flush(list_entry(pos, struct stripe,
3825 + lists[LIST_IO]), FLUSH_NOW);
3826 +
3827 + /* Remove from the list only if the stripe got processed. */
3828 + if (!r)
3829 + list_del_init(pos);
3830 + }
3831 +}
3832 +
3833 +/* Send an event in case we're getting too busy. */
3834 +static INLINE void do_busy_event(struct raid_set *rs)
3835 +{
3836 + if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
3837 + if (!TestSetRSScBusy(rs))
3838 + dm_table_event(rs->ti->table);
3839 + } else
3840 + ClearRSScBusy(rs);
3841 +}
3842 +
3843 +/* Unplug: let the io role on the sets devices. */
3844 +static INLINE void do_unplug(struct raid_set *rs)
3845 +{
3846 + struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3847 +
3848 + while (dev-- > rs->dev) {
3849 + /* Only call any device unplug function, if io got queued. */
3850 + if (io_dev_clear(dev))
3851 + blk_unplug(bdev_get_queue(dev->dev->bdev));
3852 + }
3853 +}
3854 +
3855 +/*-----------------------------------------------------------------
3856 + * RAID daemon
3857 + *---------------------------------------------------------------*/
3858 +/*
3859 + * o belabour all end ios
3860 + * o optionally shrink the stripe cache
3861 + * o update the region hash states
3862 + * o optionally do recovery
3863 + * o grab the input queue
3864 + * o work an all requeued or new ios and perform stripe cache flushs
3865 + * unless the RAID set is inoperational (when we error ios)
3866 + * o check, if the stripe cache gets too busy and throw an event if so
3867 + * o unplug any component raid devices with queued bios
3868 + */
3869 +static void do_raid(struct work_struct *ws)
3870 +{
3871 + struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
3872 + struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3873 + spinlock_t *lock = &rs->io.in_lock;
3874 +
3875 + /*
3876 + * We always need to end io, so that ios
3877 + * can get errored in case the set failed
3878 + * and the region counters get decremented
3879 + * before we update the region hash states.
3880 + */
3881 +redo:
3882 + do_endios(rs);
3883 +
3884 + /*
3885 + * Now that we've end io'd, which may have put stripes on
3886 + * the LRU list, we shrink the stripe cache if requested.
3887 + */
3888 + do_sc_shrink(rs);
3889 +
3890 + /* Update region hash states before we go any further. */
3891 + dm_rh_update_states(rs->recover.rh, 1);
3892 +
3893 + /* Try to recover regions. */
3894 + if (RSRecover(rs))
3895 + do_recovery(rs);
3896 +
3897 + /* More endios -> process. */
3898 + if (!stripe_endio_empty(&rs->sc)) {
3899 + atomic_inc(rs->stats + S_REDO);
3900 + goto redo;
3901 + }
3902 +
3903 + /* Quickly grab all new ios queued and add them to the work list. */
3904 + spin_lock_irq(lock);
3905 + bio_list_merge(ios, ios_in);
3906 + bio_list_init(ios_in);
3907 + spin_unlock_irq(lock);
3908 +
3909 + /* Let's assume we're operational most of the time ;-). */
3910 + if (likely(raid_set_operational(rs))) {
3911 + /* If we got ios, work them into the cache. */
3912 + if (!bio_list_empty(ios)) {
3913 + do_ios(rs, ios);
3914 + do_unplug(rs); /* Unplug the sets device queues. */
3915 + }
3916 +
3917 + do_flush(rs); /* Flush any stripes on io list. */
3918 + do_unplug(rs); /* Unplug the sets device queues. */
3919 + do_busy_event(rs); /* Check if we got too busy. */
3920 +
3921 + /* More endios -> process. */
3922 + if (!stripe_endio_empty(&rs->sc)) {
3923 + atomic_inc(rs->stats + S_REDO);
3924 + goto redo;
3925 + }
3926 + } else
3927 + /* No way to reconstruct data with too many devices failed. */
3928 + bio_list_fail(rs, NULL, ios);
3929 +}
3930 +
3931 +/*
3932 + * Callback for region hash to dispatch
3933 + * delayed bios queued to recovered regions
3934 + * (Gets called via rh_update_states()).
3935 + */
3936 +static void dispatch_delayed_bios(void *context, struct bio_list *bl, int dummy)
3937 +{
3938 + struct raid_set *rs = context;
3939 + struct bio *bio;
3940 +
3941 + /* REMOVEME: decrement pending delayed bios counter. */
3942 + bio_list_for_each(bio, bl)
3943 + atomic_dec(rs->stats + S_DELAYED_BIOS);
3944 +
3945 + /* Merge region hash private list to work list. */
3946 + bio_list_merge_head(&rs->io.work, bl);
3947 + bio_list_init(bl);
3948 + ClearRSBandwidth(rs);
3949 +}
3950 +
3951 +/*************************************************************
3952 + * Constructor helpers
3953 + *************************************************************/
3954 +/* Calculate MB/sec. */
3955 +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
3956 +{
3957 + return to_bytes(speed * rs->set.data_devs *
3958 + rs->recover.io_size * HZ >> 10) >> 10;
3959 +}
3960 +
3961 +/*
3962 + * Discover fastest xor algorithm and # of chunks combination.
3963 + */
3964 +/* Calculate speed for algorithm and # of chunks. */
3965 +static INLINE unsigned xor_speed(struct stripe *stripe)
3966 +{
3967 + unsigned r = 0;
3968 + unsigned long j;
3969 +
3970 + /* Wait for next tick. */
3971 + for (j = jiffies; j == jiffies;)
3972 + ;
3973 +
3974 + /* Do xors for a full tick. */
3975 + for (j = jiffies; j == jiffies;) {
3976 + mb();
3977 + common_xor(stripe, stripe->io.size, 0, 0);
3978 + mb();
3979 + r++;
3980 + mb();
3981 + }
3982 +
3983 + return r;
3984 +}
3985 +
3986 +/* Optimize xor algorithm for this RAID set. */
3987 +static unsigned xor_optimize(struct raid_set *rs)
3988 +{
3989 + unsigned chunks_max = 2, speed_max = 0;
3990 + struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3991 + struct stripe *stripe;
3992 +
3993 + BUG_ON(list_empty(&rs->recover.stripes));
3994 + stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3995 + lists[LIST_RECOVER]);
3996 +
3997 + /*
3998 + * Got to allow io on all chunks, so that
3999 + * xor() will actually work on them.
4000 + */
4001 + stripe_allow_io(stripe);
4002 +
4003 + /* Try all xor functions. */
4004 + while (f-- > xor_funcs) {
4005 + unsigned speed;
4006 +
4007 + /* Set actual xor function for common_xor(). */
4008 + rs->xor.f = f;
4009 + rs->xor.chunks = XOR_CHUNKS_MAX + 1;
4010 +
4011 + while (rs->xor.chunks-- > 2) {
4012 + speed = xor_speed(stripe);
4013 + if (speed > speed_max) {
4014 + speed_max = speed;
4015 + chunks_max = rs->xor.chunks;
4016 + f_max = f;
4017 + }
4018 + }
4019 + }
4020 +
4021 + /* Memorize optimum parameters. */
4022 + rs->xor.f = f_max;
4023 + rs->xor.chunks = chunks_max;
4024 + return speed_max;
4025 +}
4026 +
4027 +/*
4028 + * Allocate a RAID context (a RAID set)
4029 + */
4030 +static int
4031 +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
4032 + unsigned stripes, unsigned chunk_size, unsigned io_size,
4033 + unsigned recover_io_size, unsigned raid_devs,
4034 + sector_t sectors_per_dev,
4035 + struct dm_target *ti, unsigned dl_parms, char **argv)
4036 +{
4037 + int r;
4038 + unsigned p;
4039 + size_t len;
4040 + sector_t region_size, ti_len;
4041 + struct raid_set *rs = NULL;
4042 + struct dm_dirty_log *dl;
4043 + struct recover *rec;
4044 +
4045 + /*
4046 + * Create the dirty log
4047 + *
4048 + * We need to change length for the dirty log constructor,
4049 + * because we want an amount of regions for all stripes derived
4050 + * from the single device size, so that we can keep region
4051 + * size = 2^^n independant of the number of devices
4052 + */
4053 + ti_len = ti->len;
4054 + ti->len = sectors_per_dev;
4055 + dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
4056 + ti->len = ti_len;
4057 + if (!dl)
4058 + goto bad_dirty_log;
4059 +
4060 + /* Chunk size *must* be smaller than region size. */
4061 + region_size = dl->type->get_region_size(dl);
4062 + if (chunk_size > region_size)
4063 + goto bad_chunk_size;
4064 +
4065 + /* Recover io size *must* be smaller than region size as well. */
4066 + if (recover_io_size > region_size)
4067 + goto bad_recover_io_size;
4068 +
4069 + /* Size and allocate the RAID set structure. */
4070 + len = sizeof(*rs->data) + sizeof(*rs->dev);
4071 + if (array_too_big(sizeof(*rs), len, raid_devs))
4072 + goto bad_array;
4073 +
4074 + len = sizeof(*rs) + raid_devs * len;
4075 + rs = kzalloc(len, GFP_KERNEL);
4076 + if (!rs)
4077 + goto bad_alloc;
4078 +
4079 + rec = &rs->recover;
4080 + atomic_set(&rs->io.in_process, 0);
4081 + atomic_set(&rs->io.in_process_max, 0);
4082 + rec->io_size = recover_io_size;
4083 +
4084 + /* Pointer to data array. */
4085 + rs->data = (unsigned long **)
4086 + ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
4087 + rec->dl = dl;
4088 + rs->set.raid_devs = p = raid_devs;
4089 + rs->set.data_devs = raid_devs - raid_type->parity_devs;
4090 + rs->set.raid_type = raid_type;
4091 +
4092 + /*
4093 + * Set chunk and io size and respective shifts
4094 + * (used to avoid divisions)
4095 + */
4096 + rs->set.chunk_size = chunk_size;
4097 + rs->set.chunk_mask = chunk_size - 1;
4098 + rs->set.chunk_shift = ffs(chunk_size) - 1;
4099 +
4100 + rs->set.io_size = io_size;
4101 + rs->set.io_mask = io_size - 1;
4102 + rs->set.io_shift = ffs(io_size) - 1;
4103 + rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
4104 +
4105 + rs->set.pages_per_io = chunk_pages(io_size);
4106 + rs->set.sectors_per_dev = sectors_per_dev;
4107 +
4108 + rs->set.ei = -1; /* Indicate no failed device. */
4109 + atomic_set(&rs->set.failed_devs, 0);
4110 +
4111 + rs->ti = ti;
4112 +
4113 + atomic_set(rec->io_count + IO_WORK, 0);
4114 + atomic_set(rec->io_count + IO_RECOVER, 0);
4115 +
4116 + /* Initialize io lock and queues. */
4117 + spin_lock_init(&rs->io.in_lock);
4118 + bio_list_init(&rs->io.in);
4119 + bio_list_init(&rs->io.work);
4120 +
4121 + init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
4122 +
4123 + rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
4124 + rec->rh = dm_rh_client_create(MAX_RECOVER, dispatch_delayed_bios, rs,
4125 + wake_do_raid, rs, dl, region_size,
4126 + rs->recover.nr_regions);
4127 + if (IS_ERR(rec->rh))
4128 + goto bad_rh;
4129 +
4130 + /* Initialize stripe cache. */
4131 + r = sc_init(rs, stripes);
4132 + if (r)
4133 + goto bad_sc;
4134 +
4135 + /* Create dm-io client context. */
4136 + rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
4137 + rs->set.pages_per_io);
4138 + if (IS_ERR(rs->sc.dm_io_client))
4139 + goto bad_dm_io_client;
4140 +
4141 + /* REMOVEME: statistics. */
4142 + stats_reset(rs);
4143 + ClearRSDevelStats(rs); /* Disnable development status. */
4144 +
4145 + *raid_set = rs;
4146 + return 0;
4147 +
4148 +bad_dirty_log:
4149 + TI_ERR_RET("Error creating dirty log", -ENOMEM);
4150 +
4151 +
4152 +bad_chunk_size:
4153 + dm_dirty_log_destroy(dl);
4154 + TI_ERR("Chunk size larger than region size");
4155 +
4156 +bad_recover_io_size:
4157 + dm_dirty_log_destroy(dl);
4158 + TI_ERR("Recover stripe io size larger than region size");
4159 +
4160 +bad_array:
4161 + dm_dirty_log_destroy(dl);
4162 + TI_ERR("Arry too big");
4163 +
4164 +bad_alloc:
4165 + dm_dirty_log_destroy(dl);
4166 + TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
4167 +
4168 +bad_rh:
4169 + dm_dirty_log_destroy(dl);
4170 + ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
4171 + goto free_rs;
4172 +
4173 +bad_sc:
4174 + ti->error = DM_MSG_PREFIX "Error creating stripe cache";
4175 + goto free;
4176 +
4177 +bad_dm_io_client:
4178 + ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
4179 +free:
4180 + dm_rh_client_destroy(rec->rh);
4181 + sc_exit(&rs->sc);
4182 + dm_rh_client_destroy(rec->rh); /* Destroys dirty log as well. */
4183 +free_rs:
4184 + kfree(rs);
4185 + return -ENOMEM;
4186 +}
4187 +
4188 +/* Free a RAID context (a RAID set). */
4189 +static void
4190 +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
4191 +{
4192 + while (r--)
4193 + dm_put_device(ti, rs->dev[r].dev);
4194 +
4195 + dm_io_client_destroy(rs->sc.dm_io_client);
4196 + sc_exit(&rs->sc);
4197 + dm_rh_client_destroy(rs->recover.rh);
4198 + dm_dirty_log_destroy(rs->recover.dl);
4199 + kfree(rs);
4200 +}
4201 +
4202 +/* Create work queue and initialize work. */
4203 +static int rs_workqueue_init(struct raid_set *rs)
4204 +{
4205 + struct dm_target *ti = rs->ti;
4206 +
4207 + rs->io.wq = create_singlethread_workqueue(DAEMON);
4208 + if (!rs->io.wq)
4209 + TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
4210 +
4211 + INIT_DELAYED_WORK(&rs->io.dws, do_raid);
4212 + return 0;
4213 +}
4214 +
4215 +/* Return pointer to raid_type structure for raid name. */
4216 +static struct raid_type *get_raid_type(char *name)
4217 +{
4218 + struct raid_type *r = ARRAY_END(raid_types);
4219 +
4220 + while (r-- > raid_types) {
4221 + if (!strnicmp(STR_LEN(r->name, name)))
4222 + return r;
4223 + }
4224 +
4225 + return NULL;
4226 +}
4227 +
4228 +/* FIXME: factor out to dm core. */
4229 +static int multiple(sector_t a, sector_t b, sector_t *n)
4230 +{
4231 + sector_t r = a;
4232 +
4233 + sector_div(r, b);
4234 + *n = r;
4235 + return a == r * b;
4236 +}
4237 +
4238 +/* Log RAID set information to kernel log. */
4239 +static void raid_set_log(struct raid_set *rs, unsigned speed)
4240 +{
4241 + unsigned p;
4242 + char buf[BDEVNAME_SIZE];
4243 +
4244 + for (p = 0; p < rs->set.raid_devs; p++)
4245 + DMINFO("/dev/%s is raid disk %u",
4246 + bdevname(rs->dev[p].dev->bdev, buf), p);
4247 +
4248 + DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
4249 + rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
4250 + atomic_read(&rs->sc.stripes));
4251 + DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
4252 + rs->xor.chunks, mbpers(rs, speed));
4253 + DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
4254 + rs->set.data_devs, rs->set.raid_devs);
4255 +}
4256 +
4257 +/* Get all devices and offsets. */
4258 +static int
4259 +dev_parms(struct dm_target *ti, struct raid_set *rs,
4260 + char **argv, int *p)
4261 +{
4262 + for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
4263 + int r;
4264 + unsigned long long tmp;
4265 + struct raid_dev *dev = rs->dev + *p;
4266 + union dev_lookup dl = {.dev = dev };
4267 +
4268 + /* Get offset and device. */
4269 + r = sscanf(argv[1], "%llu", &tmp);
4270 + if (r != 1)
4271 + TI_ERR("Invalid RAID device offset parameter");
4272 +
4273 + dev->start = tmp;
4274 + r = dm_get_device(ti, argv[0], dev->start,
4275 + rs->set.sectors_per_dev,
4276 + dm_table_get_mode(ti->table), &dev->dev);
4277 + if (r)
4278 + TI_ERR_RET("RAID device lookup failure", r);
4279 +
4280 + r = raid_dev_lookup(rs, bynumber, &dl);
4281 + if (r != -ENODEV && r < *p) {
4282 + (*p)++; /* Ensure dm_put_device() on actual device. */
4283 + TI_ERR_RET("Duplicate RAID device", -ENXIO);
4284 + }
4285 + }
4286 +
4287 + return 0;
4288 +}
4289 +
4290 +/* Set recovery bandwidth. */
4291 +static INLINE void
4292 +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
4293 +{
4294 + rs->recover.bandwidth = bandwidth;
4295 + rs->recover.bandwidth_work = 100 / bandwidth;
4296 +}
4297 +
4298 +/* Handle variable number of RAID parameters. */
4299 +static int
4300 +raid_variable_parms(struct dm_target *ti, char **argv,
4301 + unsigned i, int *raid_parms,
4302 + int *chunk_size, int *chunk_size_parm,
4303 + int *stripes, int *stripes_parm,
4304 + int *io_size, int *io_size_parm,
4305 + int *recover_io_size, int *recover_io_size_parm,
4306 + int *bandwidth, int *bandwidth_parm)
4307 +{
4308 + /* Fetch # of variable raid parameters. */
4309 + if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
4310 + !range_ok(*raid_parms, 0, 5))
4311 + TI_ERR("Bad variable raid parameters number");
4312 +
4313 + if (*raid_parms) {
4314 + /*
4315 + * If we've got variable RAID parameters,
4316 + * chunk size is the first one
4317 + */
4318 + if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
4319 + (*chunk_size != -1 &&
4320 + (!POWER_OF_2(*chunk_size) ||
4321 + !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
4322 + TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
4323 +
4324 + *chunk_size_parm = *chunk_size;
4325 + if (*chunk_size == -1)
4326 + *chunk_size = CHUNK_SIZE;
4327 +
4328 + /*
4329 + * In case we've got 2 or more variable raid
4330 + * parameters, the number of stripes is the second one
4331 + */
4332 + if (*raid_parms > 1) {
4333 + if (sscanf(argv[i++], "%d", stripes) != 1 ||
4334 + (*stripes != -1 &&
4335 + !range_ok(*stripes, STRIPES_MIN,
4336 + STRIPES_MAX)))
4337 + TI_ERR("Invalid number of stripes: must "
4338 + "be >= 8 and <= 8192");
4339 + }
4340 +
4341 + *stripes_parm = *stripes;
4342 + if (*stripes == -1)
4343 + *stripes = STRIPES;
4344 +
4345 + /*
4346 + * In case we've got 3 or more variable raid
4347 + * parameters, the io size is the third one.
4348 + */
4349 + if (*raid_parms > 2) {
4350 + if (sscanf(argv[i++], "%d", io_size) != 1 ||
4351 + (*io_size != -1 &&
4352 + (!POWER_OF_2(*io_size) ||
4353 + !range_ok(*io_size, IO_SIZE_MIN,
4354 + min(BIO_MAX_SECTORS / 2,
4355 + *chunk_size)))))
4356 + TI_ERR("Invalid io size; must "
4357 + "be 2^^n and less equal "
4358 + "min(BIO_MAX_SECTORS/2, chunk size)");
4359 + } else
4360 + *io_size = *chunk_size;
4361 +
4362 + *io_size_parm = *io_size;
4363 + if (*io_size == -1)
4364 + *io_size = *chunk_size;
4365 +
4366 + /*
4367 + * In case we've got 4 variable raid parameters,
4368 + * the recovery stripe io_size is the fourth one
4369 + */
4370 + if (*raid_parms > 3) {
4371 + if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
4372 + (*recover_io_size != -1 &&
4373 + (!POWER_OF_2(*recover_io_size) ||
4374 + !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
4375 + BIO_MAX_SECTORS / 2))))
4376 + TI_ERR("Invalid recovery io size; must be "
4377 + "2^^n and less equal BIO_MAX_SECTORS/2");
4378 + }
4379 +
4380 + *recover_io_size_parm = *recover_io_size;
4381 + if (*recover_io_size == -1)
4382 + *recover_io_size = RECOVER_IO_SIZE;
4383 +
4384 + /*
4385 + * In case we've got 5 variable raid parameters,
4386 + * the recovery io bandwidth is the fifth one
4387 + */
4388 + if (*raid_parms > 4) {
4389 + if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
4390 + (*bandwidth != -1 &&
4391 + !range_ok(*bandwidth, BANDWIDTH_MIN,
4392 + BANDWIDTH_MAX)))
4393 + TI_ERR("Invalid recovery bandwidth "
4394 + "percentage; must be > 0 and <= 100");
4395 + }
4396 +
4397 + *bandwidth_parm = *bandwidth;
4398 + if (*bandwidth == -1)
4399 + *bandwidth = BANDWIDTH;
4400 + }
4401 +
4402 + return 0;
4403 +}
4404 +
4405 +/* Parse optional locking parameters. */
4406 +static int
4407 +raid_locking_parms(struct dm_target *ti, char **argv,
4408 + unsigned i, int *locking_parms,
4409 + struct dm_raid45_locking_type **locking_type)
4410 +{
4411 + *locking_parms = 0;
4412 + *locking_type = &locking_none;
4413 +
4414 + if (!strnicmp(argv[i], "none", strlen(argv[i])))
4415 + *locking_parms = 1;
4416 + else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
4417 + *locking_type = &locking_none;
4418 + *locking_parms = 2;
4419 + } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
4420 + *locking_type = &locking_cluster;
4421 + /* FIXME: namespace. */
4422 + *locking_parms = 3;
4423 + }
4424 +
4425 + return *locking_parms == 1 ? -EINVAL : 0;
4426 +}
4427 +
4428 +/* Set backing device information properties of RAID set. */
4429 +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
4430 +{
4431 + unsigned p, ra_pages;
4432 + struct mapped_device *md = dm_table_get_md(rs->ti->table);
4433 + struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
4434 +
4435 + /* Set read-ahead for the RAID set and the component devices. */
4436 + bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
4437 + ra_pages = chunks * chunk_pages(rs->set.io_size);
4438 + for (p = rs->set.raid_devs; p--; ) {
4439 + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
4440 +
4441 + q->backing_dev_info.ra_pages = ra_pages;
4442 + }
4443 +
4444 + /* Set congested function and data. */
4445 + bdi->congested_fn = raid_set_congested;
4446 + bdi->congested_data = rs;
4447 +
4448 + dm_put(md);
4449 +}
4450 +
4451 +/* Get backing device information properties of RAID set. */
4452 +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
4453 +{
4454 + struct mapped_device *md = dm_table_get_md(rs->ti->table);
4455 +
4456 + *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
4457 + / stripe_pages(rs, rs->set.io_size);
4458 + *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
4459 + / chunk_pages(rs->set.io_size);
4460 +
4461 + dm_put(md);
4462 +}
4463 +
4464 +/*
4465 + * Construct a RAID4/5 mapping:
4466 + *
4467 + * log_type #log_params <log_params> \
4468 + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
4469 + * [locking "none"/"cluster"]
4470 + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
4471 + *
4472 + * log_type = "core"/"disk",
4473 + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
4474 + * log_params = [dirty_log_path] region_size [[no]sync])
4475 + *
4476 + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
4477 + *
4478 + * #parity_dev = N if raid_type = "raid4"
4479 + * o N = -1: pick default = last device
4480 + * o N >= 0 and < #raid_devs: parity device index
4481 + *
4482 + * #raid_variable_params = 0-5; raid_params (-1 = default):
4483 + * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
4484 + * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
4485 + * and <= CHUNK_SIZE_MAX)
4486 + * o #stripes is number of stripes allocated to stripe cache
4487 + * (must be > 1 and < STRIPES_MAX)
4488 + * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
4489 + * o recover_io_size (io unit size per device for recovery in sectors;
4490 + must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4491 + * o %recovery_bandwith is the maximum amount spend for recovery during
4492 + * application io (1-100%)
4493 + * If raid_variable_params = 0, defaults will be used.
4494 + * Any raid_variable_param can be set to -1 to apply a default
4495 + *
4496 + * #raid_devs = N (N >= 3)
4497 + *
4498 + * #dev_to_initialize = N
4499 + * -1: initialize parity on all devices
4500 + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4501 + * of a failed devices content after replacement
4502 + *
4503 + * <dev_path> = device_path (eg, /dev/sdd1)
4504 + * <offset> = begin at offset on <dev_path>
4505 + *
4506 + */
4507 +#define MIN_PARMS 13
4508 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4509 +{
4510 + int bandwidth = BANDWIDTH, bandwidth_parm = -1,
4511 + chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
4512 + dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
4513 + i, io_size = IO_SIZE, io_size_parm = -1,
4514 + r, raid_devs, raid_parms,
4515 + recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
4516 + stripes = STRIPES, stripes_parm = -1;
4517 + unsigned speed;
4518 + sector_t tmp, sectors_per_dev;
4519 + struct dm_raid45_locking_type *locking;
4520 + struct raid_set *rs;
4521 + struct raid_type *raid_type;
4522 +
4523 + /* Ensure minimum number of parameters. */
4524 + if (argc < MIN_PARMS)
4525 + TI_ERR("Not enough parameters");
4526 +
4527 + /* Fetch # of dirty log parameters. */
4528 + if (sscanf(argv[1], "%d", &dl_parms) != 1
4529 + || !range_ok(dl_parms, 1, 4711))
4530 + TI_ERR("Bad dirty log parameters number");
4531 +
4532 + /* Check raid_type. */
4533 + raid_type = get_raid_type(argv[dl_parms + 2]);
4534 + if (!raid_type)
4535 + TI_ERR("Bad raid type");
4536 +
4537 + /* In case of RAID4, parity drive is selectable. */
4538 + parity_parm = !!(raid_type->level == raid4);
4539 +
4540 + /* Handle variable number of RAID parameters. */
4541 + r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
4542 + &raid_parms,
4543 + &chunk_size, &chunk_size_parm,
4544 + &stripes, &stripes_parm,
4545 + &io_size, &io_size_parm,
4546 + &recover_io_size, &recover_io_size_parm,
4547 + &bandwidth, &bandwidth_parm);
4548 + if (r)
4549 + return r;
4550 +
4551 + r = raid_locking_parms(ti, argv,
4552 + dl_parms + parity_parm + raid_parms + 4,
4553 + &locking_parms, &locking);
4554 + if (r)
4555 + return r;
4556 +
4557 + /* # of raid devices. */
4558 + i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
4559 + if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4560 + raid_devs < raid_type->minimal_devs)
4561 + TI_ERR("Invalid number of raid devices");
4562 +
4563 + /* In case of RAID4, check parity drive index is in limits. */
4564 + if (raid_type->level == raid4) {
4565 + /* Fetch index of parity device. */
4566 + if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4567 + !range_ok(pi, 0, raid_devs - 1))
4568 + TI_ERR("Invalid RAID4 parity device index");
4569 + }
4570 +
4571 + /*
4572 + * Index of device to initialize starts at 0
4573 + *
4574 + * o -1 -> don't initialize a particular device,
4575 + * o 0..raid_devs-1 -> initialize respective device
4576 + * (used for reconstruction of a replaced device)
4577 + */
4578 + if (sscanf
4579 + (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
4580 + "%d", &dev_to_init) != 1
4581 + || !range_ok(dev_to_init, -1, raid_devs - 1))
4582 + TI_ERR("Invalid number for raid device to initialize");
4583 +
4584 + /* Check # of raid device arguments. */
4585 + if (argc - dl_parms - parity_parm - raid_parms - 6 !=
4586 + 2 * raid_devs)
4587 + TI_ERR("Wrong number of raid device/offset arguments");
4588 +
4589 + /*
4590 + * Check that the table length is devisable
4591 + * w/o rest by (raid_devs - parity_devs)
4592 + */
4593 + if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4594 + &sectors_per_dev))
4595 + TI_ERR
4596 + ("Target length not divisable by number of data devices");
4597 +
4598 + /*
4599 + * Check that the device size is
4600 + * devisable w/o rest by chunk size
4601 + */
4602 + if (!multiple(sectors_per_dev, chunk_size, &tmp))
4603 + TI_ERR("Device length not divisable by chunk_size");
4604 +
4605 + /****************************************************************
4606 + * Now that we checked the constructor arguments ->
4607 + * let's allocate the RAID set
4608 + ****************************************************************/
4609 + r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
4610 + recover_io_size, raid_devs, sectors_per_dev,
4611 + ti, dl_parms, argv);
4612 + if (r)
4613 + return r;
4614 +
4615 + /*
4616 + * Set these here in order to avoid passing
4617 + * too many arguments to context_alloc()
4618 + */
4619 + rs->set.dev_to_init_parm = dev_to_init;
4620 + rs->set.dev_to_init = dev_to_init;
4621 + rs->set.pi_parm = pi;
4622 + rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4623 + rs->set.raid_parms = raid_parms;
4624 + rs->set.chunk_size_parm = chunk_size_parm;
4625 + rs->set.io_size_parm = io_size_parm;
4626 + rs->sc.stripes_parm = stripes_parm;
4627 + rs->recover.io_size_parm = recover_io_size_parm;
4628 + rs->recover.bandwidth_parm = bandwidth_parm;
4629 + recover_set_bandwidth(rs, bandwidth);
4630 +
4631 + /* Use locking type to lock stripe access. */
4632 + rs->locking = locking;
4633 +
4634 + /* Get the device/offset tupels. */
4635 + argv += dl_parms + 6 + parity_parm + raid_parms;
4636 + r = dev_parms(ti, rs, argv, &i);
4637 + if (r)
4638 + goto err;
4639 +
4640 + /* Initialize recovery. */
4641 + rs->recover.start_jiffies = jiffies;
4642 + rs->recover.end_jiffies = 0;
4643 + recovery_region_reset(rs);
4644 +
4645 + /* Allow for recovery of any nosync regions. */
4646 + SetRSRecover(rs);
4647 +
4648 + /* Set backing device information (eg. read ahead). */
4649 + rs_set_bdi(rs, chunk_size * 2, io_size * 4);
4650 + SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4651 +
4652 + speed = xor_optimize(rs); /* Select best xor algorithm. */
4653 +
4654 + /* Initialize work queue to handle this RAID set's io. */
4655 + r = rs_workqueue_init(rs);
4656 + if (r)
4657 + goto err;
4658 +
4659 + raid_set_log(rs, speed); /* Log information about RAID set. */
4660 +
4661 + /*
4662 + * Make sure that dm core only hands maximum io size
4663 + * length down and pays attention to io boundaries.
4664 + */
4665 + ti->split_io = rs->set.io_size;
4666 + ti->private = rs;
4667 + return 0;
4668 +
4669 +err:
4670 + context_free(rs, ti, i);
4671 + return r;
4672 +}
4673 +
4674 +/*
4675 + * Destruct a raid mapping
4676 + */
4677 +static void raid_dtr(struct dm_target *ti)
4678 +{
4679 + struct raid_set *rs = ti->private;
4680 +
4681 + /* Indicate recovery end so that ios in flight drain. */
4682 + ClearRSRecover(rs);
4683 +
4684 + wake_do_raid(rs); /* Wake daemon. */
4685 + wait_ios(rs); /* Wait for any io still being processed. */
4686 + destroy_workqueue(rs->io.wq);
4687 + context_free(rs, ti, rs->set.raid_devs);
4688 +}
4689 +
4690 +/* Queues ios to RAID sets. */
4691 +static inline void queue_bio(struct raid_set *rs, struct bio *bio)
4692 +{
4693 + int wake;
4694 + struct bio_list *in = &rs->io.in;
4695 + spinlock_t *in_lock = &rs->io.in_lock;
4696 +
4697 + spin_lock_irq(in_lock);
4698 + wake = bio_list_empty(in);
4699 + bio_list_add(in, bio);
4700 + spin_unlock_irq(in_lock);
4701 +
4702 + /* Wake daemon if input list was empty. */
4703 + if (wake)
4704 + wake_do_raid(rs);
4705 +}
4706 +
4707 +/* Raid mapping function. */
4708 +static int raid_map(struct dm_target *ti, struct bio *bio,
4709 + union map_info *map_context)
4710 +{
4711 + /* I don't want to waste stripe cache capacity. */
4712 + if (bio_rw(bio) == READA)
4713 + return -EIO;
4714 + else {
4715 + struct raid_set *rs = ti->private;
4716 +
4717 + /* REMOVEME: statistics. */
4718 + atomic_inc(rs->stats +
4719 + (bio_data_dir(bio) == WRITE ?
4720 + S_BIOS_WRITE : S_BIOS_READ));
4721 +
4722 + /*
4723 + * Get io reference to be waiting for to drop
4724 + * to zero on device suspension/destruction.
4725 + */
4726 + io_get(rs);
4727 + bio->bi_sector -= ti->begin; /* Remap sector. */
4728 + queue_bio(rs, bio); /* Queue to the daemon. */
4729 + return DM_MAPIO_SUBMITTED; /* Handle later. */
4730 + }
4731 +}
4732 +
4733 +/* Device suspend. */
4734 +static void raid_postsuspend(struct dm_target *ti)
4735 +{
4736 + struct raid_set *rs = ti->private;
4737 + struct dm_dirty_log *dl = rs->recover.dl;
4738 +
4739 + SetRSSuspended(rs);
4740 +
4741 + if (RSRecover(rs))
4742 + dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
4743 + else
4744 + wake_do_raid(rs);
4745 +
4746 + wait_ios(rs); /* Wait for completion of all ios being processed. */
4747 + if (dl->type->postsuspend && dl->type->postsuspend(dl))
4748 + /* Suspend dirty log. */
4749 + /* FIXME: need better error handling. */
4750 + DMWARN("log suspend failed");
4751 +}
4752 +
4753 +/* Device resume. */
4754 +static void raid_resume(struct dm_target *ti)
4755 +{
4756 + struct raid_set *rs = ti->private;
4757 + struct recover *rec = &rs->recover;
4758 + struct dm_dirty_log *dl = rec->dl;
4759 +
4760 + if (dl->type->resume && dl->type->resume(dl))
4761 + /* Resume dirty log. */
4762 + /* FIXME: need better error handling. */
4763 + DMWARN("log resume failed");
4764 +
4765 + rec->nr_regions_to_recover =
4766 + rec->nr_regions - dl->type->get_sync_count(dl);
4767 +
4768 + ClearRSSuspended(rs);
4769 +
4770 + /* Reset any unfinished recovery. */
4771 + if (RSRecover(rs)) {
4772 + recovery_region_reset(rs);
4773 + dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
4774 + } else
4775 + wake_do_raid(rs);
4776 +}
4777 +
4778 +static INLINE unsigned sc_size(struct raid_set *rs)
4779 +{
4780 + return to_sector(atomic_read(&rs->sc.stripes) *
4781 + (sizeof(struct stripe) +
4782 + (sizeof(struct stripe_set) +
4783 + (sizeof(struct page_list) +
4784 + to_bytes(rs->set.io_size) *
4785 + rs->set.raid_devs)) +
4786 + (rs->recover.
4787 + end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
4788 + rs->recover.
4789 + io_size))));
4790 +}
4791 +
4792 +/* REMOVEME: status output for development. */
4793 +static void
4794 +raid_devel_stats(struct dm_target *ti, char *result,
4795 + unsigned *size, unsigned maxlen)
4796 +{
4797 + unsigned chunks, stripes, sz = *size;
4798 + unsigned long j;
4799 + char buf[BDEVNAME_SIZE], *p;
4800 + struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
4801 + struct raid_set *rs = ti->private;
4802 + struct recover *rec = &rs->recover;
4803 + struct timespec ts;
4804 +
4805 + DMEMIT("%s ", version);
4806 + DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
4807 + DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
4808 +
4809 + for (sm = stats_map; sm < sm_end; sm++)
4810 + DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4811 +
4812 + DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
4813 + DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
4814 + atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
4815 + sc_size(rs));
4816 +
4817 + j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4818 + rec->start_jiffies;
4819 + jiffies_to_timespec(j, &ts);
4820 + sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4821 + p = strchr(buf, '.');
4822 + p[3] = 0;
4823 +
4824 + DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
4825 + (unsigned long long) rec->nr_regions_recovered,
4826 + RSRegionGet(rs) ? "+" : "",
4827 + (unsigned long long) rec->nr_regions_to_recover,
4828 + (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4829 +
4830 + rs_get_ra(rs, &stripes, &chunks);
4831 + DMEMIT("ra=%u/%u ", stripes, chunks);
4832 +
4833 + *size = sz;
4834 +}
4835 +
4836 +static int
4837 +raid_status(struct dm_target *ti, status_type_t type,
4838 + char *result, unsigned maxlen)
4839 +{
4840 + unsigned i, sz = 0;
4841 + char buf[BDEVNAME_SIZE];
4842 + struct raid_set *rs = ti->private;
4843 +
4844 + switch (type) {
4845 + case STATUSTYPE_INFO:
4846 + /* REMOVEME: statistics. */
4847 + if (RSDevelStats(rs))
4848 + raid_devel_stats(ti, result, &sz, maxlen);
4849 +
4850 + DMEMIT("%u ", rs->set.raid_devs);
4851 +
4852 + for (i = 0; i < rs->set.raid_devs; i++)
4853 + DMEMIT("%s ",
4854 + format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
4855 +
4856 + DMEMIT("1 ");
4857 + for (i = 0; i < rs->set.raid_devs; i++) {
4858 + DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
4859 +
4860 + if (rs->set.raid_type->level == raid4 &&
4861 + i == rs->set.pi)
4862 + DMEMIT("p");
4863 +
4864 + if (rs->set.dev_to_init == i)
4865 + DMEMIT("i");
4866 + }
4867 +
4868 + break;
4869 +
4870 + case STATUSTYPE_TABLE:
4871 + sz = rs->recover.dl->type->status(rs->recover.dl, type,
4872 + result, maxlen);
4873 + DMEMIT("%s %u ", rs->set.raid_type->name,
4874 + rs->set.raid_parms);
4875 +
4876 + if (rs->set.raid_type->level == raid4)
4877 + DMEMIT("%d ", rs->set.pi_parm);
4878 +
4879 + if (rs->set.raid_parms)
4880 + DMEMIT("%d ", rs->set.chunk_size_parm);
4881 +
4882 + if (rs->set.raid_parms > 1)
4883 + DMEMIT("%d ", rs->sc.stripes_parm);
4884 +
4885 + if (rs->set.raid_parms > 2)
4886 + DMEMIT("%d ", rs->set.io_size_parm);
4887 +
4888 + if (rs->set.raid_parms > 3)
4889 + DMEMIT("%d ", rs->recover.io_size_parm);
4890 +
4891 + if (rs->set.raid_parms > 4)
4892 + DMEMIT("%d ", rs->recover.bandwidth_parm);
4893 +
4894 + DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4895 +
4896 + for (i = 0; i < rs->set.raid_devs; i++)
4897 + DMEMIT("%s %llu ",
4898 + format_dev_t(buf,
4899 + rs->dev[i].dev->bdev->bd_dev),
4900 + (unsigned long long) rs->dev[i].start);
4901 + }
4902 +
4903 + return 0;
4904 +}
4905 +
4906 +/*
4907 + * Message interface
4908 + */
4909 +enum raid_msg_actions {
4910 + act_bw, /* Recovery bandwidth switch. */
4911 + act_dev, /* Device failure switch. */
4912 + act_overwrite, /* Stripe overwrite check. */
4913 + act_read_ahead, /* Set read ahead. */
4914 + act_stats, /* Development statistics switch. */
4915 + act_sc, /* Stripe cache switch. */
4916 +
4917 + act_on, /* Set entity on. */
4918 + act_off, /* Set entity off. */
4919 + act_reset, /* Reset entity. */
4920 +
4921 + act_set = act_on, /* Set # absolute. */
4922 + act_grow = act_off, /* Grow # by an amount. */
4923 + act_shrink = act_reset, /* Shrink # by an amount. */
4924 +};
4925 +
4926 +/* Turn a delta to absolute. */
4927 +static int _absolute(unsigned long action, int act, int r)
4928 +{
4929 + /* Make delta absolute. */
4930 + if (test_bit(act_set, &action))
4931 + ;
4932 + else if (test_bit(act_grow, &action))
4933 + r += act;
4934 + else if (test_bit(act_shrink, &action))
4935 + r = act - r;
4936 + else
4937 + r = -EINVAL;
4938 +
4939 + return r;
4940 +}
4941 +
4942 + /* Change recovery io bandwidth. */
4943 +static int bandwidth_change(struct dm_msg *msg, void *context)
4944 +{
4945 + struct raid_set *rs = context;
4946 + int act = rs->recover.bandwidth;
4947 + int bandwidth = DM_MSG_INT_ARG(msg);
4948 +
4949 + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4950 + /* Make delta bandwidth absolute. */
4951 + bandwidth = _absolute(msg->action, act, bandwidth);
4952 +
4953 + /* Check range. */
4954 + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4955 + recover_set_bandwidth(rs, bandwidth);
4956 + return 0;
4957 + }
4958 + }
4959 +
4960 + set_bit(dm_msg_ret_arg, &msg->ret);
4961 + set_bit(dm_msg_ret_inval, &msg->ret);
4962 + return -EINVAL;
4963 +}
4964 +
4965 +/* Change state of a device (running/offline). */
4966 +/* FIXME: this only works while recovering!. */
4967 +static int device_state(struct dm_msg *msg, void *context)
4968 +{
4969 + int r;
4970 + const char *str = "is already ";
4971 + union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
4972 + struct raid_set *rs = context;
4973 +
4974 + r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
4975 + bymajmin : byname, &dl);
4976 + if (r == -ENODEV) {
4977 + DMERR("device %s is no member of this set", dl.dev_name);
4978 + return r;
4979 + }
4980 +
4981 + if (test_bit(act_off, &msg->action)) {
4982 + if (dev_operational(rs, r))
4983 + str = "";
4984 + } else if (!dev_operational(rs, r))
4985 + str = "";
4986 +
4987 + DMINFO("/dev/%s %s%s", dl.dev_name, str,
4988 + test_bit(act_off, &msg->action) ? "offline" : "running");
4989 +
4990 + return test_bit(act_off, &msg->action) ?
4991 + raid_set_check_and_degrade(rs, NULL, r) :
4992 + raid_set_check_and_upgrade(rs, r);
4993 +}
4994 +
4995 +/* Set/reset development feature flags. */
4996 +static int devel_flags(struct dm_msg *msg, void *context)
4997 +{
4998 + struct raid_set *rs = context;
4999 +
5000 + if (test_bit(act_on, &msg->action))
5001 + return test_and_set_bit(msg->spec->parm,
5002 + &rs->io.flags) ? -EPERM : 0;
5003 + else if (test_bit(act_off, &msg->action))
5004 + return test_and_clear_bit(msg->spec->parm,
5005 + &rs->io.flags) ? 0 : -EPERM;
5006 + else if (test_bit(act_reset, &msg->action)) {
5007 + if (test_bit(act_stats, &msg->action)) {
5008 + stats_reset(rs);
5009 + goto on;
5010 + } else if (test_bit(act_overwrite, &msg->action)) {
5011 +on:
5012 + set_bit(msg->spec->parm, &rs->io.flags);
5013 + return 0;
5014 + }
5015 + }
5016 +
5017 + return -EINVAL;
5018 +}
5019 +
5020 + /* Set stripe and chunk read ahead pages. */
5021 +static int read_ahead_set(struct dm_msg *msg, void *context)
5022 +{
5023 + int stripes = DM_MSG_INT_ARGS(msg, 0);
5024 + int chunks = DM_MSG_INT_ARGS(msg, 1);
5025 +
5026 + if (range_ok(stripes, 1, 512) &&
5027 + range_ok(chunks, 1, 512)) {
5028 + rs_set_bdi(context, stripes, chunks);
5029 + return 0;
5030 + }
5031 +
5032 + set_bit(dm_msg_ret_arg, &msg->ret);
5033 + set_bit(dm_msg_ret_inval, &msg->ret);
5034 + return -EINVAL;
5035 +}
5036 +
5037 +/* Resize the stripe cache. */
5038 +static int stripecache_resize(struct dm_msg *msg, void *context)
5039 +{
5040 + int act, stripes;
5041 + struct raid_set *rs = context;
5042 +
5043 + /* Deny permission in case the daemon is still shrinking!. */
5044 + if (atomic_read(&rs->sc.stripes_to_shrink))
5045 + return -EPERM;
5046 +
5047 + stripes = DM_MSG_INT_ARG(msg);
5048 + if (stripes > 0) {
5049 + act = atomic_read(&rs->sc.stripes);
5050 +
5051 + /* Make delta stripes absolute. */
5052 + stripes = _absolute(msg->action, act, stripes);
5053 +
5054 + /*
5055 + * Check range and that the # of stripes changes.
5056 + * We can grow from gere but need to leave any
5057 + * shrinking to the worker for synchronization.
5058 + */
5059 + if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
5060 + if (stripes > act)
5061 + return sc_grow(&rs->sc, stripes - act, SC_GROW);
5062 + else if (stripes < act) {
5063 + atomic_set(&rs->sc.stripes_to_shrink,
5064 + act - stripes);
5065 + wake_do_raid(rs);
5066 + }
5067 +
5068 + return 0;
5069 + }
5070 + }
5071 +
5072 + set_bit(dm_msg_ret_arg, &msg->ret);
5073 + set_bit(dm_msg_ret_inval, &msg->ret);
5074 + return -EINVAL;
5075 +}
5076 +
5077 +/* Parse the RAID message action. */
5078 +/*
5079 + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
5080 + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
5081 + * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
5082 + * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2'
5083 + * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
5084 + * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
5085 + *
5086 + */
5087 +static int
5088 +raid_message(struct dm_target *ti, unsigned argc, char **argv)
5089 +{
5090 + /* Variables to store the parsed parameters im. */
5091 + static int i[2];
5092 + static unsigned long *i_arg[] = {
5093 + (unsigned long *) i + 0,
5094 + (unsigned long *) i + 1,
5095 + };
5096 + static char *p;
5097 + static unsigned long *p_arg[] = { (unsigned long *) &p };
5098 +
5099 + /* Declare all message option strings. */
5100 + static char *str_sgs[] = { "set", "grow", "shrink" };
5101 + static char *str_dev[] = { "running", "offline" };
5102 + static char *str_oor[] = { "on", "off", "reset" };
5103 +
5104 + /* Declare all actions. */
5105 + static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
5106 + static unsigned long act_oor[] = { act_on, act_off, act_reset };
5107 +
5108 + /* Bandwidth option. */
5109 + static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
5110 + static struct dm_message_argument bw_args = {
5111 + 1, i_arg, { dm_msg_int_t }
5112 + };
5113 +
5114 + /* Device option. */
5115 + static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
5116 + static struct dm_message_argument dev_args = {
5117 + 1, p_arg, { dm_msg_base_t }
5118 + };
5119 +
5120 + /* Read ahead option. */
5121 + static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
5122 + static struct dm_message_argument ra_args = {
5123 + 2, i_arg, { dm_msg_int_t, dm_msg_int_t }
5124 + };
5125 +
5126 + static struct dm_message_argument null_args = {
5127 + 0, NULL, { dm_msg_int_t }
5128 + };
5129 +
5130 + /* Overwrite and statistics option. */
5131 + static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
5132 +
5133 + /* Sripecache option. */
5134 + static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
5135 +
5136 + /* Declare messages. */
5137 + static struct dm_msg_spec specs[] = {
5138 + { "bandwidth", act_bw, &bw_opt, &bw_args,
5139 + 0, bandwidth_change },
5140 + { "device", act_dev, &dev_opt, &dev_args,
5141 + 0, device_state },
5142 + { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
5143 + RS_CHECK_OVERWRITE, devel_flags },
5144 + { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
5145 + 0, read_ahead_set },
5146 + { "statistics", act_stats, &ovr_stats_opt, &null_args,
5147 + RS_DEVEL_STATS, devel_flags },
5148 + { "stripecache", act_sc, &stripe_opt, &bw_args,
5149 + 0, stripecache_resize },
5150 + };
5151 +
5152 + /* The message for the parser. */
5153 + struct dm_msg msg = {
5154 + .num_specs = ARRAY_SIZE(specs),
5155 + .specs = specs,
5156 + };
5157 +
5158 + return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
5159 +}
5160 +/*
5161 + * END message interface
5162 + */
5163 +
5164 +static struct target_type raid_target = {
5165 + .name = "raid45",
5166 + .version = {1, 0, 0},
5167 + .module = THIS_MODULE,
5168 + .ctr = raid_ctr,
5169 + .dtr = raid_dtr,
5170 + .map = raid_map,
5171 + .postsuspend = raid_postsuspend,
5172 + .resume = raid_resume,
5173 + .status = raid_status,
5174 + .message = raid_message,
5175 +};
5176 +
5177 +static void init_exit(const char *bad_msg, const char *good_msg, int r)
5178 +{
5179 + if (r)
5180 + DMERR("Failed to %sregister target [%d]", bad_msg, r);
5181 + else
5182 + DMINFO("%s %s", good_msg, version);
5183 +}
5184 +
5185 +static int __init dm_raid_init(void)
5186 +{
5187 + int r;
5188 +
5189 + r = dm_register_target(&raid_target);
5190 + init_exit("", "initialized", r);
5191 + return r;
5192 +}
5193 +
5194 +static void __exit dm_raid_exit(void)
5195 +{
5196 + int r;
5197 +
5198 + r = dm_unregister_target(&raid_target);
5199 + init_exit("un", "exit", r);
5200 +}
5201 +
5202 +/* Module hooks. */
5203 +module_init(dm_raid_init);
5204 +module_exit(dm_raid_exit);
5205 +
5206 +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
5207 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
5208 +MODULE_LICENSE("GPL");
5209 --- /dev/null
5210 +++ b/drivers/md/dm-raid45.h
5211 @@ -0,0 +1,28 @@
5212 +/*
5213 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
5214 + *
5215 + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
5216 + *
5217 + * Locking definitions for the device-mapper RAID45 target.
5218 + *
5219 + * This file is released under the GPL.
5220 + *
5221 + */
5222 +
5223 +#ifndef _DM_RAID45_H
5224 +#define _DM_RAID45_H
5225 +
5226 +/* Factor out to dm.h! */
5227 +#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr))
5228 +
5229 +enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
5230 +
5231 +struct dm_raid45_locking_type {
5232 + /* Request a lock on a stripe. */
5233 + void* (*lock)(sector_t key, enum dm_lock_type type);
5234 +
5235 + /* Release a lock on a stripe. */
5236 + void (*unlock)(void *lock_handle);
5237 +};
5238 +
5239 +#endif
5240 --- /dev/null
5241 +++ b/drivers/md/dm-regions.c
5242 @@ -0,0 +1,723 @@
5243 +/*
5244 + * Copyright (C) 2003 Sistina Software Limited.
5245 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5246 + *
5247 + * This file is released under the GPL.
5248 + */
5249 +
5250 +#include <linux/dm-dirty-log.h>
5251 +#include <linux/dm-regions.h>
5252 +
5253 +#include <linux/ctype.h>
5254 +#include <linux/init.h>
5255 +#include <linux/module.h>
5256 +#include <linux/vmalloc.h>
5257 +
5258 +#include "dm.h"
5259 +#include "dm-bio-list.h"
5260 +
5261 +#define DM_MSG_PREFIX "region hash"
5262 +
5263 +/*-----------------------------------------------------------------
5264 + * Region hash
5265 + *
5266 + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions.
5267 + * Each region can be in one of three states:
5268 + *
5269 + * o clean
5270 + * o dirty,
5271 + * o nosync.
5272 + *
5273 + * There is no need to put clean regions in the hash.
5274 + *
5275 + *
5276 + * In addition to being present in the hash table a region _may_
5277 + * be present on one of three lists.
5278 + *
5279 + * clean_regions: Regions on this list have no io pending to
5280 + * them, they are in sync, we are no longer interested in them,
5281 + * they are dull. dm_rh_update_states() will remove them from the
5282 + * hash table.
5283 + *
5284 + * quiesced_regions: These regions have been spun down, ready
5285 + * for recovery. dm_rh_recovery_start() will remove regions from
5286 + * this list and hand them to the caller, which will schedule the
5287 + * recovery io.
5288 + *
5289 + * recovered_regions: Regions that the caller has successfully
5290 + * recovered. dm_rh_update_states() will now schedule any delayed
5291 + * io, up the recovery_count, and remove the region from the hash.
5292 + *
5293 + * There are 2 locks:
5294 + * A rw spin lock 'hash_lock' protects just the hash table,
5295 + * this is never held in write mode from interrupt context,
5296 + * which I believe means that we only have to disable irqs when
5297 + * doing a write lock.
5298 + *
5299 + * An ordinary spin lock 'region_lock' that protects the three
5300 + * lists in the region_hash, with the 'state', 'list' and
5301 + * 'delayed_bios' fields of the regions. This is used from irq
5302 + * context, so all other uses will have to suspend local irqs.
5303 + *---------------------------------------------------------------*/
5304 +struct region_hash {
5305 + unsigned max_recovery; /* Max # of regions to recover in parallel */
5306 +
5307 + /* Callback function to dispatch queued writes on recovered regions. */
5308 + void (*dispatch)(void *context, struct bio_list *bios, int error);
5309 + void *dispatch_context;
5310 +
5311 + /* Callback function to wakeup callers worker thread. */
5312 + void (*wake)(void *context);
5313 + void *wake_context;
5314 +
5315 + uint32_t region_size;
5316 + unsigned region_shift;
5317 +
5318 + /* holds persistent region state */
5319 + struct dm_dirty_log *log;
5320 +
5321 + /* hash table */
5322 + rwlock_t hash_lock;
5323 + mempool_t *region_pool;
5324 + unsigned mask;
5325 + unsigned nr_buckets;
5326 + unsigned prime;
5327 + unsigned shift;
5328 + struct list_head *buckets;
5329 +
5330 + spinlock_t region_lock;
5331 + atomic_t recovery_in_flight;
5332 + struct semaphore recovery_count;
5333 + struct list_head clean_regions;
5334 + struct list_head quiesced_regions;
5335 + struct list_head recovered_regions;
5336 + struct list_head failed_recovered_regions;
5337 +};
5338 +
5339 +struct region {
5340 + region_t key;
5341 + enum dm_rh_region_states state;
5342 + void *context; /* Caller context. */
5343 +
5344 + struct list_head hash_list;
5345 + struct list_head list;
5346 +
5347 + atomic_t pending;
5348 + struct bio_list delayed_bios;
5349 +};
5350 +
5351 +/*
5352 + * Conversion fns
5353 + */
5354 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector)
5355 +{
5356 + return sector >> ((struct region_hash *) rh)->region_shift;
5357 +}
5358 +EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
5359 +
5360 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio)
5361 +{
5362 + return dm_rh_sector_to_region(rh, bio->bi_sector);
5363 +}
5364 +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
5365 +
5366 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region)
5367 +{
5368 + return region << ((struct region_hash *) rh)->region_shift;
5369 +}
5370 +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
5371 +
5372 +/*
5373 + * Retrival fns.
5374 + */
5375 +region_t dm_rh_get_region_key(struct dm_region *reg)
5376 +{
5377 + return ((struct region *) reg)->key;
5378 +}
5379 +EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
5380 +
5381 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh)
5382 +{
5383 + return ((struct region_hash *) rh)->region_size;
5384 +}
5385 +EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
5386 +
5387 +/* Squirrel a context with a region. */
5388 +void *dm_rh_reg_get_context(struct dm_region *reg)
5389 +{
5390 + return ((struct region *) reg)->context;
5391 +}
5392 +EXPORT_SYMBOL_GPL(dm_rh_reg_get_context);
5393 +
5394 +void dm_rh_reg_set_context(struct dm_region *reg, void *context)
5395 +{
5396 + ((struct region *) reg)->context = context;
5397 +}
5398 +EXPORT_SYMBOL_GPL(dm_rh_reg_set_context);
5399 +
5400 +/*
5401 + * Create region hash client.
5402 + */
5403 +#define MIN_REGIONS 64
5404 +struct dm_rh_client *dm_rh_client_create(
5405 + unsigned max_recovery,
5406 + void (*dispatch)(void *dispatch_context,
5407 + struct bio_list *bios, int error),
5408 + void *dispatch_context,
5409 + void (*wake)(void *wake_context), void *wake_context,
5410 + struct dm_dirty_log *log, uint32_t region_size,
5411 + region_t nr_regions)
5412 +{
5413 + unsigned i;
5414 + unsigned nr_buckets, max_buckets;
5415 + unsigned hash_primes[] = {
5416 + /* Table of primes for rh_hash/table size optimization. */
5417 + 3, 7, 13, 27, 53, 97, 193, 389, 769,
5418 + 1543, 3079, 6151, 12289, 24593,
5419 + };
5420 + struct region_hash *rh;
5421 +
5422 + if (region_size & (region_size - 1)) {
5423 + DMERR("region size must be 2^^n");
5424 + return ERR_PTR(-EINVAL);
5425 + }
5426 +
5427 + /* Calculate a suitable number of buckets for our hash table. */
5428 + max_buckets = nr_regions >> 6;
5429 + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
5430 + ;
5431 + nr_buckets >>= 1;
5432 +
5433 + rh = kmalloc(sizeof(*rh), GFP_KERNEL);
5434 + if (!rh) {
5435 + DMERR("unable to allocate region hash memory");
5436 + return ERR_PTR(-ENOMEM);
5437 + }
5438 +
5439 + rh->max_recovery = max_recovery;
5440 + rh->dispatch = dispatch;
5441 + rh->dispatch_context = dispatch_context;
5442 + rh->wake = wake;
5443 + rh->wake_context = wake_context;
5444 + rh->log = log;
5445 + rh->region_size = region_size;
5446 + rh->region_shift = ffs(region_size) - 1;
5447 + rwlock_init(&rh->hash_lock);
5448 + rh->mask = nr_buckets - 1;
5449 + rh->nr_buckets = nr_buckets;
5450 + rh->shift = ffs(nr_buckets);
5451 +
5452 + /* Check prime array limits. */
5453 + i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ?
5454 + ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2;
5455 + rh->prime = hash_primes[i];
5456 +
5457 + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
5458 + if (!rh->buckets) {
5459 + DMERR("unable to allocate region hash bucket memory");
5460 + kfree(rh);
5461 + return ERR_PTR(-ENOMEM);
5462 + }
5463 +
5464 + for (i = 0; i < nr_buckets; i++)
5465 + INIT_LIST_HEAD(rh->buckets + i);
5466 +
5467 + spin_lock_init(&rh->region_lock);
5468 + sema_init(&rh->recovery_count, 0);
5469 + atomic_set(&rh->recovery_in_flight, 0);
5470 + INIT_LIST_HEAD(&rh->clean_regions);
5471 + INIT_LIST_HEAD(&rh->quiesced_regions);
5472 + INIT_LIST_HEAD(&rh->recovered_regions);
5473 + INIT_LIST_HEAD(&rh->failed_recovered_regions);
5474 +
5475 + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
5476 + sizeof(struct region));
5477 + if (!rh->region_pool) {
5478 + vfree(rh->buckets);
5479 + kfree(rh);
5480 + rh = ERR_PTR(-ENOMEM);
5481 + }
5482 +
5483 + return (struct dm_rh_client *) rh;
5484 +}
5485 +EXPORT_SYMBOL_GPL(dm_rh_client_create);
5486 +
5487 +void dm_rh_client_destroy(struct dm_rh_client *rh_in)
5488 +{
5489 + unsigned h;
5490 + struct region_hash *rh = (struct region_hash *) rh_in;
5491 + struct region *reg, *tmp;
5492 +
5493 + BUG_ON(!list_empty(&rh->quiesced_regions));
5494 +
5495 + for (h = 0; h < rh->nr_buckets; h++) {
5496 + list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) {
5497 + BUG_ON(atomic_read(&reg->pending));
5498 + mempool_free(reg, rh->region_pool);
5499 + }
5500 + }
5501 +
5502 + if (rh->region_pool)
5503 + mempool_destroy(rh->region_pool);
5504 +
5505 + vfree(rh->buckets);
5506 + kfree(rh);
5507 +}
5508 +EXPORT_SYMBOL_GPL(dm_rh_client_destroy);
5509 +
5510 +static inline unsigned rh_hash(struct region_hash *rh, region_t region)
5511 +{
5512 + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
5513 +}
5514 +
5515 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
5516 +{
5517 + struct region *reg;
5518 + struct list_head *bucket = rh->buckets + rh_hash(rh, region);
5519 +
5520 + list_for_each_entry(reg, bucket, hash_list) {
5521 + if (reg->key == region)
5522 + return reg;
5523 + }
5524 +
5525 + return NULL;
5526 +}
5527 +
5528 +static void __rh_insert(struct region_hash *rh, struct region *reg)
5529 +{
5530 + list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
5531 +}
5532 +
5533 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
5534 +{
5535 + struct region *reg, *nreg;
5536 +
5537 + read_unlock(&rh->hash_lock);
5538 + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
5539 + if (unlikely(!nreg))
5540 + nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
5541 +
5542 + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
5543 + DM_RH_CLEAN : DM_RH_NOSYNC;
5544 + nreg->key = region;
5545 + INIT_LIST_HEAD(&nreg->list);
5546 + atomic_set(&nreg->pending, 0);
5547 + bio_list_init(&nreg->delayed_bios);
5548 +
5549 + write_lock_irq(&rh->hash_lock);
5550 + reg = __rh_lookup(rh, region);
5551 + if (reg)
5552 + /* We lost the race. */
5553 + mempool_free(nreg, rh->region_pool);
5554 + else {
5555 + __rh_insert(rh, nreg);
5556 + if (nreg->state == DM_RH_CLEAN) {
5557 + spin_lock(&rh->region_lock);
5558 + list_add(&nreg->list, &rh->clean_regions);
5559 + spin_unlock(&rh->region_lock);
5560 + }
5561 +
5562 + reg = nreg;
5563 + }
5564 +
5565 + write_unlock_irq(&rh->hash_lock);
5566 + read_lock(&rh->hash_lock);
5567 + return reg;
5568 +}
5569 +
5570 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
5571 +{
5572 + struct region *reg;
5573 +
5574 + reg = __rh_lookup(rh, region);
5575 + return reg ? reg : __rh_alloc(rh, region);
5576 +}
5577 +
5578 +int dm_rh_get_state(struct dm_rh_client *rh_in, region_t region, int may_block)
5579 +{
5580 + int r;
5581 + struct region_hash *rh = (struct region_hash *) rh_in;
5582 + struct region *reg;
5583 +
5584 + read_lock(&rh->hash_lock);
5585 + reg = __rh_lookup(rh, region);
5586 + read_unlock(&rh->hash_lock);
5587 +
5588 + if (reg)
5589 + return reg->state;
5590 +
5591 + /*
5592 + * The region wasn't in the hash, so we fall back to the dirty log.
5593 + */
5594 + r = rh->log->type->in_sync(rh->log, region, may_block);
5595 +
5596 + /*
5597 + * Any error from the dirty log (eg. -EWOULDBLOCK)
5598 + * gets taken as a DM_RH_NOSYNC
5599 + */
5600 + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
5601 +}
5602 +EXPORT_SYMBOL_GPL(dm_rh_get_state);
5603 +
5604 +void dm_rh_set_state(struct dm_rh_client *rh_in, region_t region,
5605 + enum dm_rh_region_states state, int may_block)
5606 +{
5607 + struct region_hash *rh = (struct region_hash *) rh_in;
5608 + struct region *reg;
5609 + struct dm_dirty_log *log = rh->log;
5610 +
5611 + if (state == DM_RH_NOSYNC)
5612 + log->type->set_region_sync(log, region, 0);
5613 + else if (state == DM_RH_CLEAN)
5614 + log->type->clear_region(log, region);
5615 + else if (state == DM_RH_DIRTY)
5616 + log->type->mark_region(log, region);
5617 +
5618 + read_lock(&rh->hash_lock);
5619 + reg = __rh_find(rh, region);
5620 + reg->state = state;
5621 + read_unlock(&rh->hash_lock);
5622 +}
5623 +EXPORT_SYMBOL_GPL(dm_rh_set_state);
5624 +
5625 +void dm_rh_update_states(struct dm_rh_client *rh_in, int errors_handled)
5626 +{
5627 + struct region_hash *rh = (struct region_hash *) rh_in;
5628 + struct region *reg, *next;
5629 + LIST_HEAD(clean);
5630 + LIST_HEAD(recovered);
5631 + LIST_HEAD(failed_recovered);
5632 +
5633 + /*
5634 + * Quickly grab the lists and remove any regions from hash.
5635 + */
5636 + write_lock_irq(&rh->hash_lock);
5637 + spin_lock(&rh->region_lock);
5638 + if (!list_empty(&rh->clean_regions)) {
5639 + list_splice_init(&rh->clean_regions, &clean);
5640 +
5641 + list_for_each_entry(reg, &clean, list)
5642 + list_del(&reg->hash_list);
5643 + }
5644 +
5645 + if (!list_empty(&rh->recovered_regions)) {
5646 + list_splice_init(&rh->recovered_regions, &recovered);
5647 +
5648 + list_for_each_entry(reg, &recovered, list)
5649 + list_del(&reg->hash_list);
5650 + }
5651 +
5652 + if (!list_empty(&rh->failed_recovered_regions)) {
5653 + list_splice_init(&rh->failed_recovered_regions,
5654 + &failed_recovered);
5655 +
5656 + list_for_each_entry(reg, &recovered, list)
5657 + list_del(&reg->hash_list);
5658 + }
5659 +
5660 + spin_unlock(&rh->region_lock);
5661 + write_unlock_irq(&rh->hash_lock);
5662 +
5663 + /*
5664 + * All the regions on the recovered and clean lists have
5665 + * now been pulled out of the system, so no need to do
5666 + * any more locking.
5667 + */
5668 + list_for_each_entry_safe(reg, next, &recovered, list) {
5669 + rh->log->type->clear_region(rh->log, reg->key);
5670 + rh->log->type->set_region_sync(rh->log, reg->key, 1);
5671 +
5672 + if (reg->delayed_bios.head)
5673 + rh->dispatch(rh->dispatch_context,
5674 + &reg->delayed_bios, 0);
5675 +
5676 + up(&rh->recovery_count);
5677 + mempool_free(reg, rh->region_pool);
5678 + }
5679 +
5680 + list_for_each_entry_safe(reg, next, &failed_recovered, list) {
5681 + rh->log->type->set_region_sync(rh->log, reg->key,
5682 + errors_handled ? 0 : 1);
5683 + if (reg->delayed_bios.head)
5684 + rh->dispatch(rh->dispatch_context,
5685 + &reg->delayed_bios, -EIO);
5686 +
5687 + up(&rh->recovery_count);
5688 + mempool_free(reg, rh->region_pool);
5689 + }
5690 +
5691 + list_for_each_entry_safe(reg, next, &clean, list) {
5692 + rh->log->type->clear_region(rh->log, reg->key);
5693 + mempool_free(reg, rh->region_pool);
5694 + }
5695 +
5696 + dm_rh_flush(rh_in);
5697 +}
5698 +EXPORT_SYMBOL_GPL(dm_rh_update_states);
5699 +
5700 +void dm_rh_inc(struct dm_rh_client *rh_in, region_t region)
5701 +{
5702 + struct region_hash *rh = (struct region_hash *) rh_in;
5703 + struct region *reg;
5704 +
5705 + read_lock(&rh->hash_lock);
5706 + reg = __rh_find(rh, region);
5707 + if (reg->state == DM_RH_CLEAN) {
5708 + rh->log->type->mark_region(rh->log, reg->key);
5709 +
5710 + spin_lock_irq(&rh->region_lock);
5711 + reg->state = DM_RH_DIRTY;
5712 + list_del_init(&reg->list); /* Take off the clean list. */
5713 + spin_unlock_irq(&rh->region_lock);
5714 + }
5715 +
5716 + atomic_inc(&reg->pending);
5717 + read_unlock(&rh->hash_lock);
5718 +}
5719 +EXPORT_SYMBOL_GPL(dm_rh_inc);
5720 +
5721 +void dm_rh_inc_pending(struct dm_rh_client *rh_in, struct bio_list *bios)
5722 +{
5723 + struct bio *bio;
5724 +
5725 + for (bio = bios->head; bio; bio = bio->bi_next)
5726 + dm_rh_inc(rh_in, dm_rh_bio_to_region(rh_in, bio));
5727 +}
5728 +EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
5729 +
5730 +int dm_rh_dec(struct dm_rh_client *rh_in, region_t region)
5731 +{
5732 + int r = 0;
5733 + struct region_hash *rh = (struct region_hash *) rh_in;
5734 + struct region *reg;
5735 +
5736 + read_lock(&rh->hash_lock);
5737 + reg = __rh_lookup(rh, region);
5738 + read_unlock(&rh->hash_lock);
5739 +
5740 + BUG_ON(!reg);
5741 +
5742 + if (atomic_dec_and_test(&reg->pending)) {
5743 + unsigned long flags;
5744 +
5745 + /*
5746 + * There is no pending I/O for this region.
5747 + * We can move the region to corresponding list for next action.
5748 + * At this point, the region is not yet connected to any list.
5749 + *
5750 + * If the state is DM_RH_NOSYNC, the region should be kept off
5751 + * from clean list.
5752 + * The hash entry for DM_RH_NOSYNC will remain in memory
5753 + * until the region is recovered or the map is reloaded.
5754 + */
5755 +
5756 + spin_lock_irqsave(&rh->region_lock, flags);
5757 + if (reg->state == DM_RH_RECOVERING)
5758 + list_add_tail(&reg->list, &rh->quiesced_regions);
5759 + else {
5760 + reg->state = DM_RH_CLEAN;
5761 + list_add(&reg->list, &rh->clean_regions);
5762 + }
5763 + spin_unlock_irqrestore(&rh->region_lock, flags);
5764 +
5765 + r = 1;
5766 + }
5767 +
5768 + return r;
5769 +}
5770 +EXPORT_SYMBOL_GPL(dm_rh_dec);
5771 +
5772 +/*
5773 + * Starts quiescing a region in preparation for recovery.
5774 + */
5775 +static int __rh_recovery_prepare(struct region_hash *rh)
5776 +{
5777 + int r;
5778 + region_t region;
5779 + struct region *reg;
5780 +
5781 + /*
5782 + * Ask the dirty log what's next.
5783 + */
5784 + r = rh->log->type->get_resync_work(rh->log, &region);
5785 + if (r <= 0)
5786 + return r;
5787 +
5788 + /*
5789 + * Get this region, and start it quiescing
5790 + * by setting the recovering flag.
5791 + */
5792 + read_lock(&rh->hash_lock);
5793 + reg = __rh_find(rh, region);
5794 + read_unlock(&rh->hash_lock);
5795 +
5796 + spin_lock_irq(&rh->region_lock);
5797 +
5798 + reg->state = DM_RH_RECOVERING;
5799 +
5800 + /* Already quiesced ? */
5801 + list_del_init(&reg->list);
5802 + if (!atomic_read(&reg->pending))
5803 + list_add(&reg->list, &rh->quiesced_regions);
5804 +
5805 + spin_unlock_irq(&rh->region_lock);
5806 + return 1;
5807 +}
5808 +
5809 +int dm_rh_recovery_prepare(struct dm_rh_client *rh_in)
5810 +{
5811 + int r = 0;
5812 + struct region_hash *rh = (struct region_hash *) rh_in;
5813 +
5814 + /* Extra reference to avoid race with rh_stop_recovery */
5815 + atomic_inc(&rh->recovery_in_flight);
5816 +
5817 + while (!down_trylock(&rh->recovery_count)) {
5818 + atomic_inc(&rh->recovery_in_flight);
5819 +
5820 + if (__rh_recovery_prepare(rh) <= 0) {
5821 + atomic_dec(&rh->recovery_in_flight);
5822 + up(&rh->recovery_count);
5823 + r = -ENOENT;
5824 + break;
5825 + }
5826 + }
5827 +
5828 + /* Drop the extra reference. */
5829 + if (atomic_dec_and_test(&rh->recovery_in_flight))
5830 + r = -ESRCH;
5831 +
5832 + return r;
5833 +}
5834 +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
5835 +
5836 +/*
5837 + * Returns any quiesced regions.
5838 + */
5839 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh_in)
5840 +{
5841 + struct region_hash *rh = (struct region_hash *) rh_in;
5842 + struct region *reg = NULL;
5843 +
5844 + spin_lock_irq(&rh->region_lock);
5845 + if (!list_empty(&rh->quiesced_regions)) {
5846 + reg = list_entry(rh->quiesced_regions.next,
5847 + struct region, list);
5848 + list_del_init(&reg->list); /* Remove from the quiesced list. */
5849 + }
5850 +
5851 + spin_unlock_irq(&rh->region_lock);
5852 + return (struct dm_region *) reg;
5853 +}
5854 +EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
5855 +
5856 +/*
5857 + * Put region on list of recovered ones.
5858 + */
5859 +void dm_rh_recovery_end(struct dm_rh_client *rh_in, struct dm_region *reg_in,
5860 + int error)
5861 +{
5862 + struct region_hash *rh = (struct region_hash *) rh_in;
5863 + struct region *reg = (struct region *) reg_in;
5864 +
5865 + spin_lock_irq(&rh->region_lock);
5866 + if (error) {
5867 + reg->state = DM_RH_NOSYNC;
5868 + list_add(&reg->list, &rh->failed_recovered_regions);
5869 + } else
5870 + list_add(&reg->list, &rh->recovered_regions);
5871 +
5872 + atomic_dec(&rh->recovery_in_flight);
5873 + spin_unlock_irq(&rh->region_lock);
5874 +
5875 + rh->wake(rh->wake_context);
5876 + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0);
5877 +}
5878 +EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
5879 +
5880 +/* Return recovery in flight count. */
5881 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh_in)
5882 +{
5883 + return atomic_read(&((struct region_hash *) rh_in)->recovery_in_flight);
5884 +}
5885 +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
5886 +
5887 +int dm_rh_flush(struct dm_rh_client *rh_in)
5888 +{
5889 + struct region_hash *rh = (struct region_hash *) rh_in;
5890 +
5891 + return rh->log->type->flush(rh->log);
5892 +}
5893 +EXPORT_SYMBOL_GPL(dm_rh_flush);
5894 +
5895 +void dm_rh_delay_by_region(struct dm_rh_client *rh_in,
5896 + struct bio *bio, region_t region)
5897 +{
5898 + struct region_hash *rh = (struct region_hash *) rh_in;
5899 + struct region *reg;
5900 +
5901 + /* FIXME: locking. */
5902 + read_lock(&rh->hash_lock);
5903 + reg = __rh_find(rh, region);
5904 + bio_list_add(&reg->delayed_bios, bio);
5905 + read_unlock(&rh->hash_lock);
5906 +}
5907 +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
5908 +
5909 +void dm_rh_delay(struct dm_rh_client *rh_in, struct bio *bio)
5910 +{
5911 + return dm_rh_delay_by_region(rh_in, bio,
5912 + dm_rh_bio_to_region(rh_in, bio));
5913 +}
5914 +EXPORT_SYMBOL_GPL(dm_rh_delay);
5915 +
5916 +void dm_rh_dispatch_bios(struct dm_rh_client *rh_in,
5917 + region_t region, int error)
5918 +{
5919 + struct region_hash *rh = (struct region_hash *) rh_in;
5920 + struct region *reg;
5921 + struct bio_list delayed_bios;
5922 +
5923 + /* FIXME: locking. */
5924 + read_lock(&rh->hash_lock);
5925 + reg = __rh_find(rh, region);
5926 + BUG_ON(!reg);
5927 + delayed_bios = reg->delayed_bios;
5928 + bio_list_init(&reg->delayed_bios);
5929 + read_unlock(&rh->hash_lock);
5930 +
5931 + if (delayed_bios.head)
5932 + rh->dispatch(rh->dispatch_context, &delayed_bios, error);
5933 +
5934 + up(&rh->recovery_count);
5935 +}
5936 +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios);
5937 +
5938 +void dm_rh_stop_recovery(struct dm_rh_client *rh_in)
5939 +{
5940 + int i;
5941 + struct region_hash *rh = (struct region_hash *) rh_in;
5942 +
5943 + rh->wake(rh->wake_context);
5944 +
5945 + /* wait for any recovering regions */
5946 + for (i = 0; i < rh->max_recovery; i++)
5947 + down(&rh->recovery_count);
5948 +}
5949 +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
5950 +
5951 +void dm_rh_start_recovery(struct dm_rh_client *rh_in)
5952 +{
5953 + int i;
5954 + struct region_hash *rh = (struct region_hash *) rh_in;
5955 +
5956 + for (i = 0; i < rh->max_recovery; i++)
5957 + up(&rh->recovery_count);
5958 +
5959 + rh->wake(rh->wake_context);
5960 +}
5961 +EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
5962 +
5963 +MODULE_DESCRIPTION(DM_NAME " region hash");
5964 +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <hjm@redhat.com>");
5965 +MODULE_LICENSE("GPL");
5966 --- a/drivers/md/Kconfig
5967 +++ b/drivers/md/Kconfig
5968 @@ -269,6 +269,14 @@ config DM_DELAY
5969
5970 If unsure, say N.
5971
5972 +config DM_RAID45
5973 + tristate "RAID 4/5 target (EXPERIMENTAL)"
5974 + depends on BLK_DEV_DM && EXPERIMENTAL
5975 + ---help---
5976 + A target that supports RAID4 and RAID5 mappings.
5977 +
5978 + If unsure, say N.
5979 +
5980 config DM_UEVENT
5981 bool "DM uevents (EXPERIMENTAL)"
5982 depends on BLK_DEV_DM && EXPERIMENTAL
5983 --- a/drivers/md/Makefile
5984 +++ b/drivers/md/Makefile
5985 @@ -34,7 +34,9 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
5986 obj-$(CONFIG_DM_DELAY) += dm-delay.o
5987 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
5988 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
5989 -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
5990 +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o
5991 +obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \
5992 + dm-regions.o dm-message.o
5993 obj-$(CONFIG_DM_ZERO) += dm-zero.o
5994
5995 quiet_cmd_unroll = UNROLL $@
5996 --- /dev/null
5997 +++ b/include/linux/dm-regions.h
5998 @@ -0,0 +1,115 @@
5999 +/*
6000 + * Copyright (C) 2003 Sistina Software Limited.
6001 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6002 + *
6003 + * Device-Mapper dirty region hash interface.
6004 + *
6005 + * This file is released under the GPL.
6006 + */
6007 +
6008 +#ifndef DM_REGION_HASH_H
6009 +#define DM_REGION_HASH_H
6010 +
6011 +#include <linux/dm-dirty-log.h>
6012 +
6013 +/*-----------------------------------------------------------------
6014 + * Region hash
6015 + *----------------------------------------------------------------*/
6016 +struct dm_rh_client;
6017 +struct dm_region;
6018 +
6019 +/*
6020 + * States a region can have.
6021 + */
6022 +enum dm_rh_region_states {
6023 + DM_RH_CLEAN = 0x01, /* No writes in flight. */
6024 + DM_RH_DIRTY = 0x02, /* Writes in flight. */
6025 + DM_RH_NOSYNC = 0x04, /* Out of sync. */
6026 + DM_RH_RECOVERING = 0x08, /* Under resynchronization. */
6027 +};
6028 +
6029 +/*
6030 + * Region hash create/destroy.
6031 + */
6032 +struct bio_list;
6033 +struct dm_rh_client *dm_rh_client_create(
6034 + unsigned max_recovery,
6035 + void (*dispatch)(void *dispatch_context,
6036 + struct bio_list *bios, int error),
6037 + void *dispatch_context,
6038 + void (*wake)(void *wake_context), void *wake_context,
6039 + struct dm_dirty_log *log, uint32_t region_size,
6040 + region_t nr_regions);
6041 +void dm_rh_client_destroy(struct dm_rh_client *rh);
6042 +
6043 +/*
6044 + * Conversion fns:
6045 + *
6046 + * bio -> region
6047 + * sector -> region
6048 + * region -> sector
6049 + */
6050 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio);
6051 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector);
6052 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region);
6053 +
6054 +/*
6055 + * Functions to set a caller context in a region.
6056 + */
6057 +void *dm_rh_reg_get_context(struct dm_region *reg);
6058 +void dm_rh_reg_set_context(struct dm_region *reg, void *context);
6059 +
6060 +/*
6061 + * Get region size and key (ie. number of the region).
6062 + */
6063 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh);
6064 +sector_t dm_rh_get_region_key(struct dm_region *reg);
6065 +
6066 +/*
6067 + * Get/set/update region state (and dirty log).
6068 + *
6069 + * dm_rh_update_states
6070 + * @errors_handled != 0 influences
6071 + * that the state of the region will be kept NOSYNC
6072 + */
6073 +int dm_rh_get_state(struct dm_rh_client *rh, region_t region, int may_block);
6074 +void dm_rh_set_state(struct dm_rh_client *rh, region_t region,
6075 + enum dm_rh_region_states state, int may_block);
6076 +void dm_rh_update_states(struct dm_rh_client *rh, int errors_handled);
6077 +
6078 +/* Flush the region hash and dirty log. */
6079 +int dm_rh_flush(struct dm_rh_client *rh);
6080 +
6081 +/* Inc/dec pending count on regions. */
6082 +void dm_rh_inc(struct dm_rh_client *rh, region_t region);
6083 +void dm_rh_inc_pending(struct dm_rh_client *rh, struct bio_list *bios);
6084 +int dm_rh_dec(struct dm_rh_client *rh, region_t region);
6085 +
6086 +/* Delay bios on regions. */
6087 +void dm_rh_delay(struct dm_rh_client *rh, struct bio *bio);
6088 +void dm_rh_delay_by_region(struct dm_rh_client *rh,
6089 + struct bio *bio, region_t region);
6090 +
6091 +/*
6092 + * Normally, the region hash will automatically call the dispatch function.
6093 + * dm_rh_dispatch_bios() is for intentional dispatching of bios.
6094 + */
6095 +void dm_rh_dispatch_bios(struct dm_rh_client *rh, region_t region, int error);
6096 +
6097 +/*
6098 + * Region recovery control.
6099 + */
6100 +/* Prepare some regions for recovery by starting to quiesce them. */
6101 +int dm_rh_recovery_prepare(struct dm_rh_client *rh);
6102 +/* Try fetching a quiesced region for recovery. */
6103 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh);
6104 +/* Report recovery end on a region. */
6105 +void dm_rh_recovery_end(struct dm_rh_client *rh, struct dm_region *reg,
6106 + int error);
6107 +/* Check for amount of recoveries in flight. */
6108 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh);
6109 +/* Start/stop recovery. */
6110 +void dm_rh_stop_recovery(struct dm_rh_client *rh);
6111 +void dm_rh_start_recovery(struct dm_rh_client *rh);
6112 +
6113 +#endif /* #ifdef DM_REGION_HASH_H */