1 From: "Heinz Mauelshagen <hjm@redhat.de>
2 Subject: DMRAID45 module
3 X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/
7 This driver is used for "Fake RAID" devices.
9 Acked-by: Jeff Mahoney <jeffm@suse.com>
13 drivers/md/Kconfig | 8
14 drivers/md/Makefile | 4
15 drivers/md/dm-memcache.c | 301 ++
16 drivers/md/dm-memcache.h | 68
17 drivers/md/dm-message.c | 182 +
18 drivers/md/dm-message.h | 91
19 drivers/md/dm-raid45.c | 4516 +++++++++++++++++++++++++++++++++++++++++++++
20 drivers/md/dm-raid45.h | 28
21 drivers/md/dm-regions.c | 723 +++++++
23 include/linux/dm-regions.h | 115 +
24 11 files changed, 6036 insertions(+), 1 deletion(-)
28 @@ -1680,6 +1680,7 @@ struct gendisk *dm_disk(struct mapped_de
32 +EXPORT_SYMBOL_GPL(dm_disk);
34 int dm_suspended(struct mapped_device *md)
37 +++ b/drivers/md/dm-memcache.c
40 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
42 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
44 + * Device-mapper memory object handling:
46 + * o allocate/free total_pages in a per client page pool.
48 + * o allocate/free memory objects with chunks (1..n) of
49 + * pages_per_chunk pages hanging off.
51 + * This file is released under the GPL.
54 +#define DM_MEM_CACHE_VERSION "0.2"
57 +#include "dm-memcache.h"
58 +#include <linux/dm-io.h>
60 +struct dm_mem_cache_client {
62 + mempool_t *objs_pool;
63 + struct page_list *free_list;
66 + unsigned pages_per_chunk;
67 + unsigned free_pages;
68 + unsigned total_pages;
72 + * Free pages and page_list elements of client.
74 +static void free_cache_pages(struct page_list *list)
77 + struct page_list *pl = list;
81 + __free_page(pl->page);
87 + * Alloc number of pages and page_list elements as required by client.
89 +static struct page_list *alloc_cache_pages(unsigned pages)
91 + struct page_list *pl, *ret = NULL;
95 + page = alloc_page(GFP_NOIO);
99 + pl = kmalloc(sizeof(*pl), GFP_NOIO);
113 + free_cache_pages(ret);
118 + * Allocate page_list elements from the pool to chunks of the memory object.
120 +static void alloc_chunks(struct dm_mem_cache_client *cl,
121 + struct dm_mem_cache_object *obj)
123 + unsigned chunks = cl->chunks;
124 + unsigned long flags;
126 + local_irq_save(flags);
127 + local_irq_disable();
129 + unsigned p = cl->pages_per_chunk;
131 + obj[chunks].pl = NULL;
134 + struct page_list *pl;
136 + /* Take next element from free list */
137 + spin_lock(&cl->lock);
138 + pl = cl->free_list;
140 + cl->free_list = pl->next;
141 + spin_unlock(&cl->lock);
143 + pl->next = obj[chunks].pl;
144 + obj[chunks].pl = pl;
148 + local_irq_restore(flags);
152 + * Free page_list elements putting them back onto free list
154 +static void free_chunks(struct dm_mem_cache_client *cl,
155 + struct dm_mem_cache_object *obj)
157 + unsigned chunks = cl->chunks;
158 + unsigned long flags;
159 + struct page_list *next, *pl;
161 + local_irq_save(flags);
162 + local_irq_disable();
164 + for (pl = obj[chunks].pl; pl; pl = next) {
167 + spin_lock(&cl->lock);
168 + pl->next = cl->free_list;
169 + cl->free_list = pl;
171 + spin_unlock(&cl->lock);
175 + local_irq_restore(flags);
179 + * Create/destroy dm memory cache client resources.
181 +struct dm_mem_cache_client *
182 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
183 + unsigned pages_per_chunk)
185 + unsigned total_pages = objects * chunks * pages_per_chunk;
186 + struct dm_mem_cache_client *client;
188 + BUG_ON(!total_pages);
189 + client = kzalloc(sizeof(*client), GFP_KERNEL);
191 + return ERR_PTR(-ENOMEM);
193 + client->objs_pool = mempool_create_kmalloc_pool(objects,
194 + chunks * sizeof(struct dm_mem_cache_object));
195 + if (!client->objs_pool)
198 + client->free_list = alloc_cache_pages(total_pages);
199 + if (!client->free_list)
202 + spin_lock_init(&client->lock);
203 + client->objects = objects;
204 + client->chunks = chunks;
205 + client->pages_per_chunk = pages_per_chunk;
206 + client->free_pages = client->total_pages = total_pages;
210 + mempool_destroy(client->objs_pool);
213 + return ERR_PTR(-ENOMEM);
215 +EXPORT_SYMBOL(dm_mem_cache_client_create);
217 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
219 + BUG_ON(cl->free_pages != cl->total_pages);
220 + free_cache_pages(cl->free_list);
221 + mempool_destroy(cl->objs_pool);
224 +EXPORT_SYMBOL(dm_mem_cache_client_destroy);
227 + * Grow a clients cache by an amount of pages.
229 + * Don't call from interrupt context!
231 +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
233 + unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
234 + struct page_list *pl, *last;
237 + pl = alloc_cache_pages(pages);
245 + spin_lock_irq(&cl->lock);
246 + last->next = cl->free_list;
247 + cl->free_list = pl;
248 + cl->free_pages += pages;
249 + cl->total_pages += pages;
251 + spin_unlock_irq(&cl->lock);
253 + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
256 +EXPORT_SYMBOL(dm_mem_cache_grow);
258 +/* Shrink a clients cache by an amount of pages */
259 +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
262 + unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
263 + unsigned long flags;
264 + struct page_list *last = NULL, *pl, *pos;
268 + spin_lock_irqsave(&cl->lock, flags);
269 + pl = pos = cl->free_list;
270 + while (p-- && pos->next) {
279 + cl->free_list = pos;
280 + cl->free_pages -= pages;
281 + cl->total_pages -= pages;
285 + spin_unlock_irqrestore(&cl->lock, flags);
288 + free_cache_pages(pl);
289 + mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
294 +EXPORT_SYMBOL(dm_mem_cache_shrink);
297 + * Allocate/free a memory object
299 + * Can be called from interrupt context
301 +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
304 + unsigned pages = cl->chunks * cl->pages_per_chunk;
305 + unsigned long flags;
306 + struct dm_mem_cache_object *obj;
308 + obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
310 + return ERR_PTR(-ENOMEM);
312 + spin_lock_irqsave(&cl->lock, flags);
313 + if (pages > cl->free_pages)
316 + cl->free_pages -= pages;
317 + spin_unlock_irqrestore(&cl->lock, flags);
320 + mempool_free(obj, cl->objs_pool);
324 + alloc_chunks(cl, obj);
327 +EXPORT_SYMBOL(dm_mem_cache_alloc);
329 +void dm_mem_cache_free(struct dm_mem_cache_client *cl,
330 + struct dm_mem_cache_object *obj)
332 + free_chunks(cl, obj);
333 + mempool_free(obj, cl->objs_pool);
335 +EXPORT_SYMBOL(dm_mem_cache_free);
337 +MODULE_DESCRIPTION(DM_NAME " dm memory cache");
338 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
339 +MODULE_LICENSE("GPL");
341 +++ b/drivers/md/dm-memcache.h
344 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
346 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
348 + * Device-mapper memory object handling:
350 + * o allocate/free total_pages in a per client page pool.
352 + * o allocate/free memory objects with chunks (1..n) of
353 + * pages_per_chunk pages hanging off.
355 + * This file is released under the GPL.
358 +#ifndef _DM_MEM_CACHE_H
359 +#define _DM_MEM_CACHE_H
361 +#define DM_MEM_CACHE_H_VERSION "0.1"
364 +#include <linux/dm-io.h>
366 +static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
374 +struct dm_mem_cache_object {
375 + struct page_list *pl; /* Dynamically allocated array */
376 + void *private; /* Caller context reference */
379 +struct dm_mem_cache_client;
382 + * Create/destroy dm memory cache client resources.
384 + * On creation, a number of @objects with @chunks of
385 + * @pages_per_chunk pages will be allocated.
387 +struct dm_mem_cache_client *
388 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
389 + unsigned pages_per_chunk);
390 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
393 + * Grow/shrink a dm memory cache client resources
394 + * by @objetcs amount of objects.
396 +int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
397 +int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
400 + * Allocate/free a memory object
402 + * On allocation one object with an amount of chunks and
403 + * an amount of pages per chunk will be returned on success.
405 +struct dm_mem_cache_object *
406 +dm_mem_cache_alloc(struct dm_mem_cache_client *client);
407 +void dm_mem_cache_free(struct dm_mem_cache_client *client,
408 + struct dm_mem_cache_object *object);
412 +++ b/drivers/md/dm-message.c
415 + * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved.
417 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
419 + * General device-mapper message interface argument parser.
421 + * This file is released under the GPL.
423 + * device-mapper message parser.
428 +#include "dm-message.h"
429 +#include <linux/kernel.h>
431 +#define DM_MSG_PREFIX "dm_message"
433 +/* Basename of a path. */
434 +static inline char *
437 + char *p = strrchr(s, '/');
439 + return p ? p + 1 : s;
442 +/* Get an argument depending on type. */
444 +message_arguments(struct dm_msg *msg, int argc, char **argv)
449 + struct dm_message_argument *args = msg->spec->args;
451 + for (i = 0; i < args->num_args; i++) {
453 + unsigned long **ptr = args->ptr;
454 + enum dm_message_argument_type type = args->types[i];
457 + case dm_msg_base_t:
458 + ((char **) ptr)[i] = basename(argv[i]);
462 + ((char **) ptr)[i] = argv[i];
466 + r = sscanf(argv[i], "%d", ((int **) ptr)[i]);
469 + case dm_msg_uint_t:
470 + r = sscanf(argv[i], "%u",
471 + ((unsigned **) ptr)[i]);
474 + case dm_msg_uint64_t:
475 + r = sscanf(argv[i], "%llu",
476 + ((unsigned long long **) ptr)[i]);
480 + set_bit(dm_msg_ret_undef, &msg->ret);
481 + set_bit(dm_msg_ret_arg, &msg->ret);
488 +/* Parse message options. */
490 +message_options_parse(struct dm_msg *msg, int argc, char **argv)
493 + unsigned long *action;
494 + size_t l1 = strlen(*argv), l_hit = 0;
495 + struct dm_message_option *o = msg->spec->options;
496 + char **option, **option_end = o->options + o->num_options;
498 + for (option = o->options, action = o->actions;
499 + option < option_end; option++, action++) {
500 + size_t l2 = strlen(*option);
502 + if (!strnicmp(*argv, *option, min(l1, l2))) {
505 + set_bit(*action, &msg->action);
509 + /* Assume error. */
511 + set_bit(dm_msg_ret_option, &msg->ret);
512 + if (!hit || l1 > l_hit)
513 + set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */
515 + set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/
517 + clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */
518 + message_arguments(msg, --argc, ++argv);
523 +print_ret(const char *caller, unsigned long ret)
527 + const char *err_str;
528 + } static err_msg[] = {
529 + { dm_msg_ret_ambiguous, "message ambiguous" },
530 + { dm_msg_ret_inval, "message invalid" },
531 + { dm_msg_ret_undef, "message undefined" },
532 + { dm_msg_ret_arg, "message argument" },
533 + { dm_msg_ret_argcount, "message argument count" },
534 + { dm_msg_ret_option, "option" },
535 + }, *e = ARRAY_END(err_msg);
537 + while (e-- > err_msg) {
538 + if (test_bit(e->err, &ret))
539 + DMERR("%s %s", caller, e->err_str);
543 +/* Parse a message action. */
545 +dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
546 + int argc, char **argv)
549 + size_t l1 = strlen(*argv), l_hit = 0;
550 + struct dm_msg_spec *s, *s_hit = NULL,
551 + *s_end = msg->specs + msg->num_specs;
556 + for (s = msg->specs; s < s_end; s++) {
557 + size_t l2 = strlen(s->cmd);
559 + if (!strnicmp(*argv, s->cmd, min(l1, l2))) {
567 + if (!hit || l1 > l_hit) /* No hit or message string too long. */
568 + set_bit(dm_msg_ret_undef, &msg->ret);
569 + else if (hit > 1) /* Ambiguous message. */
570 + set_bit(dm_msg_ret_ambiguous, &msg->ret);
571 + else if (argc - 2 != s_hit->args->num_args) {
572 + set_bit(dm_msg_ret_undef, &msg->ret);
573 + set_bit(dm_msg_ret_argcount, &msg->ret);
581 + set_bit(s_hit->action, &msg->action);
582 + message_options_parse(msg, --argc, ++argv);
585 + return msg->spec->f(msg, context);
588 + print_ret(caller, msg->ret);
591 +EXPORT_SYMBOL(dm_message_parse);
593 +MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser");
594 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
595 +MODULE_LICENSE("GPL");
597 +++ b/drivers/md/dm-message.h
600 + * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved.
602 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de>
604 + * General device-mapper message interface argument parser.
606 + * This file is released under the GPL.
610 +#ifndef DM_MESSAGE_H
611 +#define DM_MESSAGE_H
613 +/* Factor out to dm.h. */
614 +/* Reference to array end. */
615 +#define ARRAY_END(a) ((a) + ARRAY_SIZE(a))
617 +/* Message return bits. */
618 +enum dm_message_return {
619 + dm_msg_ret_ambiguous, /* Action ambiguous. */
620 + dm_msg_ret_inval, /* Action invalid. */
621 + dm_msg_ret_undef, /* Action undefined. */
623 + dm_msg_ret_option, /* Option error. */
624 + dm_msg_ret_arg, /* Argument error. */
625 + dm_msg_ret_argcount, /* Argument count error. */
628 +/* Message argument type conversions. */
629 +enum dm_message_argument_type {
630 + dm_msg_base_t, /* Basename string. */
631 + dm_msg_str_t, /* String. */
632 + dm_msg_int_t, /* Signed int. */
633 + dm_msg_uint_t, /* Unsigned int. */
634 + dm_msg_uint64_t, /* Unsigned int 64. */
637 +/* A message option. */
638 +struct dm_message_option {
639 + unsigned num_options;
641 + unsigned long *actions;
644 +/* Message arguments and types. */
645 +struct dm_message_argument {
647 + unsigned long **ptr;
648 + enum dm_message_argument_type types[];
651 +/* Client message. */
653 + unsigned long action; /* Identified action. */
654 + unsigned long ret; /* Return bits. */
655 + unsigned num_specs; /* # of sepcifications listed. */
656 + struct dm_msg_spec *specs; /* Specification list. */
657 + struct dm_msg_spec *spec; /* Specification selected. */
660 +/* Secification of the message. */
661 +struct dm_msg_spec {
662 + const char *cmd; /* Name of the command (i.e. 'bandwidth'). */
663 + unsigned long action;
664 + struct dm_message_option *options;
665 + struct dm_message_argument *args;
666 + unsigned long parm; /* Parameter to pass through to callback. */
667 + /* Function to process for action. */
668 + int (*f) (struct dm_msg *msg, void *context);
671 +/* Parameter access macros. */
672 +#define DM_MSG_PARM(msg) ((msg)->spec->parm)
674 +#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx])
675 +#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx])
676 +#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx))
677 +#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx])
679 +#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0)
680 +#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0)
681 +#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0)
682 +#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0)
685 +/* Parse a message and its options and optionally call a function back. */
686 +int dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
687 + int argc, char **argv);
691 +++ b/drivers/md/dm-raid45.c
694 + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
696 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
698 + * This file is released under the GPL.
701 + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
704 + * o RAID4 with dedicated and selectable parity device
705 + * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
706 + * o run time optimization of xor algorithm used to calculate parity
709 + * Thanks to MD for:
710 + * o the raid address calculation algorithm
711 + * o the base of the biovec <-> page list copier.
714 + * Uses region hash to keep track of how many writes are in flight to
715 + * regions in order to use dirty log to keep state of regions to recover:
717 + * o clean regions (those which are synchronized
718 + * and don't have write io in flight)
719 + * o dirty regions (those with write io in flight)
722 + * On startup, any dirty regions are migrated to the 'nosync' state
723 + * and are subject to recovery by the daemon.
725 + * See raid_ctr() for table definition.
729 + * o add virtual interface for locking
730 + * o remove instrumentation (REMOVEME:)
734 +static const char *version = "v0.2431";
737 +#include "dm-bio-list.h"
738 +#include "dm-memcache.h"
739 +#include "dm-message.h"
740 +#include "dm-raid45.h"
742 +#include <linux/kernel.h>
743 +#include <linux/vmalloc.h>
745 +#include <linux/dm-io.h>
746 +#include <linux/dm-dirty-log.h>
747 +#include <linux/dm-regions.h>
749 +/* # of parallel recovered regions */
750 +/* FIXME: cope with multiple recovery stripes in raid_set struct. */
751 +#define MAX_RECOVER 1 /* needs to be 1! */
754 + * Configurable parameters
758 +/* Default # of stripes if not set in constructor. */
761 +/* Minimum/maximum # of selectable stripes. */
762 +#define STRIPES_MIN 8
763 +#define STRIPES_MAX 16384
765 +/* Default chunk size in sectors if not set in constructor. */
766 +#define CHUNK_SIZE 64
768 +/* Default io size in sectors if not set in constructor. */
769 +#define IO_SIZE_MIN SECTORS_PER_PAGE
770 +#define IO_SIZE IO_SIZE_MIN
772 +/* Maximum setable chunk size in sectors. */
773 +#define CHUNK_SIZE_MAX 16384
775 +/* Recover io size default in sectors. */
776 +#define RECOVER_IO_SIZE_MIN 64
777 +#define RECOVER_IO_SIZE 256
779 +/* Default percentage recover io bandwidth. */
780 +#define BANDWIDTH 10
781 +#define BANDWIDTH_MIN 1
782 +#define BANDWIDTH_MAX 100
784 + * END Configurable parameters
787 +#define TARGET "dm-raid45"
788 +#define DAEMON "kraid45d"
789 +#define DM_MSG_PREFIX TARGET
791 +#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
793 +/* Amount/size for __xor(). */
794 +#define SECTORS_PER_XOR SECTORS_PER_PAGE
795 +#define XOR_SIZE PAGE_SIZE
797 +/* Derive raid_set from stripe_cache pointer. */
798 +#define RS(x) container_of(x, struct raid_set, sc)
800 +/* Check value in range. */
801 +#define range_ok(i, min, max) (i >= min && i <= max)
803 +/* Page reference. */
804 +#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
806 +/* Bio list reference. */
807 +#define BL(stripe, p, rw) (stripe->ss[p].bl + rw)
809 +/* Page list reference. */
810 +#define PL(stripe, p) (stripe->obj[p].pl)
812 +/* Check argument is power of 2. */
813 +#define POWER_OF_2(a) (!(a & (a - 1)))
815 +/* Factor out to dm-bio-list.h */
816 +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
818 + bio->bi_next = bl->head;
825 +/* Factor out to dm.h */
826 +#define TI_ERR_RET(str, ret) \
827 + do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
828 +#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
830 +/*-----------------------------------------------------------------
833 + * Cache for all reads and writes to raid sets (operational or degraded)
835 + * We need to run all data to and from a RAID set through this cache,
836 + * because parity chunks need to get calculated from data chunks
837 + * or, in the degraded/resynchronization case, missing chunks need
838 + * to be reconstructed using the other chunks of the stripe.
839 + *---------------------------------------------------------------*/
840 +/* Protect kmem cache # counter. */
841 +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
843 +/* A stripe set (holds bios hanging off). */
845 + struct stripe *stripe; /* Backpointer to stripe for endio(). */
846 + struct bio_list bl[3]; /* Reads, writes, and writes merged. */
847 +#define WRITE_MERGED 2
850 +#if READ != 0 || WRITE != 1
851 +#error dm-raid45: READ/WRITE != 0/1 used as index!!!
855 + * Stripe linked list indexes. Keep order, because the stripe
856 + * and the stripe cache rely on the first 3!
859 + LIST_IO = 0, /* Stripes with io pending. */
860 + LIST_ENDIO, /* Stripes to endio. */
861 + LIST_LRU, /* Least recently used stripes. */
862 + LIST_HASH, /* Hashed stripes. */
863 + LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
864 + NR_LISTS, /* To size array in struct stripe. */
868 + LOCK_ENDIO = 0, /* Protect endio list. */
869 + LOCK_LRU, /* Protect lru list. */
870 + NR_LOCKS, /* To size array in struct stripe_cache. */
873 +/* A stripe: the io object to handle all reads and writes to a RAID set. */
875 + struct stripe_cache *sc; /* Backpointer to stripe cache. */
877 + sector_t key; /* Hash key. */
878 + sector_t region; /* Region stripe is mapped to. */
880 + /* Reference count. */
884 + unsigned long flags; /* flags (see below). */
887 + * Pending ios in flight:
889 + * used as a 'lock' to control move of stripe to endio list
891 + atomic_t pending; /* Pending ios in flight. */
893 + /* Sectors to read and write for multi page stripe sets. */
897 + /* Lock on stripe (for clustering). */
902 + * o io list to flush io
904 + * o LRU list to put stripes w/o reference count on
905 + * o stripe cache hash
907 + struct list_head lists[NR_LISTS];
910 + unsigned short parity; /* Parity chunk index. */
911 + short recover; /* Recovery chunk index. */
914 + /* This sets memory cache object (dm-mem-cache). */
915 + struct dm_mem_cache_object *obj;
917 + /* Array of stripe sets (dynamically allocated). */
918 + struct stripe_set ss[0];
921 +/* States stripes can be in (flags field). */
922 +enum stripe_states {
923 + STRIPE_ACTIVE, /* Active io on stripe. */
924 + STRIPE_ERROR, /* io error on stripe. */
925 + STRIPE_MERGED, /* Writes got merged. */
926 + STRIPE_READ, /* Read. */
927 + STRIPE_RBW, /* Read-before-write. */
928 + STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */
929 + STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
932 +/* ... and macros to access them. */
933 +#define BITOPS(name, what, var, flag) \
934 +static inline int TestClear ## name ## what(struct var *v) \
935 +{ return test_and_clear_bit(flag, &v->io.flags); } \
936 +static inline int TestSet ## name ## what(struct var *v) \
937 +{ return test_and_set_bit(flag, &v->io.flags); } \
938 +static inline void Clear ## name ## what(struct var *v) \
939 +{ clear_bit(flag, &v->io.flags); } \
940 +static inline void Set ## name ## what(struct var *v) \
941 +{ set_bit(flag, &v->io.flags); } \
942 +static inline int name ## what(struct var *v) \
943 +{ return test_bit(flag, &v->io.flags); }
946 +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
947 +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
948 +BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
949 +BITOPS(Stripe, Read, stripe, STRIPE_READ)
950 +BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
951 +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
952 +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
954 +/* A stripe hash. */
955 +struct stripe_hash {
956 + struct list_head *hash;
963 +/* A stripe cache. */
964 +struct stripe_cache {
966 + struct stripe_hash hash;
968 + /* Stripes with io to flush, stripes to endio and LRU lists. */
969 + struct list_head lists[3];
971 + /* Locks to protect endio and lru lists. */
972 + spinlock_t locks[NR_LOCKS];
974 + /* Slab cache to allocate stripes from. */
976 + struct kmem_cache *cache; /* Cache itself. */
977 + char name[32]; /* Unique name. */
980 + struct dm_io_client *dm_io_client; /* dm-io client resource context. */
982 + /* dm-mem-cache client resource context. */
983 + struct dm_mem_cache_client *mem_cache_client;
985 + int stripes_parm; /* # stripes parameter from constructor. */
986 + atomic_t stripes; /* actual # of stripes in cache. */
987 + atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
988 + atomic_t stripes_last; /* last # of stripes in cache. */
989 + atomic_t active_stripes; /* actual # of active stripes in cache. */
992 + atomic_t max_active_stripes; /* actual # of active stripes in cache. */
995 +/* Flag specs for raid_dev */ ;
996 +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
998 +/* The raid device in a set. */
1000 + struct dm_dev *dev;
1001 + unsigned long flags; /* raid_dev_flags. */
1002 + sector_t start; /* offset to map to. */
1005 +/* Flags spec for raid_set. */
1006 +enum raid_set_flags {
1007 + RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
1008 + RS_DEAD, /* RAID set inoperational. */
1009 + RS_DEVEL_STATS, /* REMOVEME: display status information. */
1010 + RS_IO_ERROR, /* io error on set. */
1011 + RS_RECOVER, /* Do recovery. */
1012 + RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
1013 + RS_REGION_GET, /* get a region to recover. */
1014 + RS_SC_BUSY, /* stripe cache busy -> send an event. */
1015 + RS_SUSPENDED, /* RAID set suspendedn. */
1018 +/* REMOVEME: devel stats counters. */
1021 + S_BIOS_ADDED_READ,
1022 + S_BIOS_ENDIO_READ,
1024 + S_BIOS_ADDED_WRITE,
1025 + S_BIOS_ENDIO_WRITE,
1034 + S_BIO_COPY_PL_NEXT,
1043 + S_MERGE_PAGE_LOCKED,
1050 + S_RECONSTRUCT_DEV,
1054 + S_SUM_DELAYED_BIOS,
1056 + S_NR_STATS, /* # of stats counters. */
1059 +/* Status type -> string mappings. */
1061 + const enum stats_types type;
1065 +static struct stats_map stats_map[] = {
1066 + { S_BIOS_READ, "r=" },
1067 + { S_BIOS_ADDED_READ, "/" },
1068 + { S_BIOS_ENDIO_READ, "/" },
1069 + { S_BIOS_WRITE, " w=" },
1070 + { S_BIOS_ADDED_WRITE, "/" },
1071 + { S_BIOS_ENDIO_WRITE, "/" },
1072 + { S_DM_IO_READ, " rc=" },
1073 + { S_DM_IO_WRITE, " wc=" },
1074 + { S_ACTIVE_READS, " active_reads=" },
1075 + { S_BANDWIDTH, " bandwidth=" },
1076 + { S_NO_BANDWIDTH, " no_bandwidth=" },
1077 + { S_BARRIER, " barrier=" },
1078 + { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
1079 + { S_CAN_MERGE, " can_merge=" },
1080 + { S_MERGE_PAGE_LOCKED, "/page_locked=" },
1081 + { S_CANT_MERGE, "/cant_merge=" },
1082 + { S_CONGESTED, " congested=" },
1083 + { S_NOT_CONGESTED, "/not_congested=" },
1084 + { S_DEGRADED, " degraded=" },
1085 + { S_DELAYED_BIOS, " delayed_bios=" },
1086 + { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
1087 + { S_EVICT, " evict=" },
1088 + { S_FLUSHS, " flushs=" },
1089 + { S_HITS_1ST, " hits_1st=" },
1090 + { S_IOS_POST, " ios_post=" },
1091 + { S_INSCACHE, " inscache=" },
1092 + { S_MAX_LOOKUP, " max_lookup=" },
1093 + { S_NO_RW, " no_rw=" },
1094 + { S_NOSYNC, " nosync=" },
1095 + { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
1096 + { S_RECONSTRUCT_EI, " reconstruct_ei=" },
1097 + { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
1098 + { S_REDO, " redo=" },
1099 + { S_REQUEUE, " requeue=" },
1100 + { S_STRIPE_ERROR, " stripe_error=" },
1101 + { S_XORS, " xors=" },
1107 +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
1109 + struct dm_target *ti; /* Target pointer. */
1112 + unsigned long flags; /* State flags. */
1113 + spinlock_t in_lock; /* Protects central input list below. */
1114 + struct bio_list in; /* Pending ios (central input list). */
1115 + struct bio_list work; /* ios work set. */
1116 + wait_queue_head_t suspendq; /* suspend synchronization. */
1117 + atomic_t in_process; /* counter of queued bios (suspendq). */
1118 + atomic_t in_process_max;/* counter of queued bios max. */
1121 + struct workqueue_struct *wq;
1122 + struct delayed_work dws;
1125 + /* External locking. */
1126 + struct dm_raid45_locking_type *locking;
1128 + struct stripe_cache sc; /* Stripe cache for this set. */
1130 + /* Xor optimization. */
1132 + struct xor_func *f;
1137 + /* Recovery parameters. */
1139 + struct dm_dirty_log *dl; /* Dirty log. */
1140 + struct dm_rh_client *rh; /* Region hash. */
1142 + /* dm-mem-cache client resource context for recovery stripes. */
1143 + struct dm_mem_cache_client *mem_cache_client;
1145 + struct list_head stripes; /* List of recovery stripes. */
1147 + region_t nr_regions;
1148 + region_t nr_regions_to_recover;
1149 + region_t nr_regions_recovered;
1150 + unsigned long start_jiffies;
1151 + unsigned long end_jiffies;
1153 + unsigned bandwidth; /* Recovery bandwidth [%]. */
1154 + unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
1155 + unsigned bandwidth_parm; /* " constructor parm. */
1156 + unsigned io_size; /* io size <= chunk size. */
1157 + unsigned io_size_parm; /* io size ctr parameter. */
1159 + /* recovery io throttling. */
1160 + atomic_t io_count[2]; /* counter recover/regular io. */
1161 + unsigned long last_jiffies;
1163 + struct dm_region *reg; /* Actual region to recover. */
1164 + sector_t pos; /* Position within region to recover. */
1165 + sector_t end; /* End of region to recover. */
1168 + /* RAID set parameters. */
1170 + struct raid_type *raid_type; /* RAID type (eg, RAID4). */
1171 + unsigned raid_parms; /* # variable raid parameters. */
1173 + unsigned chunk_size; /* Sectors per chunk. */
1174 + unsigned chunk_size_parm;
1175 + unsigned chunk_mask; /* Mask for amount. */
1176 + unsigned chunk_shift; /* rsector chunk size shift. */
1178 + unsigned io_size; /* Sectors per io. */
1179 + unsigned io_size_parm;
1180 + unsigned io_mask; /* Mask for amount. */
1181 + unsigned io_shift_mask; /* Mask for raid_address(). */
1182 + unsigned io_shift; /* rsector io size shift. */
1183 + unsigned pages_per_io; /* Pages per io. */
1185 + sector_t sectors_per_dev; /* Sectors per device. */
1187 + atomic_t failed_devs; /* Amount of devices failed. */
1189 + /* Index of device to initialize. */
1191 + int dev_to_init_parm;
1193 + /* Raid devices dynamically allocated. */
1194 + unsigned raid_devs; /* # of RAID devices below. */
1195 + unsigned data_devs; /* # of RAID data devices. */
1197 + int ei; /* index of failed RAID device. */
1199 + /* index of dedicated parity device (i.e. RAID4). */
1201 + int pi_parm; /* constructor parm for status output. */
1204 + /* REMOVEME: devel stats counters. */
1205 + atomic_t stats[S_NR_STATS];
1207 + /* Dynamically allocated temporary pointers for xor(). */
1208 + unsigned long **data;
1210 + /* Dynamically allocated RAID devices. Alignment? */
1211 + struct raid_dev dev[0];
1215 +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
1216 +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
1217 +BITOPS(RS, Dead, raid_set, RS_DEAD)
1218 +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
1219 +BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
1220 +BITOPS(RS, Recover, raid_set, RS_RECOVER)
1221 +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
1222 +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
1223 +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
1226 +#define PageIO(page) PageChecked(page)
1227 +#define AllowPageIO(page) SetPageChecked(page)
1228 +#define ProhibitPageIO(page) ClearPageChecked(page)
1230 +/*-----------------------------------------------------------------
1231 + * Raid-4/5 set structures.
1232 + *---------------------------------------------------------------*/
1233 +/* RAID level definitions. */
1239 +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
1240 +enum raid_algorithm {
1249 + const char *name; /* RAID algorithm. */
1250 + const char *descr; /* Descriptor text for logging. */
1251 + const unsigned parity_devs; /* # of parity devices. */
1252 + const unsigned minimal_devs; /* minimal # of devices in set. */
1253 + const enum raid_level level; /* RAID level. */
1254 + const enum raid_algorithm algorithm; /* RAID algorithm. */
1257 +/* Supported raid types and properties. */
1258 +static struct raid_type raid_types[] = {
1259 + {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
1260 + {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
1261 + {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
1262 + {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
1263 + {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
1266 +/* Address as calculated by raid_address(). */
1268 + sector_t key; /* Hash key (start address of stripe). */
1269 + unsigned di, pi; /* Data and parity disks index. */
1272 +/* REMOVEME: reset statistics counters. */
1273 +static void stats_reset(struct raid_set *rs)
1275 + unsigned s = S_NR_STATS;
1278 + atomic_set(rs->stats + s, 0);
1281 +/*----------------------------------------------------------------
1282 + * RAID set management routines.
1283 + *--------------------------------------------------------------*/
1285 + * Begin small helper functions.
1287 +/* Queue (optionally delayed) io work. */
1288 +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
1290 + struct delayed_work *dws = &rs->io.dws;
1292 + cancel_delayed_work(dws);
1293 + queue_delayed_work(rs->io.wq, dws, delay);
1296 +/* Queue io work immediately (called from region hash too). */
1297 +static INLINE void wake_do_raid(void *context)
1299 + wake_do_raid_delayed(context, 0);
1302 +/* Wait until all io has been processed. */
1303 +static INLINE void wait_ios(struct raid_set *rs)
1305 + wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
1308 +/* Declare io queued to device. */
1309 +static INLINE void io_dev_queued(struct raid_dev *dev)
1311 + set_bit(IO_QUEUED, &dev->flags);
1314 +/* Io on device and reset ? */
1315 +static inline int io_dev_clear(struct raid_dev *dev)
1317 + return test_and_clear_bit(IO_QUEUED, &dev->flags);
1320 +/* Get an io reference. */
1321 +static INLINE void io_get(struct raid_set *rs)
1323 + int p = atomic_inc_return(&rs->io.in_process);
1325 + if (p > atomic_read(&rs->io.in_process_max))
1326 + atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
1329 +/* Put the io reference and conditionally wake io waiters. */
1330 +static INLINE void io_put(struct raid_set *rs)
1332 + /* Intel: rebuild data corrupter? */
1333 + if (!atomic_read(&rs->io.in_process)) {
1334 + DMERR("%s would go negative!!!", __func__);
1338 + if (atomic_dec_and_test(&rs->io.in_process))
1339 + wake_up(&rs->io.suspendq);
1342 +/* Calculate device sector offset. */
1343 +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
1345 + sector_t sector = bio->bi_sector;
1347 + sector_div(sector, rs->set.data_devs);
1351 +/* Test device operational. */
1352 +static INLINE int dev_operational(struct raid_set *rs, unsigned p)
1354 + return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
1357 +/* Return # of active stripes in stripe cache. */
1358 +static INLINE int sc_active(struct stripe_cache *sc)
1360 + return atomic_read(&sc->active_stripes);
1363 +/* Test io pending on stripe. */
1364 +static INLINE int stripe_io(struct stripe *stripe)
1366 + return atomic_read(&stripe->io.pending);
1369 +static INLINE void stripe_io_inc(struct stripe *stripe)
1371 + atomic_inc(&stripe->io.pending);
1374 +static INLINE void stripe_io_dec(struct stripe *stripe)
1376 + atomic_dec(&stripe->io.pending);
1379 +/* Wrapper needed by for_each_io_dev(). */
1380 +static void _stripe_io_inc(struct stripe *stripe, unsigned p)
1382 + stripe_io_inc(stripe);
1385 +/* Error a stripe. */
1386 +static INLINE void stripe_error(struct stripe *stripe, struct page *page)
1388 + SetStripeError(stripe);
1389 + SetPageError(page);
1390 + atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
1393 +/* Page IOed ok. */
1394 +enum dirty_type { CLEAN, DIRTY };
1395 +static INLINE void page_set(struct page *page, enum dirty_type type)
1399 + SetPageDirty(page);
1400 + AllowPageIO(page);
1404 + ClearPageDirty(page);
1411 + SetPageUptodate(page);
1412 + ClearPageError(page);
1415 +/* Return region state for a sector. */
1417 +region_state(struct raid_set *rs, sector_t sector, unsigned long state)
1419 + struct dm_rh_client *rh = rs->recover.rh;
1421 + return RSRecover(rs) ?
1422 + (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
1426 +/* Check maximum devices which may fail in a raid set. */
1427 +static inline int raid_set_degraded(struct raid_set *rs)
1429 + return RSIoError(rs);
1432 +/* Check # of devices which may fail in a raid set. */
1433 +static INLINE int raid_set_operational(struct raid_set *rs)
1435 + /* Too many failed devices -> BAD. */
1436 + return atomic_read(&rs->set.failed_devs) <=
1437 + rs->set.raid_type->parity_devs;
1441 + * Return true in case a page_list should be read/written
1443 + * Conditions to read/write:
1444 + * o 1st page in list not uptodate
1445 + * o 1st page in list dirty
1446 + * o if we optimized io away, we flag it using the pages checked bit.
1448 +static INLINE unsigned page_io(struct page *page)
1450 + /* Optimization: page was flagged to need io during first run. */
1451 + if (PagePrivate(page)) {
1452 + ClearPagePrivate(page);
1456 + /* Avoid io if prohibited or a locked page. */
1457 + if (!PageIO(page) || PageLocked(page))
1460 + if (!PageUptodate(page) || PageDirty(page)) {
1461 + /* Flag page needs io for second run optimization. */
1462 + SetPagePrivate(page);
1469 +/* Call a function on each page list needing io. */
1470 +static INLINE unsigned
1471 +for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
1472 + void (*f_io)(struct stripe *stripe, unsigned p))
1474 + unsigned p = rs->set.raid_devs, r = 0;
1477 + if (page_io(PAGE(stripe, p))) {
1486 +/* Reconstruct a particular device ?. */
1487 +static INLINE int dev_to_init(struct raid_set *rs)
1489 + return rs->set.dev_to_init > -1;
1493 + * Index of device to calculate parity on.
1494 + * Either the parity device index *or* the selected device to init
1495 + * after a spare replacement.
1497 +static INLINE unsigned dev_for_parity(struct stripe *stripe)
1499 + struct raid_set *rs = RS(stripe->sc);
1501 + return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
1504 +/* Return the index of the device to be recovered. */
1505 +static int idx_get(struct raid_set *rs)
1507 + /* Avoid to read in the pages to be reconstructed anyway. */
1508 + if (dev_to_init(rs))
1509 + return rs->set.dev_to_init;
1510 + else if (rs->set.raid_type->level == raid4)
1511 + return rs->set.pi;
1516 +/* RAID set congested function. */
1517 +static int raid_set_congested(void *congested_data, int bdi_bits)
1519 + struct raid_set *rs = congested_data;
1520 + int r = 0; /* Assume uncongested. */
1521 + unsigned p = rs->set.raid_devs;
1523 + /* If any of our component devices are overloaded. */
1525 + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
1527 + r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1530 + /* REMOVEME: statistics. */
1531 + atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
1535 +/* Display RAID set dead message once. */
1536 +static void raid_set_dead(struct raid_set *rs)
1538 + if (!TestSetRSDead(rs)) {
1540 + char buf[BDEVNAME_SIZE];
1542 + DMERR("FATAL: too many devices failed -> RAID set dead");
1544 + for (p = 0; p < rs->set.raid_devs; p++) {
1545 + if (!dev_operational(rs, p))
1546 + DMERR("device /dev/%s failed",
1547 + bdevname(rs->dev[p].dev->bdev, buf));
1552 +/* RAID set degrade check. */
1554 +raid_set_check_and_degrade(struct raid_set *rs,
1555 + struct stripe *stripe, unsigned p)
1557 + if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
1560 + /* Through an event in case of member device errors. */
1561 + dm_table_event(rs->ti->table);
1562 + atomic_inc(&rs->set.failed_devs);
1564 + /* Only log the first member error. */
1565 + if (!TestSetRSIoError(rs)) {
1566 + char buf[BDEVNAME_SIZE];
1568 + /* Store index for recovery. */
1573 + DMERR("CRITICAL: %sio error on device /dev/%s "
1574 + "in region=%llu; DEGRADING RAID set",
1575 + stripe ? "" : "FAKED ",
1576 + bdevname(rs->dev[p].dev->bdev, buf),
1577 + (unsigned long long) (stripe ? stripe->key : 0));
1578 + DMERR("further device error messages suppressed");
1585 +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
1587 + unsigned p = rs->set.raid_devs;
1590 + struct page *page = PAGE(stripe, p);
1592 + if (PageError(page)) {
1593 + ClearPageError(page);
1594 + raid_set_check_and_degrade(rs, stripe, p);
1599 +/* RAID set upgrade check. */
1600 +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
1602 + if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
1605 + if (atomic_dec_and_test(&rs->set.failed_devs)) {
1606 + ClearRSIoError(rs);
1613 +/* Lookup a RAID device by name or by major:minor number. */
1615 + const char *dev_name;
1616 + struct raid_dev *dev;
1618 +enum lookup_type { byname, bymajmin, bynumber };
1619 +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
1620 + union dev_lookup *dl)
1625 + * Must be an incremental loop, because the device array
1626 + * can have empty slots still on calls from raid_ctr()
1628 + for (p = 0; p < rs->set.raid_devs; p++) {
1629 + char buf[BDEVNAME_SIZE];
1630 + struct raid_dev *dev = rs->dev + p;
1635 + /* Format dev string appropriately if necessary. */
1637 + bdevname(dev->dev->bdev, buf);
1638 + else if (by == bymajmin)
1639 + format_dev_t(buf, dev->dev->bdev->bd_dev);
1641 + /* Do the actual check. */
1642 + if (by == bynumber) {
1643 + if (dl->dev->dev->bdev->bd_dev ==
1644 + dev->dev->bdev->bd_dev)
1646 + } else if (!strcmp(dl->dev_name, buf))
1653 +/* End io wrapper. */
1655 +_bio_endio(struct raid_set *rs, struct bio *bio, int error)
1657 + /* REMOVEME: statistics. */
1658 + atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
1659 + S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
1660 + bio_endio(bio, error);
1661 + io_put(rs); /* Wake any suspend waiters. */
1665 + * End small helper functions.
1670 + * Stripe hash functions
1672 +/* Initialize/destroy stripe hash. */
1673 +static int hash_init(struct stripe_hash *hash, unsigned stripes)
1675 + unsigned buckets = 2, max_buckets = stripes / 4;
1676 + unsigned hash_primes[] = {
1677 + /* Table of primes for hash_fn/table size optimization. */
1678 + 3, 7, 13, 27, 53, 97, 193, 389, 769,
1679 + 1543, 3079, 6151, 12289, 24593,
1682 + /* Calculate number of buckets (2^^n <= stripes / 4). */
1683 + while (buckets < max_buckets)
1686 + /* Allocate stripe hash. */
1687 + hash->hash = vmalloc(buckets * sizeof(*hash->hash));
1691 + hash->buckets = buckets;
1692 + hash->mask = buckets - 1;
1693 + hash->shift = ffs(buckets);
1694 + if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
1695 + hash->shift = ARRAY_SIZE(hash_primes) + 1;
1697 + BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
1698 + hash->prime = hash_primes[hash->shift - 2];
1700 + /* Initialize buckets. */
1702 + INIT_LIST_HEAD(hash->hash + buckets);
1707 +static INLINE void hash_exit(struct stripe_hash *hash)
1710 + vfree(hash->hash);
1711 + hash->hash = NULL;
1715 +/* List add (head/tail/locked/unlocked) inlines. */
1716 +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
1717 +#define LIST_DEL(name, list) \
1718 +static void stripe_ ## name ## _del(struct stripe *stripe, \
1719 + enum list_lock_type lock) { \
1720 + struct list_head *lh = stripe->lists + (list); \
1721 + spinlock_t *l = NULL; \
1723 + if (lock == LIST_LOCKED) { \
1724 + l = stripe->sc->locks + LOCK_LRU; \
1725 + spin_lock_irq(l); \
1729 + if (!list_empty(lh)) \
1730 + list_del_init(lh); \
1732 + if (lock == LIST_LOCKED) \
1733 + spin_unlock_irq(l); \
1736 +LIST_DEL(hash, LIST_HASH)
1737 +LIST_DEL(lru, LIST_LRU)
1740 +enum list_pos_type { POS_HEAD, POS_TAIL };
1741 +#define LIST_ADD(name, list) \
1742 +static void stripe_ ## name ## _add(struct stripe *stripe, \
1743 + enum list_pos_type pos, \
1744 + enum list_lock_type lock) { \
1745 + struct list_head *lh = stripe->lists + (list); \
1746 + struct stripe_cache *sc = stripe->sc; \
1747 + spinlock_t *l = NULL; \
1749 + if (lock == LIST_LOCKED) { \
1750 + l = sc->locks + LOCK_LRU; \
1751 + spin_lock_irq(l); \
1754 + if (list_empty(lh)) { \
1755 + if (pos == POS_HEAD) \
1756 + list_add(lh, sc->lists + (list)); \
1758 + list_add_tail(lh, sc->lists + (list)); \
1761 + if (lock == LIST_LOCKED) \
1762 + spin_unlock_irq(l); \
1765 +LIST_ADD(endio, LIST_ENDIO)
1766 +LIST_ADD(io, LIST_IO)
1767 +LIST_ADD(lru, LIST_LRU)
1770 +#define POP(list) \
1772 + if (list_empty(sc->lists + list)) \
1775 + stripe = list_first_entry(&sc->lists[list], \
1778 + list_del_init(&stripe->lists[list]); \
1782 +/* Pop an available stripe off the lru list. */
1783 +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1785 + struct stripe *stripe;
1786 + spinlock_t *lock = sc->locks + LOCK_LRU;
1788 + spin_lock_irq(lock);
1790 + spin_unlock_irq(lock);
1793 + /* Remove from hash before reuse. */
1794 + stripe_hash_del(stripe, LIST_UNLOCKED);
1799 +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
1801 + return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
1804 +static inline struct list_head *
1805 +hash_bucket(struct stripe_hash *hash, sector_t key)
1807 + return hash->hash + hash_fn(hash, key);
1810 +/* Insert an entry into a hash. */
1811 +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
1813 + list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
1816 +/* Insert an entry into the stripe hash. */
1818 +sc_insert(struct stripe_cache *sc, struct stripe *stripe)
1820 + hash_insert(&sc->hash, stripe);
1823 +/* Lookup an entry in the stripe hash. */
1824 +static inline struct stripe *
1825 +stripe_lookup(struct stripe_cache *sc, sector_t key)
1828 + struct stripe *stripe;
1829 + struct list_head *bucket = hash_bucket(&sc->hash, key);
1831 + list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
1832 + /* REMOVEME: statisics. */
1833 + if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1834 + atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
1836 + if (stripe->key == key)
1843 +/* Resize the stripe cache hash on size changes. */
1844 +static int hash_resize(struct stripe_cache *sc)
1846 + /* Resize threshold reached? */
1847 + if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
1848 + || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
1850 + struct stripe_hash hash, hash_tmp;
1853 + r = hash_init(&hash, atomic_read(&sc->stripes));
1857 + lock = sc->locks + LOCK_LRU;
1858 + spin_lock_irq(lock);
1859 + if (sc->hash.hash) {
1860 + unsigned b = sc->hash.buckets;
1861 + struct list_head *pos, *tmp;
1863 + /* Walk old buckets and insert into new. */
1865 + list_for_each_safe(pos, tmp, sc->hash.hash + b)
1866 + hash_insert(&hash,
1867 + list_entry(pos, struct stripe,
1868 + lists[LIST_HASH]));
1873 + memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
1874 + memcpy(&sc->hash, &hash, sizeof(sc->hash));
1875 + atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1876 + spin_unlock_irq(lock);
1878 + hash_exit(&hash_tmp);
1885 + * Stripe cache locking functions
1887 +/* Dummy lock function for local RAID4+5. */
1888 +static void *no_lock(sector_t key, enum dm_lock_type type)
1893 +/* Dummy unlock function for local RAID4+5. */
1894 +static void no_unlock(void *lock_handle)
1898 +/* No locking (for local RAID 4+5). */
1899 +static struct dm_raid45_locking_type locking_none = {
1901 + .unlock = no_unlock,
1904 +/* Clustered RAID 4+5. */
1905 +/* FIXME: code this. */
1906 +static struct dm_raid45_locking_type locking_cluster = {
1908 + .unlock = no_unlock,
1911 +/* Lock a stripe (for clustering). */
1913 +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
1915 + stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
1917 + return stripe->lock ? 0 : -EPERM;
1920 +/* Unlock a stripe (for clustering). */
1921 +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
1923 + rs->locking->unlock(stripe->lock);
1924 + stripe->lock = NULL;
1928 + * Stripe cache functions.
1931 + * Invalidate all page lists pages of a stripe.
1933 + * I only keep state for the whole list in the first page.
1936 +stripe_pages_invalidate(struct stripe *stripe)
1938 + unsigned p = RS(stripe->sc)->set.raid_devs;
1941 + struct page *page = PAGE(stripe, p);
1943 + ProhibitPageIO(page);
1944 + ClearPageChecked(page);
1945 + ClearPageDirty(page);
1946 + ClearPageError(page);
1947 + clear_page_locked(page);
1948 + ClearPagePrivate(page);
1949 + ClearPageUptodate(page);
1953 +/* Prepare stripe for (re)use. */
1954 +static INLINE void stripe_invalidate(struct stripe *stripe)
1956 + stripe->io.flags = 0;
1957 + stripe_pages_invalidate(stripe);
1960 +/* Allow io on all chunks of a stripe. */
1961 +static INLINE void stripe_allow_io(struct stripe *stripe)
1963 + unsigned p = RS(stripe->sc)->set.raid_devs;
1966 + AllowPageIO(PAGE(stripe, p));
1969 +/* Initialize a stripe. */
1971 +stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1973 + unsigned p = RS(sc)->set.raid_devs;
1976 + /* Work all io chunks. */
1978 + struct stripe_set *ss = stripe->ss + p;
1980 + stripe->obj[p].private = ss;
1981 + ss->stripe = stripe;
1983 + i = ARRAY_SIZE(ss->bl);
1985 + bio_list_init(ss->bl + i);
1990 + i = ARRAY_SIZE(stripe->lists);
1992 + INIT_LIST_HEAD(stripe->lists + i);
1994 + atomic_set(&stripe->cnt, 0);
1995 + atomic_set(&stripe->io.pending, 0);
1997 + stripe_invalidate(stripe);
2000 +/* Number of pages per chunk. */
2001 +static inline unsigned chunk_pages(unsigned io_size)
2003 + return dm_div_up(io_size, SECTORS_PER_PAGE);
2006 +/* Number of pages per stripe. */
2007 +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
2009 + return chunk_pages(io_size) * rs->set.raid_devs;
2012 +/* Initialize part of page_list (recovery). */
2013 +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
2014 + unsigned start, unsigned count)
2016 + unsigned pages = chunk_pages(count);
2017 + /* Get offset into the page_list. */
2018 + struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
2021 + while (pl && pages--) {
2022 + BUG_ON(!pl->page);
2023 + memset(page_address(pl->page), 0, PAGE_SIZE);
2028 +/* Initialize parity chunk of stripe. */
2029 +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
2031 + stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
2034 +/* Return dynamic stripe structure size. */
2035 +static INLINE size_t stripe_size(struct raid_set *rs)
2037 + return sizeof(struct stripe) +
2038 + rs->set.raid_devs * sizeof(struct stripe_set);
2041 +/* Allocate a stripe and its memory object. */
2042 +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
2043 +enum grow { SC_GROW, SC_KEEP };
2044 +static struct stripe *stripe_alloc(struct stripe_cache *sc,
2045 + struct dm_mem_cache_client *mc,
2049 + struct stripe *stripe;
2051 + stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
2053 + /* Grow the dm-mem-cache by one object. */
2054 + if (grow == SC_GROW) {
2055 + r = dm_mem_cache_grow(mc, 1);
2060 + stripe->obj = dm_mem_cache_alloc(mc);
2064 + stripe_init(sc, stripe);
2070 + if (grow == SC_GROW)
2071 + dm_mem_cache_shrink(mc, 1);
2073 + kmem_cache_free(sc->kc.cache, stripe);
2078 + * Free a stripes memory object, shrink the
2079 + * memory cache and free the stripe itself
2081 +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
2083 + dm_mem_cache_free(mc, stripe->obj);
2084 + dm_mem_cache_shrink(mc, 1);
2085 + kmem_cache_free(stripe->sc->kc.cache, stripe);
2088 +/* Free the recovery stripe. */
2089 +static void stripe_recover_free(struct raid_set *rs)
2091 + struct recover *rec = &rs->recover;
2092 + struct list_head *stripes = &rec->stripes;
2094 + while (!list_empty(stripes)) {
2095 + struct stripe *stripe = list_first_entry(stripes, struct stripe,
2096 + lists[LIST_RECOVER]);
2097 + list_del(stripe->lists + LIST_RECOVER);
2098 + stripe_free(stripe, rec->mem_cache_client);
2102 +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
2103 +static INLINE void stripe_endio_push(struct stripe *stripe)
2106 + unsigned long flags;
2107 + struct stripe_cache *sc = stripe->sc;
2108 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2110 + spin_lock_irqsave(lock, flags);
2111 + wake = list_empty(sc->lists + LIST_ENDIO);
2112 + stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
2113 + spin_unlock_irqrestore(lock, flags);
2116 + wake_do_raid(RS(sc));
2119 +/* Protected check for stripe cache endio list empty. */
2120 +static INLINE int stripe_endio_empty(struct stripe_cache *sc)
2123 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2125 + spin_lock_irq(lock);
2126 + r = list_empty(sc->lists + LIST_ENDIO);
2127 + spin_unlock_irq(lock);
2132 +/* Pop a stripe off safely off the endio list. */
2133 +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
2135 + struct stripe *stripe;
2136 + spinlock_t *lock = sc->locks + LOCK_ENDIO;
2138 + /* This runs in parallel with endio(). */
2139 + spin_lock_irq(lock);
2141 + spin_unlock_irq(lock);
2147 +/* Evict stripe from cache. */
2148 +static void stripe_evict(struct stripe *stripe)
2150 + struct raid_set *rs = RS(stripe->sc);
2151 + stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
2153 + if (list_empty(stripe->lists + LIST_LRU)) {
2154 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2155 + atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
2159 +/* Grow stripe cache. */
2161 +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
2164 + struct raid_set *rs = RS(sc);
2166 + /* Try to allocate this many (additional) stripes. */
2167 + while (stripes--) {
2168 + struct stripe *stripe =
2169 + stripe_alloc(sc, sc->mem_cache_client, grow);
2171 + if (likely(stripe)) {
2172 + stripe->io.size = rs->set.io_size;
2173 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2174 + atomic_inc(&sc->stripes);
2181 + ClearRSScBusy(rs);
2182 + return r ? r : hash_resize(sc);
2185 +/* Shrink stripe cache. */
2186 +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
2190 + /* Try to get unused stripe from LRU list. */
2191 + while (stripes--) {
2192 + struct stripe *stripe;
2194 + stripe = stripe_lru_pop(sc);
2196 + /* An lru stripe may never have ios pending! */
2197 + BUG_ON(stripe_io(stripe));
2198 + stripe_free(stripe, sc->mem_cache_client);
2199 + atomic_dec(&sc->stripes);
2206 + /* Check if stats are still sane. */
2207 + if (atomic_read(&sc->max_active_stripes) >
2208 + atomic_read(&sc->stripes))
2209 + atomic_set(&sc->max_active_stripes, 0);
2214 + ClearRSScBusy(RS(sc));
2215 + return hash_resize(sc);
2218 +/* Create stripe cache. */
2219 +static int sc_init(struct raid_set *rs, unsigned stripes)
2222 + struct stripe_cache *sc = &rs->sc;
2223 + struct stripe *stripe;
2224 + struct recover *rec = &rs->recover;
2226 + /* Initialize lists and locks. */
2227 + i = ARRAY_SIZE(sc->lists);
2229 + INIT_LIST_HEAD(sc->lists + i);
2233 + spin_lock_init(sc->locks + i);
2235 + /* Initialize atomic variables. */
2236 + atomic_set(&sc->stripes, 0);
2237 + atomic_set(&sc->stripes_last, 0);
2238 + atomic_set(&sc->stripes_to_shrink, 0);
2239 + atomic_set(&sc->active_stripes, 0);
2240 + atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
2243 + * We need a runtime unique # to suffix the kmem cache name
2244 + * because we'll have one for each active RAID set.
2246 + nr = atomic_inc_return(&_stripe_sc_nr);
2247 + sprintf(sc->kc.name, "%s_%d", TARGET, nr);
2248 + sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
2250 + if (!sc->kc.cache)
2253 + /* Create memory cache client context for RAID stripe cache. */
2254 + sc->mem_cache_client =
2255 + dm_mem_cache_client_create(stripes, rs->set.raid_devs,
2256 + chunk_pages(rs->set.io_size));
2257 + if (IS_ERR(sc->mem_cache_client))
2258 + return PTR_ERR(sc->mem_cache_client);
2260 + /* Create memory cache client context for RAID recovery stripe(s). */
2261 + rec->mem_cache_client =
2262 + dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
2263 + chunk_pages(rec->io_size));
2264 + if (IS_ERR(rec->mem_cache_client))
2265 + return PTR_ERR(rec->mem_cache_client);
2267 + /* Allocate stripe for set recovery. */
2268 + /* XXX: cope with MAX_RECOVERY. */
2269 + INIT_LIST_HEAD(&rec->stripes);
2270 + for (i = 0; i < MAX_RECOVER; i++) {
2271 + stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
2275 + SetStripeRecover(stripe);
2276 + stripe->io.size = rec->io_size;
2277 + list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
2281 + * Allocate the stripe objetcs from the
2282 + * cache and add them to the LRU list.
2284 + return sc_grow(sc, stripes, SC_KEEP);
2287 +/* Destroy the stripe cache. */
2288 +static void sc_exit(struct stripe_cache *sc)
2290 + if (sc->kc.cache) {
2291 + BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
2292 + kmem_cache_destroy(sc->kc.cache);
2295 + if (sc->mem_cache_client)
2296 + dm_mem_cache_client_destroy(sc->mem_cache_client);
2298 + ClearRSRecover(RS(sc));
2299 + stripe_recover_free(RS(sc));
2300 + if (RS(sc)->recover.mem_cache_client)
2301 + dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
2303 + hash_exit(&sc->hash);
2307 + * Calculate RAID address
2309 + * Delivers tuple with the index of the data disk holding the chunk
2310 + * in the set, the parity disks index and the start of the stripe
2311 + * within the address space of the set (used as the stripe cache hash key).
2314 +static struct address *
2315 +raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
2317 + unsigned data_devs = rs->set.data_devs, di, pi,
2318 + raid_devs = rs->set.raid_devs;
2319 + sector_t stripe, tmp;
2322 + * chunk_number = sector / chunk_size
2323 + * stripe = chunk_number / data_devs
2324 + * di = stripe % data_devs;
2326 + stripe = sector >> rs->set.chunk_shift;
2327 + di = sector_div(stripe, data_devs);
2329 + switch (rs->set.raid_type->level) {
2332 + pi = sector_div(tmp, raid_devs);
2334 + switch (rs->set.raid_type->algorithm) {
2335 + case left_asym: /* Left asymmetric. */
2336 + pi = data_devs - pi;
2337 + case right_asym: /* Right asymmetric. */
2342 + case left_sym: /* Left symmetric. */
2343 + pi = data_devs - pi;
2344 + case right_sym: /* Right symmetric. */
2345 + di = (pi + di + 1) % raid_devs;
2349 + DMERR("Unknown RAID algorithm %d",
2350 + rs->set.raid_type->algorithm);
2363 + DMERR("Unknown RAID level %d", rs->set.raid_type->level);
2368 + * Hash key = start offset on any single device of the RAID set;
2369 + * adjusted in case io size differs from chunk size.
2371 + addr->key = (stripe << rs->set.chunk_shift) +
2372 + (sector & rs->set.io_shift_mask);
2381 + * Copy data across between stripe pages and bio vectors.
2383 + * Pay attention to data alignment in stripe and bio pages.
2386 +bio_copy_page_list(int rw, struct stripe *stripe,
2387 + struct page_list *pl, struct bio *bio)
2389 + unsigned i, page_offset;
2391 + struct raid_set *rs = RS(stripe->sc);
2392 + struct bio_vec *bv;
2394 + /* Get start page in page list for this sector. */
2395 + i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
2396 + pl = pl_elem(pl, i);
2398 + page_addr = page_address(pl->page);
2399 + page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
2401 + /* Walk all segments and copy data across between bio_vecs and pages. */
2402 + bio_for_each_segment(bv, bio, i) {
2403 + int len = bv->bv_len, size;
2404 + unsigned bio_offset = 0;
2405 + void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
2407 + size = (page_offset + len > PAGE_SIZE) ?
2408 + PAGE_SIZE - page_offset : len;
2411 + memcpy(bio_addr + bio_offset,
2412 + page_addr + page_offset, size);
2414 + memcpy(page_addr + page_offset,
2415 + bio_addr + bio_offset, size);
2417 + page_offset += size;
2418 + if (page_offset == PAGE_SIZE) {
2420 + * We reached the end of the chunk page ->
2421 + * need refer to the next one to copy more data.
2425 + /* Get next page. */
2428 + page_addr = page_address(pl->page);
2430 + bio_offset += size;
2431 + /* REMOVEME: statistics. */
2432 + atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
2437 + __bio_kunmap_atomic(bio_addr, KM_USER0);
2442 + * Xor optimization macros.
2444 +/* Xor data pointer declaration and initialization macros. */
2445 +#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
2446 +#define DECLARE_3 DECLARE_2, *d2 = data[2]
2447 +#define DECLARE_4 DECLARE_3, *d3 = data[3]
2448 +#define DECLARE_5 DECLARE_4, *d4 = data[4]
2449 +#define DECLARE_6 DECLARE_5, *d5 = data[5]
2450 +#define DECLARE_7 DECLARE_6, *d6 = data[6]
2451 +#define DECLARE_8 DECLARE_7, *d7 = data[7]
2453 +/* Xor unrole macros. */
2454 +#define D2(n) d0[n] = d0[n] ^ d1[n]
2455 +#define D3(n) D2(n) ^ d2[n]
2456 +#define D4(n) D3(n) ^ d3[n]
2457 +#define D5(n) D4(n) ^ d4[n]
2458 +#define D6(n) D5(n) ^ d5[n]
2459 +#define D7(n) D6(n) ^ d6[n]
2460 +#define D8(n) D7(n) ^ d7[n]
2462 +#define X_2(macro, offset) macro(offset); macro(offset + 1);
2463 +#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
2464 +#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
2465 +#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
2466 +#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
2467 +#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
2469 +/* Define a _xor_#chunks_#xors_per_run() function. */
2470 +#define _XOR(chunks, xors_per_run) \
2471 +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
2473 + unsigned end = XOR_SIZE / sizeof(data[0]), i; \
2474 + DECLARE_ ## chunks; \
2476 + for (i = 0; i < end; i += xors_per_run) { \
2477 + X_ ## xors_per_run(D ## chunks, i); \
2481 +/* Define xor functions for 2 - 8 chunks. */
2482 +#define MAKE_XOR_PER_RUN(xors_per_run) \
2483 + _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
2484 + _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
2485 + _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
2486 + _XOR(8, xors_per_run);
2488 +MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
2489 +MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
2490 +MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
2491 +MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
2493 +#define MAKE_XOR(xors_per_run) \
2495 + void (*f)(unsigned long **); \
2496 +} static xor_funcs ## xors_per_run[] = { \
2499 + { _xor2_ ## xors_per_run }, \
2500 + { _xor3_ ## xors_per_run }, \
2501 + { _xor4_ ## xors_per_run }, \
2502 + { _xor5_ ## xors_per_run }, \
2503 + { _xor6_ ## xors_per_run }, \
2504 + { _xor7_ ## xors_per_run }, \
2505 + { _xor8_ ## xors_per_run }, \
2508 +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
2510 + /* Call respective function for amount of chunks. */ \
2511 + xor_funcs ## xors_per_run[n].f(data); \
2514 +/* Define xor_8() - xor_64 functions. */
2520 +/* Maximum number of chunks, which can be xor'ed in one go. */
2521 +#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
2526 +} static xor_funcs[] = {
2528 + {xor_16, "xor_16"},
2529 + {xor_32, "xor_32"},
2530 + {xor_64, "xor_64"},
2536 + * This indexes into the page list of the stripe.
2538 + * All chunks will be xored into the parity chunk
2539 + * in maximum groups of xor.chunks.
2541 + * FIXME: try mapping the pages on discontiguous memory.
2543 +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
2545 + struct raid_set *rs = RS(stripe->sc);
2546 + unsigned max_chunks = rs->xor.chunks, n, p;
2547 + unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
2548 + unsigned long **d = rs->data;
2549 + xor_function_t xor_f = rs->xor.f->f;
2551 + /* Address of parity page to xor into. */
2552 + d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
2554 + /* Preset pointers to data pages. */
2555 + for (n = 1, p = rs->set.raid_devs; p--; ) {
2556 + if (p != pi && PageIO(PAGE(stripe, p)))
2557 + d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
2559 + /* If max chunks -> xor .*/
2560 + if (n == max_chunks) {
2566 + /* If chunks -> xor. */
2570 + /* Set parity page uptodate and clean. */
2571 + page_set(PAGE(stripe, pi), CLEAN);
2574 +/* Common xor loop through all stripe page lists. */
2575 +static void common_xor(struct stripe *stripe, sector_t count,
2576 + unsigned off, unsigned p)
2580 + for (sector = off; sector < count; sector += SECTORS_PER_XOR)
2581 + xor(stripe, p, sector);
2583 + atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
2587 + * Calculate parity sectors on intact stripes.
2589 + * Need to calculate raid address for recover stripe, because its
2590 + * chunk sizes differs and is typically larger than io chunk size.
2592 +static void parity_xor(struct stripe *stripe)
2594 + struct raid_set *rs = RS(stripe->sc);
2595 + unsigned chunk_size = rs->set.chunk_size,
2596 + io_size = stripe->io.size,
2597 + xor_size = chunk_size > io_size ? io_size : chunk_size;
2600 + /* This can be the recover stripe with a larger io size. */
2601 + for (off = 0; off < io_size; off += xor_size) {
2605 + * Recover stripe likely is bigger than regular io
2606 + * ones and has no precalculated parity disk index ->
2607 + * need to calculate RAID address.
2609 + if (unlikely(StripeRecover(stripe))) {
2610 + struct address addr;
2613 + (stripe->key + off) * rs->set.data_devs,
2616 + stripe_zero_pl_part(stripe, pi, off,
2617 + rs->set.chunk_size);
2619 + pi = stripe->idx.parity;
2621 + common_xor(stripe, xor_size, off, pi);
2622 + page_set(PAGE(stripe, pi), DIRTY);
2626 +/* Reconstruct missing chunk. */
2627 +static void reconstruct_xor(struct stripe *stripe)
2629 + struct raid_set *rs = RS(stripe->sc);
2630 + int p = stripe->idx.recover;
2634 + /* REMOVEME: statistics. */
2635 + atomic_inc(rs->stats + (raid_set_degraded(rs) ?
2636 + S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
2638 + /* Zero chunk to be reconstructed. */
2639 + stripe_zero_chunk(stripe, p);
2640 + common_xor(stripe, stripe->io.size, 0, p);
2644 + * Try getting a stripe either from the hash or from the lru list
2646 +static inline void _stripe_get(struct stripe *stripe)
2648 + atomic_inc(&stripe->cnt);
2651 +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
2653 + struct stripe_cache *sc = &rs->sc;
2654 + struct stripe *stripe;
2656 + stripe = stripe_lookup(sc, addr->key);
2658 + _stripe_get(stripe);
2659 + /* Remove from the lru list if on. */
2660 + stripe_lru_del(stripe, LIST_LOCKED);
2661 + atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2663 + /* Second try to get an LRU stripe. */
2664 + stripe = stripe_lru_pop(sc);
2666 + _stripe_get(stripe);
2667 + /* Invalidate before reinserting with changed key. */
2668 + stripe_invalidate(stripe);
2669 + stripe->key = addr->key;
2670 + stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2672 + stripe->idx.parity = addr->pi;
2673 + sc_insert(sc, stripe);
2674 + /* REMOVEME: statistics. */
2675 + atomic_inc(rs->stats + S_INSCACHE);
2683 + * Decrement reference count on a stripe.
2685 + * Move it to list of LRU stripes if zero.
2687 +static void stripe_put(struct stripe *stripe)
2689 + if (atomic_dec_and_test(&stripe->cnt)) {
2690 + if (TestClearStripeActive(stripe))
2691 + atomic_dec(&stripe->sc->active_stripes);
2693 + /* Put stripe onto the LRU list. */
2694 + stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2697 + BUG_ON(atomic_read(&stripe->cnt) < 0);
2703 + * I need to do it here because I can't in interrupt
2705 + * Read and write functions are split in order to avoid
2706 + * conditionals in the main loop for performamce reasons.
2709 +/* Helper read bios on a page list. */
2710 +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
2713 + bio_copy_page_list(READ, stripe, pl, bio);
2716 +/* Helper write bios on a page list. */
2717 +static void _rh_dec(struct stripe *stripe, struct page_list *pl,
2720 + dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
2723 +/* End io all bios on a page list. */
2725 +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
2728 + struct bio_list *bl = BL(stripe, p, rw);
2730 + if (!bio_list_empty(bl)) {
2731 + struct page_list *pl = PL(stripe, p);
2732 + struct page *page = pl->page;
2734 + if (PageLocked(page))
2737 + * FIXME: PageUptodate() not cleared
2738 + * properly for missing chunks ?
2740 + else if (PageUptodate(page)) {
2742 + struct raid_set *rs = RS(stripe->sc);
2743 + void (*h_f)(struct stripe *, struct page_list *,
2745 + (rw == READ) ? _bio_copy_page_list : _rh_dec;
2747 + while ((bio = bio_list_pop(bl))) {
2748 + h_f(stripe, pl, bio);
2749 + _bio_endio(rs, bio, 0);
2750 + stripe_put(stripe);
2762 + * End io all reads/writes on a stripe copying
2763 + * read date accross from stripe to bios.
2765 +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
2768 + unsigned p = RS(stripe->sc)->set.raid_devs;
2771 + int rr = page_list_endio(rw, stripe, p, count);
2773 + if (rr && r != -EIO)
2780 +/* Fail all ios on a bio list and return # of bios. */
2782 +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
2787 + raid_set_dead(rs);
2789 + /* Update region counters. */
2791 + struct dm_rh_client *rh = rs->recover.rh;
2793 + bio_list_for_each(bio, bl) {
2794 + if (bio_data_dir(bio) == WRITE)
2795 + dm_rh_dec(rh, stripe->region);
2799 + /* Error end io all bios. */
2800 + for (r = 0; (bio = bio_list_pop(bl)); r++)
2801 + _bio_endio(rs, bio, -EIO);
2806 +/* Fail all ios of a bio list of a stripe and drop io pending count. */
2808 +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
2809 + struct bio_list *bl)
2811 + unsigned put = bio_list_fail(rs, stripe, bl);
2814 + stripe_put(stripe);
2817 +/* Fail all ios hanging off all bio lists of a stripe. */
2818 +static void stripe_fail_io(struct stripe *stripe)
2820 + struct raid_set *rs = RS(stripe->sc);
2821 + unsigned p = rs->set.raid_devs;
2823 + stripe_evict(stripe);
2826 + struct stripe_set *ss = stripe->ss + p;
2827 + int i = ARRAY_SIZE(ss->bl);
2830 + stripe_bio_list_fail(rs, stripe, ss->bl + i);
2835 + * Handle all stripes by handing them to the daemon, because we can't
2836 + * map their pages to copy the data in interrupt context.
2838 + * We don't want to handle them here either, while interrupts are disabled.
2841 +/* Read/write endio function for dm-io (interrupt context). */
2842 +static void endio(unsigned long error, void *context)
2844 + struct dm_mem_cache_object *obj = context;
2845 + struct stripe_set *ss = obj->private;
2846 + struct stripe *stripe = ss->stripe;
2847 + struct page *page = obj->pl->page;
2849 + if (unlikely(error))
2850 + stripe_error(stripe, page);
2852 + page_set(page, CLEAN);
2854 + clear_page_locked(page);
2855 + stripe_io_dec(stripe);
2857 + /* Add stripe to endio list and wake daemon. */
2858 + stripe_endio_push(stripe);
2862 + * Recovery io throttling
2864 +/* Conditionally reset io counters. */
2865 +enum count_type { IO_WORK = 0, IO_RECOVER };
2866 +static int recover_io_reset(struct raid_set *rs)
2868 + unsigned long j = jiffies;
2870 + /* Pay attention to jiffies overflows. */
2871 + if (j > rs->recover.last_jiffies + HZ
2872 + || j < rs->recover.last_jiffies) {
2873 + rs->recover.last_jiffies = j;
2874 + atomic_set(rs->recover.io_count + IO_WORK, 0);
2875 + atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2884 +recover_io_count(struct raid_set *rs, struct stripe *stripe)
2886 + if (RSRecover(rs)) {
2887 + recover_io_reset(rs);
2888 + atomic_inc(rs->recover.io_count +
2889 + (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2893 +/* Read/Write a page_list asynchronously. */
2894 +static void page_list_rw(struct stripe *stripe, unsigned p)
2896 + struct stripe_cache *sc = stripe->sc;
2897 + struct raid_set *rs = RS(sc);
2898 + struct dm_mem_cache_object *obj = stripe->obj + p;
2899 + struct page_list *pl = obj->pl;
2900 + struct page *page = pl->page;
2901 + struct raid_dev *dev = rs->dev + p;
2902 + struct dm_io_region io = {
2903 + .bdev = dev->dev->bdev,
2904 + .sector = stripe->key,
2905 + .count = stripe->io.size,
2907 + struct dm_io_request control = {
2908 + .bi_rw = PageDirty(page) ? WRITE : READ,
2909 + .mem.type = DM_IO_PAGE_LIST,
2912 + .notify.fn = endio,
2913 + .notify.context = obj,
2914 + .client = sc->dm_io_client,
2917 + BUG_ON(PageLocked(page));
2920 + * Don't rw past end of device, which can happen, because
2921 + * typically sectors_per_dev isn't divisable by io_size.
2923 + if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2924 + io.count = rs->set.sectors_per_dev - io.sector;
2926 + io.sector += dev->start; /* Add <offset>. */
2927 + recover_io_count(rs, stripe); /* Recovery io accounting. */
2929 + /* REMOVEME: statistics. */
2930 + atomic_inc(rs->stats +
2931 + (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
2933 + ClearPageError(page);
2934 + set_page_locked(page);
2935 + io_dev_queued(dev);
2936 + BUG_ON(dm_io(&control, 1, &io, NULL));
2940 + * Write dirty / read not uptodate page lists of a stripe.
2942 +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
2947 + * Increment the pending count on the stripe
2948 + * first, so that we don't race in endio().
2950 + * An inc (IO) is needed for any page:
2953 + * o dirtied by writes merged
2954 + * o dirtied by parity calculations
2956 + r = for_each_io_dev(rs, stripe, _stripe_io_inc);
2958 + /* io needed: chunks are not uptodate/dirty. */
2959 + int max; /* REMOVEME: */
2960 + struct stripe_cache *sc = &rs->sc;
2962 + if (!TestSetStripeActive(stripe))
2963 + atomic_inc(&sc->active_stripes);
2965 + /* Take off the lru list in case it got added there. */
2966 + stripe_lru_del(stripe, LIST_LOCKED);
2968 + /* Submit actual io. */
2969 + for_each_io_dev(rs, stripe, page_list_rw);
2971 + /* REMOVEME: statistics */
2972 + max = sc_active(sc);
2973 + if (atomic_read(&sc->max_active_stripes) < max)
2974 + atomic_set(&sc->max_active_stripes, max);
2976 + atomic_inc(rs->stats + S_FLUSHS);
2977 + /* END REMOVEME: statistics */
2983 +/* Work in all pending writes. */
2984 +static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
2986 + struct bio_list *write = BL(stripe, p, WRITE);
2988 + if (!bio_list_empty(write)) {
2989 + struct page_list *pl = stripe->obj[p].pl;
2991 + struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
2994 + * We can play with the lists without holding a lock,
2995 + * because it is just us accessing them anyway.
2997 + bio_list_for_each(bio, write)
2998 + bio_copy_page_list(WRITE, stripe, pl, bio);
3000 + bio_list_merge(write_merged, write);
3001 + bio_list_init(write);
3002 + page_set(pl->page, DIRTY);
3006 +/* Merge in all writes hence dirtying respective pages. */
3007 +static INLINE void writes_merge(struct stripe *stripe)
3009 + unsigned p = RS(stripe->sc)->set.raid_devs;
3012 + _writes_merge(stripe, p);
3015 +/* Check, if a chunk gets completely overwritten. */
3016 +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
3018 + unsigned sectors = 0;
3020 + struct bio_list *bl = BL(stripe, p, WRITE);
3022 + bio_list_for_each(bio, bl)
3023 + sectors += bio_sectors(bio);
3025 + return sectors == RS(stripe->sc)->set.io_size;
3029 + * Prepare stripe to avoid io on broken/reconstructed
3030 + * drive in order to reconstruct date on endio.
3032 +enum prepare_type { IO_ALLOW, IO_PROHIBIT };
3033 +static void stripe_prepare(struct stripe *stripe, unsigned p,
3034 + enum prepare_type type)
3036 + struct page *page = PAGE(stripe, p);
3041 + * In case we prohibit, we gotta make sure, that
3042 + * io on all other chunks than the one which failed
3043 + * or is being reconstructed is allowed and that it
3044 + * doesn't have state uptodate.
3046 + stripe_allow_io(stripe);
3047 + ClearPageUptodate(page);
3048 + ProhibitPageIO(page);
3050 + /* REMOVEME: statistics. */
3051 + atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
3052 + stripe->idx.recover = p;
3053 + SetStripeReconstruct(stripe);
3057 + AllowPageIO(page);
3058 + stripe->idx.recover = -1;
3059 + ClearStripeReconstruct(stripe);
3068 + * Degraded/reconstruction mode.
3070 + * Check stripe state to figure which chunks don't need IO.
3072 +static INLINE void stripe_check_reconstruct(struct stripe *stripe,
3075 + struct raid_set *rs = RS(stripe->sc);
3078 + * Degraded mode (device(s) failed) ->
3079 + * avoid io on the failed device.
3081 + if (unlikely(raid_set_degraded(rs))) {
3082 + /* REMOVEME: statistics. */
3083 + atomic_inc(rs->stats + S_DEGRADED);
3084 + stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
3088 + * Reconstruction mode (ie. a particular device or
3089 + * some (rotating) parity chunk is being resynchronized) ->
3090 + * o make sure all needed pages are read in
3091 + * o writes are allowed to go through
3093 + int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
3096 + /* REMOVEME: statistics. */
3097 + atomic_inc(rs->stats + S_NOSYNC);
3098 + stripe_prepare(stripe, dev_for_parity(stripe),
3105 + * All disks good. Avoid reading parity chunk and reconstruct it
3106 + * unless we have prohibited io to chunk(s).
3108 + if (!prohibited) {
3109 + if (StripeMerged(stripe))
3110 + stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
3112 + stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
3115 + * Overrule stripe_prepare to reconstruct the
3116 + * parity chunk, because it'll be created new anyway.
3118 + ClearStripeReconstruct(stripe);
3123 +/* Check, if stripe is ready to merge writes. */
3124 +static INLINE int stripe_check_merge(struct stripe *stripe)
3126 + struct raid_set *rs = RS(stripe->sc);
3127 + int prohibited = 0;
3128 + unsigned chunks = 0, p = rs->set.raid_devs;
3130 + /* Walk all chunks. */
3132 + struct page *page = PAGE(stripe, p);
3134 + /* Can't merge active chunks. */
3135 + if (PageLocked(page)) {
3136 + /* REMOVEME: statistics. */
3137 + atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
3141 + /* Can merge uptodate chunks and have to count parity chunk. */
3142 + if (PageUptodate(page) || p == stripe->idx.parity) {
3147 + /* Read before write ordering. */
3148 + if (RSCheckOverwrite(rs) &&
3149 + bio_list_empty(BL(stripe, p, READ))) {
3150 + int r = stripe_check_overwrite(stripe, p);
3154 + /* REMOVEME: statistics. */
3155 + atomic_inc(RS(stripe->sc)->stats +
3156 + S_PROHIBITPAGEIO);
3157 + ProhibitPageIO(page);
3163 + if (chunks == rs->set.raid_devs) {
3164 + /* All pages are uptodate or get written over or mixture. */
3165 + /* REMOVEME: statistics. */
3166 + atomic_inc(rs->stats + S_CAN_MERGE);
3169 + /* REMOVEME: statistics.*/
3170 + atomic_inc(rs->stats + S_CANT_MERGE);
3172 + return prohibited ? 1 : -EPERM;
3175 +/* Check, if stripe is ready to merge writes. */
3176 +static INLINE int stripe_check_read(struct stripe *stripe)
3179 + unsigned p = RS(stripe->sc)->set.raid_devs;
3181 + /* Walk all chunks. */
3183 + struct page *page = PAGE(stripe, p);
3185 + if (!PageLocked(page) &&
3186 + bio_list_empty(BL(stripe, p, READ))) {
3187 + ProhibitPageIO(page);
3196 + * Read/write a stripe.
3198 + * All stripe read/write activity goes through this function.
3200 + * States to cover:
3201 + * o stripe to read and/or write
3202 + * o stripe with error to reconstruct
3204 +static int stripe_rw(struct stripe *stripe)
3206 + struct raid_set *rs = RS(stripe->sc);
3207 + int prohibited = 0, r;
3210 + * Check the state of the RAID set and if degraded (or
3211 + * resynchronizing for reads), read in all other chunks but
3212 + * the one on the dead/resynchronizing device in order to be
3213 + * able to reconstruct the missing one.
3215 + * Merge all writes hanging off uptodate pages of the stripe.
3218 + /* Initially allow io on all chunks and prohibit below, if necessary. */
3219 + stripe_allow_io(stripe);
3221 + if (StripeRBW(stripe)) {
3222 + r = stripe_check_merge(stripe);
3225 + * If I could rely on valid parity (which would only
3226 + * be sure in case of a full synchronization),
3227 + * I could xor a fraction of chunks out of
3228 + * parity and back in.
3230 + * For the time being, I got to redo parity...
3232 + /* parity_xor(stripe); */ /* Xor chunks out. */
3233 + stripe_zero_chunk(stripe, stripe->idx.parity);
3234 + writes_merge(stripe); /* Merge writes in. */
3235 + parity_xor(stripe); /* Update parity. */
3236 + ClearStripeRBW(stripe); /* Disable RBW. */
3237 + SetStripeMerged(stripe); /* Writes merged. */
3242 + } else if (!raid_set_degraded(rs))
3243 + /* Only allow for read avoidance if not degraded. */
3244 + prohibited = stripe_check_read(stripe);
3247 + * Check, if io needs to be allowed/prohibeted on certain chunks
3248 + * because of a degraded set or reconstruction on a region.
3250 + stripe_check_reconstruct(stripe, prohibited);
3252 + /* Now submit any reads/writes. */
3253 + r = stripe_page_lists_rw(rs, stripe);
3256 + * No io submitted because of chunk io prohibited or
3257 + * locked pages -> push to end io list for processing.
3259 + atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
3260 + stripe_endio_push(stripe);
3261 + wake_do_raid(rs); /* Wake myself. */
3267 +/* Flush stripe either via flush list or imeediately. */
3268 +enum flush_type { FLUSH_DELAY, FLUSH_NOW };
3269 +static int stripe_flush(struct stripe *stripe, enum flush_type type)
3273 + stripe_lru_del(stripe, LIST_LOCKED);
3275 + /* Immediately flush. */
3276 + if (type == FLUSH_NOW) {
3277 + if (likely(raid_set_operational(RS(stripe->sc))))
3278 + r = stripe_rw(stripe); /* Read/write stripe. */
3280 + /* Optimization: Fail early on failed sets. */
3281 + stripe_fail_io(stripe);
3282 + /* Delay flush by putting it on io list for later processing. */
3283 + } else if (type == FLUSH_DELAY)
3284 + stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
3292 + * Queue reads and writes to a stripe by hanging
3293 + * their bios off the stripsets read/write lists.
3295 + * Endio reads on uptodate chunks.
3297 +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
3298 + struct bio_list *reject)
3301 + struct address addr;
3302 + struct stripe *stripe =
3303 + stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
3306 + int rr, rw = bio_data_dir(bio);
3308 + rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
3310 + stripe_put(stripe);
3314 + /* Distinguish read and write cases. */
3315 + bio_list_add(BL(stripe, addr.di, rw), bio);
3317 + /* REMOVEME: statistics */
3318 + atomic_inc(rs->stats + (rw == WRITE ?
3319 + S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
3322 + SetStripeRead(stripe);
3324 + SetStripeRBW(stripe);
3326 + /* Inrement pending write count on region. */
3327 + dm_rh_inc(rs->recover.rh, stripe->region);
3328 + r = 1; /* Region hash needs a flush. */
3332 + * Optimize stripe flushing:
3334 + * o directly start io for read stripes.
3336 + * o put stripe onto stripe caches io_list for RBW,
3337 + * so that do_flush() can belabour it after we put
3338 + * more bios to the stripe for overwrite optimization.
3340 + stripe_flush(stripe,
3341 + StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
3343 + /* Got no stripe from cache -> reject bio. */
3346 + bio_list_add(reject, bio);
3347 + /* REMOVEME: statistics. */
3348 + atomic_inc(rs->stats + S_IOS_POST);
3355 + * Recovery functions
3357 +/* Read a stripe off a raid set for recovery. */
3358 +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
3360 + /* Invalidate all pages so that they get read in. */
3361 + stripe_pages_invalidate(stripe);
3363 + /* Allow io on all recovery chunks. */
3364 + stripe_allow_io(stripe);
3367 + ProhibitPageIO(PAGE(stripe, idx));
3369 + stripe->key = rs->recover.pos;
3370 + return stripe_page_lists_rw(rs, stripe);
3373 +/* Write a stripe to a raid set for recovery. */
3374 +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
3377 + * If this is a reconstruct of a particular device, then
3378 + * reconstruct the respective page(s), else create parity page(s).
3381 + struct page *page = PAGE(stripe, idx);
3383 + AllowPageIO(page);
3384 + stripe_zero_chunk(stripe, idx);
3385 + common_xor(stripe, stripe->io.size, 0, idx);
3386 + page_set(page, DIRTY);
3388 + parity_xor(stripe);
3390 + return stripe_page_lists_rw(rs, stripe);
3393 +/* Recover bandwidth available ?. */
3394 +static int recover_bandwidth(struct raid_set *rs)
3398 + /* On reset -> allow recovery. */
3399 + r = recover_io_reset(rs);
3400 + if (r || RSBandwidth(rs))
3403 + work = atomic_read(rs->recover.io_count + IO_WORK);
3405 + /* Pay attention to larger recover stripe size. */
3407 + atomic_read(rs->recover.io_count + IO_RECOVER) *
3408 + rs->recover.io_size /
3412 + * Don't use more than given bandwidth of
3413 + * the work io for recovery.
3415 + if (recover > work / rs->recover.bandwidth_work) {
3416 + /* REMOVEME: statistics. */
3417 + atomic_inc(rs->stats + S_NO_BANDWIDTH);
3423 + atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
3427 +/* Try to get a region to recover. */
3428 +static int recover_get_region(struct raid_set *rs)
3430 + struct recover *rec = &rs->recover;
3431 + struct dm_rh_client *rh = rec->rh;
3433 + /* Start quiescing some regions. */
3434 + if (!RSRegionGet(rs)) {
3435 + int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
3438 + r = dm_rh_recovery_prepare(rh);
3440 + DMINFO("No %sregions to recover",
3441 + rec->nr_regions_to_recover ?
3448 + SetRSRegionGet(rs);
3452 + rec->reg = dm_rh_recovery_start(rh);
3455 + * A reference for the the region I'll
3456 + * keep till I've completely synced it.
3459 + rec->pos = dm_rh_region_to_sector(rh,
3460 + dm_rh_get_region_key(rec->reg));
3461 + rec->end = rec->pos + dm_rh_get_region_size(rh);
3470 +/* Read/write a recovery stripe. */
3471 +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
3473 + /* Read/write flip-flop. */
3474 + if (TestClearStripeRBW(stripe)) {
3475 + SetStripeRead(stripe);
3476 + return recover_read(rs, stripe, idx_get(rs));
3477 + } else if (TestClearStripeRead(stripe))
3478 + return recover_write(rs, stripe, idx_get(rs));
3483 +/* Reset recovery variables. */
3484 +static void recovery_region_reset(struct raid_set *rs)
3486 + rs->recover.reg = NULL;
3487 + ClearRSRegionGet(rs);
3490 +/* Update region hash state. */
3491 +static void recover_rh_update(struct raid_set *rs, int error)
3493 + struct recover *rec = &rs->recover;
3494 + struct dm_rh_client *rh = rec->rh;
3495 + struct dm_region *reg = rec->reg;
3498 + dm_rh_recovery_end(rh, reg, error);
3500 + rec->nr_regions_recovered++;
3502 + recovery_region_reset(rs);
3505 + dm_rh_update_states(rh, 1);
3507 + io_put(rs); /* Release the io reference for the region. */
3510 +/* Called by main io daemon to recover regions. */
3511 +/* FIXME: cope with MAX_RECOVER > 1. */
3512 +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
3515 + struct recover *rec = &rs->recover;
3517 + /* If recovery is active -> return. */
3518 + if (StripeActive(stripe))
3521 + /* io error is fatal for recovery -> stop it. */
3522 + if (unlikely(StripeError(stripe)))
3525 + /* Get a region to recover. */
3526 + r = recover_get_region(rs);
3528 + case 1: /* Got a new region. */
3529 + /* Flag read before write. */
3530 + ClearStripeRead(stripe);
3531 + SetStripeRBW(stripe);
3535 + /* Got a region in the works. */
3536 + r = recover_bandwidth(rs);
3537 + if (r) /* Got enough bandwidth. */
3541 + /* No bandwidth/quiesced region yet, try later. */
3542 + wake_do_raid_delayed(rs, HZ / 10);
3545 + case -ENOENT: /* No more regions. */
3546 + dm_table_event(rs->ti->table);
3550 + /* Read/write a recover stripe. */
3551 + r = recover_stripe_rw(rs, stripe);
3553 + /* IO initiated, get another reference for the IO. */
3558 + /* Update recovery position within region. */
3559 + rec->pos += stripe->io.size;
3561 + /* If we're at end of region, update region hash. */
3562 + if (rec->pos >= rec->end ||
3563 + rec->pos >= rs->set.sectors_per_dev)
3564 + recover_rh_update(rs, 0);
3566 + SetStripeRBW(stripe);
3568 + /* Schedule myself for another round... */
3573 + raid_set_check_degrade(rs, stripe);
3576 + char buf[BDEVNAME_SIZE];
3578 + DMERR("stopping recovery due to "
3579 + "ERROR on /dev/%s, stripe at offset %llu",
3580 + bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3581 + (unsigned long long) stripe->key);
3585 + /* Make sure, that all quiesced regions get released. */
3588 + dm_rh_recovery_end(rec->rh, rec->reg, -EIO);
3590 + rec->reg = dm_rh_recovery_start(rec->rh);
3591 + } while (rec->reg);
3593 + recover_rh_update(rs, -EIO);
3595 + rs->set.dev_to_init = -1;
3597 + /* Check for jiffies overrun. */
3598 + rs->recover.end_jiffies = jiffies;
3599 + if (rs->recover.end_jiffies < rs->recover.start_jiffies)
3600 + rs->recover.end_jiffies = ~0;
3602 + ClearRSRecover(rs);
3605 +static INLINE void do_recovery(struct raid_set *rs)
3607 + struct stripe *stripe;
3609 + list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
3610 + _do_recovery(rs, stripe);
3612 + if (!RSRecover(rs))
3613 + stripe_recover_free(rs);
3617 + * END recovery functions
3620 +/* End io process all stripes handed in by endio() callback. */
3621 +static void do_endios(struct raid_set *rs)
3623 + struct stripe_cache *sc = &rs->sc;
3624 + struct stripe *stripe;
3626 + while ((stripe = stripe_endio_pop(sc))) {
3629 + /* Recovery stripe special case. */
3630 + if (unlikely(StripeRecover(stripe))) {
3631 + if (stripe_io(stripe))
3634 + io_put(rs); /* Release region io reference. */
3635 + ClearStripeActive(stripe);
3637 + /* REMOVEME: statistics*/
3638 + atomic_dec(&sc->active_stripes);
3642 + /* Early end io all reads on any uptodate chunks. */
3643 + stripe_endio(READ, stripe, (count = 0, &count));
3644 + if (stripe_io(stripe)) {
3645 + if (count) /* REMOVEME: statistics. */
3646 + atomic_inc(rs->stats + S_ACTIVE_READS);
3651 + /* Set stripe inactive after all io got processed. */
3652 + if (TestClearStripeActive(stripe))
3653 + atomic_dec(&sc->active_stripes);
3655 + /* Unlock stripe (for clustering). */
3656 + stripe_unlock(rs, stripe);
3659 + * If an io error on a stripe occured and the RAID set
3660 + * is still operational, requeue the stripe for io.
3662 + if (TestClearStripeError(stripe)) {
3663 + raid_set_check_degrade(rs, stripe);
3664 + ClearStripeReconstruct(stripe);
3666 + if (!StripeMerged(stripe) &&
3667 + raid_set_operational(rs)) {
3668 + stripe_pages_invalidate(stripe);
3669 + stripe_flush(stripe, FLUSH_DELAY);
3670 + /* REMOVEME: statistics. */
3671 + atomic_inc(rs->stats + S_REQUEUE);
3676 + /* Check if the RAID set is inoperational to error ios. */
3677 + if (!raid_set_operational(rs)) {
3678 + ClearStripeReconstruct(stripe);
3679 + stripe_fail_io(stripe);
3680 + BUG_ON(atomic_read(&stripe->cnt));
3684 + /* Got to reconstruct a missing chunk. */
3685 + if (TestClearStripeReconstruct(stripe))
3686 + reconstruct_xor(stripe);
3689 + * Now that we've got a complete stripe, we can
3690 + * process the rest of the end ios on reads.
3692 + BUG_ON(stripe_endio(READ, stripe, NULL));
3693 + ClearStripeRead(stripe);
3696 + * Read-before-write stripes need to be flushed again in
3697 + * order to work the write data into the pages *after*
3698 + * they were read in.
3700 + if (TestClearStripeMerged(stripe))
3701 + /* End io all bios which got merged already. */
3702 + BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
3704 + /* Got to put on flush list because of new writes. */
3705 + if (StripeRBW(stripe))
3706 + stripe_flush(stripe, FLUSH_DELAY);
3711 + * Stripe cache shrinking.
3713 +static INLINE void do_sc_shrink(struct raid_set *rs)
3715 + unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
3718 + unsigned cur = atomic_read(&rs->sc.stripes);
3720 + sc_shrink(&rs->sc, shrink);
3721 + shrink -= cur - atomic_read(&rs->sc.stripes);
3722 + atomic_set(&rs->sc.stripes_to_shrink, shrink);
3725 + * Wake myself up in case we failed to shrink the
3726 + * requested amount in order to try again later.
3737 + * We do different things with the io depending on the
3738 + * state of the region that it's in:
3740 + * o reads: hang off stripe cache or postpone if full
3744 + * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3745 + * In case stripe cache is full or busy, postpone the io.
3747 + * RECOVERING: delay the io until recovery of the region completes.
3750 +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
3753 + unsigned flush = 0;
3754 + struct dm_rh_client *rh = rs->recover.rh;
3756 + struct bio_list delay, reject;
3758 + bio_list_init(&delay);
3759 + bio_list_init(&reject);
3762 + * Classify each io:
3763 + * o delay to recovering regions
3764 + * o queue to all other regions
3766 + while ((bio = bio_list_pop(ios))) {
3768 + * In case we get a barrier bio, push it back onto
3769 + * the input queue unless all work queues are empty
3770 + * and the stripe cache is inactive.
3772 + if (unlikely(bio_barrier(bio))) {
3773 + /* REMOVEME: statistics. */
3774 + atomic_inc(rs->stats + S_BARRIER);
3775 + if (!list_empty(rs->sc.lists + LIST_IO) ||
3776 + !bio_list_empty(&delay) ||
3777 + !bio_list_empty(&reject) ||
3778 + sc_active(&rs->sc)) {
3779 + bio_list_push(ios, bio);
3784 + r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
3785 + if (unlikely(r)) {
3786 + /* Got to wait for recovering regions. */
3787 + bio_list_add(&delay, bio);
3788 + SetRSBandwidth(rs);
3791 + * Process ios to non-recovering regions by queueing
3792 + * them to stripes (does rh_inc()) for writes).
3794 + flush += stripe_queue_bio(rs, bio, &reject);
3799 + r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3801 + DMERR("dirty log flush");
3804 + /* Delay ios to regions which are recovering. */
3805 + while ((bio = bio_list_pop(&delay))) {
3806 + /* REMOVEME: statistics.*/
3807 + atomic_inc(rs->stats + S_DELAYED_BIOS);
3808 + atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3809 + dm_rh_delay_by_region(rh, bio,
3810 + dm_rh_sector_to_region(rh, _sector(rs, bio)));
3814 + /* Merge any rejected bios back to the head of the input list. */
3815 + bio_list_merge_head(ios, &reject);
3818 +/* Flush any stripes on the io list. */
3819 +static INLINE void do_flush(struct raid_set *rs)
3821 + struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
3823 + list_for_each_safe(pos, tmp, list) {
3824 + int r = stripe_flush(list_entry(pos, struct stripe,
3825 + lists[LIST_IO]), FLUSH_NOW);
3827 + /* Remove from the list only if the stripe got processed. */
3829 + list_del_init(pos);
3833 +/* Send an event in case we're getting too busy. */
3834 +static INLINE void do_busy_event(struct raid_set *rs)
3836 + if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
3837 + if (!TestSetRSScBusy(rs))
3838 + dm_table_event(rs->ti->table);
3840 + ClearRSScBusy(rs);
3843 +/* Unplug: let the io role on the sets devices. */
3844 +static INLINE void do_unplug(struct raid_set *rs)
3846 + struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3848 + while (dev-- > rs->dev) {
3849 + /* Only call any device unplug function, if io got queued. */
3850 + if (io_dev_clear(dev))
3851 + blk_unplug(bdev_get_queue(dev->dev->bdev));
3855 +/*-----------------------------------------------------------------
3857 + *---------------------------------------------------------------*/
3859 + * o belabour all end ios
3860 + * o optionally shrink the stripe cache
3861 + * o update the region hash states
3862 + * o optionally do recovery
3863 + * o grab the input queue
3864 + * o work an all requeued or new ios and perform stripe cache flushs
3865 + * unless the RAID set is inoperational (when we error ios)
3866 + * o check, if the stripe cache gets too busy and throw an event if so
3867 + * o unplug any component raid devices with queued bios
3869 +static void do_raid(struct work_struct *ws)
3871 + struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
3872 + struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3873 + spinlock_t *lock = &rs->io.in_lock;
3876 + * We always need to end io, so that ios
3877 + * can get errored in case the set failed
3878 + * and the region counters get decremented
3879 + * before we update the region hash states.
3885 + * Now that we've end io'd, which may have put stripes on
3886 + * the LRU list, we shrink the stripe cache if requested.
3890 + /* Update region hash states before we go any further. */
3891 + dm_rh_update_states(rs->recover.rh, 1);
3893 + /* Try to recover regions. */
3894 + if (RSRecover(rs))
3897 + /* More endios -> process. */
3898 + if (!stripe_endio_empty(&rs->sc)) {
3899 + atomic_inc(rs->stats + S_REDO);
3903 + /* Quickly grab all new ios queued and add them to the work list. */
3904 + spin_lock_irq(lock);
3905 + bio_list_merge(ios, ios_in);
3906 + bio_list_init(ios_in);
3907 + spin_unlock_irq(lock);
3909 + /* Let's assume we're operational most of the time ;-). */
3910 + if (likely(raid_set_operational(rs))) {
3911 + /* If we got ios, work them into the cache. */
3912 + if (!bio_list_empty(ios)) {
3914 + do_unplug(rs); /* Unplug the sets device queues. */
3917 + do_flush(rs); /* Flush any stripes on io list. */
3918 + do_unplug(rs); /* Unplug the sets device queues. */
3919 + do_busy_event(rs); /* Check if we got too busy. */
3921 + /* More endios -> process. */
3922 + if (!stripe_endio_empty(&rs->sc)) {
3923 + atomic_inc(rs->stats + S_REDO);
3927 + /* No way to reconstruct data with too many devices failed. */
3928 + bio_list_fail(rs, NULL, ios);
3932 + * Callback for region hash to dispatch
3933 + * delayed bios queued to recovered regions
3934 + * (Gets called via rh_update_states()).
3936 +static void dispatch_delayed_bios(void *context, struct bio_list *bl, int dummy)
3938 + struct raid_set *rs = context;
3941 + /* REMOVEME: decrement pending delayed bios counter. */
3942 + bio_list_for_each(bio, bl)
3943 + atomic_dec(rs->stats + S_DELAYED_BIOS);
3945 + /* Merge region hash private list to work list. */
3946 + bio_list_merge_head(&rs->io.work, bl);
3947 + bio_list_init(bl);
3948 + ClearRSBandwidth(rs);
3951 +/*************************************************************
3952 + * Constructor helpers
3953 + *************************************************************/
3954 +/* Calculate MB/sec. */
3955 +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
3957 + return to_bytes(speed * rs->set.data_devs *
3958 + rs->recover.io_size * HZ >> 10) >> 10;
3962 + * Discover fastest xor algorithm and # of chunks combination.
3964 +/* Calculate speed for algorithm and # of chunks. */
3965 +static INLINE unsigned xor_speed(struct stripe *stripe)
3970 + /* Wait for next tick. */
3971 + for (j = jiffies; j == jiffies;)
3974 + /* Do xors for a full tick. */
3975 + for (j = jiffies; j == jiffies;) {
3977 + common_xor(stripe, stripe->io.size, 0, 0);
3986 +/* Optimize xor algorithm for this RAID set. */
3987 +static unsigned xor_optimize(struct raid_set *rs)
3989 + unsigned chunks_max = 2, speed_max = 0;
3990 + struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3991 + struct stripe *stripe;
3993 + BUG_ON(list_empty(&rs->recover.stripes));
3994 + stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3995 + lists[LIST_RECOVER]);
3998 + * Got to allow io on all chunks, so that
3999 + * xor() will actually work on them.
4001 + stripe_allow_io(stripe);
4003 + /* Try all xor functions. */
4004 + while (f-- > xor_funcs) {
4007 + /* Set actual xor function for common_xor(). */
4009 + rs->xor.chunks = XOR_CHUNKS_MAX + 1;
4011 + while (rs->xor.chunks-- > 2) {
4012 + speed = xor_speed(stripe);
4013 + if (speed > speed_max) {
4014 + speed_max = speed;
4015 + chunks_max = rs->xor.chunks;
4021 + /* Memorize optimum parameters. */
4022 + rs->xor.f = f_max;
4023 + rs->xor.chunks = chunks_max;
4028 + * Allocate a RAID context (a RAID set)
4031 +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
4032 + unsigned stripes, unsigned chunk_size, unsigned io_size,
4033 + unsigned recover_io_size, unsigned raid_devs,
4034 + sector_t sectors_per_dev,
4035 + struct dm_target *ti, unsigned dl_parms, char **argv)
4040 + sector_t region_size, ti_len;
4041 + struct raid_set *rs = NULL;
4042 + struct dm_dirty_log *dl;
4043 + struct recover *rec;
4046 + * Create the dirty log
4048 + * We need to change length for the dirty log constructor,
4049 + * because we want an amount of regions for all stripes derived
4050 + * from the single device size, so that we can keep region
4051 + * size = 2^^n independant of the number of devices
4054 + ti->len = sectors_per_dev;
4055 + dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
4058 + goto bad_dirty_log;
4060 + /* Chunk size *must* be smaller than region size. */
4061 + region_size = dl->type->get_region_size(dl);
4062 + if (chunk_size > region_size)
4063 + goto bad_chunk_size;
4065 + /* Recover io size *must* be smaller than region size as well. */
4066 + if (recover_io_size > region_size)
4067 + goto bad_recover_io_size;
4069 + /* Size and allocate the RAID set structure. */
4070 + len = sizeof(*rs->data) + sizeof(*rs->dev);
4071 + if (array_too_big(sizeof(*rs), len, raid_devs))
4074 + len = sizeof(*rs) + raid_devs * len;
4075 + rs = kzalloc(len, GFP_KERNEL);
4079 + rec = &rs->recover;
4080 + atomic_set(&rs->io.in_process, 0);
4081 + atomic_set(&rs->io.in_process_max, 0);
4082 + rec->io_size = recover_io_size;
4084 + /* Pointer to data array. */
4085 + rs->data = (unsigned long **)
4086 + ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
4088 + rs->set.raid_devs = p = raid_devs;
4089 + rs->set.data_devs = raid_devs - raid_type->parity_devs;
4090 + rs->set.raid_type = raid_type;
4093 + * Set chunk and io size and respective shifts
4094 + * (used to avoid divisions)
4096 + rs->set.chunk_size = chunk_size;
4097 + rs->set.chunk_mask = chunk_size - 1;
4098 + rs->set.chunk_shift = ffs(chunk_size) - 1;
4100 + rs->set.io_size = io_size;
4101 + rs->set.io_mask = io_size - 1;
4102 + rs->set.io_shift = ffs(io_size) - 1;
4103 + rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
4105 + rs->set.pages_per_io = chunk_pages(io_size);
4106 + rs->set.sectors_per_dev = sectors_per_dev;
4108 + rs->set.ei = -1; /* Indicate no failed device. */
4109 + atomic_set(&rs->set.failed_devs, 0);
4113 + atomic_set(rec->io_count + IO_WORK, 0);
4114 + atomic_set(rec->io_count + IO_RECOVER, 0);
4116 + /* Initialize io lock and queues. */
4117 + spin_lock_init(&rs->io.in_lock);
4118 + bio_list_init(&rs->io.in);
4119 + bio_list_init(&rs->io.work);
4121 + init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
4123 + rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
4124 + rec->rh = dm_rh_client_create(MAX_RECOVER, dispatch_delayed_bios, rs,
4125 + wake_do_raid, rs, dl, region_size,
4126 + rs->recover.nr_regions);
4127 + if (IS_ERR(rec->rh))
4130 + /* Initialize stripe cache. */
4131 + r = sc_init(rs, stripes);
4135 + /* Create dm-io client context. */
4136 + rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
4137 + rs->set.pages_per_io);
4138 + if (IS_ERR(rs->sc.dm_io_client))
4139 + goto bad_dm_io_client;
4141 + /* REMOVEME: statistics. */
4143 + ClearRSDevelStats(rs); /* Disnable development status. */
4149 + TI_ERR_RET("Error creating dirty log", -ENOMEM);
4153 + dm_dirty_log_destroy(dl);
4154 + TI_ERR("Chunk size larger than region size");
4156 +bad_recover_io_size:
4157 + dm_dirty_log_destroy(dl);
4158 + TI_ERR("Recover stripe io size larger than region size");
4161 + dm_dirty_log_destroy(dl);
4162 + TI_ERR("Arry too big");
4165 + dm_dirty_log_destroy(dl);
4166 + TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
4169 + dm_dirty_log_destroy(dl);
4170 + ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
4174 + ti->error = DM_MSG_PREFIX "Error creating stripe cache";
4178 + ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
4180 + dm_rh_client_destroy(rec->rh);
4182 + dm_rh_client_destroy(rec->rh); /* Destroys dirty log as well. */
4188 +/* Free a RAID context (a RAID set). */
4190 +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
4193 + dm_put_device(ti, rs->dev[r].dev);
4195 + dm_io_client_destroy(rs->sc.dm_io_client);
4197 + dm_rh_client_destroy(rs->recover.rh);
4198 + dm_dirty_log_destroy(rs->recover.dl);
4202 +/* Create work queue and initialize work. */
4203 +static int rs_workqueue_init(struct raid_set *rs)
4205 + struct dm_target *ti = rs->ti;
4207 + rs->io.wq = create_singlethread_workqueue(DAEMON);
4209 + TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
4211 + INIT_DELAYED_WORK(&rs->io.dws, do_raid);
4215 +/* Return pointer to raid_type structure for raid name. */
4216 +static struct raid_type *get_raid_type(char *name)
4218 + struct raid_type *r = ARRAY_END(raid_types);
4220 + while (r-- > raid_types) {
4221 + if (!strnicmp(STR_LEN(r->name, name)))
4228 +/* FIXME: factor out to dm core. */
4229 +static int multiple(sector_t a, sector_t b, sector_t *n)
4235 + return a == r * b;
4238 +/* Log RAID set information to kernel log. */
4239 +static void raid_set_log(struct raid_set *rs, unsigned speed)
4242 + char buf[BDEVNAME_SIZE];
4244 + for (p = 0; p < rs->set.raid_devs; p++)
4245 + DMINFO("/dev/%s is raid disk %u",
4246 + bdevname(rs->dev[p].dev->bdev, buf), p);
4248 + DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
4249 + rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
4250 + atomic_read(&rs->sc.stripes));
4251 + DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
4252 + rs->xor.chunks, mbpers(rs, speed));
4253 + DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
4254 + rs->set.data_devs, rs->set.raid_devs);
4257 +/* Get all devices and offsets. */
4259 +dev_parms(struct dm_target *ti, struct raid_set *rs,
4260 + char **argv, int *p)
4262 + for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
4264 + unsigned long long tmp;
4265 + struct raid_dev *dev = rs->dev + *p;
4266 + union dev_lookup dl = {.dev = dev };
4268 + /* Get offset and device. */
4269 + r = sscanf(argv[1], "%llu", &tmp);
4271 + TI_ERR("Invalid RAID device offset parameter");
4274 + r = dm_get_device(ti, argv[0], dev->start,
4275 + rs->set.sectors_per_dev,
4276 + dm_table_get_mode(ti->table), &dev->dev);
4278 + TI_ERR_RET("RAID device lookup failure", r);
4280 + r = raid_dev_lookup(rs, bynumber, &dl);
4281 + if (r != -ENODEV && r < *p) {
4282 + (*p)++; /* Ensure dm_put_device() on actual device. */
4283 + TI_ERR_RET("Duplicate RAID device", -ENXIO);
4290 +/* Set recovery bandwidth. */
4292 +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
4294 + rs->recover.bandwidth = bandwidth;
4295 + rs->recover.bandwidth_work = 100 / bandwidth;
4298 +/* Handle variable number of RAID parameters. */
4300 +raid_variable_parms(struct dm_target *ti, char **argv,
4301 + unsigned i, int *raid_parms,
4302 + int *chunk_size, int *chunk_size_parm,
4303 + int *stripes, int *stripes_parm,
4304 + int *io_size, int *io_size_parm,
4305 + int *recover_io_size, int *recover_io_size_parm,
4306 + int *bandwidth, int *bandwidth_parm)
4308 + /* Fetch # of variable raid parameters. */
4309 + if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
4310 + !range_ok(*raid_parms, 0, 5))
4311 + TI_ERR("Bad variable raid parameters number");
4313 + if (*raid_parms) {
4315 + * If we've got variable RAID parameters,
4316 + * chunk size is the first one
4318 + if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
4319 + (*chunk_size != -1 &&
4320 + (!POWER_OF_2(*chunk_size) ||
4321 + !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
4322 + TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
4324 + *chunk_size_parm = *chunk_size;
4325 + if (*chunk_size == -1)
4326 + *chunk_size = CHUNK_SIZE;
4329 + * In case we've got 2 or more variable raid
4330 + * parameters, the number of stripes is the second one
4332 + if (*raid_parms > 1) {
4333 + if (sscanf(argv[i++], "%d", stripes) != 1 ||
4334 + (*stripes != -1 &&
4335 + !range_ok(*stripes, STRIPES_MIN,
4337 + TI_ERR("Invalid number of stripes: must "
4338 + "be >= 8 and <= 8192");
4341 + *stripes_parm = *stripes;
4342 + if (*stripes == -1)
4343 + *stripes = STRIPES;
4346 + * In case we've got 3 or more variable raid
4347 + * parameters, the io size is the third one.
4349 + if (*raid_parms > 2) {
4350 + if (sscanf(argv[i++], "%d", io_size) != 1 ||
4351 + (*io_size != -1 &&
4352 + (!POWER_OF_2(*io_size) ||
4353 + !range_ok(*io_size, IO_SIZE_MIN,
4354 + min(BIO_MAX_SECTORS / 2,
4356 + TI_ERR("Invalid io size; must "
4357 + "be 2^^n and less equal "
4358 + "min(BIO_MAX_SECTORS/2, chunk size)");
4360 + *io_size = *chunk_size;
4362 + *io_size_parm = *io_size;
4363 + if (*io_size == -1)
4364 + *io_size = *chunk_size;
4367 + * In case we've got 4 variable raid parameters,
4368 + * the recovery stripe io_size is the fourth one
4370 + if (*raid_parms > 3) {
4371 + if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
4372 + (*recover_io_size != -1 &&
4373 + (!POWER_OF_2(*recover_io_size) ||
4374 + !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
4375 + BIO_MAX_SECTORS / 2))))
4376 + TI_ERR("Invalid recovery io size; must be "
4377 + "2^^n and less equal BIO_MAX_SECTORS/2");
4380 + *recover_io_size_parm = *recover_io_size;
4381 + if (*recover_io_size == -1)
4382 + *recover_io_size = RECOVER_IO_SIZE;
4385 + * In case we've got 5 variable raid parameters,
4386 + * the recovery io bandwidth is the fifth one
4388 + if (*raid_parms > 4) {
4389 + if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
4390 + (*bandwidth != -1 &&
4391 + !range_ok(*bandwidth, BANDWIDTH_MIN,
4393 + TI_ERR("Invalid recovery bandwidth "
4394 + "percentage; must be > 0 and <= 100");
4397 + *bandwidth_parm = *bandwidth;
4398 + if (*bandwidth == -1)
4399 + *bandwidth = BANDWIDTH;
4405 +/* Parse optional locking parameters. */
4407 +raid_locking_parms(struct dm_target *ti, char **argv,
4408 + unsigned i, int *locking_parms,
4409 + struct dm_raid45_locking_type **locking_type)
4411 + *locking_parms = 0;
4412 + *locking_type = &locking_none;
4414 + if (!strnicmp(argv[i], "none", strlen(argv[i])))
4415 + *locking_parms = 1;
4416 + else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
4417 + *locking_type = &locking_none;
4418 + *locking_parms = 2;
4419 + } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
4420 + *locking_type = &locking_cluster;
4421 + /* FIXME: namespace. */
4422 + *locking_parms = 3;
4425 + return *locking_parms == 1 ? -EINVAL : 0;
4428 +/* Set backing device information properties of RAID set. */
4429 +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
4431 + unsigned p, ra_pages;
4432 + struct mapped_device *md = dm_table_get_md(rs->ti->table);
4433 + struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
4435 + /* Set read-ahead for the RAID set and the component devices. */
4436 + bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
4437 + ra_pages = chunks * chunk_pages(rs->set.io_size);
4438 + for (p = rs->set.raid_devs; p--; ) {
4439 + struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
4441 + q->backing_dev_info.ra_pages = ra_pages;
4444 + /* Set congested function and data. */
4445 + bdi->congested_fn = raid_set_congested;
4446 + bdi->congested_data = rs;
4451 +/* Get backing device information properties of RAID set. */
4452 +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
4454 + struct mapped_device *md = dm_table_get_md(rs->ti->table);
4456 + *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
4457 + / stripe_pages(rs, rs->set.io_size);
4458 + *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
4459 + / chunk_pages(rs->set.io_size);
4465 + * Construct a RAID4/5 mapping:
4467 + * log_type #log_params <log_params> \
4468 + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
4469 + * [locking "none"/"cluster"]
4470 + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
4472 + * log_type = "core"/"disk",
4473 + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
4474 + * log_params = [dirty_log_path] region_size [[no]sync])
4476 + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
4478 + * #parity_dev = N if raid_type = "raid4"
4479 + * o N = -1: pick default = last device
4480 + * o N >= 0 and < #raid_devs: parity device index
4482 + * #raid_variable_params = 0-5; raid_params (-1 = default):
4483 + * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
4484 + * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
4485 + * and <= CHUNK_SIZE_MAX)
4486 + * o #stripes is number of stripes allocated to stripe cache
4487 + * (must be > 1 and < STRIPES_MAX)
4488 + * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
4489 + * o recover_io_size (io unit size per device for recovery in sectors;
4490 + must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4491 + * o %recovery_bandwith is the maximum amount spend for recovery during
4492 + * application io (1-100%)
4493 + * If raid_variable_params = 0, defaults will be used.
4494 + * Any raid_variable_param can be set to -1 to apply a default
4496 + * #raid_devs = N (N >= 3)
4498 + * #dev_to_initialize = N
4499 + * -1: initialize parity on all devices
4500 + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4501 + * of a failed devices content after replacement
4503 + * <dev_path> = device_path (eg, /dev/sdd1)
4504 + * <offset> = begin at offset on <dev_path>
4507 +#define MIN_PARMS 13
4508 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4510 + int bandwidth = BANDWIDTH, bandwidth_parm = -1,
4511 + chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
4512 + dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
4513 + i, io_size = IO_SIZE, io_size_parm = -1,
4514 + r, raid_devs, raid_parms,
4515 + recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
4516 + stripes = STRIPES, stripes_parm = -1;
4518 + sector_t tmp, sectors_per_dev;
4519 + struct dm_raid45_locking_type *locking;
4520 + struct raid_set *rs;
4521 + struct raid_type *raid_type;
4523 + /* Ensure minimum number of parameters. */
4524 + if (argc < MIN_PARMS)
4525 + TI_ERR("Not enough parameters");
4527 + /* Fetch # of dirty log parameters. */
4528 + if (sscanf(argv[1], "%d", &dl_parms) != 1
4529 + || !range_ok(dl_parms, 1, 4711))
4530 + TI_ERR("Bad dirty log parameters number");
4532 + /* Check raid_type. */
4533 + raid_type = get_raid_type(argv[dl_parms + 2]);
4535 + TI_ERR("Bad raid type");
4537 + /* In case of RAID4, parity drive is selectable. */
4538 + parity_parm = !!(raid_type->level == raid4);
4540 + /* Handle variable number of RAID parameters. */
4541 + r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
4543 + &chunk_size, &chunk_size_parm,
4544 + &stripes, &stripes_parm,
4545 + &io_size, &io_size_parm,
4546 + &recover_io_size, &recover_io_size_parm,
4547 + &bandwidth, &bandwidth_parm);
4551 + r = raid_locking_parms(ti, argv,
4552 + dl_parms + parity_parm + raid_parms + 4,
4553 + &locking_parms, &locking);
4557 + /* # of raid devices. */
4558 + i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
4559 + if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4560 + raid_devs < raid_type->minimal_devs)
4561 + TI_ERR("Invalid number of raid devices");
4563 + /* In case of RAID4, check parity drive index is in limits. */
4564 + if (raid_type->level == raid4) {
4565 + /* Fetch index of parity device. */
4566 + if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4567 + !range_ok(pi, 0, raid_devs - 1))
4568 + TI_ERR("Invalid RAID4 parity device index");
4572 + * Index of device to initialize starts at 0
4574 + * o -1 -> don't initialize a particular device,
4575 + * o 0..raid_devs-1 -> initialize respective device
4576 + * (used for reconstruction of a replaced device)
4579 + (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
4580 + "%d", &dev_to_init) != 1
4581 + || !range_ok(dev_to_init, -1, raid_devs - 1))
4582 + TI_ERR("Invalid number for raid device to initialize");
4584 + /* Check # of raid device arguments. */
4585 + if (argc - dl_parms - parity_parm - raid_parms - 6 !=
4587 + TI_ERR("Wrong number of raid device/offset arguments");
4590 + * Check that the table length is devisable
4591 + * w/o rest by (raid_devs - parity_devs)
4593 + if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4594 + §ors_per_dev))
4596 + ("Target length not divisable by number of data devices");
4599 + * Check that the device size is
4600 + * devisable w/o rest by chunk size
4602 + if (!multiple(sectors_per_dev, chunk_size, &tmp))
4603 + TI_ERR("Device length not divisable by chunk_size");
4605 + /****************************************************************
4606 + * Now that we checked the constructor arguments ->
4607 + * let's allocate the RAID set
4608 + ****************************************************************/
4609 + r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
4610 + recover_io_size, raid_devs, sectors_per_dev,
4611 + ti, dl_parms, argv);
4616 + * Set these here in order to avoid passing
4617 + * too many arguments to context_alloc()
4619 + rs->set.dev_to_init_parm = dev_to_init;
4620 + rs->set.dev_to_init = dev_to_init;
4621 + rs->set.pi_parm = pi;
4622 + rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4623 + rs->set.raid_parms = raid_parms;
4624 + rs->set.chunk_size_parm = chunk_size_parm;
4625 + rs->set.io_size_parm = io_size_parm;
4626 + rs->sc.stripes_parm = stripes_parm;
4627 + rs->recover.io_size_parm = recover_io_size_parm;
4628 + rs->recover.bandwidth_parm = bandwidth_parm;
4629 + recover_set_bandwidth(rs, bandwidth);
4631 + /* Use locking type to lock stripe access. */
4632 + rs->locking = locking;
4634 + /* Get the device/offset tupels. */
4635 + argv += dl_parms + 6 + parity_parm + raid_parms;
4636 + r = dev_parms(ti, rs, argv, &i);
4640 + /* Initialize recovery. */
4641 + rs->recover.start_jiffies = jiffies;
4642 + rs->recover.end_jiffies = 0;
4643 + recovery_region_reset(rs);
4645 + /* Allow for recovery of any nosync regions. */
4648 + /* Set backing device information (eg. read ahead). */
4649 + rs_set_bdi(rs, chunk_size * 2, io_size * 4);
4650 + SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4652 + speed = xor_optimize(rs); /* Select best xor algorithm. */
4654 + /* Initialize work queue to handle this RAID set's io. */
4655 + r = rs_workqueue_init(rs);
4659 + raid_set_log(rs, speed); /* Log information about RAID set. */
4662 + * Make sure that dm core only hands maximum io size
4663 + * length down and pays attention to io boundaries.
4665 + ti->split_io = rs->set.io_size;
4670 + context_free(rs, ti, i);
4675 + * Destruct a raid mapping
4677 +static void raid_dtr(struct dm_target *ti)
4679 + struct raid_set *rs = ti->private;
4681 + /* Indicate recovery end so that ios in flight drain. */
4682 + ClearRSRecover(rs);
4684 + wake_do_raid(rs); /* Wake daemon. */
4685 + wait_ios(rs); /* Wait for any io still being processed. */
4686 + destroy_workqueue(rs->io.wq);
4687 + context_free(rs, ti, rs->set.raid_devs);
4690 +/* Queues ios to RAID sets. */
4691 +static inline void queue_bio(struct raid_set *rs, struct bio *bio)
4694 + struct bio_list *in = &rs->io.in;
4695 + spinlock_t *in_lock = &rs->io.in_lock;
4697 + spin_lock_irq(in_lock);
4698 + wake = bio_list_empty(in);
4699 + bio_list_add(in, bio);
4700 + spin_unlock_irq(in_lock);
4702 + /* Wake daemon if input list was empty. */
4707 +/* Raid mapping function. */
4708 +static int raid_map(struct dm_target *ti, struct bio *bio,
4709 + union map_info *map_context)
4711 + /* I don't want to waste stripe cache capacity. */
4712 + if (bio_rw(bio) == READA)
4715 + struct raid_set *rs = ti->private;
4717 + /* REMOVEME: statistics. */
4718 + atomic_inc(rs->stats +
4719 + (bio_data_dir(bio) == WRITE ?
4720 + S_BIOS_WRITE : S_BIOS_READ));
4723 + * Get io reference to be waiting for to drop
4724 + * to zero on device suspension/destruction.
4727 + bio->bi_sector -= ti->begin; /* Remap sector. */
4728 + queue_bio(rs, bio); /* Queue to the daemon. */
4729 + return DM_MAPIO_SUBMITTED; /* Handle later. */
4733 +/* Device suspend. */
4734 +static void raid_postsuspend(struct dm_target *ti)
4736 + struct raid_set *rs = ti->private;
4737 + struct dm_dirty_log *dl = rs->recover.dl;
4739 + SetRSSuspended(rs);
4741 + if (RSRecover(rs))
4742 + dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
4746 + wait_ios(rs); /* Wait for completion of all ios being processed. */
4747 + if (dl->type->postsuspend && dl->type->postsuspend(dl))
4748 + /* Suspend dirty log. */
4749 + /* FIXME: need better error handling. */
4750 + DMWARN("log suspend failed");
4753 +/* Device resume. */
4754 +static void raid_resume(struct dm_target *ti)
4756 + struct raid_set *rs = ti->private;
4757 + struct recover *rec = &rs->recover;
4758 + struct dm_dirty_log *dl = rec->dl;
4760 + if (dl->type->resume && dl->type->resume(dl))
4761 + /* Resume dirty log. */
4762 + /* FIXME: need better error handling. */
4763 + DMWARN("log resume failed");
4765 + rec->nr_regions_to_recover =
4766 + rec->nr_regions - dl->type->get_sync_count(dl);
4768 + ClearRSSuspended(rs);
4770 + /* Reset any unfinished recovery. */
4771 + if (RSRecover(rs)) {
4772 + recovery_region_reset(rs);
4773 + dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
4778 +static INLINE unsigned sc_size(struct raid_set *rs)
4780 + return to_sector(atomic_read(&rs->sc.stripes) *
4781 + (sizeof(struct stripe) +
4782 + (sizeof(struct stripe_set) +
4783 + (sizeof(struct page_list) +
4784 + to_bytes(rs->set.io_size) *
4785 + rs->set.raid_devs)) +
4787 + end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
4792 +/* REMOVEME: status output for development. */
4794 +raid_devel_stats(struct dm_target *ti, char *result,
4795 + unsigned *size, unsigned maxlen)
4797 + unsigned chunks, stripes, sz = *size;
4799 + char buf[BDEVNAME_SIZE], *p;
4800 + struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
4801 + struct raid_set *rs = ti->private;
4802 + struct recover *rec = &rs->recover;
4803 + struct timespec ts;
4805 + DMEMIT("%s ", version);
4806 + DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
4807 + DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
4809 + for (sm = stats_map; sm < sm_end; sm++)
4810 + DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4812 + DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
4813 + DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
4814 + atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
4817 + j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4818 + rec->start_jiffies;
4819 + jiffies_to_timespec(j, &ts);
4820 + sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4821 + p = strchr(buf, '.');
4824 + DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
4825 + (unsigned long long) rec->nr_regions_recovered,
4826 + RSRegionGet(rs) ? "+" : "",
4827 + (unsigned long long) rec->nr_regions_to_recover,
4828 + (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4830 + rs_get_ra(rs, &stripes, &chunks);
4831 + DMEMIT("ra=%u/%u ", stripes, chunks);
4837 +raid_status(struct dm_target *ti, status_type_t type,
4838 + char *result, unsigned maxlen)
4840 + unsigned i, sz = 0;
4841 + char buf[BDEVNAME_SIZE];
4842 + struct raid_set *rs = ti->private;
4845 + case STATUSTYPE_INFO:
4846 + /* REMOVEME: statistics. */
4847 + if (RSDevelStats(rs))
4848 + raid_devel_stats(ti, result, &sz, maxlen);
4850 + DMEMIT("%u ", rs->set.raid_devs);
4852 + for (i = 0; i < rs->set.raid_devs; i++)
4854 + format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
4857 + for (i = 0; i < rs->set.raid_devs; i++) {
4858 + DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
4860 + if (rs->set.raid_type->level == raid4 &&
4864 + if (rs->set.dev_to_init == i)
4870 + case STATUSTYPE_TABLE:
4871 + sz = rs->recover.dl->type->status(rs->recover.dl, type,
4873 + DMEMIT("%s %u ", rs->set.raid_type->name,
4874 + rs->set.raid_parms);
4876 + if (rs->set.raid_type->level == raid4)
4877 + DMEMIT("%d ", rs->set.pi_parm);
4879 + if (rs->set.raid_parms)
4880 + DMEMIT("%d ", rs->set.chunk_size_parm);
4882 + if (rs->set.raid_parms > 1)
4883 + DMEMIT("%d ", rs->sc.stripes_parm);
4885 + if (rs->set.raid_parms > 2)
4886 + DMEMIT("%d ", rs->set.io_size_parm);
4888 + if (rs->set.raid_parms > 3)
4889 + DMEMIT("%d ", rs->recover.io_size_parm);
4891 + if (rs->set.raid_parms > 4)
4892 + DMEMIT("%d ", rs->recover.bandwidth_parm);
4894 + DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4896 + for (i = 0; i < rs->set.raid_devs; i++)
4897 + DMEMIT("%s %llu ",
4899 + rs->dev[i].dev->bdev->bd_dev),
4900 + (unsigned long long) rs->dev[i].start);
4907 + * Message interface
4909 +enum raid_msg_actions {
4910 + act_bw, /* Recovery bandwidth switch. */
4911 + act_dev, /* Device failure switch. */
4912 + act_overwrite, /* Stripe overwrite check. */
4913 + act_read_ahead, /* Set read ahead. */
4914 + act_stats, /* Development statistics switch. */
4915 + act_sc, /* Stripe cache switch. */
4917 + act_on, /* Set entity on. */
4918 + act_off, /* Set entity off. */
4919 + act_reset, /* Reset entity. */
4921 + act_set = act_on, /* Set # absolute. */
4922 + act_grow = act_off, /* Grow # by an amount. */
4923 + act_shrink = act_reset, /* Shrink # by an amount. */
4926 +/* Turn a delta to absolute. */
4927 +static int _absolute(unsigned long action, int act, int r)
4929 + /* Make delta absolute. */
4930 + if (test_bit(act_set, &action))
4932 + else if (test_bit(act_grow, &action))
4934 + else if (test_bit(act_shrink, &action))
4942 + /* Change recovery io bandwidth. */
4943 +static int bandwidth_change(struct dm_msg *msg, void *context)
4945 + struct raid_set *rs = context;
4946 + int act = rs->recover.bandwidth;
4947 + int bandwidth = DM_MSG_INT_ARG(msg);
4949 + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4950 + /* Make delta bandwidth absolute. */
4951 + bandwidth = _absolute(msg->action, act, bandwidth);
4953 + /* Check range. */
4954 + if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4955 + recover_set_bandwidth(rs, bandwidth);
4960 + set_bit(dm_msg_ret_arg, &msg->ret);
4961 + set_bit(dm_msg_ret_inval, &msg->ret);
4965 +/* Change state of a device (running/offline). */
4966 +/* FIXME: this only works while recovering!. */
4967 +static int device_state(struct dm_msg *msg, void *context)
4970 + const char *str = "is already ";
4971 + union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
4972 + struct raid_set *rs = context;
4974 + r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
4975 + bymajmin : byname, &dl);
4976 + if (r == -ENODEV) {
4977 + DMERR("device %s is no member of this set", dl.dev_name);
4981 + if (test_bit(act_off, &msg->action)) {
4982 + if (dev_operational(rs, r))
4984 + } else if (!dev_operational(rs, r))
4987 + DMINFO("/dev/%s %s%s", dl.dev_name, str,
4988 + test_bit(act_off, &msg->action) ? "offline" : "running");
4990 + return test_bit(act_off, &msg->action) ?
4991 + raid_set_check_and_degrade(rs, NULL, r) :
4992 + raid_set_check_and_upgrade(rs, r);
4995 +/* Set/reset development feature flags. */
4996 +static int devel_flags(struct dm_msg *msg, void *context)
4998 + struct raid_set *rs = context;
5000 + if (test_bit(act_on, &msg->action))
5001 + return test_and_set_bit(msg->spec->parm,
5002 + &rs->io.flags) ? -EPERM : 0;
5003 + else if (test_bit(act_off, &msg->action))
5004 + return test_and_clear_bit(msg->spec->parm,
5005 + &rs->io.flags) ? 0 : -EPERM;
5006 + else if (test_bit(act_reset, &msg->action)) {
5007 + if (test_bit(act_stats, &msg->action)) {
5010 + } else if (test_bit(act_overwrite, &msg->action)) {
5012 + set_bit(msg->spec->parm, &rs->io.flags);
5020 + /* Set stripe and chunk read ahead pages. */
5021 +static int read_ahead_set(struct dm_msg *msg, void *context)
5023 + int stripes = DM_MSG_INT_ARGS(msg, 0);
5024 + int chunks = DM_MSG_INT_ARGS(msg, 1);
5026 + if (range_ok(stripes, 1, 512) &&
5027 + range_ok(chunks, 1, 512)) {
5028 + rs_set_bdi(context, stripes, chunks);
5032 + set_bit(dm_msg_ret_arg, &msg->ret);
5033 + set_bit(dm_msg_ret_inval, &msg->ret);
5037 +/* Resize the stripe cache. */
5038 +static int stripecache_resize(struct dm_msg *msg, void *context)
5041 + struct raid_set *rs = context;
5043 + /* Deny permission in case the daemon is still shrinking!. */
5044 + if (atomic_read(&rs->sc.stripes_to_shrink))
5047 + stripes = DM_MSG_INT_ARG(msg);
5048 + if (stripes > 0) {
5049 + act = atomic_read(&rs->sc.stripes);
5051 + /* Make delta stripes absolute. */
5052 + stripes = _absolute(msg->action, act, stripes);
5055 + * Check range and that the # of stripes changes.
5056 + * We can grow from gere but need to leave any
5057 + * shrinking to the worker for synchronization.
5059 + if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
5060 + if (stripes > act)
5061 + return sc_grow(&rs->sc, stripes - act, SC_GROW);
5062 + else if (stripes < act) {
5063 + atomic_set(&rs->sc.stripes_to_shrink,
5072 + set_bit(dm_msg_ret_arg, &msg->ret);
5073 + set_bit(dm_msg_ret_inval, &msg->ret);
5077 +/* Parse the RAID message action. */
5079 + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
5080 + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
5081 + * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
5082 + * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2'
5083 + * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
5084 + * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
5088 +raid_message(struct dm_target *ti, unsigned argc, char **argv)
5090 + /* Variables to store the parsed parameters im. */
5092 + static unsigned long *i_arg[] = {
5093 + (unsigned long *) i + 0,
5094 + (unsigned long *) i + 1,
5097 + static unsigned long *p_arg[] = { (unsigned long *) &p };
5099 + /* Declare all message option strings. */
5100 + static char *str_sgs[] = { "set", "grow", "shrink" };
5101 + static char *str_dev[] = { "running", "offline" };
5102 + static char *str_oor[] = { "on", "off", "reset" };
5104 + /* Declare all actions. */
5105 + static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
5106 + static unsigned long act_oor[] = { act_on, act_off, act_reset };
5108 + /* Bandwidth option. */
5109 + static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
5110 + static struct dm_message_argument bw_args = {
5111 + 1, i_arg, { dm_msg_int_t }
5114 + /* Device option. */
5115 + static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
5116 + static struct dm_message_argument dev_args = {
5117 + 1, p_arg, { dm_msg_base_t }
5120 + /* Read ahead option. */
5121 + static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
5122 + static struct dm_message_argument ra_args = {
5123 + 2, i_arg, { dm_msg_int_t, dm_msg_int_t }
5126 + static struct dm_message_argument null_args = {
5127 + 0, NULL, { dm_msg_int_t }
5130 + /* Overwrite and statistics option. */
5131 + static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
5133 + /* Sripecache option. */
5134 + static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
5136 + /* Declare messages. */
5137 + static struct dm_msg_spec specs[] = {
5138 + { "bandwidth", act_bw, &bw_opt, &bw_args,
5139 + 0, bandwidth_change },
5140 + { "device", act_dev, &dev_opt, &dev_args,
5141 + 0, device_state },
5142 + { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
5143 + RS_CHECK_OVERWRITE, devel_flags },
5144 + { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
5145 + 0, read_ahead_set },
5146 + { "statistics", act_stats, &ovr_stats_opt, &null_args,
5147 + RS_DEVEL_STATS, devel_flags },
5148 + { "stripecache", act_sc, &stripe_opt, &bw_args,
5149 + 0, stripecache_resize },
5152 + /* The message for the parser. */
5153 + struct dm_msg msg = {
5154 + .num_specs = ARRAY_SIZE(specs),
5158 + return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
5161 + * END message interface
5164 +static struct target_type raid_target = {
5166 + .version = {1, 0, 0},
5167 + .module = THIS_MODULE,
5171 + .postsuspend = raid_postsuspend,
5172 + .resume = raid_resume,
5173 + .status = raid_status,
5174 + .message = raid_message,
5177 +static void init_exit(const char *bad_msg, const char *good_msg, int r)
5180 + DMERR("Failed to %sregister target [%d]", bad_msg, r);
5182 + DMINFO("%s %s", good_msg, version);
5185 +static int __init dm_raid_init(void)
5189 + r = dm_register_target(&raid_target);
5190 + init_exit("", "initialized", r);
5194 +static void __exit dm_raid_exit(void)
5198 + r = dm_unregister_target(&raid_target);
5199 + init_exit("un", "exit", r);
5202 +/* Module hooks. */
5203 +module_init(dm_raid_init);
5204 +module_exit(dm_raid_exit);
5206 +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
5207 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
5208 +MODULE_LICENSE("GPL");
5210 +++ b/drivers/md/dm-raid45.h
5213 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
5215 + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
5217 + * Locking definitions for the device-mapper RAID45 target.
5219 + * This file is released under the GPL.
5223 +#ifndef _DM_RAID45_H
5224 +#define _DM_RAID45_H
5226 +/* Factor out to dm.h! */
5227 +#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr))
5229 +enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
5231 +struct dm_raid45_locking_type {
5232 + /* Request a lock on a stripe. */
5233 + void* (*lock)(sector_t key, enum dm_lock_type type);
5235 + /* Release a lock on a stripe. */
5236 + void (*unlock)(void *lock_handle);
5241 +++ b/drivers/md/dm-regions.c
5244 + * Copyright (C) 2003 Sistina Software Limited.
5245 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5247 + * This file is released under the GPL.
5250 +#include <linux/dm-dirty-log.h>
5251 +#include <linux/dm-regions.h>
5253 +#include <linux/ctype.h>
5254 +#include <linux/init.h>
5255 +#include <linux/module.h>
5256 +#include <linux/vmalloc.h>
5259 +#include "dm-bio-list.h"
5261 +#define DM_MSG_PREFIX "region hash"
5263 +/*-----------------------------------------------------------------
5266 + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions.
5267 + * Each region can be in one of three states:
5273 + * There is no need to put clean regions in the hash.
5276 + * In addition to being present in the hash table a region _may_
5277 + * be present on one of three lists.
5279 + * clean_regions: Regions on this list have no io pending to
5280 + * them, they are in sync, we are no longer interested in them,
5281 + * they are dull. dm_rh_update_states() will remove them from the
5284 + * quiesced_regions: These regions have been spun down, ready
5285 + * for recovery. dm_rh_recovery_start() will remove regions from
5286 + * this list and hand them to the caller, which will schedule the
5289 + * recovered_regions: Regions that the caller has successfully
5290 + * recovered. dm_rh_update_states() will now schedule any delayed
5291 + * io, up the recovery_count, and remove the region from the hash.
5293 + * There are 2 locks:
5294 + * A rw spin lock 'hash_lock' protects just the hash table,
5295 + * this is never held in write mode from interrupt context,
5296 + * which I believe means that we only have to disable irqs when
5297 + * doing a write lock.
5299 + * An ordinary spin lock 'region_lock' that protects the three
5300 + * lists in the region_hash, with the 'state', 'list' and
5301 + * 'delayed_bios' fields of the regions. This is used from irq
5302 + * context, so all other uses will have to suspend local irqs.
5303 + *---------------------------------------------------------------*/
5304 +struct region_hash {
5305 + unsigned max_recovery; /* Max # of regions to recover in parallel */
5307 + /* Callback function to dispatch queued writes on recovered regions. */
5308 + void (*dispatch)(void *context, struct bio_list *bios, int error);
5309 + void *dispatch_context;
5311 + /* Callback function to wakeup callers worker thread. */
5312 + void (*wake)(void *context);
5313 + void *wake_context;
5315 + uint32_t region_size;
5316 + unsigned region_shift;
5318 + /* holds persistent region state */
5319 + struct dm_dirty_log *log;
5322 + rwlock_t hash_lock;
5323 + mempool_t *region_pool;
5325 + unsigned nr_buckets;
5328 + struct list_head *buckets;
5330 + spinlock_t region_lock;
5331 + atomic_t recovery_in_flight;
5332 + struct semaphore recovery_count;
5333 + struct list_head clean_regions;
5334 + struct list_head quiesced_regions;
5335 + struct list_head recovered_regions;
5336 + struct list_head failed_recovered_regions;
5341 + enum dm_rh_region_states state;
5342 + void *context; /* Caller context. */
5344 + struct list_head hash_list;
5345 + struct list_head list;
5348 + struct bio_list delayed_bios;
5354 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector)
5356 + return sector >> ((struct region_hash *) rh)->region_shift;
5358 +EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
5360 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio)
5362 + return dm_rh_sector_to_region(rh, bio->bi_sector);
5364 +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
5366 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region)
5368 + return region << ((struct region_hash *) rh)->region_shift;
5370 +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
5375 +region_t dm_rh_get_region_key(struct dm_region *reg)
5377 + return ((struct region *) reg)->key;
5379 +EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
5381 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh)
5383 + return ((struct region_hash *) rh)->region_size;
5385 +EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
5387 +/* Squirrel a context with a region. */
5388 +void *dm_rh_reg_get_context(struct dm_region *reg)
5390 + return ((struct region *) reg)->context;
5392 +EXPORT_SYMBOL_GPL(dm_rh_reg_get_context);
5394 +void dm_rh_reg_set_context(struct dm_region *reg, void *context)
5396 + ((struct region *) reg)->context = context;
5398 +EXPORT_SYMBOL_GPL(dm_rh_reg_set_context);
5401 + * Create region hash client.
5403 +#define MIN_REGIONS 64
5404 +struct dm_rh_client *dm_rh_client_create(
5405 + unsigned max_recovery,
5406 + void (*dispatch)(void *dispatch_context,
5407 + struct bio_list *bios, int error),
5408 + void *dispatch_context,
5409 + void (*wake)(void *wake_context), void *wake_context,
5410 + struct dm_dirty_log *log, uint32_t region_size,
5411 + region_t nr_regions)
5414 + unsigned nr_buckets, max_buckets;
5415 + unsigned hash_primes[] = {
5416 + /* Table of primes for rh_hash/table size optimization. */
5417 + 3, 7, 13, 27, 53, 97, 193, 389, 769,
5418 + 1543, 3079, 6151, 12289, 24593,
5420 + struct region_hash *rh;
5422 + if (region_size & (region_size - 1)) {
5423 + DMERR("region size must be 2^^n");
5424 + return ERR_PTR(-EINVAL);
5427 + /* Calculate a suitable number of buckets for our hash table. */
5428 + max_buckets = nr_regions >> 6;
5429 + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
5433 + rh = kmalloc(sizeof(*rh), GFP_KERNEL);
5435 + DMERR("unable to allocate region hash memory");
5436 + return ERR_PTR(-ENOMEM);
5439 + rh->max_recovery = max_recovery;
5440 + rh->dispatch = dispatch;
5441 + rh->dispatch_context = dispatch_context;
5443 + rh->wake_context = wake_context;
5445 + rh->region_size = region_size;
5446 + rh->region_shift = ffs(region_size) - 1;
5447 + rwlock_init(&rh->hash_lock);
5448 + rh->mask = nr_buckets - 1;
5449 + rh->nr_buckets = nr_buckets;
5450 + rh->shift = ffs(nr_buckets);
5452 + /* Check prime array limits. */
5453 + i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ?
5454 + ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2;
5455 + rh->prime = hash_primes[i];
5457 + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
5458 + if (!rh->buckets) {
5459 + DMERR("unable to allocate region hash bucket memory");
5461 + return ERR_PTR(-ENOMEM);
5464 + for (i = 0; i < nr_buckets; i++)
5465 + INIT_LIST_HEAD(rh->buckets + i);
5467 + spin_lock_init(&rh->region_lock);
5468 + sema_init(&rh->recovery_count, 0);
5469 + atomic_set(&rh->recovery_in_flight, 0);
5470 + INIT_LIST_HEAD(&rh->clean_regions);
5471 + INIT_LIST_HEAD(&rh->quiesced_regions);
5472 + INIT_LIST_HEAD(&rh->recovered_regions);
5473 + INIT_LIST_HEAD(&rh->failed_recovered_regions);
5475 + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
5476 + sizeof(struct region));
5477 + if (!rh->region_pool) {
5478 + vfree(rh->buckets);
5480 + rh = ERR_PTR(-ENOMEM);
5483 + return (struct dm_rh_client *) rh;
5485 +EXPORT_SYMBOL_GPL(dm_rh_client_create);
5487 +void dm_rh_client_destroy(struct dm_rh_client *rh_in)
5490 + struct region_hash *rh = (struct region_hash *) rh_in;
5491 + struct region *reg, *tmp;
5493 + BUG_ON(!list_empty(&rh->quiesced_regions));
5495 + for (h = 0; h < rh->nr_buckets; h++) {
5496 + list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) {
5497 + BUG_ON(atomic_read(®->pending));
5498 + mempool_free(reg, rh->region_pool);
5502 + if (rh->region_pool)
5503 + mempool_destroy(rh->region_pool);
5505 + vfree(rh->buckets);
5508 +EXPORT_SYMBOL_GPL(dm_rh_client_destroy);
5510 +static inline unsigned rh_hash(struct region_hash *rh, region_t region)
5512 + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
5515 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
5517 + struct region *reg;
5518 + struct list_head *bucket = rh->buckets + rh_hash(rh, region);
5520 + list_for_each_entry(reg, bucket, hash_list) {
5521 + if (reg->key == region)
5528 +static void __rh_insert(struct region_hash *rh, struct region *reg)
5530 + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key));
5533 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
5535 + struct region *reg, *nreg;
5537 + read_unlock(&rh->hash_lock);
5538 + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
5539 + if (unlikely(!nreg))
5540 + nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
5542 + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
5543 + DM_RH_CLEAN : DM_RH_NOSYNC;
5544 + nreg->key = region;
5545 + INIT_LIST_HEAD(&nreg->list);
5546 + atomic_set(&nreg->pending, 0);
5547 + bio_list_init(&nreg->delayed_bios);
5549 + write_lock_irq(&rh->hash_lock);
5550 + reg = __rh_lookup(rh, region);
5552 + /* We lost the race. */
5553 + mempool_free(nreg, rh->region_pool);
5555 + __rh_insert(rh, nreg);
5556 + if (nreg->state == DM_RH_CLEAN) {
5557 + spin_lock(&rh->region_lock);
5558 + list_add(&nreg->list, &rh->clean_regions);
5559 + spin_unlock(&rh->region_lock);
5565 + write_unlock_irq(&rh->hash_lock);
5566 + read_lock(&rh->hash_lock);
5570 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
5572 + struct region *reg;
5574 + reg = __rh_lookup(rh, region);
5575 + return reg ? reg : __rh_alloc(rh, region);
5578 +int dm_rh_get_state(struct dm_rh_client *rh_in, region_t region, int may_block)
5581 + struct region_hash *rh = (struct region_hash *) rh_in;
5582 + struct region *reg;
5584 + read_lock(&rh->hash_lock);
5585 + reg = __rh_lookup(rh, region);
5586 + read_unlock(&rh->hash_lock);
5589 + return reg->state;
5592 + * The region wasn't in the hash, so we fall back to the dirty log.
5594 + r = rh->log->type->in_sync(rh->log, region, may_block);
5597 + * Any error from the dirty log (eg. -EWOULDBLOCK)
5598 + * gets taken as a DM_RH_NOSYNC
5600 + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
5602 +EXPORT_SYMBOL_GPL(dm_rh_get_state);
5604 +void dm_rh_set_state(struct dm_rh_client *rh_in, region_t region,
5605 + enum dm_rh_region_states state, int may_block)
5607 + struct region_hash *rh = (struct region_hash *) rh_in;
5608 + struct region *reg;
5609 + struct dm_dirty_log *log = rh->log;
5611 + if (state == DM_RH_NOSYNC)
5612 + log->type->set_region_sync(log, region, 0);
5613 + else if (state == DM_RH_CLEAN)
5614 + log->type->clear_region(log, region);
5615 + else if (state == DM_RH_DIRTY)
5616 + log->type->mark_region(log, region);
5618 + read_lock(&rh->hash_lock);
5619 + reg = __rh_find(rh, region);
5620 + reg->state = state;
5621 + read_unlock(&rh->hash_lock);
5623 +EXPORT_SYMBOL_GPL(dm_rh_set_state);
5625 +void dm_rh_update_states(struct dm_rh_client *rh_in, int errors_handled)
5627 + struct region_hash *rh = (struct region_hash *) rh_in;
5628 + struct region *reg, *next;
5630 + LIST_HEAD(recovered);
5631 + LIST_HEAD(failed_recovered);
5634 + * Quickly grab the lists and remove any regions from hash.
5636 + write_lock_irq(&rh->hash_lock);
5637 + spin_lock(&rh->region_lock);
5638 + if (!list_empty(&rh->clean_regions)) {
5639 + list_splice_init(&rh->clean_regions, &clean);
5641 + list_for_each_entry(reg, &clean, list)
5642 + list_del(®->hash_list);
5645 + if (!list_empty(&rh->recovered_regions)) {
5646 + list_splice_init(&rh->recovered_regions, &recovered);
5648 + list_for_each_entry(reg, &recovered, list)
5649 + list_del(®->hash_list);
5652 + if (!list_empty(&rh->failed_recovered_regions)) {
5653 + list_splice_init(&rh->failed_recovered_regions,
5654 + &failed_recovered);
5656 + list_for_each_entry(reg, &recovered, list)
5657 + list_del(®->hash_list);
5660 + spin_unlock(&rh->region_lock);
5661 + write_unlock_irq(&rh->hash_lock);
5664 + * All the regions on the recovered and clean lists have
5665 + * now been pulled out of the system, so no need to do
5666 + * any more locking.
5668 + list_for_each_entry_safe(reg, next, &recovered, list) {
5669 + rh->log->type->clear_region(rh->log, reg->key);
5670 + rh->log->type->set_region_sync(rh->log, reg->key, 1);
5672 + if (reg->delayed_bios.head)
5673 + rh->dispatch(rh->dispatch_context,
5674 + ®->delayed_bios, 0);
5676 + up(&rh->recovery_count);
5677 + mempool_free(reg, rh->region_pool);
5680 + list_for_each_entry_safe(reg, next, &failed_recovered, list) {
5681 + rh->log->type->set_region_sync(rh->log, reg->key,
5682 + errors_handled ? 0 : 1);
5683 + if (reg->delayed_bios.head)
5684 + rh->dispatch(rh->dispatch_context,
5685 + ®->delayed_bios, -EIO);
5687 + up(&rh->recovery_count);
5688 + mempool_free(reg, rh->region_pool);
5691 + list_for_each_entry_safe(reg, next, &clean, list) {
5692 + rh->log->type->clear_region(rh->log, reg->key);
5693 + mempool_free(reg, rh->region_pool);
5696 + dm_rh_flush(rh_in);
5698 +EXPORT_SYMBOL_GPL(dm_rh_update_states);
5700 +void dm_rh_inc(struct dm_rh_client *rh_in, region_t region)
5702 + struct region_hash *rh = (struct region_hash *) rh_in;
5703 + struct region *reg;
5705 + read_lock(&rh->hash_lock);
5706 + reg = __rh_find(rh, region);
5707 + if (reg->state == DM_RH_CLEAN) {
5708 + rh->log->type->mark_region(rh->log, reg->key);
5710 + spin_lock_irq(&rh->region_lock);
5711 + reg->state = DM_RH_DIRTY;
5712 + list_del_init(®->list); /* Take off the clean list. */
5713 + spin_unlock_irq(&rh->region_lock);
5716 + atomic_inc(®->pending);
5717 + read_unlock(&rh->hash_lock);
5719 +EXPORT_SYMBOL_GPL(dm_rh_inc);
5721 +void dm_rh_inc_pending(struct dm_rh_client *rh_in, struct bio_list *bios)
5725 + for (bio = bios->head; bio; bio = bio->bi_next)
5726 + dm_rh_inc(rh_in, dm_rh_bio_to_region(rh_in, bio));
5728 +EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
5730 +int dm_rh_dec(struct dm_rh_client *rh_in, region_t region)
5733 + struct region_hash *rh = (struct region_hash *) rh_in;
5734 + struct region *reg;
5736 + read_lock(&rh->hash_lock);
5737 + reg = __rh_lookup(rh, region);
5738 + read_unlock(&rh->hash_lock);
5742 + if (atomic_dec_and_test(®->pending)) {
5743 + unsigned long flags;
5746 + * There is no pending I/O for this region.
5747 + * We can move the region to corresponding list for next action.
5748 + * At this point, the region is not yet connected to any list.
5750 + * If the state is DM_RH_NOSYNC, the region should be kept off
5751 + * from clean list.
5752 + * The hash entry for DM_RH_NOSYNC will remain in memory
5753 + * until the region is recovered or the map is reloaded.
5756 + spin_lock_irqsave(&rh->region_lock, flags);
5757 + if (reg->state == DM_RH_RECOVERING)
5758 + list_add_tail(®->list, &rh->quiesced_regions);
5760 + reg->state = DM_RH_CLEAN;
5761 + list_add(®->list, &rh->clean_regions);
5763 + spin_unlock_irqrestore(&rh->region_lock, flags);
5770 +EXPORT_SYMBOL_GPL(dm_rh_dec);
5773 + * Starts quiescing a region in preparation for recovery.
5775 +static int __rh_recovery_prepare(struct region_hash *rh)
5779 + struct region *reg;
5782 + * Ask the dirty log what's next.
5784 + r = rh->log->type->get_resync_work(rh->log, ®ion);
5789 + * Get this region, and start it quiescing
5790 + * by setting the recovering flag.
5792 + read_lock(&rh->hash_lock);
5793 + reg = __rh_find(rh, region);
5794 + read_unlock(&rh->hash_lock);
5796 + spin_lock_irq(&rh->region_lock);
5798 + reg->state = DM_RH_RECOVERING;
5800 + /* Already quiesced ? */
5801 + list_del_init(®->list);
5802 + if (!atomic_read(®->pending))
5803 + list_add(®->list, &rh->quiesced_regions);
5805 + spin_unlock_irq(&rh->region_lock);
5809 +int dm_rh_recovery_prepare(struct dm_rh_client *rh_in)
5812 + struct region_hash *rh = (struct region_hash *) rh_in;
5814 + /* Extra reference to avoid race with rh_stop_recovery */
5815 + atomic_inc(&rh->recovery_in_flight);
5817 + while (!down_trylock(&rh->recovery_count)) {
5818 + atomic_inc(&rh->recovery_in_flight);
5820 + if (__rh_recovery_prepare(rh) <= 0) {
5821 + atomic_dec(&rh->recovery_in_flight);
5822 + up(&rh->recovery_count);
5828 + /* Drop the extra reference. */
5829 + if (atomic_dec_and_test(&rh->recovery_in_flight))
5834 +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
5837 + * Returns any quiesced regions.
5839 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh_in)
5841 + struct region_hash *rh = (struct region_hash *) rh_in;
5842 + struct region *reg = NULL;
5844 + spin_lock_irq(&rh->region_lock);
5845 + if (!list_empty(&rh->quiesced_regions)) {
5846 + reg = list_entry(rh->quiesced_regions.next,
5847 + struct region, list);
5848 + list_del_init(®->list); /* Remove from the quiesced list. */
5851 + spin_unlock_irq(&rh->region_lock);
5852 + return (struct dm_region *) reg;
5854 +EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
5857 + * Put region on list of recovered ones.
5859 +void dm_rh_recovery_end(struct dm_rh_client *rh_in, struct dm_region *reg_in,
5862 + struct region_hash *rh = (struct region_hash *) rh_in;
5863 + struct region *reg = (struct region *) reg_in;
5865 + spin_lock_irq(&rh->region_lock);
5867 + reg->state = DM_RH_NOSYNC;
5868 + list_add(®->list, &rh->failed_recovered_regions);
5870 + list_add(®->list, &rh->recovered_regions);
5872 + atomic_dec(&rh->recovery_in_flight);
5873 + spin_unlock_irq(&rh->region_lock);
5875 + rh->wake(rh->wake_context);
5876 + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0);
5878 +EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
5880 +/* Return recovery in flight count. */
5881 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh_in)
5883 + return atomic_read(&((struct region_hash *) rh_in)->recovery_in_flight);
5885 +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
5887 +int dm_rh_flush(struct dm_rh_client *rh_in)
5889 + struct region_hash *rh = (struct region_hash *) rh_in;
5891 + return rh->log->type->flush(rh->log);
5893 +EXPORT_SYMBOL_GPL(dm_rh_flush);
5895 +void dm_rh_delay_by_region(struct dm_rh_client *rh_in,
5896 + struct bio *bio, region_t region)
5898 + struct region_hash *rh = (struct region_hash *) rh_in;
5899 + struct region *reg;
5901 + /* FIXME: locking. */
5902 + read_lock(&rh->hash_lock);
5903 + reg = __rh_find(rh, region);
5904 + bio_list_add(®->delayed_bios, bio);
5905 + read_unlock(&rh->hash_lock);
5907 +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
5909 +void dm_rh_delay(struct dm_rh_client *rh_in, struct bio *bio)
5911 + return dm_rh_delay_by_region(rh_in, bio,
5912 + dm_rh_bio_to_region(rh_in, bio));
5914 +EXPORT_SYMBOL_GPL(dm_rh_delay);
5916 +void dm_rh_dispatch_bios(struct dm_rh_client *rh_in,
5917 + region_t region, int error)
5919 + struct region_hash *rh = (struct region_hash *) rh_in;
5920 + struct region *reg;
5921 + struct bio_list delayed_bios;
5923 + /* FIXME: locking. */
5924 + read_lock(&rh->hash_lock);
5925 + reg = __rh_find(rh, region);
5927 + delayed_bios = reg->delayed_bios;
5928 + bio_list_init(®->delayed_bios);
5929 + read_unlock(&rh->hash_lock);
5931 + if (delayed_bios.head)
5932 + rh->dispatch(rh->dispatch_context, &delayed_bios, error);
5934 + up(&rh->recovery_count);
5936 +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios);
5938 +void dm_rh_stop_recovery(struct dm_rh_client *rh_in)
5941 + struct region_hash *rh = (struct region_hash *) rh_in;
5943 + rh->wake(rh->wake_context);
5945 + /* wait for any recovering regions */
5946 + for (i = 0; i < rh->max_recovery; i++)
5947 + down(&rh->recovery_count);
5949 +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
5951 +void dm_rh_start_recovery(struct dm_rh_client *rh_in)
5954 + struct region_hash *rh = (struct region_hash *) rh_in;
5956 + for (i = 0; i < rh->max_recovery; i++)
5957 + up(&rh->recovery_count);
5959 + rh->wake(rh->wake_context);
5961 +EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
5963 +MODULE_DESCRIPTION(DM_NAME " region hash");
5964 +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <hjm@redhat.com>");
5965 +MODULE_LICENSE("GPL");
5966 --- a/drivers/md/Kconfig
5967 +++ b/drivers/md/Kconfig
5968 @@ -269,6 +269,14 @@ config DM_DELAY
5973 + tristate "RAID 4/5 target (EXPERIMENTAL)"
5974 + depends on BLK_DEV_DM && EXPERIMENTAL
5976 + A target that supports RAID4 and RAID5 mappings.
5981 bool "DM uevents (EXPERIMENTAL)"
5982 depends on BLK_DEV_DM && EXPERIMENTAL
5983 --- a/drivers/md/Makefile
5984 +++ b/drivers/md/Makefile
5985 @@ -34,7 +34,9 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
5986 obj-$(CONFIG_DM_DELAY) += dm-delay.o
5987 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
5988 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
5989 -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
5990 +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o
5991 +obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \
5992 + dm-regions.o dm-message.o
5993 obj-$(CONFIG_DM_ZERO) += dm-zero.o
5995 quiet_cmd_unroll = UNROLL $@
5997 +++ b/include/linux/dm-regions.h
6000 + * Copyright (C) 2003 Sistina Software Limited.
6001 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6003 + * Device-Mapper dirty region hash interface.
6005 + * This file is released under the GPL.
6008 +#ifndef DM_REGION_HASH_H
6009 +#define DM_REGION_HASH_H
6011 +#include <linux/dm-dirty-log.h>
6013 +/*-----------------------------------------------------------------
6015 + *----------------------------------------------------------------*/
6016 +struct dm_rh_client;
6020 + * States a region can have.
6022 +enum dm_rh_region_states {
6023 + DM_RH_CLEAN = 0x01, /* No writes in flight. */
6024 + DM_RH_DIRTY = 0x02, /* Writes in flight. */
6025 + DM_RH_NOSYNC = 0x04, /* Out of sync. */
6026 + DM_RH_RECOVERING = 0x08, /* Under resynchronization. */
6030 + * Region hash create/destroy.
6033 +struct dm_rh_client *dm_rh_client_create(
6034 + unsigned max_recovery,
6035 + void (*dispatch)(void *dispatch_context,
6036 + struct bio_list *bios, int error),
6037 + void *dispatch_context,
6038 + void (*wake)(void *wake_context), void *wake_context,
6039 + struct dm_dirty_log *log, uint32_t region_size,
6040 + region_t nr_regions);
6041 +void dm_rh_client_destroy(struct dm_rh_client *rh);
6047 + * sector -> region
6048 + * region -> sector
6050 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio);
6051 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector);
6052 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region);
6055 + * Functions to set a caller context in a region.
6057 +void *dm_rh_reg_get_context(struct dm_region *reg);
6058 +void dm_rh_reg_set_context(struct dm_region *reg, void *context);
6061 + * Get region size and key (ie. number of the region).
6063 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh);
6064 +sector_t dm_rh_get_region_key(struct dm_region *reg);
6067 + * Get/set/update region state (and dirty log).
6069 + * dm_rh_update_states
6070 + * @errors_handled != 0 influences
6071 + * that the state of the region will be kept NOSYNC
6073 +int dm_rh_get_state(struct dm_rh_client *rh, region_t region, int may_block);
6074 +void dm_rh_set_state(struct dm_rh_client *rh, region_t region,
6075 + enum dm_rh_region_states state, int may_block);
6076 +void dm_rh_update_states(struct dm_rh_client *rh, int errors_handled);
6078 +/* Flush the region hash and dirty log. */
6079 +int dm_rh_flush(struct dm_rh_client *rh);
6081 +/* Inc/dec pending count on regions. */
6082 +void dm_rh_inc(struct dm_rh_client *rh, region_t region);
6083 +void dm_rh_inc_pending(struct dm_rh_client *rh, struct bio_list *bios);
6084 +int dm_rh_dec(struct dm_rh_client *rh, region_t region);
6086 +/* Delay bios on regions. */
6087 +void dm_rh_delay(struct dm_rh_client *rh, struct bio *bio);
6088 +void dm_rh_delay_by_region(struct dm_rh_client *rh,
6089 + struct bio *bio, region_t region);
6092 + * Normally, the region hash will automatically call the dispatch function.
6093 + * dm_rh_dispatch_bios() is for intentional dispatching of bios.
6095 +void dm_rh_dispatch_bios(struct dm_rh_client *rh, region_t region, int error);
6098 + * Region recovery control.
6100 +/* Prepare some regions for recovery by starting to quiesce them. */
6101 +int dm_rh_recovery_prepare(struct dm_rh_client *rh);
6102 +/* Try fetching a quiesced region for recovery. */
6103 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh);
6104 +/* Report recovery end on a region. */
6105 +void dm_rh_recovery_end(struct dm_rh_client *rh, struct dm_region *reg,
6107 +/* Check for amount of recoveries in flight. */
6108 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh);
6109 +/* Start/stop recovery. */
6110 +void dm_rh_stop_recovery(struct dm_rh_client *rh);
6111 +void dm_rh_start_recovery(struct dm_rh_client *rh);
6113 +#endif /* #ifdef DM_REGION_HASH_H */