]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/suse-2.6.27.31/patches.suse/dm-raid45_2.6.27_20081027.patch
Move xen patchset to new version's subdir.
[ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.suse / dm-raid45_2.6.27_20081027.patch
CommitLineData
00e5a55c
BS
1From: "Heinz Mauelshagen <hjm@redhat.de>
2Subject: DMRAID45 module
3X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/
4
5 DM-RAID 45 module.
6
7 This driver is used for "Fake RAID" devices.
8
9Acked-by: Jeff Mahoney <jeffm@suse.com>
10
11---
12
13 drivers/md/Kconfig | 8
14 drivers/md/Makefile | 4
15 drivers/md/dm-memcache.c | 301 ++
16 drivers/md/dm-memcache.h | 68
17 drivers/md/dm-message.c | 182 +
18 drivers/md/dm-message.h | 91
19 drivers/md/dm-raid45.c | 4516 +++++++++++++++++++++++++++++++++++++++++++++
20 drivers/md/dm-raid45.h | 28
21 drivers/md/dm-regions.c | 723 +++++++
22 drivers/md/dm.c | 1
23 include/linux/dm-regions.h | 115 +
24 11 files changed, 6036 insertions(+), 1 deletion(-)
25
26--- a/drivers/md/dm.c
27+++ b/drivers/md/dm.c
28@@ -1680,6 +1680,7 @@ struct gendisk *dm_disk(struct mapped_de
29 {
30 return md->disk;
31 }
32+EXPORT_SYMBOL_GPL(dm_disk);
33
34 int dm_suspended(struct mapped_device *md)
35 {
36--- /dev/null
37+++ b/drivers/md/dm-memcache.c
38@@ -0,0 +1,301 @@
39+/*
40+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
41+ *
42+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
43+ *
44+ * Device-mapper memory object handling:
45+ *
46+ * o allocate/free total_pages in a per client page pool.
47+ *
48+ * o allocate/free memory objects with chunks (1..n) of
49+ * pages_per_chunk pages hanging off.
50+ *
51+ * This file is released under the GPL.
52+ */
53+
54+#define DM_MEM_CACHE_VERSION "0.2"
55+
56+#include "dm.h"
57+#include "dm-memcache.h"
58+#include <linux/dm-io.h>
59+
60+struct dm_mem_cache_client {
61+ spinlock_t lock;
62+ mempool_t *objs_pool;
63+ struct page_list *free_list;
64+ unsigned objects;
65+ unsigned chunks;
66+ unsigned pages_per_chunk;
67+ unsigned free_pages;
68+ unsigned total_pages;
69+};
70+
71+/*
72+ * Free pages and page_list elements of client.
73+ */
74+static void free_cache_pages(struct page_list *list)
75+{
76+ while (list) {
77+ struct page_list *pl = list;
78+
79+ list = pl->next;
80+ BUG_ON(!pl->page);
81+ __free_page(pl->page);
82+ kfree(pl);
83+ }
84+}
85+
86+/*
87+ * Alloc number of pages and page_list elements as required by client.
88+ */
89+static struct page_list *alloc_cache_pages(unsigned pages)
90+{
91+ struct page_list *pl, *ret = NULL;
92+ struct page *page;
93+
94+ while (pages--) {
95+ page = alloc_page(GFP_NOIO);
96+ if (!page)
97+ goto err;
98+
99+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
100+ if (!pl) {
101+ __free_page(page);
102+ goto err;
103+ }
104+
105+ pl->page = page;
106+ pl->next = ret;
107+ ret = pl;
108+ }
109+
110+ return ret;
111+
112+err:
113+ free_cache_pages(ret);
114+ return NULL;
115+}
116+
117+/*
118+ * Allocate page_list elements from the pool to chunks of the memory object.
119+ */
120+static void alloc_chunks(struct dm_mem_cache_client *cl,
121+ struct dm_mem_cache_object *obj)
122+{
123+ unsigned chunks = cl->chunks;
124+ unsigned long flags;
125+
126+ local_irq_save(flags);
127+ local_irq_disable();
128+ while (chunks--) {
129+ unsigned p = cl->pages_per_chunk;
130+
131+ obj[chunks].pl = NULL;
132+
133+ while (p--) {
134+ struct page_list *pl;
135+
136+ /* Take next element from free list */
137+ spin_lock(&cl->lock);
138+ pl = cl->free_list;
139+ BUG_ON(!pl);
140+ cl->free_list = pl->next;
141+ spin_unlock(&cl->lock);
142+
143+ pl->next = obj[chunks].pl;
144+ obj[chunks].pl = pl;
145+ }
146+ }
147+
148+ local_irq_restore(flags);
149+}
150+
151+/*
152+ * Free page_list elements putting them back onto free list
153+ */
154+static void free_chunks(struct dm_mem_cache_client *cl,
155+ struct dm_mem_cache_object *obj)
156+{
157+ unsigned chunks = cl->chunks;
158+ unsigned long flags;
159+ struct page_list *next, *pl;
160+
161+ local_irq_save(flags);
162+ local_irq_disable();
163+ while (chunks--) {
164+ for (pl = obj[chunks].pl; pl; pl = next) {
165+ next = pl->next;
166+
167+ spin_lock(&cl->lock);
168+ pl->next = cl->free_list;
169+ cl->free_list = pl;
170+ cl->free_pages++;
171+ spin_unlock(&cl->lock);
172+ }
173+ }
174+
175+ local_irq_restore(flags);
176+}
177+
178+/*
179+ * Create/destroy dm memory cache client resources.
180+ */
181+struct dm_mem_cache_client *
182+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
183+ unsigned pages_per_chunk)
184+{
185+ unsigned total_pages = objects * chunks * pages_per_chunk;
186+ struct dm_mem_cache_client *client;
187+
188+ BUG_ON(!total_pages);
189+ client = kzalloc(sizeof(*client), GFP_KERNEL);
190+ if (!client)
191+ return ERR_PTR(-ENOMEM);
192+
193+ client->objs_pool = mempool_create_kmalloc_pool(objects,
194+ chunks * sizeof(struct dm_mem_cache_object));
195+ if (!client->objs_pool)
196+ goto err;
197+
198+ client->free_list = alloc_cache_pages(total_pages);
199+ if (!client->free_list)
200+ goto err1;
201+
202+ spin_lock_init(&client->lock);
203+ client->objects = objects;
204+ client->chunks = chunks;
205+ client->pages_per_chunk = pages_per_chunk;
206+ client->free_pages = client->total_pages = total_pages;
207+ return client;
208+
209+err1:
210+ mempool_destroy(client->objs_pool);
211+err:
212+ kfree(client);
213+ return ERR_PTR(-ENOMEM);
214+}
215+EXPORT_SYMBOL(dm_mem_cache_client_create);
216+
217+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
218+{
219+ BUG_ON(cl->free_pages != cl->total_pages);
220+ free_cache_pages(cl->free_list);
221+ mempool_destroy(cl->objs_pool);
222+ kfree(cl);
223+}
224+EXPORT_SYMBOL(dm_mem_cache_client_destroy);
225+
226+/*
227+ * Grow a clients cache by an amount of pages.
228+ *
229+ * Don't call from interrupt context!
230+ */
231+int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
232+{
233+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
234+ struct page_list *pl, *last;
235+
236+ BUG_ON(!pages);
237+ pl = alloc_cache_pages(pages);
238+ if (!pl)
239+ return -ENOMEM;
240+
241+ last = pl;
242+ while (last->next)
243+ last = last->next;
244+
245+ spin_lock_irq(&cl->lock);
246+ last->next = cl->free_list;
247+ cl->free_list = pl;
248+ cl->free_pages += pages;
249+ cl->total_pages += pages;
250+ cl->objects++;
251+ spin_unlock_irq(&cl->lock);
252+
253+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
254+ return 0;
255+}
256+EXPORT_SYMBOL(dm_mem_cache_grow);
257+
258+/* Shrink a clients cache by an amount of pages */
259+int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
260+{
261+ int r;
262+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
263+ unsigned long flags;
264+ struct page_list *last = NULL, *pl, *pos;
265+
266+ BUG_ON(!pages);
267+
268+ spin_lock_irqsave(&cl->lock, flags);
269+ pl = pos = cl->free_list;
270+ while (p-- && pos->next) {
271+ last = pos;
272+ pos = pos->next;
273+ }
274+
275+ if (++p)
276+ r = -ENOMEM;
277+ else {
278+ r = 0;
279+ cl->free_list = pos;
280+ cl->free_pages -= pages;
281+ cl->total_pages -= pages;
282+ cl->objects--;
283+ last->next = NULL;
284+ }
285+ spin_unlock_irqrestore(&cl->lock, flags);
286+
287+ if (!r) {
288+ free_cache_pages(pl);
289+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
290+ }
291+
292+ return r;
293+}
294+EXPORT_SYMBOL(dm_mem_cache_shrink);
295+
296+/*
297+ * Allocate/free a memory object
298+ *
299+ * Can be called from interrupt context
300+ */
301+struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
302+{
303+ int r = 0;
304+ unsigned pages = cl->chunks * cl->pages_per_chunk;
305+ unsigned long flags;
306+ struct dm_mem_cache_object *obj;
307+
308+ obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
309+ if (!obj)
310+ return ERR_PTR(-ENOMEM);
311+
312+ spin_lock_irqsave(&cl->lock, flags);
313+ if (pages > cl->free_pages)
314+ r = -ENOMEM;
315+ else
316+ cl->free_pages -= pages;
317+ spin_unlock_irqrestore(&cl->lock, flags);
318+
319+ if (r) {
320+ mempool_free(obj, cl->objs_pool);
321+ return ERR_PTR(r);
322+ }
323+
324+ alloc_chunks(cl, obj);
325+ return obj;
326+}
327+EXPORT_SYMBOL(dm_mem_cache_alloc);
328+
329+void dm_mem_cache_free(struct dm_mem_cache_client *cl,
330+ struct dm_mem_cache_object *obj)
331+{
332+ free_chunks(cl, obj);
333+ mempool_free(obj, cl->objs_pool);
334+}
335+EXPORT_SYMBOL(dm_mem_cache_free);
336+
337+MODULE_DESCRIPTION(DM_NAME " dm memory cache");
338+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
339+MODULE_LICENSE("GPL");
340--- /dev/null
341+++ b/drivers/md/dm-memcache.h
342@@ -0,0 +1,68 @@
343+/*
344+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
345+ *
346+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
347+ *
348+ * Device-mapper memory object handling:
349+ *
350+ * o allocate/free total_pages in a per client page pool.
351+ *
352+ * o allocate/free memory objects with chunks (1..n) of
353+ * pages_per_chunk pages hanging off.
354+ *
355+ * This file is released under the GPL.
356+ */
357+
358+#ifndef _DM_MEM_CACHE_H
359+#define _DM_MEM_CACHE_H
360+
361+#define DM_MEM_CACHE_H_VERSION "0.1"
362+
363+#include "dm.h"
364+#include <linux/dm-io.h>
365+
366+static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
367+{
368+ while (pl && p--)
369+ pl = pl->next;
370+
371+ return pl;
372+}
373+
374+struct dm_mem_cache_object {
375+ struct page_list *pl; /* Dynamically allocated array */
376+ void *private; /* Caller context reference */
377+};
378+
379+struct dm_mem_cache_client;
380+
381+/*
382+ * Create/destroy dm memory cache client resources.
383+ *
384+ * On creation, a number of @objects with @chunks of
385+ * @pages_per_chunk pages will be allocated.
386+ */
387+struct dm_mem_cache_client *
388+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
389+ unsigned pages_per_chunk);
390+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
391+
392+/*
393+ * Grow/shrink a dm memory cache client resources
394+ * by @objetcs amount of objects.
395+ */
396+int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
397+int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
398+
399+/*
400+ * Allocate/free a memory object
401+ *
402+ * On allocation one object with an amount of chunks and
403+ * an amount of pages per chunk will be returned on success.
404+ */
405+struct dm_mem_cache_object *
406+dm_mem_cache_alloc(struct dm_mem_cache_client *client);
407+void dm_mem_cache_free(struct dm_mem_cache_client *client,
408+ struct dm_mem_cache_object *object);
409+
410+#endif
411--- /dev/null
412+++ b/drivers/md/dm-message.c
413@@ -0,0 +1,182 @@
414+/*
415+ * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved.
416+ *
417+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
418+ *
419+ * General device-mapper message interface argument parser.
420+ *
421+ * This file is released under the GPL.
422+ *
423+ * device-mapper message parser.
424+ *
425+ */
426+
427+#include "dm.h"
428+#include "dm-message.h"
429+#include <linux/kernel.h>
430+
431+#define DM_MSG_PREFIX "dm_message"
432+
433+/* Basename of a path. */
434+static inline char *
435+basename(char *s)
436+{
437+ char *p = strrchr(s, '/');
438+
439+ return p ? p + 1 : s;
440+}
441+
442+/* Get an argument depending on type. */
443+static void
444+message_arguments(struct dm_msg *msg, int argc, char **argv)
445+{
446+
447+ if (argc) {
448+ int i;
449+ struct dm_message_argument *args = msg->spec->args;
450+
451+ for (i = 0; i < args->num_args; i++) {
452+ int r;
453+ unsigned long **ptr = args->ptr;
454+ enum dm_message_argument_type type = args->types[i];
455+
456+ switch (type) {
457+ case dm_msg_base_t:
458+ ((char **) ptr)[i] = basename(argv[i]);
459+ break;
460+
461+ case dm_msg_str_t:
462+ ((char **) ptr)[i] = argv[i];
463+ break;
464+
465+ case dm_msg_int_t:
466+ r = sscanf(argv[i], "%d", ((int **) ptr)[i]);
467+ goto check;
468+
469+ case dm_msg_uint_t:
470+ r = sscanf(argv[i], "%u",
471+ ((unsigned **) ptr)[i]);
472+ goto check;
473+
474+ case dm_msg_uint64_t:
475+ r = sscanf(argv[i], "%llu",
476+ ((unsigned long long **) ptr)[i]);
477+
478+check:
479+ if (r != 1) {
480+ set_bit(dm_msg_ret_undef, &msg->ret);
481+ set_bit(dm_msg_ret_arg, &msg->ret);
482+ }
483+ }
484+ }
485+ }
486+}
487+
488+/* Parse message options. */
489+static void
490+message_options_parse(struct dm_msg *msg, int argc, char **argv)
491+{
492+ int hit = 0;
493+ unsigned long *action;
494+ size_t l1 = strlen(*argv), l_hit = 0;
495+ struct dm_message_option *o = msg->spec->options;
496+ char **option, **option_end = o->options + o->num_options;
497+
498+ for (option = o->options, action = o->actions;
499+ option < option_end; option++, action++) {
500+ size_t l2 = strlen(*option);
501+
502+ if (!strnicmp(*argv, *option, min(l1, l2))) {
503+ hit++;
504+ l_hit = l2;
505+ set_bit(*action, &msg->action);
506+ }
507+ }
508+
509+ /* Assume error. */
510+ msg->ret = 0;
511+ set_bit(dm_msg_ret_option, &msg->ret);
512+ if (!hit || l1 > l_hit)
513+ set_bit(dm_msg_ret_undef, &msg->ret); /* Undefined option. */
514+ else if (hit > 1)
515+ set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/
516+ else {
517+ clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */
518+ message_arguments(msg, --argc, ++argv);
519+ }
520+}
521+
522+static inline void
523+print_ret(const char *caller, unsigned long ret)
524+{
525+ struct {
526+ unsigned long err;
527+ const char *err_str;
528+ } static err_msg[] = {
529+ { dm_msg_ret_ambiguous, "message ambiguous" },
530+ { dm_msg_ret_inval, "message invalid" },
531+ { dm_msg_ret_undef, "message undefined" },
532+ { dm_msg_ret_arg, "message argument" },
533+ { dm_msg_ret_argcount, "message argument count" },
534+ { dm_msg_ret_option, "option" },
535+ }, *e = ARRAY_END(err_msg);
536+
537+ while (e-- > err_msg) {
538+ if (test_bit(e->err, &ret))
539+ DMERR("%s %s", caller, e->err_str);
540+ }
541+}
542+
543+/* Parse a message action. */
544+int
545+dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
546+ int argc, char **argv)
547+{
548+ int hit = 0;
549+ size_t l1 = strlen(*argv), l_hit = 0;
550+ struct dm_msg_spec *s, *s_hit = NULL,
551+ *s_end = msg->specs + msg->num_specs;
552+
553+ if (argc < 2)
554+ return -EINVAL;
555+
556+ for (s = msg->specs; s < s_end; s++) {
557+ size_t l2 = strlen(s->cmd);
558+
559+ if (!strnicmp(*argv, s->cmd, min(l1, l2))) {
560+ hit++;
561+ l_hit = l2;
562+ s_hit = s;
563+ }
564+ }
565+
566+ msg->ret = 0;
567+ if (!hit || l1 > l_hit) /* No hit or message string too long. */
568+ set_bit(dm_msg_ret_undef, &msg->ret);
569+ else if (hit > 1) /* Ambiguous message. */
570+ set_bit(dm_msg_ret_ambiguous, &msg->ret);
571+ else if (argc - 2 != s_hit->args->num_args) {
572+ set_bit(dm_msg_ret_undef, &msg->ret);
573+ set_bit(dm_msg_ret_argcount, &msg->ret);
574+ }
575+
576+ if (msg->ret)
577+ goto bad;
578+
579+ msg->action = 0;
580+ msg->spec = s_hit;
581+ set_bit(s_hit->action, &msg->action);
582+ message_options_parse(msg, --argc, ++argv);
583+
584+ if (!msg->ret)
585+ return msg->spec->f(msg, context);
586+
587+bad:
588+ print_ret(caller, msg->ret);
589+ return -EINVAL;
590+}
591+EXPORT_SYMBOL(dm_message_parse);
592+
593+MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser");
594+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
595+MODULE_LICENSE("GPL");
596--- /dev/null
597+++ b/drivers/md/dm-message.h
598@@ -0,0 +1,91 @@
599+/*
600+ * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved.
601+ *
602+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de>
603+ *
604+ * General device-mapper message interface argument parser.
605+ *
606+ * This file is released under the GPL.
607+ *
608+ */
609+
610+#ifndef DM_MESSAGE_H
611+#define DM_MESSAGE_H
612+
613+/* Factor out to dm.h. */
614+/* Reference to array end. */
615+#define ARRAY_END(a) ((a) + ARRAY_SIZE(a))
616+
617+/* Message return bits. */
618+enum dm_message_return {
619+ dm_msg_ret_ambiguous, /* Action ambiguous. */
620+ dm_msg_ret_inval, /* Action invalid. */
621+ dm_msg_ret_undef, /* Action undefined. */
622+
623+ dm_msg_ret_option, /* Option error. */
624+ dm_msg_ret_arg, /* Argument error. */
625+ dm_msg_ret_argcount, /* Argument count error. */
626+};
627+
628+/* Message argument type conversions. */
629+enum dm_message_argument_type {
630+ dm_msg_base_t, /* Basename string. */
631+ dm_msg_str_t, /* String. */
632+ dm_msg_int_t, /* Signed int. */
633+ dm_msg_uint_t, /* Unsigned int. */
634+ dm_msg_uint64_t, /* Unsigned int 64. */
635+};
636+
637+/* A message option. */
638+struct dm_message_option {
639+ unsigned num_options;
640+ char **options;
641+ unsigned long *actions;
642+};
643+
644+/* Message arguments and types. */
645+struct dm_message_argument {
646+ unsigned num_args;
647+ unsigned long **ptr;
648+ enum dm_message_argument_type types[];
649+};
650+
651+/* Client message. */
652+struct dm_msg {
653+ unsigned long action; /* Identified action. */
654+ unsigned long ret; /* Return bits. */
655+ unsigned num_specs; /* # of sepcifications listed. */
656+ struct dm_msg_spec *specs; /* Specification list. */
657+ struct dm_msg_spec *spec; /* Specification selected. */
658+};
659+
660+/* Secification of the message. */
661+struct dm_msg_spec {
662+ const char *cmd; /* Name of the command (i.e. 'bandwidth'). */
663+ unsigned long action;
664+ struct dm_message_option *options;
665+ struct dm_message_argument *args;
666+ unsigned long parm; /* Parameter to pass through to callback. */
667+ /* Function to process for action. */
668+ int (*f) (struct dm_msg *msg, void *context);
669+};
670+
671+/* Parameter access macros. */
672+#define DM_MSG_PARM(msg) ((msg)->spec->parm)
673+
674+#define DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx])
675+#define DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx])
676+#define DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx))
677+#define DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t) *(msg)->spec->args->ptr[idx])
678+
679+#define DM_MSG_STR_ARG(msg) DM_MSG_STR_ARGS(msg, 0)
680+#define DM_MSG_INT_ARG(msg) DM_MSG_INT_ARGS(msg, 0)
681+#define DM_MSG_UINT_ARG(msg) DM_MSG_UINT_ARGS(msg, 0)
682+#define DM_MSG_UINT64_ARG(msg) DM_MSG_UINT64_ARGS(msg, 0)
683+
684+
685+/* Parse a message and its options and optionally call a function back. */
686+int dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
687+ int argc, char **argv);
688+
689+#endif
690--- /dev/null
691+++ b/drivers/md/dm-raid45.c
692@@ -0,0 +1,4516 @@
693+/*
694+ * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
695+ *
696+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
697+ *
698+ * This file is released under the GPL.
699+ *
700+ *
701+ * Linux 2.6 Device Mapper RAID4 and RAID5 target.
702+ *
703+ * Supports:
704+ * o RAID4 with dedicated and selectable parity device
705+ * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
706+ * o run time optimization of xor algorithm used to calculate parity
707+ *
708+ *
709+ * Thanks to MD for:
710+ * o the raid address calculation algorithm
711+ * o the base of the biovec <-> page list copier.
712+ *
713+ *
714+ * Uses region hash to keep track of how many writes are in flight to
715+ * regions in order to use dirty log to keep state of regions to recover:
716+ *
717+ * o clean regions (those which are synchronized
718+ * and don't have write io in flight)
719+ * o dirty regions (those with write io in flight)
720+ *
721+ *
722+ * On startup, any dirty regions are migrated to the 'nosync' state
723+ * and are subject to recovery by the daemon.
724+ *
725+ * See raid_ctr() for table definition.
726+ *
727+ *
728+ * FIXME:
729+ * o add virtual interface for locking
730+ * o remove instrumentation (REMOVEME:)
731+ *
732+ */
733+
734+static const char *version = "v0.2431";
735+
736+#include "dm.h"
737+#include "dm-bio-list.h"
738+#include "dm-memcache.h"
739+#include "dm-message.h"
740+#include "dm-raid45.h"
741+
742+#include <linux/kernel.h>
743+#include <linux/vmalloc.h>
744+
745+#include <linux/dm-io.h>
746+#include <linux/dm-dirty-log.h>
747+#include <linux/dm-regions.h>
748+
749+/* # of parallel recovered regions */
750+/* FIXME: cope with multiple recovery stripes in raid_set struct. */
751+#define MAX_RECOVER 1 /* needs to be 1! */
752+
753+/*
754+ * Configurable parameters
755+ */
756+#define INLINE
757+
758+/* Default # of stripes if not set in constructor. */
759+#define STRIPES 64
760+
761+/* Minimum/maximum # of selectable stripes. */
762+#define STRIPES_MIN 8
763+#define STRIPES_MAX 16384
764+
765+/* Default chunk size in sectors if not set in constructor. */
766+#define CHUNK_SIZE 64
767+
768+/* Default io size in sectors if not set in constructor. */
769+#define IO_SIZE_MIN SECTORS_PER_PAGE
770+#define IO_SIZE IO_SIZE_MIN
771+
772+/* Maximum setable chunk size in sectors. */
773+#define CHUNK_SIZE_MAX 16384
774+
775+/* Recover io size default in sectors. */
776+#define RECOVER_IO_SIZE_MIN 64
777+#define RECOVER_IO_SIZE 256
778+
779+/* Default percentage recover io bandwidth. */
780+#define BANDWIDTH 10
781+#define BANDWIDTH_MIN 1
782+#define BANDWIDTH_MAX 100
783+/*
784+ * END Configurable parameters
785+ */
786+
787+#define TARGET "dm-raid45"
788+#define DAEMON "kraid45d"
789+#define DM_MSG_PREFIX TARGET
790+
791+#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
792+
793+/* Amount/size for __xor(). */
794+#define SECTORS_PER_XOR SECTORS_PER_PAGE
795+#define XOR_SIZE PAGE_SIZE
796+
797+/* Derive raid_set from stripe_cache pointer. */
798+#define RS(x) container_of(x, struct raid_set, sc)
799+
800+/* Check value in range. */
801+#define range_ok(i, min, max) (i >= min && i <= max)
802+
803+/* Page reference. */
804+#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
805+
806+/* Bio list reference. */
807+#define BL(stripe, p, rw) (stripe->ss[p].bl + rw)
808+
809+/* Page list reference. */
810+#define PL(stripe, p) (stripe->obj[p].pl)
811+
812+/* Check argument is power of 2. */
813+#define POWER_OF_2(a) (!(a & (a - 1)))
814+
815+/* Factor out to dm-bio-list.h */
816+static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
817+{
818+ bio->bi_next = bl->head;
819+ bl->head = bio;
820+
821+ if (!bl->tail)
822+ bl->tail = bio;
823+}
824+
825+/* Factor out to dm.h */
826+#define TI_ERR_RET(str, ret) \
827+ do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
828+#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
829+
830+/*-----------------------------------------------------------------
831+ * Stripe cache
832+ *
833+ * Cache for all reads and writes to raid sets (operational or degraded)
834+ *
835+ * We need to run all data to and from a RAID set through this cache,
836+ * because parity chunks need to get calculated from data chunks
837+ * or, in the degraded/resynchronization case, missing chunks need
838+ * to be reconstructed using the other chunks of the stripe.
839+ *---------------------------------------------------------------*/
840+/* Protect kmem cache # counter. */
841+static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
842+
843+/* A stripe set (holds bios hanging off). */
844+struct stripe_set {
845+ struct stripe *stripe; /* Backpointer to stripe for endio(). */
846+ struct bio_list bl[3]; /* Reads, writes, and writes merged. */
847+#define WRITE_MERGED 2
848+};
849+
850+#if READ != 0 || WRITE != 1
851+#error dm-raid45: READ/WRITE != 0/1 used as index!!!
852+#endif
853+
854+/*
855+ * Stripe linked list indexes. Keep order, because the stripe
856+ * and the stripe cache rely on the first 3!
857+ */
858+enum list_types {
859+ LIST_IO = 0, /* Stripes with io pending. */
860+ LIST_ENDIO, /* Stripes to endio. */
861+ LIST_LRU, /* Least recently used stripes. */
862+ LIST_HASH, /* Hashed stripes. */
863+ LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
864+ NR_LISTS, /* To size array in struct stripe. */
865+};
866+
867+enum lock_types {
868+ LOCK_ENDIO = 0, /* Protect endio list. */
869+ LOCK_LRU, /* Protect lru list. */
870+ NR_LOCKS, /* To size array in struct stripe_cache. */
871+};
872+
873+/* A stripe: the io object to handle all reads and writes to a RAID set. */
874+struct stripe {
875+ struct stripe_cache *sc; /* Backpointer to stripe cache. */
876+
877+ sector_t key; /* Hash key. */
878+ sector_t region; /* Region stripe is mapped to. */
879+
880+ /* Reference count. */
881+ atomic_t cnt;
882+
883+ struct {
884+ unsigned long flags; /* flags (see below). */
885+
886+ /*
887+ * Pending ios in flight:
888+ *
889+ * used as a 'lock' to control move of stripe to endio list
890+ */
891+ atomic_t pending; /* Pending ios in flight. */
892+
893+ /* Sectors to read and write for multi page stripe sets. */
894+ unsigned size;
895+ } io;
896+
897+ /* Lock on stripe (for clustering). */
898+ void *lock;
899+
900+ /*
901+ * 4 linked lists:
902+ * o io list to flush io
903+ * o endio list
904+ * o LRU list to put stripes w/o reference count on
905+ * o stripe cache hash
906+ */
907+ struct list_head lists[NR_LISTS];
908+
909+ struct {
910+ unsigned short parity; /* Parity chunk index. */
911+ short recover; /* Recovery chunk index. */
912+ } idx;
913+
914+ /* This sets memory cache object (dm-mem-cache). */
915+ struct dm_mem_cache_object *obj;
916+
917+ /* Array of stripe sets (dynamically allocated). */
918+ struct stripe_set ss[0];
919+};
920+
921+/* States stripes can be in (flags field). */
922+enum stripe_states {
923+ STRIPE_ACTIVE, /* Active io on stripe. */
924+ STRIPE_ERROR, /* io error on stripe. */
925+ STRIPE_MERGED, /* Writes got merged. */
926+ STRIPE_READ, /* Read. */
927+ STRIPE_RBW, /* Read-before-write. */
928+ STRIPE_RECONSTRUCT, /* reconstruct of a missing chunk required. */
929+ STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
930+};
931+
932+/* ... and macros to access them. */
933+#define BITOPS(name, what, var, flag) \
934+static inline int TestClear ## name ## what(struct var *v) \
935+{ return test_and_clear_bit(flag, &v->io.flags); } \
936+static inline int TestSet ## name ## what(struct var *v) \
937+{ return test_and_set_bit(flag, &v->io.flags); } \
938+static inline void Clear ## name ## what(struct var *v) \
939+{ clear_bit(flag, &v->io.flags); } \
940+static inline void Set ## name ## what(struct var *v) \
941+{ set_bit(flag, &v->io.flags); } \
942+static inline int name ## what(struct var *v) \
943+{ return test_bit(flag, &v->io.flags); }
944+
945+
946+BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
947+BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
948+BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
949+BITOPS(Stripe, Read, stripe, STRIPE_READ)
950+BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
951+BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
952+BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
953+
954+/* A stripe hash. */
955+struct stripe_hash {
956+ struct list_head *hash;
957+ unsigned buckets;
958+ unsigned mask;
959+ unsigned prime;
960+ unsigned shift;
961+};
962+
963+/* A stripe cache. */
964+struct stripe_cache {
965+ /* Stripe hash. */
966+ struct stripe_hash hash;
967+
968+ /* Stripes with io to flush, stripes to endio and LRU lists. */
969+ struct list_head lists[3];
970+
971+ /* Locks to protect endio and lru lists. */
972+ spinlock_t locks[NR_LOCKS];
973+
974+ /* Slab cache to allocate stripes from. */
975+ struct {
976+ struct kmem_cache *cache; /* Cache itself. */
977+ char name[32]; /* Unique name. */
978+ } kc;
979+
980+ struct dm_io_client *dm_io_client; /* dm-io client resource context. */
981+
982+ /* dm-mem-cache client resource context. */
983+ struct dm_mem_cache_client *mem_cache_client;
984+
985+ int stripes_parm; /* # stripes parameter from constructor. */
986+ atomic_t stripes; /* actual # of stripes in cache. */
987+ atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
988+ atomic_t stripes_last; /* last # of stripes in cache. */
989+ atomic_t active_stripes; /* actual # of active stripes in cache. */
990+
991+ /* REMOVEME: */
992+ atomic_t max_active_stripes; /* actual # of active stripes in cache. */
993+};
994+
995+/* Flag specs for raid_dev */ ;
996+enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
997+
998+/* The raid device in a set. */
999+struct raid_dev {
1000+ struct dm_dev *dev;
1001+ unsigned long flags; /* raid_dev_flags. */
1002+ sector_t start; /* offset to map to. */
1003+};
1004+
1005+/* Flags spec for raid_set. */
1006+enum raid_set_flags {
1007+ RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
1008+ RS_DEAD, /* RAID set inoperational. */
1009+ RS_DEVEL_STATS, /* REMOVEME: display status information. */
1010+ RS_IO_ERROR, /* io error on set. */
1011+ RS_RECOVER, /* Do recovery. */
1012+ RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
1013+ RS_REGION_GET, /* get a region to recover. */
1014+ RS_SC_BUSY, /* stripe cache busy -> send an event. */
1015+ RS_SUSPENDED, /* RAID set suspendedn. */
1016+};
1017+
1018+/* REMOVEME: devel stats counters. */
1019+enum stats_types {
1020+ S_BIOS_READ,
1021+ S_BIOS_ADDED_READ,
1022+ S_BIOS_ENDIO_READ,
1023+ S_BIOS_WRITE,
1024+ S_BIOS_ADDED_WRITE,
1025+ S_BIOS_ENDIO_WRITE,
1026+ S_CAN_MERGE,
1027+ S_CANT_MERGE,
1028+ S_CONGESTED,
1029+ S_DM_IO_READ,
1030+ S_DM_IO_WRITE,
1031+ S_ACTIVE_READS,
1032+ S_BANDWIDTH,
1033+ S_BARRIER,
1034+ S_BIO_COPY_PL_NEXT,
1035+ S_DEGRADED,
1036+ S_DELAYED_BIOS,
1037+ S_EVICT,
1038+ S_FLUSHS,
1039+ S_HITS_1ST,
1040+ S_IOS_POST,
1041+ S_INSCACHE,
1042+ S_MAX_LOOKUP,
1043+ S_MERGE_PAGE_LOCKED,
1044+ S_NO_BANDWIDTH,
1045+ S_NOT_CONGESTED,
1046+ S_NO_RW,
1047+ S_NOSYNC,
1048+ S_PROHIBITPAGEIO,
1049+ S_RECONSTRUCT_EI,
1050+ S_RECONSTRUCT_DEV,
1051+ S_REDO,
1052+ S_REQUEUE,
1053+ S_STRIPE_ERROR,
1054+ S_SUM_DELAYED_BIOS,
1055+ S_XORS,
1056+ S_NR_STATS, /* # of stats counters. */
1057+};
1058+
1059+/* Status type -> string mappings. */
1060+struct stats_map {
1061+ const enum stats_types type;
1062+ const char *str;
1063+};
1064+
1065+static struct stats_map stats_map[] = {
1066+ { S_BIOS_READ, "r=" },
1067+ { S_BIOS_ADDED_READ, "/" },
1068+ { S_BIOS_ENDIO_READ, "/" },
1069+ { S_BIOS_WRITE, " w=" },
1070+ { S_BIOS_ADDED_WRITE, "/" },
1071+ { S_BIOS_ENDIO_WRITE, "/" },
1072+ { S_DM_IO_READ, " rc=" },
1073+ { S_DM_IO_WRITE, " wc=" },
1074+ { S_ACTIVE_READS, " active_reads=" },
1075+ { S_BANDWIDTH, " bandwidth=" },
1076+ { S_NO_BANDWIDTH, " no_bandwidth=" },
1077+ { S_BARRIER, " barrier=" },
1078+ { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
1079+ { S_CAN_MERGE, " can_merge=" },
1080+ { S_MERGE_PAGE_LOCKED, "/page_locked=" },
1081+ { S_CANT_MERGE, "/cant_merge=" },
1082+ { S_CONGESTED, " congested=" },
1083+ { S_NOT_CONGESTED, "/not_congested=" },
1084+ { S_DEGRADED, " degraded=" },
1085+ { S_DELAYED_BIOS, " delayed_bios=" },
1086+ { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
1087+ { S_EVICT, " evict=" },
1088+ { S_FLUSHS, " flushs=" },
1089+ { S_HITS_1ST, " hits_1st=" },
1090+ { S_IOS_POST, " ios_post=" },
1091+ { S_INSCACHE, " inscache=" },
1092+ { S_MAX_LOOKUP, " max_lookup=" },
1093+ { S_NO_RW, " no_rw=" },
1094+ { S_NOSYNC, " nosync=" },
1095+ { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
1096+ { S_RECONSTRUCT_EI, " reconstruct_ei=" },
1097+ { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
1098+ { S_REDO, " redo=" },
1099+ { S_REQUEUE, " requeue=" },
1100+ { S_STRIPE_ERROR, " stripe_error=" },
1101+ { S_XORS, " xors=" },
1102+};
1103+
1104+/*
1105+ * A RAID set.
1106+ */
1107+typedef void (*xor_function_t)(unsigned count, unsigned long **data);
1108+struct raid_set {
1109+ struct dm_target *ti; /* Target pointer. */
1110+
1111+ struct {
1112+ unsigned long flags; /* State flags. */
1113+ spinlock_t in_lock; /* Protects central input list below. */
1114+ struct bio_list in; /* Pending ios (central input list). */
1115+ struct bio_list work; /* ios work set. */
1116+ wait_queue_head_t suspendq; /* suspend synchronization. */
1117+ atomic_t in_process; /* counter of queued bios (suspendq). */
1118+ atomic_t in_process_max;/* counter of queued bios max. */
1119+
1120+ /* io work. */
1121+ struct workqueue_struct *wq;
1122+ struct delayed_work dws;
1123+ } io;
1124+
1125+ /* External locking. */
1126+ struct dm_raid45_locking_type *locking;
1127+
1128+ struct stripe_cache sc; /* Stripe cache for this set. */
1129+
1130+ /* Xor optimization. */
1131+ struct {
1132+ struct xor_func *f;
1133+ unsigned chunks;
1134+ unsigned speed;
1135+ } xor;
1136+
1137+ /* Recovery parameters. */
1138+ struct recover {
1139+ struct dm_dirty_log *dl; /* Dirty log. */
1140+ struct dm_rh_client *rh; /* Region hash. */
1141+
1142+ /* dm-mem-cache client resource context for recovery stripes. */
1143+ struct dm_mem_cache_client *mem_cache_client;
1144+
1145+ struct list_head stripes; /* List of recovery stripes. */
1146+
1147+ region_t nr_regions;
1148+ region_t nr_regions_to_recover;
1149+ region_t nr_regions_recovered;
1150+ unsigned long start_jiffies;
1151+ unsigned long end_jiffies;
1152+
1153+ unsigned bandwidth; /* Recovery bandwidth [%]. */
1154+ unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
1155+ unsigned bandwidth_parm; /* " constructor parm. */
1156+ unsigned io_size; /* io size <= chunk size. */
1157+ unsigned io_size_parm; /* io size ctr parameter. */
1158+
1159+ /* recovery io throttling. */
1160+ atomic_t io_count[2]; /* counter recover/regular io. */
1161+ unsigned long last_jiffies;
1162+
1163+ struct dm_region *reg; /* Actual region to recover. */
1164+ sector_t pos; /* Position within region to recover. */
1165+ sector_t end; /* End of region to recover. */
1166+ } recover;
1167+
1168+ /* RAID set parameters. */
1169+ struct {
1170+ struct raid_type *raid_type; /* RAID type (eg, RAID4). */
1171+ unsigned raid_parms; /* # variable raid parameters. */
1172+
1173+ unsigned chunk_size; /* Sectors per chunk. */
1174+ unsigned chunk_size_parm;
1175+ unsigned chunk_mask; /* Mask for amount. */
1176+ unsigned chunk_shift; /* rsector chunk size shift. */
1177+
1178+ unsigned io_size; /* Sectors per io. */
1179+ unsigned io_size_parm;
1180+ unsigned io_mask; /* Mask for amount. */
1181+ unsigned io_shift_mask; /* Mask for raid_address(). */
1182+ unsigned io_shift; /* rsector io size shift. */
1183+ unsigned pages_per_io; /* Pages per io. */
1184+
1185+ sector_t sectors_per_dev; /* Sectors per device. */
1186+
1187+ atomic_t failed_devs; /* Amount of devices failed. */
1188+
1189+ /* Index of device to initialize. */
1190+ int dev_to_init;
1191+ int dev_to_init_parm;
1192+
1193+ /* Raid devices dynamically allocated. */
1194+ unsigned raid_devs; /* # of RAID devices below. */
1195+ unsigned data_devs; /* # of RAID data devices. */
1196+
1197+ int ei; /* index of failed RAID device. */
1198+
1199+ /* index of dedicated parity device (i.e. RAID4). */
1200+ int pi;
1201+ int pi_parm; /* constructor parm for status output. */
1202+ } set;
1203+
1204+ /* REMOVEME: devel stats counters. */
1205+ atomic_t stats[S_NR_STATS];
1206+
1207+ /* Dynamically allocated temporary pointers for xor(). */
1208+ unsigned long **data;
1209+
1210+ /* Dynamically allocated RAID devices. Alignment? */
1211+ struct raid_dev dev[0];
1212+};
1213+
1214+
1215+BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
1216+BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
1217+BITOPS(RS, Dead, raid_set, RS_DEAD)
1218+BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
1219+BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
1220+BITOPS(RS, Recover, raid_set, RS_RECOVER)
1221+BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
1222+BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
1223+BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
1224+#undef BITOPS
1225+
1226+#define PageIO(page) PageChecked(page)
1227+#define AllowPageIO(page) SetPageChecked(page)
1228+#define ProhibitPageIO(page) ClearPageChecked(page)
1229+
1230+/*-----------------------------------------------------------------
1231+ * Raid-4/5 set structures.
1232+ *---------------------------------------------------------------*/
1233+/* RAID level definitions. */
1234+enum raid_level {
1235+ raid4,
1236+ raid5,
1237+};
1238+
1239+/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
1240+enum raid_algorithm {
1241+ none,
1242+ left_asym,
1243+ right_asym,
1244+ left_sym,
1245+ right_sym,
1246+};
1247+
1248+struct raid_type {
1249+ const char *name; /* RAID algorithm. */
1250+ const char *descr; /* Descriptor text for logging. */
1251+ const unsigned parity_devs; /* # of parity devices. */
1252+ const unsigned minimal_devs; /* minimal # of devices in set. */
1253+ const enum raid_level level; /* RAID level. */
1254+ const enum raid_algorithm algorithm; /* RAID algorithm. */
1255+};
1256+
1257+/* Supported raid types and properties. */
1258+static struct raid_type raid_types[] = {
1259+ {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
1260+ {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
1261+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
1262+ {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
1263+ {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
1264+};
1265+
1266+/* Address as calculated by raid_address(). */
1267+struct address {
1268+ sector_t key; /* Hash key (start address of stripe). */
1269+ unsigned di, pi; /* Data and parity disks index. */
1270+};
1271+
1272+/* REMOVEME: reset statistics counters. */
1273+static void stats_reset(struct raid_set *rs)
1274+{
1275+ unsigned s = S_NR_STATS;
1276+
1277+ while (s--)
1278+ atomic_set(rs->stats + s, 0);
1279+}
1280+
1281+/*----------------------------------------------------------------
1282+ * RAID set management routines.
1283+ *--------------------------------------------------------------*/
1284+/*
1285+ * Begin small helper functions.
1286+ */
1287+/* Queue (optionally delayed) io work. */
1288+static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
1289+{
1290+ struct delayed_work *dws = &rs->io.dws;
1291+
1292+ cancel_delayed_work(dws);
1293+ queue_delayed_work(rs->io.wq, dws, delay);
1294+}
1295+
1296+/* Queue io work immediately (called from region hash too). */
1297+static INLINE void wake_do_raid(void *context)
1298+{
1299+ wake_do_raid_delayed(context, 0);
1300+}
1301+
1302+/* Wait until all io has been processed. */
1303+static INLINE void wait_ios(struct raid_set *rs)
1304+{
1305+ wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
1306+}
1307+
1308+/* Declare io queued to device. */
1309+static INLINE void io_dev_queued(struct raid_dev *dev)
1310+{
1311+ set_bit(IO_QUEUED, &dev->flags);
1312+}
1313+
1314+/* Io on device and reset ? */
1315+static inline int io_dev_clear(struct raid_dev *dev)
1316+{
1317+ return test_and_clear_bit(IO_QUEUED, &dev->flags);
1318+}
1319+
1320+/* Get an io reference. */
1321+static INLINE void io_get(struct raid_set *rs)
1322+{
1323+ int p = atomic_inc_return(&rs->io.in_process);
1324+
1325+ if (p > atomic_read(&rs->io.in_process_max))
1326+ atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
1327+}
1328+
1329+/* Put the io reference and conditionally wake io waiters. */
1330+static INLINE void io_put(struct raid_set *rs)
1331+{
1332+ /* Intel: rebuild data corrupter? */
1333+ if (!atomic_read(&rs->io.in_process)) {
1334+ DMERR("%s would go negative!!!", __func__);
1335+ return;
1336+ }
1337+
1338+ if (atomic_dec_and_test(&rs->io.in_process))
1339+ wake_up(&rs->io.suspendq);
1340+}
1341+
1342+/* Calculate device sector offset. */
1343+static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
1344+{
1345+ sector_t sector = bio->bi_sector;
1346+
1347+ sector_div(sector, rs->set.data_devs);
1348+ return sector;
1349+}
1350+
1351+/* Test device operational. */
1352+static INLINE int dev_operational(struct raid_set *rs, unsigned p)
1353+{
1354+ return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
1355+}
1356+
1357+/* Return # of active stripes in stripe cache. */
1358+static INLINE int sc_active(struct stripe_cache *sc)
1359+{
1360+ return atomic_read(&sc->active_stripes);
1361+}
1362+
1363+/* Test io pending on stripe. */
1364+static INLINE int stripe_io(struct stripe *stripe)
1365+{
1366+ return atomic_read(&stripe->io.pending);
1367+}
1368+
1369+static INLINE void stripe_io_inc(struct stripe *stripe)
1370+{
1371+ atomic_inc(&stripe->io.pending);
1372+}
1373+
1374+static INLINE void stripe_io_dec(struct stripe *stripe)
1375+{
1376+ atomic_dec(&stripe->io.pending);
1377+}
1378+
1379+/* Wrapper needed by for_each_io_dev(). */
1380+static void _stripe_io_inc(struct stripe *stripe, unsigned p)
1381+{
1382+ stripe_io_inc(stripe);
1383+}
1384+
1385+/* Error a stripe. */
1386+static INLINE void stripe_error(struct stripe *stripe, struct page *page)
1387+{
1388+ SetStripeError(stripe);
1389+ SetPageError(page);
1390+ atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
1391+}
1392+
1393+/* Page IOed ok. */
1394+enum dirty_type { CLEAN, DIRTY };
1395+static INLINE void page_set(struct page *page, enum dirty_type type)
1396+{
1397+ switch (type) {
1398+ case DIRTY:
1399+ SetPageDirty(page);
1400+ AllowPageIO(page);
1401+ break;
1402+
1403+ case CLEAN:
1404+ ClearPageDirty(page);
1405+ break;
1406+
1407+ default:
1408+ BUG();
1409+ }
1410+
1411+ SetPageUptodate(page);
1412+ ClearPageError(page);
1413+}
1414+
1415+/* Return region state for a sector. */
1416+static INLINE int
1417+region_state(struct raid_set *rs, sector_t sector, unsigned long state)
1418+{
1419+ struct dm_rh_client *rh = rs->recover.rh;
1420+
1421+ return RSRecover(rs) ?
1422+ (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
1423+ state) : 0;
1424+}
1425+
1426+/* Check maximum devices which may fail in a raid set. */
1427+static inline int raid_set_degraded(struct raid_set *rs)
1428+{
1429+ return RSIoError(rs);
1430+}
1431+
1432+/* Check # of devices which may fail in a raid set. */
1433+static INLINE int raid_set_operational(struct raid_set *rs)
1434+{
1435+ /* Too many failed devices -> BAD. */
1436+ return atomic_read(&rs->set.failed_devs) <=
1437+ rs->set.raid_type->parity_devs;
1438+}
1439+
1440+/*
1441+ * Return true in case a page_list should be read/written
1442+ *
1443+ * Conditions to read/write:
1444+ * o 1st page in list not uptodate
1445+ * o 1st page in list dirty
1446+ * o if we optimized io away, we flag it using the pages checked bit.
1447+ */
1448+static INLINE unsigned page_io(struct page *page)
1449+{
1450+ /* Optimization: page was flagged to need io during first run. */
1451+ if (PagePrivate(page)) {
1452+ ClearPagePrivate(page);
1453+ return 1;
1454+ }
1455+
1456+ /* Avoid io if prohibited or a locked page. */
1457+ if (!PageIO(page) || PageLocked(page))
1458+ return 0;
1459+
1460+ if (!PageUptodate(page) || PageDirty(page)) {
1461+ /* Flag page needs io for second run optimization. */
1462+ SetPagePrivate(page);
1463+ return 1;
1464+ }
1465+
1466+ return 0;
1467+}
1468+
1469+/* Call a function on each page list needing io. */
1470+static INLINE unsigned
1471+for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
1472+ void (*f_io)(struct stripe *stripe, unsigned p))
1473+{
1474+ unsigned p = rs->set.raid_devs, r = 0;
1475+
1476+ while (p--) {
1477+ if (page_io(PAGE(stripe, p))) {
1478+ f_io(stripe, p);
1479+ r++;
1480+ }
1481+ }
1482+
1483+ return r;
1484+}
1485+
1486+/* Reconstruct a particular device ?. */
1487+static INLINE int dev_to_init(struct raid_set *rs)
1488+{
1489+ return rs->set.dev_to_init > -1;
1490+}
1491+
1492+/*
1493+ * Index of device to calculate parity on.
1494+ * Either the parity device index *or* the selected device to init
1495+ * after a spare replacement.
1496+ */
1497+static INLINE unsigned dev_for_parity(struct stripe *stripe)
1498+{
1499+ struct raid_set *rs = RS(stripe->sc);
1500+
1501+ return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
1502+}
1503+
1504+/* Return the index of the device to be recovered. */
1505+static int idx_get(struct raid_set *rs)
1506+{
1507+ /* Avoid to read in the pages to be reconstructed anyway. */
1508+ if (dev_to_init(rs))
1509+ return rs->set.dev_to_init;
1510+ else if (rs->set.raid_type->level == raid4)
1511+ return rs->set.pi;
1512+
1513+ return -1;
1514+}
1515+
1516+/* RAID set congested function. */
1517+static int raid_set_congested(void *congested_data, int bdi_bits)
1518+{
1519+ struct raid_set *rs = congested_data;
1520+ int r = 0; /* Assume uncongested. */
1521+ unsigned p = rs->set.raid_devs;
1522+
1523+ /* If any of our component devices are overloaded. */
1524+ while (p--) {
1525+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
1526+
1527+ r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1528+ }
1529+
1530+ /* REMOVEME: statistics. */
1531+ atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
1532+ return r;
1533+}
1534+
1535+/* Display RAID set dead message once. */
1536+static void raid_set_dead(struct raid_set *rs)
1537+{
1538+ if (!TestSetRSDead(rs)) {
1539+ unsigned p;
1540+ char buf[BDEVNAME_SIZE];
1541+
1542+ DMERR("FATAL: too many devices failed -> RAID set dead");
1543+
1544+ for (p = 0; p < rs->set.raid_devs; p++) {
1545+ if (!dev_operational(rs, p))
1546+ DMERR("device /dev/%s failed",
1547+ bdevname(rs->dev[p].dev->bdev, buf));
1548+ }
1549+ }
1550+}
1551+
1552+/* RAID set degrade check. */
1553+static INLINE int
1554+raid_set_check_and_degrade(struct raid_set *rs,
1555+ struct stripe *stripe, unsigned p)
1556+{
1557+ if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
1558+ return -EPERM;
1559+
1560+ /* Through an event in case of member device errors. */
1561+ dm_table_event(rs->ti->table);
1562+ atomic_inc(&rs->set.failed_devs);
1563+
1564+ /* Only log the first member error. */
1565+ if (!TestSetRSIoError(rs)) {
1566+ char buf[BDEVNAME_SIZE];
1567+
1568+ /* Store index for recovery. */
1569+ mb();
1570+ rs->set.ei = p;
1571+ mb();
1572+
1573+ DMERR("CRITICAL: %sio error on device /dev/%s "
1574+ "in region=%llu; DEGRADING RAID set",
1575+ stripe ? "" : "FAKED ",
1576+ bdevname(rs->dev[p].dev->bdev, buf),
1577+ (unsigned long long) (stripe ? stripe->key : 0));
1578+ DMERR("further device error messages suppressed");
1579+ }
1580+
1581+ return 0;
1582+}
1583+
1584+static void
1585+raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
1586+{
1587+ unsigned p = rs->set.raid_devs;
1588+
1589+ while (p--) {
1590+ struct page *page = PAGE(stripe, p);
1591+
1592+ if (PageError(page)) {
1593+ ClearPageError(page);
1594+ raid_set_check_and_degrade(rs, stripe, p);
1595+ }
1596+ }
1597+}
1598+
1599+/* RAID set upgrade check. */
1600+static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
1601+{
1602+ if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
1603+ return -EPERM;
1604+
1605+ if (atomic_dec_and_test(&rs->set.failed_devs)) {
1606+ ClearRSIoError(rs);
1607+ rs->set.ei = -1;
1608+ }
1609+
1610+ return 0;
1611+}
1612+
1613+/* Lookup a RAID device by name or by major:minor number. */
1614+union dev_lookup {
1615+ const char *dev_name;
1616+ struct raid_dev *dev;
1617+};
1618+enum lookup_type { byname, bymajmin, bynumber };
1619+static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
1620+ union dev_lookup *dl)
1621+{
1622+ unsigned p;
1623+
1624+ /*
1625+ * Must be an incremental loop, because the device array
1626+ * can have empty slots still on calls from raid_ctr()
1627+ */
1628+ for (p = 0; p < rs->set.raid_devs; p++) {
1629+ char buf[BDEVNAME_SIZE];
1630+ struct raid_dev *dev = rs->dev + p;
1631+
1632+ if (!dev->dev)
1633+ break;
1634+
1635+ /* Format dev string appropriately if necessary. */
1636+ if (by == byname)
1637+ bdevname(dev->dev->bdev, buf);
1638+ else if (by == bymajmin)
1639+ format_dev_t(buf, dev->dev->bdev->bd_dev);
1640+
1641+ /* Do the actual check. */
1642+ if (by == bynumber) {
1643+ if (dl->dev->dev->bdev->bd_dev ==
1644+ dev->dev->bdev->bd_dev)
1645+ return p;
1646+ } else if (!strcmp(dl->dev_name, buf))
1647+ return p;
1648+ }
1649+
1650+ return -ENODEV;
1651+}
1652+
1653+/* End io wrapper. */
1654+static INLINE void
1655+_bio_endio(struct raid_set *rs, struct bio *bio, int error)
1656+{
1657+ /* REMOVEME: statistics. */
1658+ atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
1659+ S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
1660+ bio_endio(bio, error);
1661+ io_put(rs); /* Wake any suspend waiters. */
1662+}
1663+
1664+/*
1665+ * End small helper functions.
1666+ */
1667+
1668+
1669+/*
1670+ * Stripe hash functions
1671+ */
1672+/* Initialize/destroy stripe hash. */
1673+static int hash_init(struct stripe_hash *hash, unsigned stripes)
1674+{
1675+ unsigned buckets = 2, max_buckets = stripes / 4;
1676+ unsigned hash_primes[] = {
1677+ /* Table of primes for hash_fn/table size optimization. */
1678+ 3, 7, 13, 27, 53, 97, 193, 389, 769,
1679+ 1543, 3079, 6151, 12289, 24593,
1680+ };
1681+
1682+ /* Calculate number of buckets (2^^n <= stripes / 4). */
1683+ while (buckets < max_buckets)
1684+ buckets <<= 1;
1685+
1686+ /* Allocate stripe hash. */
1687+ hash->hash = vmalloc(buckets * sizeof(*hash->hash));
1688+ if (!hash->hash)
1689+ return -ENOMEM;
1690+
1691+ hash->buckets = buckets;
1692+ hash->mask = buckets - 1;
1693+ hash->shift = ffs(buckets);
1694+ if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
1695+ hash->shift = ARRAY_SIZE(hash_primes) + 1;
1696+
1697+ BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
1698+ hash->prime = hash_primes[hash->shift - 2];
1699+
1700+ /* Initialize buckets. */
1701+ while (buckets--)
1702+ INIT_LIST_HEAD(hash->hash + buckets);
1703+
1704+ return 0;
1705+}
1706+
1707+static INLINE void hash_exit(struct stripe_hash *hash)
1708+{
1709+ if (hash->hash) {
1710+ vfree(hash->hash);
1711+ hash->hash = NULL;
1712+ }
1713+}
1714+
1715+/* List add (head/tail/locked/unlocked) inlines. */
1716+enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
1717+#define LIST_DEL(name, list) \
1718+static void stripe_ ## name ## _del(struct stripe *stripe, \
1719+ enum list_lock_type lock) { \
1720+ struct list_head *lh = stripe->lists + (list); \
1721+ spinlock_t *l = NULL; \
1722+\
1723+ if (lock == LIST_LOCKED) { \
1724+ l = stripe->sc->locks + LOCK_LRU; \
1725+ spin_lock_irq(l); \
1726+ } \
1727+\
1728+\
1729+ if (!list_empty(lh)) \
1730+ list_del_init(lh); \
1731+\
1732+ if (lock == LIST_LOCKED) \
1733+ spin_unlock_irq(l); \
1734+}
1735+
1736+LIST_DEL(hash, LIST_HASH)
1737+LIST_DEL(lru, LIST_LRU)
1738+#undef LIST_DEL
1739+
1740+enum list_pos_type { POS_HEAD, POS_TAIL };
1741+#define LIST_ADD(name, list) \
1742+static void stripe_ ## name ## _add(struct stripe *stripe, \
1743+ enum list_pos_type pos, \
1744+ enum list_lock_type lock) { \
1745+ struct list_head *lh = stripe->lists + (list); \
1746+ struct stripe_cache *sc = stripe->sc; \
1747+ spinlock_t *l = NULL; \
1748+\
1749+ if (lock == LIST_LOCKED) { \
1750+ l = sc->locks + LOCK_LRU; \
1751+ spin_lock_irq(l); \
1752+ } \
1753+\
1754+ if (list_empty(lh)) { \
1755+ if (pos == POS_HEAD) \
1756+ list_add(lh, sc->lists + (list)); \
1757+ else \
1758+ list_add_tail(lh, sc->lists + (list)); \
1759+ } \
1760+\
1761+ if (lock == LIST_LOCKED) \
1762+ spin_unlock_irq(l); \
1763+}
1764+
1765+LIST_ADD(endio, LIST_ENDIO)
1766+LIST_ADD(io, LIST_IO)
1767+LIST_ADD(lru, LIST_LRU)
1768+#undef LIST_ADD
1769+
1770+#define POP(list) \
1771+ do { \
1772+ if (list_empty(sc->lists + list)) \
1773+ stripe = NULL; \
1774+ else { \
1775+ stripe = list_first_entry(&sc->lists[list], \
1776+ struct stripe, \
1777+ lists[list]); \
1778+ list_del_init(&stripe->lists[list]); \
1779+ } \
1780+ } while (0);
1781+
1782+/* Pop an available stripe off the lru list. */
1783+static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1784+{
1785+ struct stripe *stripe;
1786+ spinlock_t *lock = sc->locks + LOCK_LRU;
1787+
1788+ spin_lock_irq(lock);
1789+ POP(LIST_LRU);
1790+ spin_unlock_irq(lock);
1791+
1792+ if (stripe)
1793+ /* Remove from hash before reuse. */
1794+ stripe_hash_del(stripe, LIST_UNLOCKED);
1795+
1796+ return stripe;
1797+}
1798+
1799+static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
1800+{
1801+ return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
1802+}
1803+
1804+static inline struct list_head *
1805+hash_bucket(struct stripe_hash *hash, sector_t key)
1806+{
1807+ return hash->hash + hash_fn(hash, key);
1808+}
1809+
1810+/* Insert an entry into a hash. */
1811+static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
1812+{
1813+ list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
1814+}
1815+
1816+/* Insert an entry into the stripe hash. */
1817+static inline void
1818+sc_insert(struct stripe_cache *sc, struct stripe *stripe)
1819+{
1820+ hash_insert(&sc->hash, stripe);
1821+}
1822+
1823+/* Lookup an entry in the stripe hash. */
1824+static inline struct stripe *
1825+stripe_lookup(struct stripe_cache *sc, sector_t key)
1826+{
1827+ unsigned c = 0;
1828+ struct stripe *stripe;
1829+ struct list_head *bucket = hash_bucket(&sc->hash, key);
1830+
1831+ list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
1832+ /* REMOVEME: statisics. */
1833+ if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1834+ atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
1835+
1836+ if (stripe->key == key)
1837+ return stripe;
1838+ }
1839+
1840+ return NULL;
1841+}
1842+
1843+/* Resize the stripe cache hash on size changes. */
1844+static int hash_resize(struct stripe_cache *sc)
1845+{
1846+ /* Resize threshold reached? */
1847+ if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
1848+ || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
1849+ int r;
1850+ struct stripe_hash hash, hash_tmp;
1851+ spinlock_t *lock;
1852+
1853+ r = hash_init(&hash, atomic_read(&sc->stripes));
1854+ if (r)
1855+ return r;
1856+
1857+ lock = sc->locks + LOCK_LRU;
1858+ spin_lock_irq(lock);
1859+ if (sc->hash.hash) {
1860+ unsigned b = sc->hash.buckets;
1861+ struct list_head *pos, *tmp;
1862+
1863+ /* Walk old buckets and insert into new. */
1864+ while (b--) {
1865+ list_for_each_safe(pos, tmp, sc->hash.hash + b)
1866+ hash_insert(&hash,
1867+ list_entry(pos, struct stripe,
1868+ lists[LIST_HASH]));
1869+ }
1870+
1871+ }
1872+
1873+ memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
1874+ memcpy(&sc->hash, &hash, sizeof(sc->hash));
1875+ atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1876+ spin_unlock_irq(lock);
1877+
1878+ hash_exit(&hash_tmp);
1879+ }
1880+
1881+ return 0;
1882+}
1883+
1884+/*
1885+ * Stripe cache locking functions
1886+ */
1887+/* Dummy lock function for local RAID4+5. */
1888+static void *no_lock(sector_t key, enum dm_lock_type type)
1889+{
1890+ return &no_lock;
1891+}
1892+
1893+/* Dummy unlock function for local RAID4+5. */
1894+static void no_unlock(void *lock_handle)
1895+{
1896+}
1897+
1898+/* No locking (for local RAID 4+5). */
1899+static struct dm_raid45_locking_type locking_none = {
1900+ .lock = no_lock,
1901+ .unlock = no_unlock,
1902+};
1903+
1904+/* Clustered RAID 4+5. */
1905+/* FIXME: code this. */
1906+static struct dm_raid45_locking_type locking_cluster = {
1907+ .lock = no_lock,
1908+ .unlock = no_unlock,
1909+};
1910+
1911+/* Lock a stripe (for clustering). */
1912+static int
1913+stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
1914+{
1915+ stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
1916+ DM_RAID45_EX);
1917+ return stripe->lock ? 0 : -EPERM;
1918+}
1919+
1920+/* Unlock a stripe (for clustering). */
1921+static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
1922+{
1923+ rs->locking->unlock(stripe->lock);
1924+ stripe->lock = NULL;
1925+}
1926+
1927+/*
1928+ * Stripe cache functions.
1929+ */
1930+/*
1931+ * Invalidate all page lists pages of a stripe.
1932+ *
1933+ * I only keep state for the whole list in the first page.
1934+ */
1935+static INLINE void
1936+stripe_pages_invalidate(struct stripe *stripe)
1937+{
1938+ unsigned p = RS(stripe->sc)->set.raid_devs;
1939+
1940+ while (p--) {
1941+ struct page *page = PAGE(stripe, p);
1942+
1943+ ProhibitPageIO(page);
1944+ ClearPageChecked(page);
1945+ ClearPageDirty(page);
1946+ ClearPageError(page);
1947+ clear_page_locked(page);
1948+ ClearPagePrivate(page);
1949+ ClearPageUptodate(page);
1950+ }
1951+}
1952+
1953+/* Prepare stripe for (re)use. */
1954+static INLINE void stripe_invalidate(struct stripe *stripe)
1955+{
1956+ stripe->io.flags = 0;
1957+ stripe_pages_invalidate(stripe);
1958+}
1959+
1960+/* Allow io on all chunks of a stripe. */
1961+static INLINE void stripe_allow_io(struct stripe *stripe)
1962+{
1963+ unsigned p = RS(stripe->sc)->set.raid_devs;
1964+
1965+ while (p--)
1966+ AllowPageIO(PAGE(stripe, p));
1967+}
1968+
1969+/* Initialize a stripe. */
1970+static void
1971+stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1972+{
1973+ unsigned p = RS(sc)->set.raid_devs;
1974+ unsigned i;
1975+
1976+ /* Work all io chunks. */
1977+ while (p--) {
1978+ struct stripe_set *ss = stripe->ss + p;
1979+
1980+ stripe->obj[p].private = ss;
1981+ ss->stripe = stripe;
1982+
1983+ i = ARRAY_SIZE(ss->bl);
1984+ while (i--)
1985+ bio_list_init(ss->bl + i);
1986+ }
1987+
1988+ stripe->sc = sc;
1989+
1990+ i = ARRAY_SIZE(stripe->lists);
1991+ while (i--)
1992+ INIT_LIST_HEAD(stripe->lists + i);
1993+
1994+ atomic_set(&stripe->cnt, 0);
1995+ atomic_set(&stripe->io.pending, 0);
1996+
1997+ stripe_invalidate(stripe);
1998+}
1999+
2000+/* Number of pages per chunk. */
2001+static inline unsigned chunk_pages(unsigned io_size)
2002+{
2003+ return dm_div_up(io_size, SECTORS_PER_PAGE);
2004+}
2005+
2006+/* Number of pages per stripe. */
2007+static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
2008+{
2009+ return chunk_pages(io_size) * rs->set.raid_devs;
2010+}
2011+
2012+/* Initialize part of page_list (recovery). */
2013+static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
2014+ unsigned start, unsigned count)
2015+{
2016+ unsigned pages = chunk_pages(count);
2017+ /* Get offset into the page_list. */
2018+ struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
2019+
2020+ BUG_ON(!pl);
2021+ while (pl && pages--) {
2022+ BUG_ON(!pl->page);
2023+ memset(page_address(pl->page), 0, PAGE_SIZE);
2024+ pl = pl->next;
2025+ }
2026+}
2027+
2028+/* Initialize parity chunk of stripe. */
2029+static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
2030+{
2031+ stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
2032+}
2033+
2034+/* Return dynamic stripe structure size. */
2035+static INLINE size_t stripe_size(struct raid_set *rs)
2036+{
2037+ return sizeof(struct stripe) +
2038+ rs->set.raid_devs * sizeof(struct stripe_set);
2039+}
2040+
2041+/* Allocate a stripe and its memory object. */
2042+/* XXX adjust to cope with stripe cache and recovery stripe caches. */
2043+enum grow { SC_GROW, SC_KEEP };
2044+static struct stripe *stripe_alloc(struct stripe_cache *sc,
2045+ struct dm_mem_cache_client *mc,
2046+ enum grow grow)
2047+{
2048+ int r;
2049+ struct stripe *stripe;
2050+
2051+ stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
2052+ if (stripe) {
2053+ /* Grow the dm-mem-cache by one object. */
2054+ if (grow == SC_GROW) {
2055+ r = dm_mem_cache_grow(mc, 1);
2056+ if (r)
2057+ goto err_free;
2058+ }
2059+
2060+ stripe->obj = dm_mem_cache_alloc(mc);
2061+ if (!stripe->obj)
2062+ goto err_shrink;
2063+
2064+ stripe_init(sc, stripe);
2065+ }
2066+
2067+ return stripe;
2068+
2069+err_shrink:
2070+ if (grow == SC_GROW)
2071+ dm_mem_cache_shrink(mc, 1);
2072+err_free:
2073+ kmem_cache_free(sc->kc.cache, stripe);
2074+ return NULL;
2075+}
2076+
2077+/*
2078+ * Free a stripes memory object, shrink the
2079+ * memory cache and free the stripe itself
2080+ */
2081+static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
2082+{
2083+ dm_mem_cache_free(mc, stripe->obj);
2084+ dm_mem_cache_shrink(mc, 1);
2085+ kmem_cache_free(stripe->sc->kc.cache, stripe);
2086+}
2087+
2088+/* Free the recovery stripe. */
2089+static void stripe_recover_free(struct raid_set *rs)
2090+{
2091+ struct recover *rec = &rs->recover;
2092+ struct list_head *stripes = &rec->stripes;
2093+
2094+ while (!list_empty(stripes)) {
2095+ struct stripe *stripe = list_first_entry(stripes, struct stripe,
2096+ lists[LIST_RECOVER]);
2097+ list_del(stripe->lists + LIST_RECOVER);
2098+ stripe_free(stripe, rec->mem_cache_client);
2099+ }
2100+}
2101+
2102+/* Push a stripe safely onto the endio list to be handled by do_endios(). */
2103+static INLINE void stripe_endio_push(struct stripe *stripe)
2104+{
2105+ int wake;
2106+ unsigned long flags;
2107+ struct stripe_cache *sc = stripe->sc;
2108+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
2109+
2110+ spin_lock_irqsave(lock, flags);
2111+ wake = list_empty(sc->lists + LIST_ENDIO);
2112+ stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
2113+ spin_unlock_irqrestore(lock, flags);
2114+
2115+ if (wake)
2116+ wake_do_raid(RS(sc));
2117+}
2118+
2119+/* Protected check for stripe cache endio list empty. */
2120+static INLINE int stripe_endio_empty(struct stripe_cache *sc)
2121+{
2122+ int r;
2123+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
2124+
2125+ spin_lock_irq(lock);
2126+ r = list_empty(sc->lists + LIST_ENDIO);
2127+ spin_unlock_irq(lock);
2128+
2129+ return r;
2130+}
2131+
2132+/* Pop a stripe off safely off the endio list. */
2133+static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
2134+{
2135+ struct stripe *stripe;
2136+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
2137+
2138+ /* This runs in parallel with endio(). */
2139+ spin_lock_irq(lock);
2140+ POP(LIST_ENDIO)
2141+ spin_unlock_irq(lock);
2142+ return stripe;
2143+}
2144+
2145+#undef POP
2146+
2147+/* Evict stripe from cache. */
2148+static void stripe_evict(struct stripe *stripe)
2149+{
2150+ struct raid_set *rs = RS(stripe->sc);
2151+ stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
2152+
2153+ if (list_empty(stripe->lists + LIST_LRU)) {
2154+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2155+ atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
2156+ }
2157+}
2158+
2159+/* Grow stripe cache. */
2160+static int
2161+sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
2162+{
2163+ int r = 0;
2164+ struct raid_set *rs = RS(sc);
2165+
2166+ /* Try to allocate this many (additional) stripes. */
2167+ while (stripes--) {
2168+ struct stripe *stripe =
2169+ stripe_alloc(sc, sc->mem_cache_client, grow);
2170+
2171+ if (likely(stripe)) {
2172+ stripe->io.size = rs->set.io_size;
2173+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2174+ atomic_inc(&sc->stripes);
2175+ } else {
2176+ r = -ENOMEM;
2177+ break;
2178+ }
2179+ }
2180+
2181+ ClearRSScBusy(rs);
2182+ return r ? r : hash_resize(sc);
2183+}
2184+
2185+/* Shrink stripe cache. */
2186+static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
2187+{
2188+ int r = 0;
2189+
2190+ /* Try to get unused stripe from LRU list. */
2191+ while (stripes--) {
2192+ struct stripe *stripe;
2193+
2194+ stripe = stripe_lru_pop(sc);
2195+ if (stripe) {
2196+ /* An lru stripe may never have ios pending! */
2197+ BUG_ON(stripe_io(stripe));
2198+ stripe_free(stripe, sc->mem_cache_client);
2199+ atomic_dec(&sc->stripes);
2200+ } else {
2201+ r = -ENOENT;
2202+ break;
2203+ }
2204+ }
2205+
2206+ /* Check if stats are still sane. */
2207+ if (atomic_read(&sc->max_active_stripes) >
2208+ atomic_read(&sc->stripes))
2209+ atomic_set(&sc->max_active_stripes, 0);
2210+
2211+ if (r)
2212+ return r;
2213+
2214+ ClearRSScBusy(RS(sc));
2215+ return hash_resize(sc);
2216+}
2217+
2218+/* Create stripe cache. */
2219+static int sc_init(struct raid_set *rs, unsigned stripes)
2220+{
2221+ unsigned i, nr;
2222+ struct stripe_cache *sc = &rs->sc;
2223+ struct stripe *stripe;
2224+ struct recover *rec = &rs->recover;
2225+
2226+ /* Initialize lists and locks. */
2227+ i = ARRAY_SIZE(sc->lists);
2228+ while (i--)
2229+ INIT_LIST_HEAD(sc->lists + i);
2230+
2231+ i = NR_LOCKS;
2232+ while (i--)
2233+ spin_lock_init(sc->locks + i);
2234+
2235+ /* Initialize atomic variables. */
2236+ atomic_set(&sc->stripes, 0);
2237+ atomic_set(&sc->stripes_last, 0);
2238+ atomic_set(&sc->stripes_to_shrink, 0);
2239+ atomic_set(&sc->active_stripes, 0);
2240+ atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
2241+
2242+ /*
2243+ * We need a runtime unique # to suffix the kmem cache name
2244+ * because we'll have one for each active RAID set.
2245+ */
2246+ nr = atomic_inc_return(&_stripe_sc_nr);
2247+ sprintf(sc->kc.name, "%s_%d", TARGET, nr);
2248+ sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
2249+ 0, 0, NULL);
2250+ if (!sc->kc.cache)
2251+ return -ENOMEM;
2252+
2253+ /* Create memory cache client context for RAID stripe cache. */
2254+ sc->mem_cache_client =
2255+ dm_mem_cache_client_create(stripes, rs->set.raid_devs,
2256+ chunk_pages(rs->set.io_size));
2257+ if (IS_ERR(sc->mem_cache_client))
2258+ return PTR_ERR(sc->mem_cache_client);
2259+
2260+ /* Create memory cache client context for RAID recovery stripe(s). */
2261+ rec->mem_cache_client =
2262+ dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
2263+ chunk_pages(rec->io_size));
2264+ if (IS_ERR(rec->mem_cache_client))
2265+ return PTR_ERR(rec->mem_cache_client);
2266+
2267+ /* Allocate stripe for set recovery. */
2268+ /* XXX: cope with MAX_RECOVERY. */
2269+ INIT_LIST_HEAD(&rec->stripes);
2270+ for (i = 0; i < MAX_RECOVER; i++) {
2271+ stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
2272+ if (!stripe)
2273+ return -ENOMEM;
2274+
2275+ SetStripeRecover(stripe);
2276+ stripe->io.size = rec->io_size;
2277+ list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
2278+ }
2279+
2280+ /*
2281+ * Allocate the stripe objetcs from the
2282+ * cache and add them to the LRU list.
2283+ */
2284+ return sc_grow(sc, stripes, SC_KEEP);
2285+}
2286+
2287+/* Destroy the stripe cache. */
2288+static void sc_exit(struct stripe_cache *sc)
2289+{
2290+ if (sc->kc.cache) {
2291+ BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
2292+ kmem_cache_destroy(sc->kc.cache);
2293+ }
2294+
2295+ if (sc->mem_cache_client)
2296+ dm_mem_cache_client_destroy(sc->mem_cache_client);
2297+
2298+ ClearRSRecover(RS(sc));
2299+ stripe_recover_free(RS(sc));
2300+ if (RS(sc)->recover.mem_cache_client)
2301+ dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
2302+
2303+ hash_exit(&sc->hash);
2304+}
2305+
2306+/*
2307+ * Calculate RAID address
2308+ *
2309+ * Delivers tuple with the index of the data disk holding the chunk
2310+ * in the set, the parity disks index and the start of the stripe
2311+ * within the address space of the set (used as the stripe cache hash key).
2312+ */
2313+/* thx MD. */
2314+static struct address *
2315+raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
2316+{
2317+ unsigned data_devs = rs->set.data_devs, di, pi,
2318+ raid_devs = rs->set.raid_devs;
2319+ sector_t stripe, tmp;
2320+
2321+ /*
2322+ * chunk_number = sector / chunk_size
2323+ * stripe = chunk_number / data_devs
2324+ * di = stripe % data_devs;
2325+ */
2326+ stripe = sector >> rs->set.chunk_shift;
2327+ di = sector_div(stripe, data_devs);
2328+
2329+ switch (rs->set.raid_type->level) {
2330+ case raid5:
2331+ tmp = stripe;
2332+ pi = sector_div(tmp, raid_devs);
2333+
2334+ switch (rs->set.raid_type->algorithm) {
2335+ case left_asym: /* Left asymmetric. */
2336+ pi = data_devs - pi;
2337+ case right_asym: /* Right asymmetric. */
2338+ if (di >= pi)
2339+ di++;
2340+ break;
2341+
2342+ case left_sym: /* Left symmetric. */
2343+ pi = data_devs - pi;
2344+ case right_sym: /* Right symmetric. */
2345+ di = (pi + di + 1) % raid_devs;
2346+ break;
2347+
2348+ default:
2349+ DMERR("Unknown RAID algorithm %d",
2350+ rs->set.raid_type->algorithm);
2351+ goto out;
2352+ }
2353+
2354+ break;
2355+
2356+ case raid4:
2357+ pi = rs->set.pi;
2358+ if (di >= pi)
2359+ di++;
2360+ break;
2361+
2362+ default:
2363+ DMERR("Unknown RAID level %d", rs->set.raid_type->level);
2364+ goto out;
2365+ }
2366+
2367+ /*
2368+ * Hash key = start offset on any single device of the RAID set;
2369+ * adjusted in case io size differs from chunk size.
2370+ */
2371+ addr->key = (stripe << rs->set.chunk_shift) +
2372+ (sector & rs->set.io_shift_mask);
2373+ addr->di = di;
2374+ addr->pi = pi;
2375+
2376+out:
2377+ return addr;
2378+}
2379+
2380+/*
2381+ * Copy data across between stripe pages and bio vectors.
2382+ *
2383+ * Pay attention to data alignment in stripe and bio pages.
2384+ */
2385+static void
2386+bio_copy_page_list(int rw, struct stripe *stripe,
2387+ struct page_list *pl, struct bio *bio)
2388+{
2389+ unsigned i, page_offset;
2390+ void *page_addr;
2391+ struct raid_set *rs = RS(stripe->sc);
2392+ struct bio_vec *bv;
2393+
2394+ /* Get start page in page list for this sector. */
2395+ i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
2396+ pl = pl_elem(pl, i);
2397+
2398+ page_addr = page_address(pl->page);
2399+ page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
2400+
2401+ /* Walk all segments and copy data across between bio_vecs and pages. */
2402+ bio_for_each_segment(bv, bio, i) {
2403+ int len = bv->bv_len, size;
2404+ unsigned bio_offset = 0;
2405+ void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
2406+redo:
2407+ size = (page_offset + len > PAGE_SIZE) ?
2408+ PAGE_SIZE - page_offset : len;
2409+
2410+ if (rw == READ)
2411+ memcpy(bio_addr + bio_offset,
2412+ page_addr + page_offset, size);
2413+ else
2414+ memcpy(page_addr + page_offset,
2415+ bio_addr + bio_offset, size);
2416+
2417+ page_offset += size;
2418+ if (page_offset == PAGE_SIZE) {
2419+ /*
2420+ * We reached the end of the chunk page ->
2421+ * need refer to the next one to copy more data.
2422+ */
2423+ len -= size;
2424+ if (len) {
2425+ /* Get next page. */
2426+ pl = pl->next;
2427+ BUG_ON(!pl);
2428+ page_addr = page_address(pl->page);
2429+ page_offset = 0;
2430+ bio_offset += size;
2431+ /* REMOVEME: statistics. */
2432+ atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
2433+ goto redo;
2434+ }
2435+ }
2436+
2437+ __bio_kunmap_atomic(bio_addr, KM_USER0);
2438+ }
2439+}
2440+
2441+/*
2442+ * Xor optimization macros.
2443+ */
2444+/* Xor data pointer declaration and initialization macros. */
2445+#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
2446+#define DECLARE_3 DECLARE_2, *d2 = data[2]
2447+#define DECLARE_4 DECLARE_3, *d3 = data[3]
2448+#define DECLARE_5 DECLARE_4, *d4 = data[4]
2449+#define DECLARE_6 DECLARE_5, *d5 = data[5]
2450+#define DECLARE_7 DECLARE_6, *d6 = data[6]
2451+#define DECLARE_8 DECLARE_7, *d7 = data[7]
2452+
2453+/* Xor unrole macros. */
2454+#define D2(n) d0[n] = d0[n] ^ d1[n]
2455+#define D3(n) D2(n) ^ d2[n]
2456+#define D4(n) D3(n) ^ d3[n]
2457+#define D5(n) D4(n) ^ d4[n]
2458+#define D6(n) D5(n) ^ d5[n]
2459+#define D7(n) D6(n) ^ d6[n]
2460+#define D8(n) D7(n) ^ d7[n]
2461+
2462+#define X_2(macro, offset) macro(offset); macro(offset + 1);
2463+#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
2464+#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
2465+#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
2466+#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
2467+#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
2468+
2469+/* Define a _xor_#chunks_#xors_per_run() function. */
2470+#define _XOR(chunks, xors_per_run) \
2471+static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
2472+{ \
2473+ unsigned end = XOR_SIZE / sizeof(data[0]), i; \
2474+ DECLARE_ ## chunks; \
2475+\
2476+ for (i = 0; i < end; i += xors_per_run) { \
2477+ X_ ## xors_per_run(D ## chunks, i); \
2478+ } \
2479+}
2480+
2481+/* Define xor functions for 2 - 8 chunks. */
2482+#define MAKE_XOR_PER_RUN(xors_per_run) \
2483+ _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
2484+ _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
2485+ _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
2486+ _XOR(8, xors_per_run);
2487+
2488+MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
2489+MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
2490+MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
2491+MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
2492+
2493+#define MAKE_XOR(xors_per_run) \
2494+struct { \
2495+ void (*f)(unsigned long **); \
2496+} static xor_funcs ## xors_per_run[] = { \
2497+ { NULL }, \
2498+ { NULL }, \
2499+ { _xor2_ ## xors_per_run }, \
2500+ { _xor3_ ## xors_per_run }, \
2501+ { _xor4_ ## xors_per_run }, \
2502+ { _xor5_ ## xors_per_run }, \
2503+ { _xor6_ ## xors_per_run }, \
2504+ { _xor7_ ## xors_per_run }, \
2505+ { _xor8_ ## xors_per_run }, \
2506+}; \
2507+\
2508+static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
2509+{ \
2510+ /* Call respective function for amount of chunks. */ \
2511+ xor_funcs ## xors_per_run[n].f(data); \
2512+}
2513+
2514+/* Define xor_8() - xor_64 functions. */
2515+MAKE_XOR(8)
2516+MAKE_XOR(16)
2517+MAKE_XOR(32)
2518+MAKE_XOR(64)
2519+
2520+/* Maximum number of chunks, which can be xor'ed in one go. */
2521+#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
2522+
2523+struct xor_func {
2524+ xor_function_t f;
2525+ const char *name;
2526+} static xor_funcs[] = {
2527+ {xor_8, "xor_8"},
2528+ {xor_16, "xor_16"},
2529+ {xor_32, "xor_32"},
2530+ {xor_64, "xor_64"},
2531+};
2532+
2533+/*
2534+ * Calculate crc.
2535+ *
2536+ * This indexes into the page list of the stripe.
2537+ *
2538+ * All chunks will be xored into the parity chunk
2539+ * in maximum groups of xor.chunks.
2540+ *
2541+ * FIXME: try mapping the pages on discontiguous memory.
2542+ */
2543+static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
2544+{
2545+ struct raid_set *rs = RS(stripe->sc);
2546+ unsigned max_chunks = rs->xor.chunks, n, p;
2547+ unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
2548+ unsigned long **d = rs->data;
2549+ xor_function_t xor_f = rs->xor.f->f;
2550+
2551+ /* Address of parity page to xor into. */
2552+ d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
2553+
2554+ /* Preset pointers to data pages. */
2555+ for (n = 1, p = rs->set.raid_devs; p--; ) {
2556+ if (p != pi && PageIO(PAGE(stripe, p)))
2557+ d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
2558+
2559+ /* If max chunks -> xor .*/
2560+ if (n == max_chunks) {
2561+ xor_f(n, d);
2562+ n = 1;
2563+ }
2564+ }
2565+
2566+ /* If chunks -> xor. */
2567+ if (n > 1)
2568+ xor_f(n, d);
2569+
2570+ /* Set parity page uptodate and clean. */
2571+ page_set(PAGE(stripe, pi), CLEAN);
2572+}
2573+
2574+/* Common xor loop through all stripe page lists. */
2575+static void common_xor(struct stripe *stripe, sector_t count,
2576+ unsigned off, unsigned p)
2577+{
2578+ unsigned sector;
2579+
2580+ for (sector = off; sector < count; sector += SECTORS_PER_XOR)
2581+ xor(stripe, p, sector);
2582+
2583+ atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
2584+}
2585+
2586+/*
2587+ * Calculate parity sectors on intact stripes.
2588+ *
2589+ * Need to calculate raid address for recover stripe, because its
2590+ * chunk sizes differs and is typically larger than io chunk size.
2591+ */
2592+static void parity_xor(struct stripe *stripe)
2593+{
2594+ struct raid_set *rs = RS(stripe->sc);
2595+ unsigned chunk_size = rs->set.chunk_size,
2596+ io_size = stripe->io.size,
2597+ xor_size = chunk_size > io_size ? io_size : chunk_size;
2598+ sector_t off;
2599+
2600+ /* This can be the recover stripe with a larger io size. */
2601+ for (off = 0; off < io_size; off += xor_size) {
2602+ unsigned pi;
2603+
2604+ /*
2605+ * Recover stripe likely is bigger than regular io
2606+ * ones and has no precalculated parity disk index ->
2607+ * need to calculate RAID address.
2608+ */
2609+ if (unlikely(StripeRecover(stripe))) {
2610+ struct address addr;
2611+
2612+ raid_address(rs,
2613+ (stripe->key + off) * rs->set.data_devs,
2614+ &addr);
2615+ pi = addr.pi;
2616+ stripe_zero_pl_part(stripe, pi, off,
2617+ rs->set.chunk_size);
2618+ } else
2619+ pi = stripe->idx.parity;
2620+
2621+ common_xor(stripe, xor_size, off, pi);
2622+ page_set(PAGE(stripe, pi), DIRTY);
2623+ }
2624+}
2625+
2626+/* Reconstruct missing chunk. */
2627+static void reconstruct_xor(struct stripe *stripe)
2628+{
2629+ struct raid_set *rs = RS(stripe->sc);
2630+ int p = stripe->idx.recover;
2631+
2632+ BUG_ON(p < 0);
2633+
2634+ /* REMOVEME: statistics. */
2635+ atomic_inc(rs->stats + (raid_set_degraded(rs) ?
2636+ S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
2637+
2638+ /* Zero chunk to be reconstructed. */
2639+ stripe_zero_chunk(stripe, p);
2640+ common_xor(stripe, stripe->io.size, 0, p);
2641+}
2642+
2643+/*
2644+ * Try getting a stripe either from the hash or from the lru list
2645+ */
2646+static inline void _stripe_get(struct stripe *stripe)
2647+{
2648+ atomic_inc(&stripe->cnt);
2649+}
2650+
2651+static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
2652+{
2653+ struct stripe_cache *sc = &rs->sc;
2654+ struct stripe *stripe;
2655+
2656+ stripe = stripe_lookup(sc, addr->key);
2657+ if (stripe) {
2658+ _stripe_get(stripe);
2659+ /* Remove from the lru list if on. */
2660+ stripe_lru_del(stripe, LIST_LOCKED);
2661+ atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2662+ } else {
2663+ /* Second try to get an LRU stripe. */
2664+ stripe = stripe_lru_pop(sc);
2665+ if (stripe) {
2666+ _stripe_get(stripe);
2667+ /* Invalidate before reinserting with changed key. */
2668+ stripe_invalidate(stripe);
2669+ stripe->key = addr->key;
2670+ stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2671+ addr->key);
2672+ stripe->idx.parity = addr->pi;
2673+ sc_insert(sc, stripe);
2674+ /* REMOVEME: statistics. */
2675+ atomic_inc(rs->stats + S_INSCACHE);
2676+ }
2677+ }
2678+
2679+ return stripe;
2680+}
2681+
2682+/*
2683+ * Decrement reference count on a stripe.
2684+ *
2685+ * Move it to list of LRU stripes if zero.
2686+ */
2687+static void stripe_put(struct stripe *stripe)
2688+{
2689+ if (atomic_dec_and_test(&stripe->cnt)) {
2690+ if (TestClearStripeActive(stripe))
2691+ atomic_dec(&stripe->sc->active_stripes);
2692+
2693+ /* Put stripe onto the LRU list. */
2694+ stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2695+ }
2696+
2697+ BUG_ON(atomic_read(&stripe->cnt) < 0);
2698+}
2699+
2700+/*
2701+ * Process end io
2702+ *
2703+ * I need to do it here because I can't in interrupt
2704+ *
2705+ * Read and write functions are split in order to avoid
2706+ * conditionals in the main loop for performamce reasons.
2707+ */
2708+
2709+/* Helper read bios on a page list. */
2710+static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
2711+ struct bio *bio)
2712+{
2713+ bio_copy_page_list(READ, stripe, pl, bio);
2714+}
2715+
2716+/* Helper write bios on a page list. */
2717+static void _rh_dec(struct stripe *stripe, struct page_list *pl,
2718+ struct bio *bio)
2719+{
2720+ dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
2721+}
2722+
2723+/* End io all bios on a page list. */
2724+static inline int
2725+page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
2726+{
2727+ int r = 0;
2728+ struct bio_list *bl = BL(stripe, p, rw);
2729+
2730+ if (!bio_list_empty(bl)) {
2731+ struct page_list *pl = PL(stripe, p);
2732+ struct page *page = pl->page;
2733+
2734+ if (PageLocked(page))
2735+ r = -EBUSY;
2736+ /*
2737+ * FIXME: PageUptodate() not cleared
2738+ * properly for missing chunks ?
2739+ */
2740+ else if (PageUptodate(page)) {
2741+ struct bio *bio;
2742+ struct raid_set *rs = RS(stripe->sc);
2743+ void (*h_f)(struct stripe *, struct page_list *,
2744+ struct bio *) =
2745+ (rw == READ) ? _bio_copy_page_list : _rh_dec;
2746+
2747+ while ((bio = bio_list_pop(bl))) {
2748+ h_f(stripe, pl, bio);
2749+ _bio_endio(rs, bio, 0);
2750+ stripe_put(stripe);
2751+ if (count)
2752+ (*count)++;
2753+ }
2754+ } else
2755+ r = -EAGAIN;
2756+ }
2757+
2758+ return r;
2759+}
2760+
2761+/*
2762+ * End io all reads/writes on a stripe copying
2763+ * read date accross from stripe to bios.
2764+ */
2765+static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
2766+{
2767+ int r = 0;
2768+ unsigned p = RS(stripe->sc)->set.raid_devs;
2769+
2770+ while (p--) {
2771+ int rr = page_list_endio(rw, stripe, p, count);
2772+
2773+ if (rr && r != -EIO)
2774+ r = rr;
2775+ }
2776+
2777+ return r;
2778+}
2779+
2780+/* Fail all ios on a bio list and return # of bios. */
2781+static unsigned
2782+bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
2783+{
2784+ unsigned r;
2785+ struct bio *bio;
2786+
2787+ raid_set_dead(rs);
2788+
2789+ /* Update region counters. */
2790+ if (stripe) {
2791+ struct dm_rh_client *rh = rs->recover.rh;
2792+
2793+ bio_list_for_each(bio, bl) {
2794+ if (bio_data_dir(bio) == WRITE)
2795+ dm_rh_dec(rh, stripe->region);
2796+ }
2797+ }
2798+
2799+ /* Error end io all bios. */
2800+ for (r = 0; (bio = bio_list_pop(bl)); r++)
2801+ _bio_endio(rs, bio, -EIO);
2802+
2803+ return r;
2804+}
2805+
2806+/* Fail all ios of a bio list of a stripe and drop io pending count. */
2807+static void
2808+stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
2809+ struct bio_list *bl)
2810+{
2811+ unsigned put = bio_list_fail(rs, stripe, bl);
2812+
2813+ while (put--)
2814+ stripe_put(stripe);
2815+}
2816+
2817+/* Fail all ios hanging off all bio lists of a stripe. */
2818+static void stripe_fail_io(struct stripe *stripe)
2819+{
2820+ struct raid_set *rs = RS(stripe->sc);
2821+ unsigned p = rs->set.raid_devs;
2822+
2823+ stripe_evict(stripe);
2824+
2825+ while (p--) {
2826+ struct stripe_set *ss = stripe->ss + p;
2827+ int i = ARRAY_SIZE(ss->bl);
2828+
2829+ while (i--)
2830+ stripe_bio_list_fail(rs, stripe, ss->bl + i);
2831+ }
2832+}
2833+
2834+/*
2835+ * Handle all stripes by handing them to the daemon, because we can't
2836+ * map their pages to copy the data in interrupt context.
2837+ *
2838+ * We don't want to handle them here either, while interrupts are disabled.
2839+ */
2840+
2841+/* Read/write endio function for dm-io (interrupt context). */
2842+static void endio(unsigned long error, void *context)
2843+{
2844+ struct dm_mem_cache_object *obj = context;
2845+ struct stripe_set *ss = obj->private;
2846+ struct stripe *stripe = ss->stripe;
2847+ struct page *page = obj->pl->page;
2848+
2849+ if (unlikely(error))
2850+ stripe_error(stripe, page);
2851+ else
2852+ page_set(page, CLEAN);
2853+
2854+ clear_page_locked(page);
2855+ stripe_io_dec(stripe);
2856+
2857+ /* Add stripe to endio list and wake daemon. */
2858+ stripe_endio_push(stripe);
2859+}
2860+
2861+/*
2862+ * Recovery io throttling
2863+ */
2864+/* Conditionally reset io counters. */
2865+enum count_type { IO_WORK = 0, IO_RECOVER };
2866+static int recover_io_reset(struct raid_set *rs)
2867+{
2868+ unsigned long j = jiffies;
2869+
2870+ /* Pay attention to jiffies overflows. */
2871+ if (j > rs->recover.last_jiffies + HZ
2872+ || j < rs->recover.last_jiffies) {
2873+ rs->recover.last_jiffies = j;
2874+ atomic_set(rs->recover.io_count + IO_WORK, 0);
2875+ atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2876+ return 1;
2877+ }
2878+
2879+ return 0;
2880+}
2881+
2882+/* Count ios. */
2883+static INLINE void
2884+recover_io_count(struct raid_set *rs, struct stripe *stripe)
2885+{
2886+ if (RSRecover(rs)) {
2887+ recover_io_reset(rs);
2888+ atomic_inc(rs->recover.io_count +
2889+ (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2890+ }
2891+}
2892+
2893+/* Read/Write a page_list asynchronously. */
2894+static void page_list_rw(struct stripe *stripe, unsigned p)
2895+{
2896+ struct stripe_cache *sc = stripe->sc;
2897+ struct raid_set *rs = RS(sc);
2898+ struct dm_mem_cache_object *obj = stripe->obj + p;
2899+ struct page_list *pl = obj->pl;
2900+ struct page *page = pl->page;
2901+ struct raid_dev *dev = rs->dev + p;
2902+ struct dm_io_region io = {
2903+ .bdev = dev->dev->bdev,
2904+ .sector = stripe->key,
2905+ .count = stripe->io.size,
2906+ };
2907+ struct dm_io_request control = {
2908+ .bi_rw = PageDirty(page) ? WRITE : READ,
2909+ .mem.type = DM_IO_PAGE_LIST,
2910+ .mem.ptr.pl = pl,
2911+ .mem.offset = 0,
2912+ .notify.fn = endio,
2913+ .notify.context = obj,
2914+ .client = sc->dm_io_client,
2915+ };
2916+
2917+ BUG_ON(PageLocked(page));
2918+
2919+ /*
2920+ * Don't rw past end of device, which can happen, because
2921+ * typically sectors_per_dev isn't divisable by io_size.
2922+ */
2923+ if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2924+ io.count = rs->set.sectors_per_dev - io.sector;
2925+
2926+ io.sector += dev->start; /* Add <offset>. */
2927+ recover_io_count(rs, stripe); /* Recovery io accounting. */
2928+
2929+ /* REMOVEME: statistics. */
2930+ atomic_inc(rs->stats +
2931+ (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
2932+
2933+ ClearPageError(page);
2934+ set_page_locked(page);
2935+ io_dev_queued(dev);
2936+ BUG_ON(dm_io(&control, 1, &io, NULL));
2937+}
2938+
2939+/*
2940+ * Write dirty / read not uptodate page lists of a stripe.
2941+ */
2942+static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
2943+{
2944+ unsigned r;
2945+
2946+ /*
2947+ * Increment the pending count on the stripe
2948+ * first, so that we don't race in endio().
2949+ *
2950+ * An inc (IO) is needed for any page:
2951+ *
2952+ * o not uptodate
2953+ * o dirtied by writes merged
2954+ * o dirtied by parity calculations
2955+ */
2956+ r = for_each_io_dev(rs, stripe, _stripe_io_inc);
2957+ if (r) {
2958+ /* io needed: chunks are not uptodate/dirty. */
2959+ int max; /* REMOVEME: */
2960+ struct stripe_cache *sc = &rs->sc;
2961+
2962+ if (!TestSetStripeActive(stripe))
2963+ atomic_inc(&sc->active_stripes);
2964+
2965+ /* Take off the lru list in case it got added there. */
2966+ stripe_lru_del(stripe, LIST_LOCKED);
2967+
2968+ /* Submit actual io. */
2969+ for_each_io_dev(rs, stripe, page_list_rw);
2970+
2971+ /* REMOVEME: statistics */
2972+ max = sc_active(sc);
2973+ if (atomic_read(&sc->max_active_stripes) < max)
2974+ atomic_set(&sc->max_active_stripes, max);
2975+
2976+ atomic_inc(rs->stats + S_FLUSHS);
2977+ /* END REMOVEME: statistics */
2978+ }
2979+
2980+ return r;
2981+}
2982+
2983+/* Work in all pending writes. */
2984+static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
2985+{
2986+ struct bio_list *write = BL(stripe, p, WRITE);
2987+
2988+ if (!bio_list_empty(write)) {
2989+ struct page_list *pl = stripe->obj[p].pl;
2990+ struct bio *bio;
2991+ struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
2992+
2993+ /*
2994+ * We can play with the lists without holding a lock,
2995+ * because it is just us accessing them anyway.
2996+ */
2997+ bio_list_for_each(bio, write)
2998+ bio_copy_page_list(WRITE, stripe, pl, bio);
2999+
3000+ bio_list_merge(write_merged, write);
3001+ bio_list_init(write);
3002+ page_set(pl->page, DIRTY);
3003+ }
3004+}
3005+
3006+/* Merge in all writes hence dirtying respective pages. */
3007+static INLINE void writes_merge(struct stripe *stripe)
3008+{
3009+ unsigned p = RS(stripe->sc)->set.raid_devs;
3010+
3011+ while (p--)
3012+ _writes_merge(stripe, p);
3013+}
3014+
3015+/* Check, if a chunk gets completely overwritten. */
3016+static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
3017+{
3018+ unsigned sectors = 0;
3019+ struct bio *bio;
3020+ struct bio_list *bl = BL(stripe, p, WRITE);
3021+
3022+ bio_list_for_each(bio, bl)
3023+ sectors += bio_sectors(bio);
3024+
3025+ return sectors == RS(stripe->sc)->set.io_size;
3026+}
3027+
3028+/*
3029+ * Prepare stripe to avoid io on broken/reconstructed
3030+ * drive in order to reconstruct date on endio.
3031+ */
3032+enum prepare_type { IO_ALLOW, IO_PROHIBIT };
3033+static void stripe_prepare(struct stripe *stripe, unsigned p,
3034+ enum prepare_type type)
3035+{
3036+ struct page *page = PAGE(stripe, p);
3037+
3038+ switch (type) {
3039+ case IO_PROHIBIT:
3040+ /*
3041+ * In case we prohibit, we gotta make sure, that
3042+ * io on all other chunks than the one which failed
3043+ * or is being reconstructed is allowed and that it
3044+ * doesn't have state uptodate.
3045+ */
3046+ stripe_allow_io(stripe);
3047+ ClearPageUptodate(page);
3048+ ProhibitPageIO(page);
3049+
3050+ /* REMOVEME: statistics. */
3051+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
3052+ stripe->idx.recover = p;
3053+ SetStripeReconstruct(stripe);
3054+ break;
3055+
3056+ case IO_ALLOW:
3057+ AllowPageIO(page);
3058+ stripe->idx.recover = -1;
3059+ ClearStripeReconstruct(stripe);
3060+ break;
3061+
3062+ default:
3063+ BUG();
3064+ }
3065+}
3066+
3067+/*
3068+ * Degraded/reconstruction mode.
3069+ *
3070+ * Check stripe state to figure which chunks don't need IO.
3071+ */
3072+static INLINE void stripe_check_reconstruct(struct stripe *stripe,
3073+ int prohibited)
3074+{
3075+ struct raid_set *rs = RS(stripe->sc);
3076+
3077+ /*
3078+ * Degraded mode (device(s) failed) ->
3079+ * avoid io on the failed device.
3080+ */
3081+ if (unlikely(raid_set_degraded(rs))) {
3082+ /* REMOVEME: statistics. */
3083+ atomic_inc(rs->stats + S_DEGRADED);
3084+ stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
3085+ return;
3086+ } else {
3087+ /*
3088+ * Reconstruction mode (ie. a particular device or
3089+ * some (rotating) parity chunk is being resynchronized) ->
3090+ * o make sure all needed pages are read in
3091+ * o writes are allowed to go through
3092+ */
3093+ int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
3094+
3095+ if (r) {
3096+ /* REMOVEME: statistics. */
3097+ atomic_inc(rs->stats + S_NOSYNC);
3098+ stripe_prepare(stripe, dev_for_parity(stripe),
3099+ IO_PROHIBIT);
3100+ return;
3101+ }
3102+ }
3103+
3104+ /*
3105+ * All disks good. Avoid reading parity chunk and reconstruct it
3106+ * unless we have prohibited io to chunk(s).
3107+ */
3108+ if (!prohibited) {
3109+ if (StripeMerged(stripe))
3110+ stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
3111+ else {
3112+ stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
3113+
3114+ /*
3115+ * Overrule stripe_prepare to reconstruct the
3116+ * parity chunk, because it'll be created new anyway.
3117+ */
3118+ ClearStripeReconstruct(stripe);
3119+ }
3120+ }
3121+}
3122+
3123+/* Check, if stripe is ready to merge writes. */
3124+static INLINE int stripe_check_merge(struct stripe *stripe)
3125+{
3126+ struct raid_set *rs = RS(stripe->sc);
3127+ int prohibited = 0;
3128+ unsigned chunks = 0, p = rs->set.raid_devs;
3129+
3130+ /* Walk all chunks. */
3131+ while (p--) {
3132+ struct page *page = PAGE(stripe, p);
3133+
3134+ /* Can't merge active chunks. */
3135+ if (PageLocked(page)) {
3136+ /* REMOVEME: statistics. */
3137+ atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
3138+ break;
3139+ }
3140+
3141+ /* Can merge uptodate chunks and have to count parity chunk. */
3142+ if (PageUptodate(page) || p == stripe->idx.parity) {
3143+ chunks++;
3144+ continue;
3145+ }
3146+
3147+ /* Read before write ordering. */
3148+ if (RSCheckOverwrite(rs) &&
3149+ bio_list_empty(BL(stripe, p, READ))) {
3150+ int r = stripe_check_overwrite(stripe, p);
3151+
3152+ if (r) {
3153+ chunks++;
3154+ /* REMOVEME: statistics. */
3155+ atomic_inc(RS(stripe->sc)->stats +
3156+ S_PROHIBITPAGEIO);
3157+ ProhibitPageIO(page);
3158+ prohibited = 1;
3159+ }
3160+ }
3161+ }
3162+
3163+ if (chunks == rs->set.raid_devs) {
3164+ /* All pages are uptodate or get written over or mixture. */
3165+ /* REMOVEME: statistics. */
3166+ atomic_inc(rs->stats + S_CAN_MERGE);
3167+ return 0;
3168+ } else
3169+ /* REMOVEME: statistics.*/
3170+ atomic_inc(rs->stats + S_CANT_MERGE);
3171+
3172+ return prohibited ? 1 : -EPERM;
3173+}
3174+
3175+/* Check, if stripe is ready to merge writes. */
3176+static INLINE int stripe_check_read(struct stripe *stripe)
3177+{
3178+ int r = 0;
3179+ unsigned p = RS(stripe->sc)->set.raid_devs;
3180+
3181+ /* Walk all chunks. */
3182+ while (p--) {
3183+ struct page *page = PAGE(stripe, p);
3184+
3185+ if (!PageLocked(page) &&
3186+ bio_list_empty(BL(stripe, p, READ))) {
3187+ ProhibitPageIO(page);
3188+ r = 1;
3189+ }
3190+ }
3191+
3192+ return r;
3193+}
3194+
3195+/*
3196+ * Read/write a stripe.
3197+ *
3198+ * All stripe read/write activity goes through this function.
3199+ *
3200+ * States to cover:
3201+ * o stripe to read and/or write
3202+ * o stripe with error to reconstruct
3203+ */
3204+static int stripe_rw(struct stripe *stripe)
3205+{
3206+ struct raid_set *rs = RS(stripe->sc);
3207+ int prohibited = 0, r;
3208+
3209+ /*
3210+ * Check the state of the RAID set and if degraded (or
3211+ * resynchronizing for reads), read in all other chunks but
3212+ * the one on the dead/resynchronizing device in order to be
3213+ * able to reconstruct the missing one.
3214+ *
3215+ * Merge all writes hanging off uptodate pages of the stripe.
3216+ */
3217+
3218+ /* Initially allow io on all chunks and prohibit below, if necessary. */
3219+ stripe_allow_io(stripe);
3220+
3221+ if (StripeRBW(stripe)) {
3222+ r = stripe_check_merge(stripe);
3223+ if (!r) {
3224+ /*
3225+ * If I could rely on valid parity (which would only
3226+ * be sure in case of a full synchronization),
3227+ * I could xor a fraction of chunks out of
3228+ * parity and back in.
3229+ *
3230+ * For the time being, I got to redo parity...
3231+ */
3232+ /* parity_xor(stripe); */ /* Xor chunks out. */
3233+ stripe_zero_chunk(stripe, stripe->idx.parity);
3234+ writes_merge(stripe); /* Merge writes in. */
3235+ parity_xor(stripe); /* Update parity. */
3236+ ClearStripeRBW(stripe); /* Disable RBW. */
3237+ SetStripeMerged(stripe); /* Writes merged. */
3238+ }
3239+
3240+ if (r > 0)
3241+ prohibited = 1;
3242+ } else if (!raid_set_degraded(rs))
3243+ /* Only allow for read avoidance if not degraded. */
3244+ prohibited = stripe_check_read(stripe);
3245+
3246+ /*
3247+ * Check, if io needs to be allowed/prohibeted on certain chunks
3248+ * because of a degraded set or reconstruction on a region.
3249+ */
3250+ stripe_check_reconstruct(stripe, prohibited);
3251+
3252+ /* Now submit any reads/writes. */
3253+ r = stripe_page_lists_rw(rs, stripe);
3254+ if (!r) {
3255+ /*
3256+ * No io submitted because of chunk io prohibited or
3257+ * locked pages -> push to end io list for processing.
3258+ */
3259+ atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
3260+ stripe_endio_push(stripe);
3261+ wake_do_raid(rs); /* Wake myself. */
3262+ }
3263+
3264+ return 0;
3265+}
3266+
3267+/* Flush stripe either via flush list or imeediately. */
3268+enum flush_type { FLUSH_DELAY, FLUSH_NOW };
3269+static int stripe_flush(struct stripe *stripe, enum flush_type type)
3270+{
3271+ int r = 0;
3272+
3273+ stripe_lru_del(stripe, LIST_LOCKED);
3274+
3275+ /* Immediately flush. */
3276+ if (type == FLUSH_NOW) {
3277+ if (likely(raid_set_operational(RS(stripe->sc))))
3278+ r = stripe_rw(stripe); /* Read/write stripe. */
3279+ else
3280+ /* Optimization: Fail early on failed sets. */
3281+ stripe_fail_io(stripe);
3282+ /* Delay flush by putting it on io list for later processing. */
3283+ } else if (type == FLUSH_DELAY)
3284+ stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
3285+ else
3286+ BUG();
3287+
3288+ return r;
3289+}
3290+
3291+/*
3292+ * Queue reads and writes to a stripe by hanging
3293+ * their bios off the stripsets read/write lists.
3294+ *
3295+ * Endio reads on uptodate chunks.
3296+ */
3297+static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
3298+ struct bio_list *reject)
3299+{
3300+ int r = 0;
3301+ struct address addr;
3302+ struct stripe *stripe =
3303+ stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
3304+
3305+ if (stripe) {
3306+ int rr, rw = bio_data_dir(bio);
3307+
3308+ rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
3309+ if (rr) {
3310+ stripe_put(stripe);
3311+ goto out;
3312+ }
3313+
3314+ /* Distinguish read and write cases. */
3315+ bio_list_add(BL(stripe, addr.di, rw), bio);
3316+
3317+ /* REMOVEME: statistics */
3318+ atomic_inc(rs->stats + (rw == WRITE ?
3319+ S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
3320+
3321+ if (rw == READ)
3322+ SetStripeRead(stripe);
3323+ else {
3324+ SetStripeRBW(stripe);
3325+
3326+ /* Inrement pending write count on region. */
3327+ dm_rh_inc(rs->recover.rh, stripe->region);
3328+ r = 1; /* Region hash needs a flush. */
3329+ }
3330+
3331+ /*
3332+ * Optimize stripe flushing:
3333+ *
3334+ * o directly start io for read stripes.
3335+ *
3336+ * o put stripe onto stripe caches io_list for RBW,
3337+ * so that do_flush() can belabour it after we put
3338+ * more bios to the stripe for overwrite optimization.
3339+ */
3340+ stripe_flush(stripe,
3341+ StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
3342+
3343+ /* Got no stripe from cache -> reject bio. */
3344+ } else {
3345+out:
3346+ bio_list_add(reject, bio);
3347+ /* REMOVEME: statistics. */
3348+ atomic_inc(rs->stats + S_IOS_POST);
3349+ }
3350+
3351+ return r;
3352+}
3353+
3354+/*
3355+ * Recovery functions
3356+ */
3357+/* Read a stripe off a raid set for recovery. */
3358+static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
3359+{
3360+ /* Invalidate all pages so that they get read in. */
3361+ stripe_pages_invalidate(stripe);
3362+
3363+ /* Allow io on all recovery chunks. */
3364+ stripe_allow_io(stripe);
3365+
3366+ if (idx > -1)
3367+ ProhibitPageIO(PAGE(stripe, idx));
3368+
3369+ stripe->key = rs->recover.pos;
3370+ return stripe_page_lists_rw(rs, stripe);
3371+}
3372+
3373+/* Write a stripe to a raid set for recovery. */
3374+static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
3375+{
3376+ /*
3377+ * If this is a reconstruct of a particular device, then
3378+ * reconstruct the respective page(s), else create parity page(s).
3379+ */
3380+ if (idx > -1) {
3381+ struct page *page = PAGE(stripe, idx);
3382+
3383+ AllowPageIO(page);
3384+ stripe_zero_chunk(stripe, idx);
3385+ common_xor(stripe, stripe->io.size, 0, idx);
3386+ page_set(page, DIRTY);
3387+ } else
3388+ parity_xor(stripe);
3389+
3390+ return stripe_page_lists_rw(rs, stripe);
3391+}
3392+
3393+/* Recover bandwidth available ?. */
3394+static int recover_bandwidth(struct raid_set *rs)
3395+{
3396+ int r, work;
3397+
3398+ /* On reset -> allow recovery. */
3399+ r = recover_io_reset(rs);
3400+ if (r || RSBandwidth(rs))
3401+ goto out;
3402+
3403+ work = atomic_read(rs->recover.io_count + IO_WORK);
3404+ if (work) {
3405+ /* Pay attention to larger recover stripe size. */
3406+ int recover =
3407+ atomic_read(rs->recover.io_count + IO_RECOVER) *
3408+ rs->recover.io_size /
3409+ rs->set.io_size;
3410+
3411+ /*
3412+ * Don't use more than given bandwidth of
3413+ * the work io for recovery.
3414+ */
3415+ if (recover > work / rs->recover.bandwidth_work) {
3416+ /* REMOVEME: statistics. */
3417+ atomic_inc(rs->stats + S_NO_BANDWIDTH);
3418+ return 0;
3419+ }
3420+ }
3421+
3422+out:
3423+ atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
3424+ return 1;
3425+}
3426+
3427+/* Try to get a region to recover. */
3428+static int recover_get_region(struct raid_set *rs)
3429+{
3430+ struct recover *rec = &rs->recover;
3431+ struct dm_rh_client *rh = rec->rh;
3432+
3433+ /* Start quiescing some regions. */
3434+ if (!RSRegionGet(rs)) {
3435+ int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
3436+
3437+ if (r) {
3438+ r = dm_rh_recovery_prepare(rh);
3439+ if (r < 0) {
3440+ DMINFO("No %sregions to recover",
3441+ rec->nr_regions_to_recover ?
3442+ "more " : "");
3443+ return -ENOENT;
3444+ }
3445+ } else
3446+ return -EAGAIN;
3447+
3448+ SetRSRegionGet(rs);
3449+ }
3450+
3451+ if (!rec->reg) {
3452+ rec->reg = dm_rh_recovery_start(rh);
3453+ if (rec->reg) {
3454+ /*
3455+ * A reference for the the region I'll
3456+ * keep till I've completely synced it.
3457+ */
3458+ io_get(rs);
3459+ rec->pos = dm_rh_region_to_sector(rh,
3460+ dm_rh_get_region_key(rec->reg));
3461+ rec->end = rec->pos + dm_rh_get_region_size(rh);
3462+ return 1;
3463+ } else
3464+ return -EAGAIN;
3465+ }
3466+
3467+ return 0;
3468+}
3469+
3470+/* Read/write a recovery stripe. */
3471+static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
3472+{
3473+ /* Read/write flip-flop. */
3474+ if (TestClearStripeRBW(stripe)) {
3475+ SetStripeRead(stripe);
3476+ return recover_read(rs, stripe, idx_get(rs));
3477+ } else if (TestClearStripeRead(stripe))
3478+ return recover_write(rs, stripe, idx_get(rs));
3479+
3480+ return 0;
3481+}
3482+
3483+/* Reset recovery variables. */
3484+static void recovery_region_reset(struct raid_set *rs)
3485+{
3486+ rs->recover.reg = NULL;
3487+ ClearRSRegionGet(rs);
3488+}
3489+
3490+/* Update region hash state. */
3491+static void recover_rh_update(struct raid_set *rs, int error)
3492+{
3493+ struct recover *rec = &rs->recover;
3494+ struct dm_rh_client *rh = rec->rh;
3495+ struct dm_region *reg = rec->reg;
3496+
3497+ if (reg) {
3498+ dm_rh_recovery_end(rh, reg, error);
3499+ if (!error)
3500+ rec->nr_regions_recovered++;
3501+
3502+ recovery_region_reset(rs);
3503+ }
3504+
3505+ dm_rh_update_states(rh, 1);
3506+ dm_rh_flush(rh);
3507+ io_put(rs); /* Release the io reference for the region. */
3508+}
3509+
3510+/* Called by main io daemon to recover regions. */
3511+/* FIXME: cope with MAX_RECOVER > 1. */
3512+static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
3513+{
3514+ int r;
3515+ struct recover *rec = &rs->recover;
3516+
3517+ /* If recovery is active -> return. */
3518+ if (StripeActive(stripe))
3519+ return;
3520+
3521+ /* io error is fatal for recovery -> stop it. */
3522+ if (unlikely(StripeError(stripe)))
3523+ goto err;
3524+
3525+ /* Get a region to recover. */
3526+ r = recover_get_region(rs);
3527+ switch (r) {
3528+ case 1: /* Got a new region. */
3529+ /* Flag read before write. */
3530+ ClearStripeRead(stripe);
3531+ SetStripeRBW(stripe);
3532+ break;
3533+
3534+ case 0:
3535+ /* Got a region in the works. */
3536+ r = recover_bandwidth(rs);
3537+ if (r) /* Got enough bandwidth. */
3538+ break;
3539+
3540+ case -EAGAIN:
3541+ /* No bandwidth/quiesced region yet, try later. */
3542+ wake_do_raid_delayed(rs, HZ / 10);
3543+ return;
3544+
3545+ case -ENOENT: /* No more regions. */
3546+ dm_table_event(rs->ti->table);
3547+ goto free;
3548+ }
3549+
3550+ /* Read/write a recover stripe. */
3551+ r = recover_stripe_rw(rs, stripe);
3552+ if (r) {
3553+ /* IO initiated, get another reference for the IO. */
3554+ io_get(rs);
3555+ return;
3556+ }
3557+
3558+ /* Update recovery position within region. */
3559+ rec->pos += stripe->io.size;
3560+
3561+ /* If we're at end of region, update region hash. */
3562+ if (rec->pos >= rec->end ||
3563+ rec->pos >= rs->set.sectors_per_dev)
3564+ recover_rh_update(rs, 0);
3565+ else
3566+ SetStripeRBW(stripe);
3567+
3568+ /* Schedule myself for another round... */
3569+ wake_do_raid(rs);
3570+ return;
3571+
3572+err:
3573+ raid_set_check_degrade(rs, stripe);
3574+
3575+ {
3576+ char buf[BDEVNAME_SIZE];
3577+
3578+ DMERR("stopping recovery due to "
3579+ "ERROR on /dev/%s, stripe at offset %llu",
3580+ bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3581+ (unsigned long long) stripe->key);
3582+
3583+ }
3584+
3585+ /* Make sure, that all quiesced regions get released. */
3586+ do {
3587+ if (rec->reg)
3588+ dm_rh_recovery_end(rec->rh, rec->reg, -EIO);
3589+
3590+ rec->reg = dm_rh_recovery_start(rec->rh);
3591+ } while (rec->reg);
3592+
3593+ recover_rh_update(rs, -EIO);
3594+free:
3595+ rs->set.dev_to_init = -1;
3596+
3597+ /* Check for jiffies overrun. */
3598+ rs->recover.end_jiffies = jiffies;
3599+ if (rs->recover.end_jiffies < rs->recover.start_jiffies)
3600+ rs->recover.end_jiffies = ~0;
3601+
3602+ ClearRSRecover(rs);
3603+}
3604+
3605+static INLINE void do_recovery(struct raid_set *rs)
3606+{
3607+ struct stripe *stripe;
3608+
3609+ list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
3610+ _do_recovery(rs, stripe);
3611+
3612+ if (!RSRecover(rs))
3613+ stripe_recover_free(rs);
3614+}
3615+
3616+/*
3617+ * END recovery functions
3618+ */
3619+
3620+/* End io process all stripes handed in by endio() callback. */
3621+static void do_endios(struct raid_set *rs)
3622+{
3623+ struct stripe_cache *sc = &rs->sc;
3624+ struct stripe *stripe;
3625+
3626+ while ((stripe = stripe_endio_pop(sc))) {
3627+ unsigned count;
3628+
3629+ /* Recovery stripe special case. */
3630+ if (unlikely(StripeRecover(stripe))) {
3631+ if (stripe_io(stripe))
3632+ continue;
3633+
3634+ io_put(rs); /* Release region io reference. */
3635+ ClearStripeActive(stripe);
3636+
3637+ /* REMOVEME: statistics*/
3638+ atomic_dec(&sc->active_stripes);
3639+ continue;
3640+ }
3641+
3642+ /* Early end io all reads on any uptodate chunks. */
3643+ stripe_endio(READ, stripe, (count = 0, &count));
3644+ if (stripe_io(stripe)) {
3645+ if (count) /* REMOVEME: statistics. */
3646+ atomic_inc(rs->stats + S_ACTIVE_READS);
3647+
3648+ continue;
3649+ }
3650+
3651+ /* Set stripe inactive after all io got processed. */
3652+ if (TestClearStripeActive(stripe))
3653+ atomic_dec(&sc->active_stripes);
3654+
3655+ /* Unlock stripe (for clustering). */
3656+ stripe_unlock(rs, stripe);
3657+
3658+ /*
3659+ * If an io error on a stripe occured and the RAID set
3660+ * is still operational, requeue the stripe for io.
3661+ */
3662+ if (TestClearStripeError(stripe)) {
3663+ raid_set_check_degrade(rs, stripe);
3664+ ClearStripeReconstruct(stripe);
3665+
3666+ if (!StripeMerged(stripe) &&
3667+ raid_set_operational(rs)) {
3668+ stripe_pages_invalidate(stripe);
3669+ stripe_flush(stripe, FLUSH_DELAY);
3670+ /* REMOVEME: statistics. */
3671+ atomic_inc(rs->stats + S_REQUEUE);
3672+ continue;
3673+ }
3674+ }
3675+
3676+ /* Check if the RAID set is inoperational to error ios. */
3677+ if (!raid_set_operational(rs)) {
3678+ ClearStripeReconstruct(stripe);
3679+ stripe_fail_io(stripe);
3680+ BUG_ON(atomic_read(&stripe->cnt));
3681+ continue;
3682+ }
3683+
3684+ /* Got to reconstruct a missing chunk. */
3685+ if (TestClearStripeReconstruct(stripe))
3686+ reconstruct_xor(stripe);
3687+
3688+ /*
3689+ * Now that we've got a complete stripe, we can
3690+ * process the rest of the end ios on reads.
3691+ */
3692+ BUG_ON(stripe_endio(READ, stripe, NULL));
3693+ ClearStripeRead(stripe);
3694+
3695+ /*
3696+ * Read-before-write stripes need to be flushed again in
3697+ * order to work the write data into the pages *after*
3698+ * they were read in.
3699+ */
3700+ if (TestClearStripeMerged(stripe))
3701+ /* End io all bios which got merged already. */
3702+ BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
3703+
3704+ /* Got to put on flush list because of new writes. */
3705+ if (StripeRBW(stripe))
3706+ stripe_flush(stripe, FLUSH_DELAY);
3707+ }
3708+}
3709+
3710+/*
3711+ * Stripe cache shrinking.
3712+ */
3713+static INLINE void do_sc_shrink(struct raid_set *rs)
3714+{
3715+ unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
3716+
3717+ if (shrink) {
3718+ unsigned cur = atomic_read(&rs->sc.stripes);
3719+
3720+ sc_shrink(&rs->sc, shrink);
3721+ shrink -= cur - atomic_read(&rs->sc.stripes);
3722+ atomic_set(&rs->sc.stripes_to_shrink, shrink);
3723+
3724+ /*
3725+ * Wake myself up in case we failed to shrink the
3726+ * requested amount in order to try again later.
3727+ */
3728+ if (shrink)
3729+ wake_do_raid(rs);
3730+ }
3731+}
3732+
3733+
3734+/*
3735+ * Process all ios
3736+ *
3737+ * We do different things with the io depending on the
3738+ * state of the region that it's in:
3739+ *
3740+ * o reads: hang off stripe cache or postpone if full
3741+ *
3742+ * o writes:
3743+ *
3744+ * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3745+ * In case stripe cache is full or busy, postpone the io.
3746+ *
3747+ * RECOVERING: delay the io until recovery of the region completes.
3748+ *
3749+ */
3750+static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
3751+{
3752+ int r;
3753+ unsigned flush = 0;
3754+ struct dm_rh_client *rh = rs->recover.rh;
3755+ struct bio *bio;
3756+ struct bio_list delay, reject;
3757+
3758+ bio_list_init(&delay);
3759+ bio_list_init(&reject);
3760+
3761+ /*
3762+ * Classify each io:
3763+ * o delay to recovering regions
3764+ * o queue to all other regions
3765+ */
3766+ while ((bio = bio_list_pop(ios))) {
3767+ /*
3768+ * In case we get a barrier bio, push it back onto
3769+ * the input queue unless all work queues are empty
3770+ * and the stripe cache is inactive.
3771+ */
3772+ if (unlikely(bio_barrier(bio))) {
3773+ /* REMOVEME: statistics. */
3774+ atomic_inc(rs->stats + S_BARRIER);
3775+ if (!list_empty(rs->sc.lists + LIST_IO) ||
3776+ !bio_list_empty(&delay) ||
3777+ !bio_list_empty(&reject) ||
3778+ sc_active(&rs->sc)) {
3779+ bio_list_push(ios, bio);
3780+ break;
3781+ }
3782+ }
3783+
3784+ r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
3785+ if (unlikely(r)) {
3786+ /* Got to wait for recovering regions. */
3787+ bio_list_add(&delay, bio);
3788+ SetRSBandwidth(rs);
3789+ } else {
3790+ /*
3791+ * Process ios to non-recovering regions by queueing
3792+ * them to stripes (does rh_inc()) for writes).
3793+ */
3794+ flush += stripe_queue_bio(rs, bio, &reject);
3795+ }
3796+ }
3797+
3798+ if (flush) {
3799+ r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3800+ if (r)
3801+ DMERR("dirty log flush");
3802+ }
3803+
3804+ /* Delay ios to regions which are recovering. */
3805+ while ((bio = bio_list_pop(&delay))) {
3806+ /* REMOVEME: statistics.*/
3807+ atomic_inc(rs->stats + S_DELAYED_BIOS);
3808+ atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3809+ dm_rh_delay_by_region(rh, bio,
3810+ dm_rh_sector_to_region(rh, _sector(rs, bio)));
3811+
3812+ }
3813+
3814+ /* Merge any rejected bios back to the head of the input list. */
3815+ bio_list_merge_head(ios, &reject);
3816+}
3817+
3818+/* Flush any stripes on the io list. */
3819+static INLINE void do_flush(struct raid_set *rs)
3820+{
3821+ struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
3822+
3823+ list_for_each_safe(pos, tmp, list) {
3824+ int r = stripe_flush(list_entry(pos, struct stripe,
3825+ lists[LIST_IO]), FLUSH_NOW);
3826+
3827+ /* Remove from the list only if the stripe got processed. */
3828+ if (!r)
3829+ list_del_init(pos);
3830+ }
3831+}
3832+
3833+/* Send an event in case we're getting too busy. */
3834+static INLINE void do_busy_event(struct raid_set *rs)
3835+{
3836+ if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
3837+ if (!TestSetRSScBusy(rs))
3838+ dm_table_event(rs->ti->table);
3839+ } else
3840+ ClearRSScBusy(rs);
3841+}
3842+
3843+/* Unplug: let the io role on the sets devices. */
3844+static INLINE void do_unplug(struct raid_set *rs)
3845+{
3846+ struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3847+
3848+ while (dev-- > rs->dev) {
3849+ /* Only call any device unplug function, if io got queued. */
3850+ if (io_dev_clear(dev))
3851+ blk_unplug(bdev_get_queue(dev->dev->bdev));
3852+ }
3853+}
3854+
3855+/*-----------------------------------------------------------------
3856+ * RAID daemon
3857+ *---------------------------------------------------------------*/
3858+/*
3859+ * o belabour all end ios
3860+ * o optionally shrink the stripe cache
3861+ * o update the region hash states
3862+ * o optionally do recovery
3863+ * o grab the input queue
3864+ * o work an all requeued or new ios and perform stripe cache flushs
3865+ * unless the RAID set is inoperational (when we error ios)
3866+ * o check, if the stripe cache gets too busy and throw an event if so
3867+ * o unplug any component raid devices with queued bios
3868+ */
3869+static void do_raid(struct work_struct *ws)
3870+{
3871+ struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
3872+ struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3873+ spinlock_t *lock = &rs->io.in_lock;
3874+
3875+ /*
3876+ * We always need to end io, so that ios
3877+ * can get errored in case the set failed
3878+ * and the region counters get decremented
3879+ * before we update the region hash states.
3880+ */
3881+redo:
3882+ do_endios(rs);
3883+
3884+ /*
3885+ * Now that we've end io'd, which may have put stripes on
3886+ * the LRU list, we shrink the stripe cache if requested.
3887+ */
3888+ do_sc_shrink(rs);
3889+
3890+ /* Update region hash states before we go any further. */
3891+ dm_rh_update_states(rs->recover.rh, 1);
3892+
3893+ /* Try to recover regions. */
3894+ if (RSRecover(rs))
3895+ do_recovery(rs);
3896+
3897+ /* More endios -> process. */
3898+ if (!stripe_endio_empty(&rs->sc)) {
3899+ atomic_inc(rs->stats + S_REDO);
3900+ goto redo;
3901+ }
3902+
3903+ /* Quickly grab all new ios queued and add them to the work list. */
3904+ spin_lock_irq(lock);
3905+ bio_list_merge(ios, ios_in);
3906+ bio_list_init(ios_in);
3907+ spin_unlock_irq(lock);
3908+
3909+ /* Let's assume we're operational most of the time ;-). */
3910+ if (likely(raid_set_operational(rs))) {
3911+ /* If we got ios, work them into the cache. */
3912+ if (!bio_list_empty(ios)) {
3913+ do_ios(rs, ios);
3914+ do_unplug(rs); /* Unplug the sets device queues. */
3915+ }
3916+
3917+ do_flush(rs); /* Flush any stripes on io list. */
3918+ do_unplug(rs); /* Unplug the sets device queues. */
3919+ do_busy_event(rs); /* Check if we got too busy. */
3920+
3921+ /* More endios -> process. */
3922+ if (!stripe_endio_empty(&rs->sc)) {
3923+ atomic_inc(rs->stats + S_REDO);
3924+ goto redo;
3925+ }
3926+ } else
3927+ /* No way to reconstruct data with too many devices failed. */
3928+ bio_list_fail(rs, NULL, ios);
3929+}
3930+
3931+/*
3932+ * Callback for region hash to dispatch
3933+ * delayed bios queued to recovered regions
3934+ * (Gets called via rh_update_states()).
3935+ */
3936+static void dispatch_delayed_bios(void *context, struct bio_list *bl, int dummy)
3937+{
3938+ struct raid_set *rs = context;
3939+ struct bio *bio;
3940+
3941+ /* REMOVEME: decrement pending delayed bios counter. */
3942+ bio_list_for_each(bio, bl)
3943+ atomic_dec(rs->stats + S_DELAYED_BIOS);
3944+
3945+ /* Merge region hash private list to work list. */
3946+ bio_list_merge_head(&rs->io.work, bl);
3947+ bio_list_init(bl);
3948+ ClearRSBandwidth(rs);
3949+}
3950+
3951+/*************************************************************
3952+ * Constructor helpers
3953+ *************************************************************/
3954+/* Calculate MB/sec. */
3955+static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
3956+{
3957+ return to_bytes(speed * rs->set.data_devs *
3958+ rs->recover.io_size * HZ >> 10) >> 10;
3959+}
3960+
3961+/*
3962+ * Discover fastest xor algorithm and # of chunks combination.
3963+ */
3964+/* Calculate speed for algorithm and # of chunks. */
3965+static INLINE unsigned xor_speed(struct stripe *stripe)
3966+{
3967+ unsigned r = 0;
3968+ unsigned long j;
3969+
3970+ /* Wait for next tick. */
3971+ for (j = jiffies; j == jiffies;)
3972+ ;
3973+
3974+ /* Do xors for a full tick. */
3975+ for (j = jiffies; j == jiffies;) {
3976+ mb();
3977+ common_xor(stripe, stripe->io.size, 0, 0);
3978+ mb();
3979+ r++;
3980+ mb();
3981+ }
3982+
3983+ return r;
3984+}
3985+
3986+/* Optimize xor algorithm for this RAID set. */
3987+static unsigned xor_optimize(struct raid_set *rs)
3988+{
3989+ unsigned chunks_max = 2, speed_max = 0;
3990+ struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3991+ struct stripe *stripe;
3992+
3993+ BUG_ON(list_empty(&rs->recover.stripes));
3994+ stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3995+ lists[LIST_RECOVER]);
3996+
3997+ /*
3998+ * Got to allow io on all chunks, so that
3999+ * xor() will actually work on them.
4000+ */
4001+ stripe_allow_io(stripe);
4002+
4003+ /* Try all xor functions. */
4004+ while (f-- > xor_funcs) {
4005+ unsigned speed;
4006+
4007+ /* Set actual xor function for common_xor(). */
4008+ rs->xor.f = f;
4009+ rs->xor.chunks = XOR_CHUNKS_MAX + 1;
4010+
4011+ while (rs->xor.chunks-- > 2) {
4012+ speed = xor_speed(stripe);
4013+ if (speed > speed_max) {
4014+ speed_max = speed;
4015+ chunks_max = rs->xor.chunks;
4016+ f_max = f;
4017+ }
4018+ }
4019+ }
4020+
4021+ /* Memorize optimum parameters. */
4022+ rs->xor.f = f_max;
4023+ rs->xor.chunks = chunks_max;
4024+ return speed_max;
4025+}
4026+
4027+/*
4028+ * Allocate a RAID context (a RAID set)
4029+ */
4030+static int
4031+context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
4032+ unsigned stripes, unsigned chunk_size, unsigned io_size,
4033+ unsigned recover_io_size, unsigned raid_devs,
4034+ sector_t sectors_per_dev,
4035+ struct dm_target *ti, unsigned dl_parms, char **argv)
4036+{
4037+ int r;
4038+ unsigned p;
4039+ size_t len;
4040+ sector_t region_size, ti_len;
4041+ struct raid_set *rs = NULL;
4042+ struct dm_dirty_log *dl;
4043+ struct recover *rec;
4044+
4045+ /*
4046+ * Create the dirty log
4047+ *
4048+ * We need to change length for the dirty log constructor,
4049+ * because we want an amount of regions for all stripes derived
4050+ * from the single device size, so that we can keep region
4051+ * size = 2^^n independant of the number of devices
4052+ */
4053+ ti_len = ti->len;
4054+ ti->len = sectors_per_dev;
4055+ dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
4056+ ti->len = ti_len;
4057+ if (!dl)
4058+ goto bad_dirty_log;
4059+
4060+ /* Chunk size *must* be smaller than region size. */
4061+ region_size = dl->type->get_region_size(dl);
4062+ if (chunk_size > region_size)
4063+ goto bad_chunk_size;
4064+
4065+ /* Recover io size *must* be smaller than region size as well. */
4066+ if (recover_io_size > region_size)
4067+ goto bad_recover_io_size;
4068+
4069+ /* Size and allocate the RAID set structure. */
4070+ len = sizeof(*rs->data) + sizeof(*rs->dev);
4071+ if (array_too_big(sizeof(*rs), len, raid_devs))
4072+ goto bad_array;
4073+
4074+ len = sizeof(*rs) + raid_devs * len;
4075+ rs = kzalloc(len, GFP_KERNEL);
4076+ if (!rs)
4077+ goto bad_alloc;
4078+
4079+ rec = &rs->recover;
4080+ atomic_set(&rs->io.in_process, 0);
4081+ atomic_set(&rs->io.in_process_max, 0);
4082+ rec->io_size = recover_io_size;
4083+
4084+ /* Pointer to data array. */
4085+ rs->data = (unsigned long **)
4086+ ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
4087+ rec->dl = dl;
4088+ rs->set.raid_devs = p = raid_devs;
4089+ rs->set.data_devs = raid_devs - raid_type->parity_devs;
4090+ rs->set.raid_type = raid_type;
4091+
4092+ /*
4093+ * Set chunk and io size and respective shifts
4094+ * (used to avoid divisions)
4095+ */
4096+ rs->set.chunk_size = chunk_size;
4097+ rs->set.chunk_mask = chunk_size - 1;
4098+ rs->set.chunk_shift = ffs(chunk_size) - 1;
4099+
4100+ rs->set.io_size = io_size;
4101+ rs->set.io_mask = io_size - 1;
4102+ rs->set.io_shift = ffs(io_size) - 1;
4103+ rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
4104+
4105+ rs->set.pages_per_io = chunk_pages(io_size);
4106+ rs->set.sectors_per_dev = sectors_per_dev;
4107+
4108+ rs->set.ei = -1; /* Indicate no failed device. */
4109+ atomic_set(&rs->set.failed_devs, 0);
4110+
4111+ rs->ti = ti;
4112+
4113+ atomic_set(rec->io_count + IO_WORK, 0);
4114+ atomic_set(rec->io_count + IO_RECOVER, 0);
4115+
4116+ /* Initialize io lock and queues. */
4117+ spin_lock_init(&rs->io.in_lock);
4118+ bio_list_init(&rs->io.in);
4119+ bio_list_init(&rs->io.work);
4120+
4121+ init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
4122+
4123+ rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
4124+ rec->rh = dm_rh_client_create(MAX_RECOVER, dispatch_delayed_bios, rs,
4125+ wake_do_raid, rs, dl, region_size,
4126+ rs->recover.nr_regions);
4127+ if (IS_ERR(rec->rh))
4128+ goto bad_rh;
4129+
4130+ /* Initialize stripe cache. */
4131+ r = sc_init(rs, stripes);
4132+ if (r)
4133+ goto bad_sc;
4134+
4135+ /* Create dm-io client context. */
4136+ rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
4137+ rs->set.pages_per_io);
4138+ if (IS_ERR(rs->sc.dm_io_client))
4139+ goto bad_dm_io_client;
4140+
4141+ /* REMOVEME: statistics. */
4142+ stats_reset(rs);
4143+ ClearRSDevelStats(rs); /* Disnable development status. */
4144+
4145+ *raid_set = rs;
4146+ return 0;
4147+
4148+bad_dirty_log:
4149+ TI_ERR_RET("Error creating dirty log", -ENOMEM);
4150+
4151+
4152+bad_chunk_size:
4153+ dm_dirty_log_destroy(dl);
4154+ TI_ERR("Chunk size larger than region size");
4155+
4156+bad_recover_io_size:
4157+ dm_dirty_log_destroy(dl);
4158+ TI_ERR("Recover stripe io size larger than region size");
4159+
4160+bad_array:
4161+ dm_dirty_log_destroy(dl);
4162+ TI_ERR("Arry too big");
4163+
4164+bad_alloc:
4165+ dm_dirty_log_destroy(dl);
4166+ TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
4167+
4168+bad_rh:
4169+ dm_dirty_log_destroy(dl);
4170+ ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
4171+ goto free_rs;
4172+
4173+bad_sc:
4174+ ti->error = DM_MSG_PREFIX "Error creating stripe cache";
4175+ goto free;
4176+
4177+bad_dm_io_client:
4178+ ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
4179+free:
4180+ dm_rh_client_destroy(rec->rh);
4181+ sc_exit(&rs->sc);
4182+ dm_rh_client_destroy(rec->rh); /* Destroys dirty log as well. */
4183+free_rs:
4184+ kfree(rs);
4185+ return -ENOMEM;
4186+}
4187+
4188+/* Free a RAID context (a RAID set). */
4189+static void
4190+context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
4191+{
4192+ while (r--)
4193+ dm_put_device(ti, rs->dev[r].dev);
4194+
4195+ dm_io_client_destroy(rs->sc.dm_io_client);
4196+ sc_exit(&rs->sc);
4197+ dm_rh_client_destroy(rs->recover.rh);
4198+ dm_dirty_log_destroy(rs->recover.dl);
4199+ kfree(rs);
4200+}
4201+
4202+/* Create work queue and initialize work. */
4203+static int rs_workqueue_init(struct raid_set *rs)
4204+{
4205+ struct dm_target *ti = rs->ti;
4206+
4207+ rs->io.wq = create_singlethread_workqueue(DAEMON);
4208+ if (!rs->io.wq)
4209+ TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
4210+
4211+ INIT_DELAYED_WORK(&rs->io.dws, do_raid);
4212+ return 0;
4213+}
4214+
4215+/* Return pointer to raid_type structure for raid name. */
4216+static struct raid_type *get_raid_type(char *name)
4217+{
4218+ struct raid_type *r = ARRAY_END(raid_types);
4219+
4220+ while (r-- > raid_types) {
4221+ if (!strnicmp(STR_LEN(r->name, name)))
4222+ return r;
4223+ }
4224+
4225+ return NULL;
4226+}
4227+
4228+/* FIXME: factor out to dm core. */
4229+static int multiple(sector_t a, sector_t b, sector_t *n)
4230+{
4231+ sector_t r = a;
4232+
4233+ sector_div(r, b);
4234+ *n = r;
4235+ return a == r * b;
4236+}
4237+
4238+/* Log RAID set information to kernel log. */
4239+static void raid_set_log(struct raid_set *rs, unsigned speed)
4240+{
4241+ unsigned p;
4242+ char buf[BDEVNAME_SIZE];
4243+
4244+ for (p = 0; p < rs->set.raid_devs; p++)
4245+ DMINFO("/dev/%s is raid disk %u",
4246+ bdevname(rs->dev[p].dev->bdev, buf), p);
4247+
4248+ DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
4249+ rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
4250+ atomic_read(&rs->sc.stripes));
4251+ DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
4252+ rs->xor.chunks, mbpers(rs, speed));
4253+ DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
4254+ rs->set.data_devs, rs->set.raid_devs);
4255+}
4256+
4257+/* Get all devices and offsets. */
4258+static int
4259+dev_parms(struct dm_target *ti, struct raid_set *rs,
4260+ char **argv, int *p)
4261+{
4262+ for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
4263+ int r;
4264+ unsigned long long tmp;
4265+ struct raid_dev *dev = rs->dev + *p;
4266+ union dev_lookup dl = {.dev = dev };
4267+
4268+ /* Get offset and device. */
4269+ r = sscanf(argv[1], "%llu", &tmp);
4270+ if (r != 1)
4271+ TI_ERR("Invalid RAID device offset parameter");
4272+
4273+ dev->start = tmp;
4274+ r = dm_get_device(ti, argv[0], dev->start,
4275+ rs->set.sectors_per_dev,
4276+ dm_table_get_mode(ti->table), &dev->dev);
4277+ if (r)
4278+ TI_ERR_RET("RAID device lookup failure", r);
4279+
4280+ r = raid_dev_lookup(rs, bynumber, &dl);
4281+ if (r != -ENODEV && r < *p) {
4282+ (*p)++; /* Ensure dm_put_device() on actual device. */
4283+ TI_ERR_RET("Duplicate RAID device", -ENXIO);
4284+ }
4285+ }
4286+
4287+ return 0;
4288+}
4289+
4290+/* Set recovery bandwidth. */
4291+static INLINE void
4292+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
4293+{
4294+ rs->recover.bandwidth = bandwidth;
4295+ rs->recover.bandwidth_work = 100 / bandwidth;
4296+}
4297+
4298+/* Handle variable number of RAID parameters. */
4299+static int
4300+raid_variable_parms(struct dm_target *ti, char **argv,
4301+ unsigned i, int *raid_parms,
4302+ int *chunk_size, int *chunk_size_parm,
4303+ int *stripes, int *stripes_parm,
4304+ int *io_size, int *io_size_parm,
4305+ int *recover_io_size, int *recover_io_size_parm,
4306+ int *bandwidth, int *bandwidth_parm)
4307+{
4308+ /* Fetch # of variable raid parameters. */
4309+ if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
4310+ !range_ok(*raid_parms, 0, 5))
4311+ TI_ERR("Bad variable raid parameters number");
4312+
4313+ if (*raid_parms) {
4314+ /*
4315+ * If we've got variable RAID parameters,
4316+ * chunk size is the first one
4317+ */
4318+ if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
4319+ (*chunk_size != -1 &&
4320+ (!POWER_OF_2(*chunk_size) ||
4321+ !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
4322+ TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
4323+
4324+ *chunk_size_parm = *chunk_size;
4325+ if (*chunk_size == -1)
4326+ *chunk_size = CHUNK_SIZE;
4327+
4328+ /*
4329+ * In case we've got 2 or more variable raid
4330+ * parameters, the number of stripes is the second one
4331+ */
4332+ if (*raid_parms > 1) {
4333+ if (sscanf(argv[i++], "%d", stripes) != 1 ||
4334+ (*stripes != -1 &&
4335+ !range_ok(*stripes, STRIPES_MIN,
4336+ STRIPES_MAX)))
4337+ TI_ERR("Invalid number of stripes: must "
4338+ "be >= 8 and <= 8192");
4339+ }
4340+
4341+ *stripes_parm = *stripes;
4342+ if (*stripes == -1)
4343+ *stripes = STRIPES;
4344+
4345+ /*
4346+ * In case we've got 3 or more variable raid
4347+ * parameters, the io size is the third one.
4348+ */
4349+ if (*raid_parms > 2) {
4350+ if (sscanf(argv[i++], "%d", io_size) != 1 ||
4351+ (*io_size != -1 &&
4352+ (!POWER_OF_2(*io_size) ||
4353+ !range_ok(*io_size, IO_SIZE_MIN,
4354+ min(BIO_MAX_SECTORS / 2,
4355+ *chunk_size)))))
4356+ TI_ERR("Invalid io size; must "
4357+ "be 2^^n and less equal "
4358+ "min(BIO_MAX_SECTORS/2, chunk size)");
4359+ } else
4360+ *io_size = *chunk_size;
4361+
4362+ *io_size_parm = *io_size;
4363+ if (*io_size == -1)
4364+ *io_size = *chunk_size;
4365+
4366+ /*
4367+ * In case we've got 4 variable raid parameters,
4368+ * the recovery stripe io_size is the fourth one
4369+ */
4370+ if (*raid_parms > 3) {
4371+ if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
4372+ (*recover_io_size != -1 &&
4373+ (!POWER_OF_2(*recover_io_size) ||
4374+ !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
4375+ BIO_MAX_SECTORS / 2))))
4376+ TI_ERR("Invalid recovery io size; must be "
4377+ "2^^n and less equal BIO_MAX_SECTORS/2");
4378+ }
4379+
4380+ *recover_io_size_parm = *recover_io_size;
4381+ if (*recover_io_size == -1)
4382+ *recover_io_size = RECOVER_IO_SIZE;
4383+
4384+ /*
4385+ * In case we've got 5 variable raid parameters,
4386+ * the recovery io bandwidth is the fifth one
4387+ */
4388+ if (*raid_parms > 4) {
4389+ if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
4390+ (*bandwidth != -1 &&
4391+ !range_ok(*bandwidth, BANDWIDTH_MIN,
4392+ BANDWIDTH_MAX)))
4393+ TI_ERR("Invalid recovery bandwidth "
4394+ "percentage; must be > 0 and <= 100");
4395+ }
4396+
4397+ *bandwidth_parm = *bandwidth;
4398+ if (*bandwidth == -1)
4399+ *bandwidth = BANDWIDTH;
4400+ }
4401+
4402+ return 0;
4403+}
4404+
4405+/* Parse optional locking parameters. */
4406+static int
4407+raid_locking_parms(struct dm_target *ti, char **argv,
4408+ unsigned i, int *locking_parms,
4409+ struct dm_raid45_locking_type **locking_type)
4410+{
4411+ *locking_parms = 0;
4412+ *locking_type = &locking_none;
4413+
4414+ if (!strnicmp(argv[i], "none", strlen(argv[i])))
4415+ *locking_parms = 1;
4416+ else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
4417+ *locking_type = &locking_none;
4418+ *locking_parms = 2;
4419+ } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
4420+ *locking_type = &locking_cluster;
4421+ /* FIXME: namespace. */
4422+ *locking_parms = 3;
4423+ }
4424+
4425+ return *locking_parms == 1 ? -EINVAL : 0;
4426+}
4427+
4428+/* Set backing device information properties of RAID set. */
4429+static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
4430+{
4431+ unsigned p, ra_pages;
4432+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
4433+ struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
4434+
4435+ /* Set read-ahead for the RAID set and the component devices. */
4436+ bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
4437+ ra_pages = chunks * chunk_pages(rs->set.io_size);
4438+ for (p = rs->set.raid_devs; p--; ) {
4439+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
4440+
4441+ q->backing_dev_info.ra_pages = ra_pages;
4442+ }
4443+
4444+ /* Set congested function and data. */
4445+ bdi->congested_fn = raid_set_congested;
4446+ bdi->congested_data = rs;
4447+
4448+ dm_put(md);
4449+}
4450+
4451+/* Get backing device information properties of RAID set. */
4452+static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
4453+{
4454+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
4455+
4456+ *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
4457+ / stripe_pages(rs, rs->set.io_size);
4458+ *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
4459+ / chunk_pages(rs->set.io_size);
4460+
4461+ dm_put(md);
4462+}
4463+
4464+/*
4465+ * Construct a RAID4/5 mapping:
4466+ *
4467+ * log_type #log_params <log_params> \
4468+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
4469+ * [locking "none"/"cluster"]
4470+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
4471+ *
4472+ * log_type = "core"/"disk",
4473+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
4474+ * log_params = [dirty_log_path] region_size [[no]sync])
4475+ *
4476+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
4477+ *
4478+ * #parity_dev = N if raid_type = "raid4"
4479+ * o N = -1: pick default = last device
4480+ * o N >= 0 and < #raid_devs: parity device index
4481+ *
4482+ * #raid_variable_params = 0-5; raid_params (-1 = default):
4483+ * [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
4484+ * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
4485+ * and <= CHUNK_SIZE_MAX)
4486+ * o #stripes is number of stripes allocated to stripe cache
4487+ * (must be > 1 and < STRIPES_MAX)
4488+ * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
4489+ * o recover_io_size (io unit size per device for recovery in sectors;
4490+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4491+ * o %recovery_bandwith is the maximum amount spend for recovery during
4492+ * application io (1-100%)
4493+ * If raid_variable_params = 0, defaults will be used.
4494+ * Any raid_variable_param can be set to -1 to apply a default
4495+ *
4496+ * #raid_devs = N (N >= 3)
4497+ *
4498+ * #dev_to_initialize = N
4499+ * -1: initialize parity on all devices
4500+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4501+ * of a failed devices content after replacement
4502+ *
4503+ * <dev_path> = device_path (eg, /dev/sdd1)
4504+ * <offset> = begin at offset on <dev_path>
4505+ *
4506+ */
4507+#define MIN_PARMS 13
4508+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4509+{
4510+ int bandwidth = BANDWIDTH, bandwidth_parm = -1,
4511+ chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
4512+ dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
4513+ i, io_size = IO_SIZE, io_size_parm = -1,
4514+ r, raid_devs, raid_parms,
4515+ recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
4516+ stripes = STRIPES, stripes_parm = -1;
4517+ unsigned speed;
4518+ sector_t tmp, sectors_per_dev;
4519+ struct dm_raid45_locking_type *locking;
4520+ struct raid_set *rs;
4521+ struct raid_type *raid_type;
4522+
4523+ /* Ensure minimum number of parameters. */
4524+ if (argc < MIN_PARMS)
4525+ TI_ERR("Not enough parameters");
4526+
4527+ /* Fetch # of dirty log parameters. */
4528+ if (sscanf(argv[1], "%d", &dl_parms) != 1
4529+ || !range_ok(dl_parms, 1, 4711))
4530+ TI_ERR("Bad dirty log parameters number");
4531+
4532+ /* Check raid_type. */
4533+ raid_type = get_raid_type(argv[dl_parms + 2]);
4534+ if (!raid_type)
4535+ TI_ERR("Bad raid type");
4536+
4537+ /* In case of RAID4, parity drive is selectable. */
4538+ parity_parm = !!(raid_type->level == raid4);
4539+
4540+ /* Handle variable number of RAID parameters. */
4541+ r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
4542+ &raid_parms,
4543+ &chunk_size, &chunk_size_parm,
4544+ &stripes, &stripes_parm,
4545+ &io_size, &io_size_parm,
4546+ &recover_io_size, &recover_io_size_parm,
4547+ &bandwidth, &bandwidth_parm);
4548+ if (r)
4549+ return r;
4550+
4551+ r = raid_locking_parms(ti, argv,
4552+ dl_parms + parity_parm + raid_parms + 4,
4553+ &locking_parms, &locking);
4554+ if (r)
4555+ return r;
4556+
4557+ /* # of raid devices. */
4558+ i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
4559+ if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4560+ raid_devs < raid_type->minimal_devs)
4561+ TI_ERR("Invalid number of raid devices");
4562+
4563+ /* In case of RAID4, check parity drive index is in limits. */
4564+ if (raid_type->level == raid4) {
4565+ /* Fetch index of parity device. */
4566+ if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4567+ !range_ok(pi, 0, raid_devs - 1))
4568+ TI_ERR("Invalid RAID4 parity device index");
4569+ }
4570+
4571+ /*
4572+ * Index of device to initialize starts at 0
4573+ *
4574+ * o -1 -> don't initialize a particular device,
4575+ * o 0..raid_devs-1 -> initialize respective device
4576+ * (used for reconstruction of a replaced device)
4577+ */
4578+ if (sscanf
4579+ (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
4580+ "%d", &dev_to_init) != 1
4581+ || !range_ok(dev_to_init, -1, raid_devs - 1))
4582+ TI_ERR("Invalid number for raid device to initialize");
4583+
4584+ /* Check # of raid device arguments. */
4585+ if (argc - dl_parms - parity_parm - raid_parms - 6 !=
4586+ 2 * raid_devs)
4587+ TI_ERR("Wrong number of raid device/offset arguments");
4588+
4589+ /*
4590+ * Check that the table length is devisable
4591+ * w/o rest by (raid_devs - parity_devs)
4592+ */
4593+ if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4594+ &sectors_per_dev))
4595+ TI_ERR
4596+ ("Target length not divisable by number of data devices");
4597+
4598+ /*
4599+ * Check that the device size is
4600+ * devisable w/o rest by chunk size
4601+ */
4602+ if (!multiple(sectors_per_dev, chunk_size, &tmp))
4603+ TI_ERR("Device length not divisable by chunk_size");
4604+
4605+ /****************************************************************
4606+ * Now that we checked the constructor arguments ->
4607+ * let's allocate the RAID set
4608+ ****************************************************************/
4609+ r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
4610+ recover_io_size, raid_devs, sectors_per_dev,
4611+ ti, dl_parms, argv);
4612+ if (r)
4613+ return r;
4614+
4615+ /*
4616+ * Set these here in order to avoid passing
4617+ * too many arguments to context_alloc()
4618+ */
4619+ rs->set.dev_to_init_parm = dev_to_init;
4620+ rs->set.dev_to_init = dev_to_init;
4621+ rs->set.pi_parm = pi;
4622+ rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4623+ rs->set.raid_parms = raid_parms;
4624+ rs->set.chunk_size_parm = chunk_size_parm;
4625+ rs->set.io_size_parm = io_size_parm;
4626+ rs->sc.stripes_parm = stripes_parm;
4627+ rs->recover.io_size_parm = recover_io_size_parm;
4628+ rs->recover.bandwidth_parm = bandwidth_parm;
4629+ recover_set_bandwidth(rs, bandwidth);
4630+
4631+ /* Use locking type to lock stripe access. */
4632+ rs->locking = locking;
4633+
4634+ /* Get the device/offset tupels. */
4635+ argv += dl_parms + 6 + parity_parm + raid_parms;
4636+ r = dev_parms(ti, rs, argv, &i);
4637+ if (r)
4638+ goto err;
4639+
4640+ /* Initialize recovery. */
4641+ rs->recover.start_jiffies = jiffies;
4642+ rs->recover.end_jiffies = 0;
4643+ recovery_region_reset(rs);
4644+
4645+ /* Allow for recovery of any nosync regions. */
4646+ SetRSRecover(rs);
4647+
4648+ /* Set backing device information (eg. read ahead). */
4649+ rs_set_bdi(rs, chunk_size * 2, io_size * 4);
4650+ SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4651+
4652+ speed = xor_optimize(rs); /* Select best xor algorithm. */
4653+
4654+ /* Initialize work queue to handle this RAID set's io. */
4655+ r = rs_workqueue_init(rs);
4656+ if (r)
4657+ goto err;
4658+
4659+ raid_set_log(rs, speed); /* Log information about RAID set. */
4660+
4661+ /*
4662+ * Make sure that dm core only hands maximum io size
4663+ * length down and pays attention to io boundaries.
4664+ */
4665+ ti->split_io = rs->set.io_size;
4666+ ti->private = rs;
4667+ return 0;
4668+
4669+err:
4670+ context_free(rs, ti, i);
4671+ return r;
4672+}
4673+
4674+/*
4675+ * Destruct a raid mapping
4676+ */
4677+static void raid_dtr(struct dm_target *ti)
4678+{
4679+ struct raid_set *rs = ti->private;
4680+
4681+ /* Indicate recovery end so that ios in flight drain. */
4682+ ClearRSRecover(rs);
4683+
4684+ wake_do_raid(rs); /* Wake daemon. */
4685+ wait_ios(rs); /* Wait for any io still being processed. */
4686+ destroy_workqueue(rs->io.wq);
4687+ context_free(rs, ti, rs->set.raid_devs);
4688+}
4689+
4690+/* Queues ios to RAID sets. */
4691+static inline void queue_bio(struct raid_set *rs, struct bio *bio)
4692+{
4693+ int wake;
4694+ struct bio_list *in = &rs->io.in;
4695+ spinlock_t *in_lock = &rs->io.in_lock;
4696+
4697+ spin_lock_irq(in_lock);
4698+ wake = bio_list_empty(in);
4699+ bio_list_add(in, bio);
4700+ spin_unlock_irq(in_lock);
4701+
4702+ /* Wake daemon if input list was empty. */
4703+ if (wake)
4704+ wake_do_raid(rs);
4705+}
4706+
4707+/* Raid mapping function. */
4708+static int raid_map(struct dm_target *ti, struct bio *bio,
4709+ union map_info *map_context)
4710+{
4711+ /* I don't want to waste stripe cache capacity. */
4712+ if (bio_rw(bio) == READA)
4713+ return -EIO;
4714+ else {
4715+ struct raid_set *rs = ti->private;
4716+
4717+ /* REMOVEME: statistics. */
4718+ atomic_inc(rs->stats +
4719+ (bio_data_dir(bio) == WRITE ?
4720+ S_BIOS_WRITE : S_BIOS_READ));
4721+
4722+ /*
4723+ * Get io reference to be waiting for to drop
4724+ * to zero on device suspension/destruction.
4725+ */
4726+ io_get(rs);
4727+ bio->bi_sector -= ti->begin; /* Remap sector. */
4728+ queue_bio(rs, bio); /* Queue to the daemon. */
4729+ return DM_MAPIO_SUBMITTED; /* Handle later. */
4730+ }
4731+}
4732+
4733+/* Device suspend. */
4734+static void raid_postsuspend(struct dm_target *ti)
4735+{
4736+ struct raid_set *rs = ti->private;
4737+ struct dm_dirty_log *dl = rs->recover.dl;
4738+
4739+ SetRSSuspended(rs);
4740+
4741+ if (RSRecover(rs))
4742+ dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
4743+ else
4744+ wake_do_raid(rs);
4745+
4746+ wait_ios(rs); /* Wait for completion of all ios being processed. */
4747+ if (dl->type->postsuspend && dl->type->postsuspend(dl))
4748+ /* Suspend dirty log. */
4749+ /* FIXME: need better error handling. */
4750+ DMWARN("log suspend failed");
4751+}
4752+
4753+/* Device resume. */
4754+static void raid_resume(struct dm_target *ti)
4755+{
4756+ struct raid_set *rs = ti->private;
4757+ struct recover *rec = &rs->recover;
4758+ struct dm_dirty_log *dl = rec->dl;
4759+
4760+ if (dl->type->resume && dl->type->resume(dl))
4761+ /* Resume dirty log. */
4762+ /* FIXME: need better error handling. */
4763+ DMWARN("log resume failed");
4764+
4765+ rec->nr_regions_to_recover =
4766+ rec->nr_regions - dl->type->get_sync_count(dl);
4767+
4768+ ClearRSSuspended(rs);
4769+
4770+ /* Reset any unfinished recovery. */
4771+ if (RSRecover(rs)) {
4772+ recovery_region_reset(rs);
4773+ dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
4774+ } else
4775+ wake_do_raid(rs);
4776+}
4777+
4778+static INLINE unsigned sc_size(struct raid_set *rs)
4779+{
4780+ return to_sector(atomic_read(&rs->sc.stripes) *
4781+ (sizeof(struct stripe) +
4782+ (sizeof(struct stripe_set) +
4783+ (sizeof(struct page_list) +
4784+ to_bytes(rs->set.io_size) *
4785+ rs->set.raid_devs)) +
4786+ (rs->recover.
4787+ end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
4788+ rs->recover.
4789+ io_size))));
4790+}
4791+
4792+/* REMOVEME: status output for development. */
4793+static void
4794+raid_devel_stats(struct dm_target *ti, char *result,
4795+ unsigned *size, unsigned maxlen)
4796+{
4797+ unsigned chunks, stripes, sz = *size;
4798+ unsigned long j;
4799+ char buf[BDEVNAME_SIZE], *p;
4800+ struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
4801+ struct raid_set *rs = ti->private;
4802+ struct recover *rec = &rs->recover;
4803+ struct timespec ts;
4804+
4805+ DMEMIT("%s ", version);
4806+ DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
4807+ DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
4808+
4809+ for (sm = stats_map; sm < sm_end; sm++)
4810+ DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4811+
4812+ DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
4813+ DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
4814+ atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
4815+ sc_size(rs));
4816+
4817+ j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4818+ rec->start_jiffies;
4819+ jiffies_to_timespec(j, &ts);
4820+ sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4821+ p = strchr(buf, '.');
4822+ p[3] = 0;
4823+
4824+ DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
4825+ (unsigned long long) rec->nr_regions_recovered,
4826+ RSRegionGet(rs) ? "+" : "",
4827+ (unsigned long long) rec->nr_regions_to_recover,
4828+ (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4829+
4830+ rs_get_ra(rs, &stripes, &chunks);
4831+ DMEMIT("ra=%u/%u ", stripes, chunks);
4832+
4833+ *size = sz;
4834+}
4835+
4836+static int
4837+raid_status(struct dm_target *ti, status_type_t type,
4838+ char *result, unsigned maxlen)
4839+{
4840+ unsigned i, sz = 0;
4841+ char buf[BDEVNAME_SIZE];
4842+ struct raid_set *rs = ti->private;
4843+
4844+ switch (type) {
4845+ case STATUSTYPE_INFO:
4846+ /* REMOVEME: statistics. */
4847+ if (RSDevelStats(rs))
4848+ raid_devel_stats(ti, result, &sz, maxlen);
4849+
4850+ DMEMIT("%u ", rs->set.raid_devs);
4851+
4852+ for (i = 0; i < rs->set.raid_devs; i++)
4853+ DMEMIT("%s ",
4854+ format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
4855+
4856+ DMEMIT("1 ");
4857+ for (i = 0; i < rs->set.raid_devs; i++) {
4858+ DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
4859+
4860+ if (rs->set.raid_type->level == raid4 &&
4861+ i == rs->set.pi)
4862+ DMEMIT("p");
4863+
4864+ if (rs->set.dev_to_init == i)
4865+ DMEMIT("i");
4866+ }
4867+
4868+ break;
4869+
4870+ case STATUSTYPE_TABLE:
4871+ sz = rs->recover.dl->type->status(rs->recover.dl, type,
4872+ result, maxlen);
4873+ DMEMIT("%s %u ", rs->set.raid_type->name,
4874+ rs->set.raid_parms);
4875+
4876+ if (rs->set.raid_type->level == raid4)
4877+ DMEMIT("%d ", rs->set.pi_parm);
4878+
4879+ if (rs->set.raid_parms)
4880+ DMEMIT("%d ", rs->set.chunk_size_parm);
4881+
4882+ if (rs->set.raid_parms > 1)
4883+ DMEMIT("%d ", rs->sc.stripes_parm);
4884+
4885+ if (rs->set.raid_parms > 2)
4886+ DMEMIT("%d ", rs->set.io_size_parm);
4887+
4888+ if (rs->set.raid_parms > 3)
4889+ DMEMIT("%d ", rs->recover.io_size_parm);
4890+
4891+ if (rs->set.raid_parms > 4)
4892+ DMEMIT("%d ", rs->recover.bandwidth_parm);
4893+
4894+ DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4895+
4896+ for (i = 0; i < rs->set.raid_devs; i++)
4897+ DMEMIT("%s %llu ",
4898+ format_dev_t(buf,
4899+ rs->dev[i].dev->bdev->bd_dev),
4900+ (unsigned long long) rs->dev[i].start);
4901+ }
4902+
4903+ return 0;
4904+}
4905+
4906+/*
4907+ * Message interface
4908+ */
4909+enum raid_msg_actions {
4910+ act_bw, /* Recovery bandwidth switch. */
4911+ act_dev, /* Device failure switch. */
4912+ act_overwrite, /* Stripe overwrite check. */
4913+ act_read_ahead, /* Set read ahead. */
4914+ act_stats, /* Development statistics switch. */
4915+ act_sc, /* Stripe cache switch. */
4916+
4917+ act_on, /* Set entity on. */
4918+ act_off, /* Set entity off. */
4919+ act_reset, /* Reset entity. */
4920+
4921+ act_set = act_on, /* Set # absolute. */
4922+ act_grow = act_off, /* Grow # by an amount. */
4923+ act_shrink = act_reset, /* Shrink # by an amount. */
4924+};
4925+
4926+/* Turn a delta to absolute. */
4927+static int _absolute(unsigned long action, int act, int r)
4928+{
4929+ /* Make delta absolute. */
4930+ if (test_bit(act_set, &action))
4931+ ;
4932+ else if (test_bit(act_grow, &action))
4933+ r += act;
4934+ else if (test_bit(act_shrink, &action))
4935+ r = act - r;
4936+ else
4937+ r = -EINVAL;
4938+
4939+ return r;
4940+}
4941+
4942+ /* Change recovery io bandwidth. */
4943+static int bandwidth_change(struct dm_msg *msg, void *context)
4944+{
4945+ struct raid_set *rs = context;
4946+ int act = rs->recover.bandwidth;
4947+ int bandwidth = DM_MSG_INT_ARG(msg);
4948+
4949+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4950+ /* Make delta bandwidth absolute. */
4951+ bandwidth = _absolute(msg->action, act, bandwidth);
4952+
4953+ /* Check range. */
4954+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4955+ recover_set_bandwidth(rs, bandwidth);
4956+ return 0;
4957+ }
4958+ }
4959+
4960+ set_bit(dm_msg_ret_arg, &msg->ret);
4961+ set_bit(dm_msg_ret_inval, &msg->ret);
4962+ return -EINVAL;
4963+}
4964+
4965+/* Change state of a device (running/offline). */
4966+/* FIXME: this only works while recovering!. */
4967+static int device_state(struct dm_msg *msg, void *context)
4968+{
4969+ int r;
4970+ const char *str = "is already ";
4971+ union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
4972+ struct raid_set *rs = context;
4973+
4974+ r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
4975+ bymajmin : byname, &dl);
4976+ if (r == -ENODEV) {
4977+ DMERR("device %s is no member of this set", dl.dev_name);
4978+ return r;
4979+ }
4980+
4981+ if (test_bit(act_off, &msg->action)) {
4982+ if (dev_operational(rs, r))
4983+ str = "";
4984+ } else if (!dev_operational(rs, r))
4985+ str = "";
4986+
4987+ DMINFO("/dev/%s %s%s", dl.dev_name, str,
4988+ test_bit(act_off, &msg->action) ? "offline" : "running");
4989+
4990+ return test_bit(act_off, &msg->action) ?
4991+ raid_set_check_and_degrade(rs, NULL, r) :
4992+ raid_set_check_and_upgrade(rs, r);
4993+}
4994+
4995+/* Set/reset development feature flags. */
4996+static int devel_flags(struct dm_msg *msg, void *context)
4997+{
4998+ struct raid_set *rs = context;
4999+
5000+ if (test_bit(act_on, &msg->action))
5001+ return test_and_set_bit(msg->spec->parm,
5002+ &rs->io.flags) ? -EPERM : 0;
5003+ else if (test_bit(act_off, &msg->action))
5004+ return test_and_clear_bit(msg->spec->parm,
5005+ &rs->io.flags) ? 0 : -EPERM;
5006+ else if (test_bit(act_reset, &msg->action)) {
5007+ if (test_bit(act_stats, &msg->action)) {
5008+ stats_reset(rs);
5009+ goto on;
5010+ } else if (test_bit(act_overwrite, &msg->action)) {
5011+on:
5012+ set_bit(msg->spec->parm, &rs->io.flags);
5013+ return 0;
5014+ }
5015+ }
5016+
5017+ return -EINVAL;
5018+}
5019+
5020+ /* Set stripe and chunk read ahead pages. */
5021+static int read_ahead_set(struct dm_msg *msg, void *context)
5022+{
5023+ int stripes = DM_MSG_INT_ARGS(msg, 0);
5024+ int chunks = DM_MSG_INT_ARGS(msg, 1);
5025+
5026+ if (range_ok(stripes, 1, 512) &&
5027+ range_ok(chunks, 1, 512)) {
5028+ rs_set_bdi(context, stripes, chunks);
5029+ return 0;
5030+ }
5031+
5032+ set_bit(dm_msg_ret_arg, &msg->ret);
5033+ set_bit(dm_msg_ret_inval, &msg->ret);
5034+ return -EINVAL;
5035+}
5036+
5037+/* Resize the stripe cache. */
5038+static int stripecache_resize(struct dm_msg *msg, void *context)
5039+{
5040+ int act, stripes;
5041+ struct raid_set *rs = context;
5042+
5043+ /* Deny permission in case the daemon is still shrinking!. */
5044+ if (atomic_read(&rs->sc.stripes_to_shrink))
5045+ return -EPERM;
5046+
5047+ stripes = DM_MSG_INT_ARG(msg);
5048+ if (stripes > 0) {
5049+ act = atomic_read(&rs->sc.stripes);
5050+
5051+ /* Make delta stripes absolute. */
5052+ stripes = _absolute(msg->action, act, stripes);
5053+
5054+ /*
5055+ * Check range and that the # of stripes changes.
5056+ * We can grow from gere but need to leave any
5057+ * shrinking to the worker for synchronization.
5058+ */
5059+ if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
5060+ if (stripes > act)
5061+ return sc_grow(&rs->sc, stripes - act, SC_GROW);
5062+ else if (stripes < act) {
5063+ atomic_set(&rs->sc.stripes_to_shrink,
5064+ act - stripes);
5065+ wake_do_raid(rs);
5066+ }
5067+
5068+ return 0;
5069+ }
5070+ }
5071+
5072+ set_bit(dm_msg_ret_arg, &msg->ret);
5073+ set_bit(dm_msg_ret_inval, &msg->ret);
5074+ return -EINVAL;
5075+}
5076+
5077+/* Parse the RAID message action. */
5078+/*
5079+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
5080+ * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
5081+ * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
5082+ * "r[ead_ahead] set #stripes #chunks # e.g. 'r se 3 2'
5083+ * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
5084+ * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
5085+ *
5086+ */
5087+static int
5088+raid_message(struct dm_target *ti, unsigned argc, char **argv)
5089+{
5090+ /* Variables to store the parsed parameters im. */
5091+ static int i[2];
5092+ static unsigned long *i_arg[] = {
5093+ (unsigned long *) i + 0,
5094+ (unsigned long *) i + 1,
5095+ };
5096+ static char *p;
5097+ static unsigned long *p_arg[] = { (unsigned long *) &p };
5098+
5099+ /* Declare all message option strings. */
5100+ static char *str_sgs[] = { "set", "grow", "shrink" };
5101+ static char *str_dev[] = { "running", "offline" };
5102+ static char *str_oor[] = { "on", "off", "reset" };
5103+
5104+ /* Declare all actions. */
5105+ static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
5106+ static unsigned long act_oor[] = { act_on, act_off, act_reset };
5107+
5108+ /* Bandwidth option. */
5109+ static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
5110+ static struct dm_message_argument bw_args = {
5111+ 1, i_arg, { dm_msg_int_t }
5112+ };
5113+
5114+ /* Device option. */
5115+ static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
5116+ static struct dm_message_argument dev_args = {
5117+ 1, p_arg, { dm_msg_base_t }
5118+ };
5119+
5120+ /* Read ahead option. */
5121+ static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
5122+ static struct dm_message_argument ra_args = {
5123+ 2, i_arg, { dm_msg_int_t, dm_msg_int_t }
5124+ };
5125+
5126+ static struct dm_message_argument null_args = {
5127+ 0, NULL, { dm_msg_int_t }
5128+ };
5129+
5130+ /* Overwrite and statistics option. */
5131+ static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
5132+
5133+ /* Sripecache option. */
5134+ static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
5135+
5136+ /* Declare messages. */
5137+ static struct dm_msg_spec specs[] = {
5138+ { "bandwidth", act_bw, &bw_opt, &bw_args,
5139+ 0, bandwidth_change },
5140+ { "device", act_dev, &dev_opt, &dev_args,
5141+ 0, device_state },
5142+ { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
5143+ RS_CHECK_OVERWRITE, devel_flags },
5144+ { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
5145+ 0, read_ahead_set },
5146+ { "statistics", act_stats, &ovr_stats_opt, &null_args,
5147+ RS_DEVEL_STATS, devel_flags },
5148+ { "stripecache", act_sc, &stripe_opt, &bw_args,
5149+ 0, stripecache_resize },
5150+ };
5151+
5152+ /* The message for the parser. */
5153+ struct dm_msg msg = {
5154+ .num_specs = ARRAY_SIZE(specs),
5155+ .specs = specs,
5156+ };
5157+
5158+ return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
5159+}
5160+/*
5161+ * END message interface
5162+ */
5163+
5164+static struct target_type raid_target = {
5165+ .name = "raid45",
5166+ .version = {1, 0, 0},
5167+ .module = THIS_MODULE,
5168+ .ctr = raid_ctr,
5169+ .dtr = raid_dtr,
5170+ .map = raid_map,
5171+ .postsuspend = raid_postsuspend,
5172+ .resume = raid_resume,
5173+ .status = raid_status,
5174+ .message = raid_message,
5175+};
5176+
5177+static void init_exit(const char *bad_msg, const char *good_msg, int r)
5178+{
5179+ if (r)
5180+ DMERR("Failed to %sregister target [%d]", bad_msg, r);
5181+ else
5182+ DMINFO("%s %s", good_msg, version);
5183+}
5184+
5185+static int __init dm_raid_init(void)
5186+{
5187+ int r;
5188+
5189+ r = dm_register_target(&raid_target);
5190+ init_exit("", "initialized", r);
5191+ return r;
5192+}
5193+
5194+static void __exit dm_raid_exit(void)
5195+{
5196+ int r;
5197+
5198+ r = dm_unregister_target(&raid_target);
5199+ init_exit("un", "exit", r);
5200+}
5201+
5202+/* Module hooks. */
5203+module_init(dm_raid_init);
5204+module_exit(dm_raid_exit);
5205+
5206+MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
5207+MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
5208+MODULE_LICENSE("GPL");
5209--- /dev/null
5210+++ b/drivers/md/dm-raid45.h
5211@@ -0,0 +1,28 @@
5212+/*
5213+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
5214+ *
5215+ * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
5216+ *
5217+ * Locking definitions for the device-mapper RAID45 target.
5218+ *
5219+ * This file is released under the GPL.
5220+ *
5221+ */
5222+
5223+#ifndef _DM_RAID45_H
5224+#define _DM_RAID45_H
5225+
5226+/* Factor out to dm.h! */
5227+#define STR_LEN(ptr, str) (ptr), (str), strlen((ptr))
5228+
5229+enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
5230+
5231+struct dm_raid45_locking_type {
5232+ /* Request a lock on a stripe. */
5233+ void* (*lock)(sector_t key, enum dm_lock_type type);
5234+
5235+ /* Release a lock on a stripe. */
5236+ void (*unlock)(void *lock_handle);
5237+};
5238+
5239+#endif
5240--- /dev/null
5241+++ b/drivers/md/dm-regions.c
5242@@ -0,0 +1,723 @@
5243+/*
5244+ * Copyright (C) 2003 Sistina Software Limited.
5245+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5246+ *
5247+ * This file is released under the GPL.
5248+ */
5249+
5250+#include <linux/dm-dirty-log.h>
5251+#include <linux/dm-regions.h>
5252+
5253+#include <linux/ctype.h>
5254+#include <linux/init.h>
5255+#include <linux/module.h>
5256+#include <linux/vmalloc.h>
5257+
5258+#include "dm.h"
5259+#include "dm-bio-list.h"
5260+
5261+#define DM_MSG_PREFIX "region hash"
5262+
5263+/*-----------------------------------------------------------------
5264+ * Region hash
5265+ *
5266+ * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions.
5267+ * Each region can be in one of three states:
5268+ *
5269+ * o clean
5270+ * o dirty,
5271+ * o nosync.
5272+ *
5273+ * There is no need to put clean regions in the hash.
5274+ *
5275+ *
5276+ * In addition to being present in the hash table a region _may_
5277+ * be present on one of three lists.
5278+ *
5279+ * clean_regions: Regions on this list have no io pending to
5280+ * them, they are in sync, we are no longer interested in them,
5281+ * they are dull. dm_rh_update_states() will remove them from the
5282+ * hash table.
5283+ *
5284+ * quiesced_regions: These regions have been spun down, ready
5285+ * for recovery. dm_rh_recovery_start() will remove regions from
5286+ * this list and hand them to the caller, which will schedule the
5287+ * recovery io.
5288+ *
5289+ * recovered_regions: Regions that the caller has successfully
5290+ * recovered. dm_rh_update_states() will now schedule any delayed
5291+ * io, up the recovery_count, and remove the region from the hash.
5292+ *
5293+ * There are 2 locks:
5294+ * A rw spin lock 'hash_lock' protects just the hash table,
5295+ * this is never held in write mode from interrupt context,
5296+ * which I believe means that we only have to disable irqs when
5297+ * doing a write lock.
5298+ *
5299+ * An ordinary spin lock 'region_lock' that protects the three
5300+ * lists in the region_hash, with the 'state', 'list' and
5301+ * 'delayed_bios' fields of the regions. This is used from irq
5302+ * context, so all other uses will have to suspend local irqs.
5303+ *---------------------------------------------------------------*/
5304+struct region_hash {
5305+ unsigned max_recovery; /* Max # of regions to recover in parallel */
5306+
5307+ /* Callback function to dispatch queued writes on recovered regions. */
5308+ void (*dispatch)(void *context, struct bio_list *bios, int error);
5309+ void *dispatch_context;
5310+
5311+ /* Callback function to wakeup callers worker thread. */
5312+ void (*wake)(void *context);
5313+ void *wake_context;
5314+
5315+ uint32_t region_size;
5316+ unsigned region_shift;
5317+
5318+ /* holds persistent region state */
5319+ struct dm_dirty_log *log;
5320+
5321+ /* hash table */
5322+ rwlock_t hash_lock;
5323+ mempool_t *region_pool;
5324+ unsigned mask;
5325+ unsigned nr_buckets;
5326+ unsigned prime;
5327+ unsigned shift;
5328+ struct list_head *buckets;
5329+
5330+ spinlock_t region_lock;
5331+ atomic_t recovery_in_flight;
5332+ struct semaphore recovery_count;
5333+ struct list_head clean_regions;
5334+ struct list_head quiesced_regions;
5335+ struct list_head recovered_regions;
5336+ struct list_head failed_recovered_regions;
5337+};
5338+
5339+struct region {
5340+ region_t key;
5341+ enum dm_rh_region_states state;
5342+ void *context; /* Caller context. */
5343+
5344+ struct list_head hash_list;
5345+ struct list_head list;
5346+
5347+ atomic_t pending;
5348+ struct bio_list delayed_bios;
5349+};
5350+
5351+/*
5352+ * Conversion fns
5353+ */
5354+region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector)
5355+{
5356+ return sector >> ((struct region_hash *) rh)->region_shift;
5357+}
5358+EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
5359+
5360+region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio)
5361+{
5362+ return dm_rh_sector_to_region(rh, bio->bi_sector);
5363+}
5364+EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
5365+
5366+sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region)
5367+{
5368+ return region << ((struct region_hash *) rh)->region_shift;
5369+}
5370+EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
5371+
5372+/*
5373+ * Retrival fns.
5374+ */
5375+region_t dm_rh_get_region_key(struct dm_region *reg)
5376+{
5377+ return ((struct region *) reg)->key;
5378+}
5379+EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
5380+
5381+sector_t dm_rh_get_region_size(struct dm_rh_client *rh)
5382+{
5383+ return ((struct region_hash *) rh)->region_size;
5384+}
5385+EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
5386+
5387+/* Squirrel a context with a region. */
5388+void *dm_rh_reg_get_context(struct dm_region *reg)
5389+{
5390+ return ((struct region *) reg)->context;
5391+}
5392+EXPORT_SYMBOL_GPL(dm_rh_reg_get_context);
5393+
5394+void dm_rh_reg_set_context(struct dm_region *reg, void *context)
5395+{
5396+ ((struct region *) reg)->context = context;
5397+}
5398+EXPORT_SYMBOL_GPL(dm_rh_reg_set_context);
5399+
5400+/*
5401+ * Create region hash client.
5402+ */
5403+#define MIN_REGIONS 64
5404+struct dm_rh_client *dm_rh_client_create(
5405+ unsigned max_recovery,
5406+ void (*dispatch)(void *dispatch_context,
5407+ struct bio_list *bios, int error),
5408+ void *dispatch_context,
5409+ void (*wake)(void *wake_context), void *wake_context,
5410+ struct dm_dirty_log *log, uint32_t region_size,
5411+ region_t nr_regions)
5412+{
5413+ unsigned i;
5414+ unsigned nr_buckets, max_buckets;
5415+ unsigned hash_primes[] = {
5416+ /* Table of primes for rh_hash/table size optimization. */
5417+ 3, 7, 13, 27, 53, 97, 193, 389, 769,
5418+ 1543, 3079, 6151, 12289, 24593,
5419+ };
5420+ struct region_hash *rh;
5421+
5422+ if (region_size & (region_size - 1)) {
5423+ DMERR("region size must be 2^^n");
5424+ return ERR_PTR(-EINVAL);
5425+ }
5426+
5427+ /* Calculate a suitable number of buckets for our hash table. */
5428+ max_buckets = nr_regions >> 6;
5429+ for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
5430+ ;
5431+ nr_buckets >>= 1;
5432+
5433+ rh = kmalloc(sizeof(*rh), GFP_KERNEL);
5434+ if (!rh) {
5435+ DMERR("unable to allocate region hash memory");
5436+ return ERR_PTR(-ENOMEM);
5437+ }
5438+
5439+ rh->max_recovery = max_recovery;
5440+ rh->dispatch = dispatch;
5441+ rh->dispatch_context = dispatch_context;
5442+ rh->wake = wake;
5443+ rh->wake_context = wake_context;
5444+ rh->log = log;
5445+ rh->region_size = region_size;
5446+ rh->region_shift = ffs(region_size) - 1;
5447+ rwlock_init(&rh->hash_lock);
5448+ rh->mask = nr_buckets - 1;
5449+ rh->nr_buckets = nr_buckets;
5450+ rh->shift = ffs(nr_buckets);
5451+
5452+ /* Check prime array limits. */
5453+ i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ?
5454+ ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2;
5455+ rh->prime = hash_primes[i];
5456+
5457+ rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
5458+ if (!rh->buckets) {
5459+ DMERR("unable to allocate region hash bucket memory");
5460+ kfree(rh);
5461+ return ERR_PTR(-ENOMEM);
5462+ }
5463+
5464+ for (i = 0; i < nr_buckets; i++)
5465+ INIT_LIST_HEAD(rh->buckets + i);
5466+
5467+ spin_lock_init(&rh->region_lock);
5468+ sema_init(&rh->recovery_count, 0);
5469+ atomic_set(&rh->recovery_in_flight, 0);
5470+ INIT_LIST_HEAD(&rh->clean_regions);
5471+ INIT_LIST_HEAD(&rh->quiesced_regions);
5472+ INIT_LIST_HEAD(&rh->recovered_regions);
5473+ INIT_LIST_HEAD(&rh->failed_recovered_regions);
5474+
5475+ rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
5476+ sizeof(struct region));
5477+ if (!rh->region_pool) {
5478+ vfree(rh->buckets);
5479+ kfree(rh);
5480+ rh = ERR_PTR(-ENOMEM);
5481+ }
5482+
5483+ return (struct dm_rh_client *) rh;
5484+}
5485+EXPORT_SYMBOL_GPL(dm_rh_client_create);
5486+
5487+void dm_rh_client_destroy(struct dm_rh_client *rh_in)
5488+{
5489+ unsigned h;
5490+ struct region_hash *rh = (struct region_hash *) rh_in;
5491+ struct region *reg, *tmp;
5492+
5493+ BUG_ON(!list_empty(&rh->quiesced_regions));
5494+
5495+ for (h = 0; h < rh->nr_buckets; h++) {
5496+ list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) {
5497+ BUG_ON(atomic_read(&reg->pending));
5498+ mempool_free(reg, rh->region_pool);
5499+ }
5500+ }
5501+
5502+ if (rh->region_pool)
5503+ mempool_destroy(rh->region_pool);
5504+
5505+ vfree(rh->buckets);
5506+ kfree(rh);
5507+}
5508+EXPORT_SYMBOL_GPL(dm_rh_client_destroy);
5509+
5510+static inline unsigned rh_hash(struct region_hash *rh, region_t region)
5511+{
5512+ return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
5513+}
5514+
5515+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
5516+{
5517+ struct region *reg;
5518+ struct list_head *bucket = rh->buckets + rh_hash(rh, region);
5519+
5520+ list_for_each_entry(reg, bucket, hash_list) {
5521+ if (reg->key == region)
5522+ return reg;
5523+ }
5524+
5525+ return NULL;
5526+}
5527+
5528+static void __rh_insert(struct region_hash *rh, struct region *reg)
5529+{
5530+ list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
5531+}
5532+
5533+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
5534+{
5535+ struct region *reg, *nreg;
5536+
5537+ read_unlock(&rh->hash_lock);
5538+ nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
5539+ if (unlikely(!nreg))
5540+ nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
5541+
5542+ nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
5543+ DM_RH_CLEAN : DM_RH_NOSYNC;
5544+ nreg->key = region;
5545+ INIT_LIST_HEAD(&nreg->list);
5546+ atomic_set(&nreg->pending, 0);
5547+ bio_list_init(&nreg->delayed_bios);
5548+
5549+ write_lock_irq(&rh->hash_lock);
5550+ reg = __rh_lookup(rh, region);
5551+ if (reg)
5552+ /* We lost the race. */
5553+ mempool_free(nreg, rh->region_pool);
5554+ else {
5555+ __rh_insert(rh, nreg);
5556+ if (nreg->state == DM_RH_CLEAN) {
5557+ spin_lock(&rh->region_lock);
5558+ list_add(&nreg->list, &rh->clean_regions);
5559+ spin_unlock(&rh->region_lock);
5560+ }
5561+
5562+ reg = nreg;
5563+ }
5564+
5565+ write_unlock_irq(&rh->hash_lock);
5566+ read_lock(&rh->hash_lock);
5567+ return reg;
5568+}
5569+
5570+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
5571+{
5572+ struct region *reg;
5573+
5574+ reg = __rh_lookup(rh, region);
5575+ return reg ? reg : __rh_alloc(rh, region);
5576+}
5577+
5578+int dm_rh_get_state(struct dm_rh_client *rh_in, region_t region, int may_block)
5579+{
5580+ int r;
5581+ struct region_hash *rh = (struct region_hash *) rh_in;
5582+ struct region *reg;
5583+
5584+ read_lock(&rh->hash_lock);
5585+ reg = __rh_lookup(rh, region);
5586+ read_unlock(&rh->hash_lock);
5587+
5588+ if (reg)
5589+ return reg->state;
5590+
5591+ /*
5592+ * The region wasn't in the hash, so we fall back to the dirty log.
5593+ */
5594+ r = rh->log->type->in_sync(rh->log, region, may_block);
5595+
5596+ /*
5597+ * Any error from the dirty log (eg. -EWOULDBLOCK)
5598+ * gets taken as a DM_RH_NOSYNC
5599+ */
5600+ return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
5601+}
5602+EXPORT_SYMBOL_GPL(dm_rh_get_state);
5603+
5604+void dm_rh_set_state(struct dm_rh_client *rh_in, region_t region,
5605+ enum dm_rh_region_states state, int may_block)
5606+{
5607+ struct region_hash *rh = (struct region_hash *) rh_in;
5608+ struct region *reg;
5609+ struct dm_dirty_log *log = rh->log;
5610+
5611+ if (state == DM_RH_NOSYNC)
5612+ log->type->set_region_sync(log, region, 0);
5613+ else if (state == DM_RH_CLEAN)
5614+ log->type->clear_region(log, region);
5615+ else if (state == DM_RH_DIRTY)
5616+ log->type->mark_region(log, region);
5617+
5618+ read_lock(&rh->hash_lock);
5619+ reg = __rh_find(rh, region);
5620+ reg->state = state;
5621+ read_unlock(&rh->hash_lock);
5622+}
5623+EXPORT_SYMBOL_GPL(dm_rh_set_state);
5624+
5625+void dm_rh_update_states(struct dm_rh_client *rh_in, int errors_handled)
5626+{
5627+ struct region_hash *rh = (struct region_hash *) rh_in;
5628+ struct region *reg, *next;
5629+ LIST_HEAD(clean);
5630+ LIST_HEAD(recovered);
5631+ LIST_HEAD(failed_recovered);
5632+
5633+ /*
5634+ * Quickly grab the lists and remove any regions from hash.
5635+ */
5636+ write_lock_irq(&rh->hash_lock);
5637+ spin_lock(&rh->region_lock);
5638+ if (!list_empty(&rh->clean_regions)) {
5639+ list_splice_init(&rh->clean_regions, &clean);
5640+
5641+ list_for_each_entry(reg, &clean, list)
5642+ list_del(&reg->hash_list);
5643+ }
5644+
5645+ if (!list_empty(&rh->recovered_regions)) {
5646+ list_splice_init(&rh->recovered_regions, &recovered);
5647+
5648+ list_for_each_entry(reg, &recovered, list)
5649+ list_del(&reg->hash_list);
5650+ }
5651+
5652+ if (!list_empty(&rh->failed_recovered_regions)) {
5653+ list_splice_init(&rh->failed_recovered_regions,
5654+ &failed_recovered);
5655+
5656+ list_for_each_entry(reg, &recovered, list)
5657+ list_del(&reg->hash_list);
5658+ }
5659+
5660+ spin_unlock(&rh->region_lock);
5661+ write_unlock_irq(&rh->hash_lock);
5662+
5663+ /*
5664+ * All the regions on the recovered and clean lists have
5665+ * now been pulled out of the system, so no need to do
5666+ * any more locking.
5667+ */
5668+ list_for_each_entry_safe(reg, next, &recovered, list) {
5669+ rh->log->type->clear_region(rh->log, reg->key);
5670+ rh->log->type->set_region_sync(rh->log, reg->key, 1);
5671+
5672+ if (reg->delayed_bios.head)
5673+ rh->dispatch(rh->dispatch_context,
5674+ &reg->delayed_bios, 0);
5675+
5676+ up(&rh->recovery_count);
5677+ mempool_free(reg, rh->region_pool);
5678+ }
5679+
5680+ list_for_each_entry_safe(reg, next, &failed_recovered, list) {
5681+ rh->log->type->set_region_sync(rh->log, reg->key,
5682+ errors_handled ? 0 : 1);
5683+ if (reg->delayed_bios.head)
5684+ rh->dispatch(rh->dispatch_context,
5685+ &reg->delayed_bios, -EIO);
5686+
5687+ up(&rh->recovery_count);
5688+ mempool_free(reg, rh->region_pool);
5689+ }
5690+
5691+ list_for_each_entry_safe(reg, next, &clean, list) {
5692+ rh->log->type->clear_region(rh->log, reg->key);
5693+ mempool_free(reg, rh->region_pool);
5694+ }
5695+
5696+ dm_rh_flush(rh_in);
5697+}
5698+EXPORT_SYMBOL_GPL(dm_rh_update_states);
5699+
5700+void dm_rh_inc(struct dm_rh_client *rh_in, region_t region)
5701+{
5702+ struct region_hash *rh = (struct region_hash *) rh_in;
5703+ struct region *reg;
5704+
5705+ read_lock(&rh->hash_lock);
5706+ reg = __rh_find(rh, region);
5707+ if (reg->state == DM_RH_CLEAN) {
5708+ rh->log->type->mark_region(rh->log, reg->key);
5709+
5710+ spin_lock_irq(&rh->region_lock);
5711+ reg->state = DM_RH_DIRTY;
5712+ list_del_init(&reg->list); /* Take off the clean list. */
5713+ spin_unlock_irq(&rh->region_lock);
5714+ }
5715+
5716+ atomic_inc(&reg->pending);
5717+ read_unlock(&rh->hash_lock);
5718+}
5719+EXPORT_SYMBOL_GPL(dm_rh_inc);
5720+
5721+void dm_rh_inc_pending(struct dm_rh_client *rh_in, struct bio_list *bios)
5722+{
5723+ struct bio *bio;
5724+
5725+ for (bio = bios->head; bio; bio = bio->bi_next)
5726+ dm_rh_inc(rh_in, dm_rh_bio_to_region(rh_in, bio));
5727+}
5728+EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
5729+
5730+int dm_rh_dec(struct dm_rh_client *rh_in, region_t region)
5731+{
5732+ int r = 0;
5733+ struct region_hash *rh = (struct region_hash *) rh_in;
5734+ struct region *reg;
5735+
5736+ read_lock(&rh->hash_lock);
5737+ reg = __rh_lookup(rh, region);
5738+ read_unlock(&rh->hash_lock);
5739+
5740+ BUG_ON(!reg);
5741+
5742+ if (atomic_dec_and_test(&reg->pending)) {
5743+ unsigned long flags;
5744+
5745+ /*
5746+ * There is no pending I/O for this region.
5747+ * We can move the region to corresponding list for next action.
5748+ * At this point, the region is not yet connected to any list.
5749+ *
5750+ * If the state is DM_RH_NOSYNC, the region should be kept off
5751+ * from clean list.
5752+ * The hash entry for DM_RH_NOSYNC will remain in memory
5753+ * until the region is recovered or the map is reloaded.
5754+ */
5755+
5756+ spin_lock_irqsave(&rh->region_lock, flags);
5757+ if (reg->state == DM_RH_RECOVERING)
5758+ list_add_tail(&reg->list, &rh->quiesced_regions);
5759+ else {
5760+ reg->state = DM_RH_CLEAN;
5761+ list_add(&reg->list, &rh->clean_regions);
5762+ }
5763+ spin_unlock_irqrestore(&rh->region_lock, flags);
5764+
5765+ r = 1;
5766+ }
5767+
5768+ return r;
5769+}
5770+EXPORT_SYMBOL_GPL(dm_rh_dec);
5771+
5772+/*
5773+ * Starts quiescing a region in preparation for recovery.
5774+ */
5775+static int __rh_recovery_prepare(struct region_hash *rh)
5776+{
5777+ int r;
5778+ region_t region;
5779+ struct region *reg;
5780+
5781+ /*
5782+ * Ask the dirty log what's next.
5783+ */
5784+ r = rh->log->type->get_resync_work(rh->log, &region);
5785+ if (r <= 0)
5786+ return r;
5787+
5788+ /*
5789+ * Get this region, and start it quiescing
5790+ * by setting the recovering flag.
5791+ */
5792+ read_lock(&rh->hash_lock);
5793+ reg = __rh_find(rh, region);
5794+ read_unlock(&rh->hash_lock);
5795+
5796+ spin_lock_irq(&rh->region_lock);
5797+
5798+ reg->state = DM_RH_RECOVERING;
5799+
5800+ /* Already quiesced ? */
5801+ list_del_init(&reg->list);
5802+ if (!atomic_read(&reg->pending))
5803+ list_add(&reg->list, &rh->quiesced_regions);
5804+
5805+ spin_unlock_irq(&rh->region_lock);
5806+ return 1;
5807+}
5808+
5809+int dm_rh_recovery_prepare(struct dm_rh_client *rh_in)
5810+{
5811+ int r = 0;
5812+ struct region_hash *rh = (struct region_hash *) rh_in;
5813+
5814+ /* Extra reference to avoid race with rh_stop_recovery */
5815+ atomic_inc(&rh->recovery_in_flight);
5816+
5817+ while (!down_trylock(&rh->recovery_count)) {
5818+ atomic_inc(&rh->recovery_in_flight);
5819+
5820+ if (__rh_recovery_prepare(rh) <= 0) {
5821+ atomic_dec(&rh->recovery_in_flight);
5822+ up(&rh->recovery_count);
5823+ r = -ENOENT;
5824+ break;
5825+ }
5826+ }
5827+
5828+ /* Drop the extra reference. */
5829+ if (atomic_dec_and_test(&rh->recovery_in_flight))
5830+ r = -ESRCH;
5831+
5832+ return r;
5833+}
5834+EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
5835+
5836+/*
5837+ * Returns any quiesced regions.
5838+ */
5839+struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh_in)
5840+{
5841+ struct region_hash *rh = (struct region_hash *) rh_in;
5842+ struct region *reg = NULL;
5843+
5844+ spin_lock_irq(&rh->region_lock);
5845+ if (!list_empty(&rh->quiesced_regions)) {
5846+ reg = list_entry(rh->quiesced_regions.next,
5847+ struct region, list);
5848+ list_del_init(&reg->list); /* Remove from the quiesced list. */
5849+ }
5850+
5851+ spin_unlock_irq(&rh->region_lock);
5852+ return (struct dm_region *) reg;
5853+}
5854+EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
5855+
5856+/*
5857+ * Put region on list of recovered ones.
5858+ */
5859+void dm_rh_recovery_end(struct dm_rh_client *rh_in, struct dm_region *reg_in,
5860+ int error)
5861+{
5862+ struct region_hash *rh = (struct region_hash *) rh_in;
5863+ struct region *reg = (struct region *) reg_in;
5864+
5865+ spin_lock_irq(&rh->region_lock);
5866+ if (error) {
5867+ reg->state = DM_RH_NOSYNC;
5868+ list_add(&reg->list, &rh->failed_recovered_regions);
5869+ } else
5870+ list_add(&reg->list, &rh->recovered_regions);
5871+
5872+ atomic_dec(&rh->recovery_in_flight);
5873+ spin_unlock_irq(&rh->region_lock);
5874+
5875+ rh->wake(rh->wake_context);
5876+ BUG_ON(atomic_read(&rh->recovery_in_flight) < 0);
5877+}
5878+EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
5879+
5880+/* Return recovery in flight count. */
5881+int dm_rh_recovery_in_flight(struct dm_rh_client *rh_in)
5882+{
5883+ return atomic_read(&((struct region_hash *) rh_in)->recovery_in_flight);
5884+}
5885+EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
5886+
5887+int dm_rh_flush(struct dm_rh_client *rh_in)
5888+{
5889+ struct region_hash *rh = (struct region_hash *) rh_in;
5890+
5891+ return rh->log->type->flush(rh->log);
5892+}
5893+EXPORT_SYMBOL_GPL(dm_rh_flush);
5894+
5895+void dm_rh_delay_by_region(struct dm_rh_client *rh_in,
5896+ struct bio *bio, region_t region)
5897+{
5898+ struct region_hash *rh = (struct region_hash *) rh_in;
5899+ struct region *reg;
5900+
5901+ /* FIXME: locking. */
5902+ read_lock(&rh->hash_lock);
5903+ reg = __rh_find(rh, region);
5904+ bio_list_add(&reg->delayed_bios, bio);
5905+ read_unlock(&rh->hash_lock);
5906+}
5907+EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
5908+
5909+void dm_rh_delay(struct dm_rh_client *rh_in, struct bio *bio)
5910+{
5911+ return dm_rh_delay_by_region(rh_in, bio,
5912+ dm_rh_bio_to_region(rh_in, bio));
5913+}
5914+EXPORT_SYMBOL_GPL(dm_rh_delay);
5915+
5916+void dm_rh_dispatch_bios(struct dm_rh_client *rh_in,
5917+ region_t region, int error)
5918+{
5919+ struct region_hash *rh = (struct region_hash *) rh_in;
5920+ struct region *reg;
5921+ struct bio_list delayed_bios;
5922+
5923+ /* FIXME: locking. */
5924+ read_lock(&rh->hash_lock);
5925+ reg = __rh_find(rh, region);
5926+ BUG_ON(!reg);
5927+ delayed_bios = reg->delayed_bios;
5928+ bio_list_init(&reg->delayed_bios);
5929+ read_unlock(&rh->hash_lock);
5930+
5931+ if (delayed_bios.head)
5932+ rh->dispatch(rh->dispatch_context, &delayed_bios, error);
5933+
5934+ up(&rh->recovery_count);
5935+}
5936+EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios);
5937+
5938+void dm_rh_stop_recovery(struct dm_rh_client *rh_in)
5939+{
5940+ int i;
5941+ struct region_hash *rh = (struct region_hash *) rh_in;
5942+
5943+ rh->wake(rh->wake_context);
5944+
5945+ /* wait for any recovering regions */
5946+ for (i = 0; i < rh->max_recovery; i++)
5947+ down(&rh->recovery_count);
5948+}
5949+EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
5950+
5951+void dm_rh_start_recovery(struct dm_rh_client *rh_in)
5952+{
5953+ int i;
5954+ struct region_hash *rh = (struct region_hash *) rh_in;
5955+
5956+ for (i = 0; i < rh->max_recovery; i++)
5957+ up(&rh->recovery_count);
5958+
5959+ rh->wake(rh->wake_context);
5960+}
5961+EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
5962+
5963+MODULE_DESCRIPTION(DM_NAME " region hash");
5964+MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <hjm@redhat.com>");
5965+MODULE_LICENSE("GPL");
5966--- a/drivers/md/Kconfig
5967+++ b/drivers/md/Kconfig
5968@@ -269,6 +269,14 @@ config DM_DELAY
5969
5970 If unsure, say N.
5971
5972+config DM_RAID45
5973+ tristate "RAID 4/5 target (EXPERIMENTAL)"
5974+ depends on BLK_DEV_DM && EXPERIMENTAL
5975+ ---help---
5976+ A target that supports RAID4 and RAID5 mappings.
5977+
5978+ If unsure, say N.
5979+
5980 config DM_UEVENT
5981 bool "DM uevents (EXPERIMENTAL)"
5982 depends on BLK_DEV_DM && EXPERIMENTAL
5983--- a/drivers/md/Makefile
5984+++ b/drivers/md/Makefile
5985@@ -34,7 +34,9 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
5986 obj-$(CONFIG_DM_DELAY) += dm-delay.o
5987 obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
5988 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
5989-obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
5990+obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-regions.o dm-log.o
5991+obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \
5992+ dm-regions.o dm-message.o
5993 obj-$(CONFIG_DM_ZERO) += dm-zero.o
5994
5995 quiet_cmd_unroll = UNROLL $@
5996--- /dev/null
5997+++ b/include/linux/dm-regions.h
5998@@ -0,0 +1,115 @@
5999+/*
6000+ * Copyright (C) 2003 Sistina Software Limited.
6001+ * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6002+ *
6003+ * Device-Mapper dirty region hash interface.
6004+ *
6005+ * This file is released under the GPL.
6006+ */
6007+
6008+#ifndef DM_REGION_HASH_H
6009+#define DM_REGION_HASH_H
6010+
6011+#include <linux/dm-dirty-log.h>
6012+
6013+/*-----------------------------------------------------------------
6014+ * Region hash
6015+ *----------------------------------------------------------------*/
6016+struct dm_rh_client;
6017+struct dm_region;
6018+
6019+/*
6020+ * States a region can have.
6021+ */
6022+enum dm_rh_region_states {
6023+ DM_RH_CLEAN = 0x01, /* No writes in flight. */
6024+ DM_RH_DIRTY = 0x02, /* Writes in flight. */
6025+ DM_RH_NOSYNC = 0x04, /* Out of sync. */
6026+ DM_RH_RECOVERING = 0x08, /* Under resynchronization. */
6027+};
6028+
6029+/*
6030+ * Region hash create/destroy.
6031+ */
6032+struct bio_list;
6033+struct dm_rh_client *dm_rh_client_create(
6034+ unsigned max_recovery,
6035+ void (*dispatch)(void *dispatch_context,
6036+ struct bio_list *bios, int error),
6037+ void *dispatch_context,
6038+ void (*wake)(void *wake_context), void *wake_context,
6039+ struct dm_dirty_log *log, uint32_t region_size,
6040+ region_t nr_regions);
6041+void dm_rh_client_destroy(struct dm_rh_client *rh);
6042+
6043+/*
6044+ * Conversion fns:
6045+ *
6046+ * bio -> region
6047+ * sector -> region
6048+ * region -> sector
6049+ */
6050+region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio);
6051+region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector);
6052+sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region);
6053+
6054+/*
6055+ * Functions to set a caller context in a region.
6056+ */
6057+void *dm_rh_reg_get_context(struct dm_region *reg);
6058+void dm_rh_reg_set_context(struct dm_region *reg, void *context);
6059+
6060+/*
6061+ * Get region size and key (ie. number of the region).
6062+ */
6063+sector_t dm_rh_get_region_size(struct dm_rh_client *rh);
6064+sector_t dm_rh_get_region_key(struct dm_region *reg);
6065+
6066+/*
6067+ * Get/set/update region state (and dirty log).
6068+ *
6069+ * dm_rh_update_states
6070+ * @errors_handled != 0 influences
6071+ * that the state of the region will be kept NOSYNC
6072+ */
6073+int dm_rh_get_state(struct dm_rh_client *rh, region_t region, int may_block);
6074+void dm_rh_set_state(struct dm_rh_client *rh, region_t region,
6075+ enum dm_rh_region_states state, int may_block);
6076+void dm_rh_update_states(struct dm_rh_client *rh, int errors_handled);
6077+
6078+/* Flush the region hash and dirty log. */
6079+int dm_rh_flush(struct dm_rh_client *rh);
6080+
6081+/* Inc/dec pending count on regions. */
6082+void dm_rh_inc(struct dm_rh_client *rh, region_t region);
6083+void dm_rh_inc_pending(struct dm_rh_client *rh, struct bio_list *bios);
6084+int dm_rh_dec(struct dm_rh_client *rh, region_t region);
6085+
6086+/* Delay bios on regions. */
6087+void dm_rh_delay(struct dm_rh_client *rh, struct bio *bio);
6088+void dm_rh_delay_by_region(struct dm_rh_client *rh,
6089+ struct bio *bio, region_t region);
6090+
6091+/*
6092+ * Normally, the region hash will automatically call the dispatch function.
6093+ * dm_rh_dispatch_bios() is for intentional dispatching of bios.
6094+ */
6095+void dm_rh_dispatch_bios(struct dm_rh_client *rh, region_t region, int error);
6096+
6097+/*
6098+ * Region recovery control.
6099+ */
6100+/* Prepare some regions for recovery by starting to quiesce them. */
6101+int dm_rh_recovery_prepare(struct dm_rh_client *rh);
6102+/* Try fetching a quiesced region for recovery. */
6103+struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh);
6104+/* Report recovery end on a region. */
6105+void dm_rh_recovery_end(struct dm_rh_client *rh, struct dm_region *reg,
6106+ int error);
6107+/* Check for amount of recoveries in flight. */
6108+int dm_rh_recovery_in_flight(struct dm_rh_client *rh);
6109+/* Start/stop recovery. */
6110+void dm_rh_stop_recovery(struct dm_rh_client *rh);
6111+void dm_rh_start_recovery(struct dm_rh_client *rh);
6112+
6113+#endif /* #ifdef DM_REGION_HASH_H */