src/patches/suse-2.6.27.25/patches.suse/dm-raid45_2.6.27_20081027.patch

   1 From: "Heinz Mauelshagen <hjm@redhat.de>
   2 Subject: DMRAID45 module
   3 X-URL: http://people.redhat.com/~heinzm/sw/dm/dm-raid45/
   4
   5  DM-RAID 45 module.
   6
   7  This driver is used for "Fake RAID" devices.
   8
   9 Acked-by: Jeff Mahoney <jeffm@suse.com>
  10
  11 ---
  12
  13  drivers/md/Kconfig         |    8
  14  drivers/md/Makefile        |    4
  15  drivers/md/dm-memcache.c   |  301 ++
  16  drivers/md/dm-memcache.h   |   68
  17  drivers/md/dm-message.c    |  182 +
  18  drivers/md/dm-message.h    |   91
  19  drivers/md/dm-raid45.c     | 4516 +++++++++++++++++++++++++++++++++++++++++++++
  20  drivers/md/dm-raid45.h     |   28
  21  drivers/md/dm-regions.c    |  723 +++++++
  22  drivers/md/dm.c            |    1
  23  include/linux/dm-regions.h |  115 +
  24  11 files changed, 6036 insertions(+), 1 deletion(-)
  25
  26 --- a/drivers/md/dm.c
  27 +++ b/drivers/md/dm.c
  28 @@ -1680,6 +1680,7 @@ struct gendisk *dm_disk(struct mapped_de
  29  {
  30         return md->disk;
  31  }
  32 +EXPORT_SYMBOL_GPL(dm_disk);
  33
  34  int dm_suspended(struct mapped_device *md)
  35  {
  36 --- /dev/null
  37 +++ b/drivers/md/dm-memcache.c
  38 @@ -0,0 +1,301 @@
  39 +/*
  40 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
  41 + *
  42 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
  43 + *
  44 + * Device-mapper memory object handling:
  45 + *
  46 + * o allocate/free total_pages in a per client page pool.
  47 + *
  48 + * o allocate/free memory objects with chunks (1..n) of
  49 + *   pages_per_chunk pages hanging off.
  50 + *
  51 + * This file is released under the GPL.
  52 + */
  53 +
  54 +#define        DM_MEM_CACHE_VERSION    "0.2"
  55 +
  56 +#include "dm.h"
  57 +#include "dm-memcache.h"
  58 +#include <linux/dm-io.h>
  59 +
  60 +struct dm_mem_cache_client {
  61 +       spinlock_t lock;
  62 +       mempool_t *objs_pool;
  63 +       struct page_list *free_list;
  64 +       unsigned objects;
  65 +       unsigned chunks;
  66 +       unsigned pages_per_chunk;
  67 +       unsigned free_pages;
  68 +       unsigned total_pages;
  69 +};
  70 +
  71 +/*
  72 + * Free pages and page_list elements of client.
  73 + */
  74 +static void free_cache_pages(struct page_list *list)
  75 +{
  76 +       while (list) {
  77 +               struct page_list *pl = list;
  78 +
  79 +               list = pl->next;
  80 +               BUG_ON(!pl->page);
  81 +               __free_page(pl->page);
  82 +               kfree(pl);
  83 +       }
  84 +}
  85 +
  86 +/*
  87 + * Alloc number of pages and page_list elements as required by client.
  88 + */
  89 +static struct page_list *alloc_cache_pages(unsigned pages)
  90 +{
  91 +       struct page_list *pl, *ret = NULL;
  92 +       struct page *page;
  93 +
  94 +       while (pages--) {
  95 +               page = alloc_page(GFP_NOIO);
  96 +               if (!page)
  97 +                       goto err;
  98 +
  99 +               pl = kmalloc(sizeof(*pl), GFP_NOIO);
 100 +               if (!pl) {
 101 +                       __free_page(page);
 102 +                       goto err;
 103 +               }
 104 +
 105 +               pl->page = page;
 106 +               pl->next = ret;
 107 +               ret = pl;
 108 +       }
 109 +
 110 +       return ret;
 111 +
 112 +err:
 113 +       free_cache_pages(ret);
 114 +       return NULL;
 115 +}
 116 +
 117 +/*
 118 + * Allocate page_list elements from the pool to chunks of the memory object.
 119 + */
 120 +static void alloc_chunks(struct dm_mem_cache_client *cl,
 121 +                        struct dm_mem_cache_object *obj)
 122 +{
 123 +       unsigned chunks = cl->chunks;
 124 +       unsigned long flags;
 125 +
 126 +       local_irq_save(flags);
 127 +       local_irq_disable();
 128 +       while (chunks--) {
 129 +               unsigned p = cl->pages_per_chunk;
 130 +
 131 +               obj[chunks].pl = NULL;
 132 +
 133 +               while (p--) {
 134 +                       struct page_list *pl;
 135 +
 136 +                       /* Take next element from free list */
 137 +                       spin_lock(&cl->lock);
 138 +                       pl = cl->free_list;
 139 +                       BUG_ON(!pl);
 140 +                       cl->free_list = pl->next;
 141 +                       spin_unlock(&cl->lock);
 142 +
 143 +                       pl->next = obj[chunks].pl;
 144 +                       obj[chunks].pl = pl;
 145 +               }
 146 +       }
 147 +
 148 +       local_irq_restore(flags);
 149 +}
 150 +
 151 +/*
 152 + * Free page_list elements putting them back onto free list
 153 + */
 154 +static void free_chunks(struct dm_mem_cache_client *cl,
 155 +                       struct dm_mem_cache_object *obj)
 156 +{
 157 +       unsigned chunks = cl->chunks;
 158 +       unsigned long flags;
 159 +       struct page_list *next, *pl;
 160 +
 161 +       local_irq_save(flags);
 162 +       local_irq_disable();
 163 +       while (chunks--) {
 164 +               for (pl = obj[chunks].pl; pl; pl = next) {
 165 +                       next = pl->next;
 166 +
 167 +                       spin_lock(&cl->lock);
 168 +                       pl->next = cl->free_list;
 169 +                       cl->free_list = pl;
 170 +                       cl->free_pages++;
 171 +                       spin_unlock(&cl->lock);
 172 +               }
 173 +       }
 174 +
 175 +       local_irq_restore(flags);
 176 +}
 177 +
 178 +/*
 179 + * Create/destroy dm memory cache client resources.
 180 + */
 181 +struct dm_mem_cache_client *
 182 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
 183 +                          unsigned pages_per_chunk)
 184 +{
 185 +       unsigned total_pages = objects * chunks * pages_per_chunk;
 186 +       struct dm_mem_cache_client *client;
 187 +
 188 +       BUG_ON(!total_pages);
 189 +       client = kzalloc(sizeof(*client), GFP_KERNEL);
 190 +       if (!client)
 191 +               return ERR_PTR(-ENOMEM);
 192 +
 193 +       client->objs_pool = mempool_create_kmalloc_pool(objects,
 194 +                               chunks * sizeof(struct dm_mem_cache_object));
 195 +       if (!client->objs_pool)
 196 +               goto err;
 197 +
 198 +       client->free_list = alloc_cache_pages(total_pages);
 199 +       if (!client->free_list)
 200 +               goto err1;
 201 +
 202 +       spin_lock_init(&client->lock);
 203 +       client->objects = objects;
 204 +       client->chunks = chunks;
 205 +       client->pages_per_chunk = pages_per_chunk;
 206 +       client->free_pages = client->total_pages = total_pages;
 207 +       return client;
 208 +
 209 +err1:
 210 +       mempool_destroy(client->objs_pool);
 211 +err:
 212 +       kfree(client);
 213 +       return ERR_PTR(-ENOMEM);
 214 +}
 215 +EXPORT_SYMBOL(dm_mem_cache_client_create);
 216 +
 217 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
 218 +{
 219 +       BUG_ON(cl->free_pages != cl->total_pages);
 220 +       free_cache_pages(cl->free_list);
 221 +       mempool_destroy(cl->objs_pool);
 222 +       kfree(cl);
 223 +}
 224 +EXPORT_SYMBOL(dm_mem_cache_client_destroy);
 225 +
 226 +/*
 227 + * Grow a clients cache by an amount of pages.
 228 + *
 229 + * Don't call from interrupt context!
 230 + */
 231 +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
 232 +{
 233 +       unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
 234 +       struct page_list *pl, *last;
 235 +
 236 +       BUG_ON(!pages);
 237 +       pl = alloc_cache_pages(pages);
 238 +       if (!pl)
 239 +               return -ENOMEM;
 240 +
 241 +       last = pl;
 242 +       while (last->next)
 243 +               last = last->next;
 244 +
 245 +       spin_lock_irq(&cl->lock);
 246 +       last->next = cl->free_list;
 247 +       cl->free_list = pl;
 248 +       cl->free_pages += pages;
 249 +       cl->total_pages += pages;
 250 +       cl->objects++;
 251 +       spin_unlock_irq(&cl->lock);
 252 +
 253 +       mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
 254 +       return 0;
 255 +}
 256 +EXPORT_SYMBOL(dm_mem_cache_grow);
 257 +
 258 +/* Shrink a clients cache by an amount of pages */
 259 +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
 260 +{
 261 +       int r;
 262 +       unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
 263 +       unsigned long flags;
 264 +       struct page_list *last = NULL, *pl, *pos;
 265 +
 266 +       BUG_ON(!pages);
 267 +
 268 +       spin_lock_irqsave(&cl->lock, flags);
 269 +       pl = pos = cl->free_list;
 270 +       while (p-- && pos->next) {
 271 +               last = pos;
 272 +               pos = pos->next;
 273 +       }
 274 +
 275 +       if (++p)
 276 +               r = -ENOMEM;
 277 +       else {
 278 +               r = 0;
 279 +               cl->free_list = pos;
 280 +               cl->free_pages -= pages;
 281 +               cl->total_pages -= pages;
 282 +               cl->objects--;
 283 +               last->next = NULL;
 284 +       }
 285 +       spin_unlock_irqrestore(&cl->lock, flags);
 286 +
 287 +       if (!r) {
 288 +               free_cache_pages(pl);
 289 +               mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
 290 +       }
 291 +
 292 +       return r;
 293 +}
 294 +EXPORT_SYMBOL(dm_mem_cache_shrink);
 295 +
 296 +/*
 297 + * Allocate/free a memory object
 298 + *
 299 + * Can be called from interrupt context
 300 + */
 301 +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
 302 +{
 303 +       int r = 0;
 304 +       unsigned pages = cl->chunks * cl->pages_per_chunk;
 305 +       unsigned long flags;
 306 +       struct dm_mem_cache_object *obj;
 307 +
 308 +       obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
 309 +       if (!obj)
 310 +               return ERR_PTR(-ENOMEM);
 311 +
 312 +       spin_lock_irqsave(&cl->lock, flags);
 313 +       if (pages > cl->free_pages)
 314 +               r = -ENOMEM;
 315 +       else
 316 +               cl->free_pages -= pages;
 317 +       spin_unlock_irqrestore(&cl->lock, flags);
 318 +
 319 +       if (r) {
 320 +               mempool_free(obj, cl->objs_pool);
 321 +               return ERR_PTR(r);
 322 +       }
 323 +
 324 +       alloc_chunks(cl, obj);
 325 +       return obj;
 326 +}
 327 +EXPORT_SYMBOL(dm_mem_cache_alloc);
 328 +
 329 +void dm_mem_cache_free(struct dm_mem_cache_client *cl,
 330 +                      struct dm_mem_cache_object *obj)
 331 +{
 332 +       free_chunks(cl, obj);
 333 +       mempool_free(obj, cl->objs_pool);
 334 +}
 335 +EXPORT_SYMBOL(dm_mem_cache_free);
 336 +
 337 +MODULE_DESCRIPTION(DM_NAME " dm memory cache");
 338 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
 339 +MODULE_LICENSE("GPL");
 340 --- /dev/null
 341 +++ b/drivers/md/dm-memcache.h
 342 @@ -0,0 +1,68 @@
 343 +/*
 344 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
 345 + *
 346 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
 347 + *
 348 + * Device-mapper memory object handling:
 349 + *
 350 + * o allocate/free total_pages in a per client page pool.
 351 + *
 352 + * o allocate/free memory objects with chunks (1..n) of
 353 + *   pages_per_chunk pages hanging off.
 354 + *
 355 + * This file is released under the GPL.
 356 + */
 357 +
 358 +#ifndef _DM_MEM_CACHE_H
 359 +#define _DM_MEM_CACHE_H
 360 +
 361 +#define        DM_MEM_CACHE_H_VERSION  "0.1"
 362 +
 363 +#include "dm.h"
 364 +#include <linux/dm-io.h>
 365 +
 366 +static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
 367 +{
 368 +       while (pl && p--)
 369 +               pl = pl->next;
 370 +
 371 +       return pl;
 372 +}
 373 +
 374 +struct dm_mem_cache_object {
 375 +       struct page_list *pl; /* Dynamically allocated array */
 376 +       void *private;        /* Caller context reference */
 377 +};
 378 +
 379 +struct dm_mem_cache_client;
 380 +
 381 +/*
 382 + * Create/destroy dm memory cache client resources.
 383 + *
 384 + * On creation, a number of @objects with @chunks of
 385 + * @pages_per_chunk pages will be allocated.
 386 + */
 387 +struct dm_mem_cache_client *
 388 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
 389 +                          unsigned pages_per_chunk);
 390 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
 391 +
 392 +/*
 393 + * Grow/shrink a dm memory cache client resources
 394 + * by @objetcs amount of objects.
 395 + */
 396 +int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
 397 +int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
 398 +
 399 +/*
 400 + * Allocate/free a memory object
 401 + *
 402 + * On allocation one object with an amount of chunks and
 403 + * an amount of pages per chunk will be returned on success.
 404 + */
 405 +struct dm_mem_cache_object *
 406 +dm_mem_cache_alloc(struct dm_mem_cache_client *client);
 407 +void dm_mem_cache_free(struct dm_mem_cache_client *client,
 408 +                      struct dm_mem_cache_object *object);
 409 +
 410 +#endif
 411 --- /dev/null
 412 +++ b/drivers/md/dm-message.c
 413 @@ -0,0 +1,182 @@
 414 +/*
 415 + * Copyright (C) 2007,2008 Red Hat Inc. All rights reserved.
 416 + *
 417 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
 418 + *
 419 + * General device-mapper message interface argument parser.
 420 + *
 421 + * This file is released under the GPL.
 422 + *
 423 + * device-mapper message parser.
 424 + *
 425 + */
 426 +
 427 +#include "dm.h"
 428 +#include "dm-message.h"
 429 +#include <linux/kernel.h>
 430 +
 431 +#define DM_MSG_PREFIX  "dm_message"
 432 +
 433 +/* Basename of a path. */
 434 +static inline char *
 435 +basename(char *s)
 436 +{
 437 +       char *p = strrchr(s, '/');
 438 +
 439 +       return p ? p + 1 : s;
 440 +}
 441 +
 442 +/* Get an argument depending on type. */
 443 +static void
 444 +message_arguments(struct dm_msg *msg, int argc, char **argv)
 445 +{
 446 +
 447 +       if (argc) {
 448 +               int i;
 449 +               struct dm_message_argument *args = msg->spec->args;
 450 +
 451 +               for (i = 0; i < args->num_args; i++) {
 452 +                       int r;
 453 +                       unsigned long **ptr = args->ptr;
 454 +                       enum dm_message_argument_type type = args->types[i];
 455 +
 456 +                       switch (type) {
 457 +                       case dm_msg_base_t:
 458 +                               ((char **) ptr)[i] = basename(argv[i]);
 459 +                               break;
 460 +
 461 +                       case dm_msg_str_t:
 462 +                               ((char **) ptr)[i] = argv[i];
 463 +                               break;
 464 +
 465 +                       case dm_msg_int_t:
 466 +                               r = sscanf(argv[i], "%d", ((int **) ptr)[i]);
 467 +                               goto check;
 468 +
 469 +                       case dm_msg_uint_t:
 470 +                               r = sscanf(argv[i], "%u",
 471 +                                          ((unsigned **) ptr)[i]);
 472 +                               goto check;
 473 +
 474 +                       case dm_msg_uint64_t:
 475 +                               r = sscanf(argv[i], "%llu",
 476 +                                          ((unsigned long long **) ptr)[i]);
 477 +
 478 +check:
 479 +                               if (r != 1) {
 480 +                                       set_bit(dm_msg_ret_undef, &msg->ret);
 481 +                                       set_bit(dm_msg_ret_arg, &msg->ret);
 482 +                               }
 483 +                       }
 484 +               }
 485 +       }
 486 +}
 487 +
 488 +/* Parse message options. */
 489 +static void
 490 +message_options_parse(struct dm_msg *msg, int argc, char **argv)
 491 +{
 492 +       int hit = 0;
 493 +       unsigned long *action;
 494 +       size_t l1 = strlen(*argv), l_hit = 0;
 495 +       struct dm_message_option *o = msg->spec->options;
 496 +       char **option, **option_end = o->options + o->num_options;
 497 +
 498 +       for (option = o->options, action = o->actions;
 499 +            option < option_end; option++, action++) {
 500 +               size_t l2 = strlen(*option);
 501 +
 502 +               if (!strnicmp(*argv, *option, min(l1, l2))) {
 503 +                       hit++;
 504 +                       l_hit = l2;
 505 +                       set_bit(*action, &msg->action);
 506 +               }
 507 +       }
 508 +
 509 +       /* Assume error. */
 510 +       msg->ret = 0;
 511 +       set_bit(dm_msg_ret_option, &msg->ret);
 512 +       if (!hit || l1 > l_hit)
 513 +               set_bit(dm_msg_ret_undef, &msg->ret);   /* Undefined option. */
 514 +       else if (hit > 1)
 515 +               set_bit(dm_msg_ret_ambiguous, &msg->ret); /* Ambiguous option.*/
 516 +       else {
 517 +               clear_bit(dm_msg_ret_option, &msg->ret); /* Option OK. */
 518 +               message_arguments(msg, --argc, ++argv);
 519 +       }
 520 +}
 521 +
 522 +static inline void
 523 +print_ret(const char *caller, unsigned long ret)
 524 +{
 525 +       struct {
 526 +               unsigned long err;
 527 +               const char *err_str;
 528 +       } static err_msg[] = {
 529 +               { dm_msg_ret_ambiguous, "message ambiguous" },
 530 +               { dm_msg_ret_inval, "message invalid" },
 531 +               { dm_msg_ret_undef, "message undefined" },
 532 +               { dm_msg_ret_arg, "message argument" },
 533 +               { dm_msg_ret_argcount, "message argument count" },
 534 +               { dm_msg_ret_option, "option" },
 535 +       }, *e = ARRAY_END(err_msg);
 536 +
 537 +       while (e-- > err_msg) {
 538 +               if (test_bit(e->err, &ret))
 539 +                       DMERR("%s %s", caller, e->err_str);
 540 +       }
 541 +}
 542 +
 543 +/* Parse a message action. */
 544 +int
 545 +dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
 546 +                int argc, char **argv)
 547 +{
 548 +       int hit = 0;
 549 +       size_t l1 = strlen(*argv), l_hit = 0;
 550 +       struct dm_msg_spec *s, *s_hit = NULL,
 551 +                          *s_end = msg->specs + msg->num_specs;
 552 +
 553 +       if (argc < 2)
 554 +               return -EINVAL;
 555 +
 556 +       for (s = msg->specs; s < s_end; s++) {
 557 +               size_t l2 = strlen(s->cmd);
 558 +
 559 +               if (!strnicmp(*argv, s->cmd, min(l1, l2))) {
 560 +                       hit++;
 561 +                       l_hit = l2;
 562 +                       s_hit = s;
 563 +               }
 564 +       }
 565 +
 566 +       msg->ret = 0;
 567 +       if (!hit || l1 > l_hit) /* No hit or message string too long. */
 568 +               set_bit(dm_msg_ret_undef, &msg->ret);
 569 +       else if (hit > 1)       /* Ambiguous message. */
 570 +               set_bit(dm_msg_ret_ambiguous, &msg->ret);
 571 +       else if (argc - 2 != s_hit->args->num_args) {
 572 +               set_bit(dm_msg_ret_undef, &msg->ret);
 573 +               set_bit(dm_msg_ret_argcount, &msg->ret);
 574 +       }
 575 +
 576 +       if (msg->ret)
 577 +               goto bad;
 578 +
 579 +       msg->action = 0;
 580 +       msg->spec = s_hit;
 581 +       set_bit(s_hit->action, &msg->action);
 582 +       message_options_parse(msg, --argc, ++argv);
 583 +
 584 +       if (!msg->ret)
 585 +               return msg->spec->f(msg, context);
 586 +
 587 +bad:
 588 +       print_ret(caller, msg->ret);
 589 +       return -EINVAL;
 590 +}
 591 +EXPORT_SYMBOL(dm_message_parse);
 592 +
 593 +MODULE_DESCRIPTION(DM_NAME " device-mapper target message parser");
 594 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
 595 +MODULE_LICENSE("GPL");
 596 --- /dev/null
 597 +++ b/drivers/md/dm-message.h
 598 @@ -0,0 +1,91 @@
 599 +/*
 600 + * Copyright (C) 2007,2008 Red Hat, Inc. All rights reserved.
 601 + *
 602 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.de>
 603 + *
 604 + * General device-mapper message interface argument parser.
 605 + *
 606 + * This file is released under the GPL.
 607 + *
 608 + */
 609 +
 610 +#ifndef DM_MESSAGE_H
 611 +#define DM_MESSAGE_H
 612 +
 613 +/* Factor out to dm.h. */
 614 +/* Reference to array end. */
 615 +#define ARRAY_END(a)    ((a) + ARRAY_SIZE(a))
 616 +
 617 +/* Message return bits. */
 618 +enum dm_message_return {
 619 +       dm_msg_ret_ambiguous,           /* Action ambiguous. */
 620 +       dm_msg_ret_inval,               /* Action invalid. */
 621 +       dm_msg_ret_undef,               /* Action undefined. */
 622 +
 623 +       dm_msg_ret_option,              /* Option error. */
 624 +       dm_msg_ret_arg,                 /* Argument error. */
 625 +       dm_msg_ret_argcount,            /* Argument count error. */
 626 +};
 627 +
 628 +/* Message argument type conversions. */
 629 +enum dm_message_argument_type {
 630 +       dm_msg_base_t,          /* Basename string. */
 631 +       dm_msg_str_t,           /* String. */
 632 +       dm_msg_int_t,           /* Signed int. */
 633 +       dm_msg_uint_t,          /* Unsigned int. */
 634 +       dm_msg_uint64_t,        /* Unsigned int 64. */
 635 +};
 636 +
 637 +/* A message option. */
 638 +struct dm_message_option {
 639 +       unsigned num_options;
 640 +       char **options;
 641 +       unsigned long *actions;
 642 +};
 643 +
 644 +/* Message arguments and types. */
 645 +struct dm_message_argument {
 646 +       unsigned num_args;
 647 +       unsigned long **ptr;
 648 +       enum dm_message_argument_type types[];
 649 +};
 650 +
 651 +/* Client message. */
 652 +struct dm_msg {
 653 +       unsigned long action;           /* Identified action. */
 654 +       unsigned long ret;              /* Return bits. */
 655 +       unsigned num_specs;             /* # of sepcifications listed. */
 656 +       struct dm_msg_spec *specs;      /* Specification list. */
 657 +       struct dm_msg_spec *spec;       /* Specification selected. */
 658 +};
 659 +
 660 +/* Secification of the message. */
 661 +struct dm_msg_spec {
 662 +       const char *cmd;        /* Name of the command (i.e. 'bandwidth'). */
 663 +       unsigned long action;
 664 +       struct dm_message_option *options;
 665 +       struct dm_message_argument *args;
 666 +       unsigned long parm;     /* Parameter to pass through to callback. */
 667 +       /* Function to process for action. */
 668 +       int (*f) (struct dm_msg *msg, void *context);
 669 +};
 670 +
 671 +/* Parameter access macros. */
 672 +#define        DM_MSG_PARM(msg) ((msg)->spec->parm)
 673 +
 674 +#define        DM_MSG_STR_ARGS(msg, idx) ((char *) *(msg)->spec->args->ptr[idx])
 675 +#define        DM_MSG_INT_ARGS(msg, idx) ((int) *(msg)->spec->args->ptr[idx])
 676 +#define        DM_MSG_UINT_ARGS(msg, idx) ((unsigned) DM_MSG_INT_ARG(msg, idx))
 677 +#define        DM_MSG_UINT64_ARGS(msg, idx) ((uint64_t)  *(msg)->spec->args->ptr[idx])
 678 +
 679 +#define        DM_MSG_STR_ARG(msg)     DM_MSG_STR_ARGS(msg, 0)
 680 +#define        DM_MSG_INT_ARG(msg)     DM_MSG_INT_ARGS(msg, 0)
 681 +#define        DM_MSG_UINT_ARG(msg)    DM_MSG_UINT_ARGS(msg, 0)
 682 +#define        DM_MSG_UINT64_ARG(msg)  DM_MSG_UINT64_ARGS(msg, 0)
 683 +
 684 +
 685 +/* Parse a message and its options and optionally call a function back. */
 686 +int dm_message_parse(const char *caller, struct dm_msg *msg, void *context,
 687 +                    int argc, char **argv);
 688 +
 689 +#endif
 690 --- /dev/null
 691 +++ b/drivers/md/dm-raid45.c
 692 @@ -0,0 +1,4516 @@
 693 +/*
 694 + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
 695 + *
 696 + * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
 697 + *
 698 + * This file is released under the GPL.
 699 + *
 700 + *
 701 + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
 702 + *
 703 + * Supports:
 704 + *     o RAID4 with dedicated and selectable parity device
 705 + *     o RAID5 with rotating parity (left+right, symmetric+asymmetric)
 706 + *     o run time optimization of xor algorithm used to calculate parity
 707 + *
 708 + *
 709 + * Thanks to MD for:
 710 + *    o the raid address calculation algorithm
 711 + *    o the base of the biovec <-> page list copier.
 712 + *
 713 + *
 714 + * Uses region hash to keep track of how many writes are in flight to
 715 + * regions in order to use dirty log to keep state of regions to recover:
 716 + *
 717 + *    o clean regions (those which are synchronized
 718 + *     and don't have write io in flight)
 719 + *    o dirty regions (those with write io in flight)
 720 + *
 721 + *
 722 + * On startup, any dirty regions are migrated to the 'nosync' state
 723 + * and are subject to recovery by the daemon.
 724 + *
 725 + * See raid_ctr() for table definition.
 726 + *
 727 + *
 728 + * FIXME:
 729 + * o add virtual interface for locking
 730 + * o remove instrumentation (REMOVEME:)
 731 + *
 732 + */
 733 +
 734 +static const char *version = "v0.2431";
 735 +
 736 +#include "dm.h"
 737 +#include "dm-bio-list.h"
 738 +#include "dm-memcache.h"
 739 +#include "dm-message.h"
 740 +#include "dm-raid45.h"
 741 +
 742 +#include <linux/kernel.h>
 743 +#include <linux/vmalloc.h>
 744 +
 745 +#include <linux/dm-io.h>
 746 +#include <linux/dm-dirty-log.h>
 747 +#include <linux/dm-regions.h>
 748 +
 749 +/* # of parallel recovered regions */
 750 +/* FIXME: cope with multiple recovery stripes in raid_set struct. */
 751 +#define MAX_RECOVER    1 /* needs to be 1! */
 752 +
 753 +/*
 754 + * Configurable parameters
 755 + */
 756 +#define        INLINE
 757 +
 758 +/* Default # of stripes if not set in constructor. */
 759 +#define        STRIPES                 64
 760 +
 761 +/* Minimum/maximum # of selectable stripes. */
 762 +#define        STRIPES_MIN             8
 763 +#define        STRIPES_MAX             16384
 764 +
 765 +/* Default chunk size in sectors if not set in constructor. */
 766 +#define        CHUNK_SIZE              64
 767 +
 768 +/* Default io size in sectors if not set in constructor. */
 769 +#define        IO_SIZE_MIN             SECTORS_PER_PAGE
 770 +#define        IO_SIZE                 IO_SIZE_MIN
 771 +
 772 +/* Maximum setable chunk size in sectors. */
 773 +#define        CHUNK_SIZE_MAX          16384
 774 +
 775 +/* Recover io size default in sectors. */
 776 +#define        RECOVER_IO_SIZE_MIN     64
 777 +#define        RECOVER_IO_SIZE         256
 778 +
 779 +/* Default percentage recover io bandwidth. */
 780 +#define        BANDWIDTH               10
 781 +#define        BANDWIDTH_MIN           1
 782 +#define        BANDWIDTH_MAX           100
 783 +/*
 784 + * END Configurable parameters
 785 + */
 786 +
 787 +#define        TARGET  "dm-raid45"
 788 +#define        DAEMON  "kraid45d"
 789 +#define        DM_MSG_PREFIX   TARGET
 790 +
 791 +#define        SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
 792 +
 793 +/* Amount/size for __xor(). */
 794 +#define        SECTORS_PER_XOR SECTORS_PER_PAGE
 795 +#define        XOR_SIZE        PAGE_SIZE
 796 +
 797 +/* Derive raid_set from stripe_cache pointer. */
 798 +#define        RS(x)   container_of(x, struct raid_set, sc)
 799 +
 800 +/* Check value in range. */
 801 +#define        range_ok(i, min, max)   (i >= min && i <= max)
 802 +
 803 +/* Page reference. */
 804 +#define PAGE(stripe, p)        ((stripe)->obj[p].pl->page)
 805 +
 806 +/* Bio list reference. */
 807 +#define        BL(stripe, p, rw)       (stripe->ss[p].bl + rw)
 808 +
 809 +/* Page list reference. */
 810 +#define        PL(stripe, p)           (stripe->obj[p].pl)
 811 +
 812 +/* Check argument is power of 2. */
 813 +#define POWER_OF_2(a) (!(a & (a - 1)))
 814 +
 815 +/* Factor out to dm-bio-list.h */
 816 +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 817 +{
 818 +       bio->bi_next = bl->head;
 819 +       bl->head = bio;
 820 +
 821 +       if (!bl->tail)
 822 +               bl->tail = bio;
 823 +}
 824 +
 825 +/* Factor out to dm.h */
 826 +#define TI_ERR_RET(str, ret) \
 827 +       do { ti->error = DM_MSG_PREFIX ": " str; return ret; } while (0);
 828 +#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 829 +
 830 +/*-----------------------------------------------------------------
 831 + * Stripe cache
 832 + *
 833 + * Cache for all reads and writes to raid sets (operational or degraded)
 834 + *
 835 + * We need to run all data to and from a RAID set through this cache,
 836 + * because parity chunks need to get calculated from data chunks
 837 + * or, in the degraded/resynchronization case, missing chunks need
 838 + * to be reconstructed using the other chunks of the stripe.
 839 + *---------------------------------------------------------------*/
 840 +/* Protect kmem cache # counter. */
 841 +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
 842 +
 843 +/* A stripe set (holds bios hanging off). */
 844 +struct stripe_set {
 845 +       struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 846 +       struct bio_list bl[3]; /* Reads, writes, and writes merged. */
 847 +#define        WRITE_MERGED    2
 848 +};
 849 +
 850 +#if READ != 0 || WRITE != 1
 851 +#error dm-raid45: READ/WRITE != 0/1 used as index!!!
 852 +#endif
 853 +
 854 +/*
 855 + * Stripe linked list indexes. Keep order, because the stripe
 856 + * and the stripe cache rely on the first 3!
 857 + */
 858 +enum list_types {
 859 +       LIST_IO = 0,    /* Stripes with io pending. */
 860 +       LIST_ENDIO,     /* Stripes to endio. */
 861 +       LIST_LRU,       /* Least recently used stripes. */
 862 +       LIST_HASH,      /* Hashed stripes. */
 863 +       LIST_RECOVER = LIST_HASH,       /* For recovery type stripes only. */
 864 +       NR_LISTS,       /* To size array in struct stripe. */
 865 +};
 866 +
 867 +enum lock_types {
 868 +       LOCK_ENDIO = 0, /* Protect endio list. */
 869 +       LOCK_LRU,       /* Protect lru list. */
 870 +       NR_LOCKS,       /* To size array in struct stripe_cache. */
 871 +};
 872 +
 873 +/* A stripe: the io object to handle all reads and writes to a RAID set. */
 874 +struct stripe {
 875 +       struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 876 +
 877 +       sector_t key;           /* Hash key. */
 878 +       sector_t region;        /* Region stripe is mapped to. */
 879 +
 880 +       /* Reference count. */
 881 +       atomic_t cnt;
 882 +
 883 +       struct {
 884 +               unsigned long flags;    /* flags (see below). */
 885 +
 886 +               /*
 887 +                * Pending ios in flight:
 888 +                *
 889 +                * used as a 'lock' to control move of stripe to endio list
 890 +                */
 891 +               atomic_t pending;       /* Pending ios in flight. */
 892 +
 893 +               /* Sectors to read and write for multi page stripe sets. */
 894 +               unsigned size;
 895 +       } io;
 896 +
 897 +       /* Lock on stripe (for clustering). */
 898 +       void *lock;
 899 +
 900 +       /*
 901 +        * 4 linked lists:
 902 +        *   o io list to flush io
 903 +        *   o endio list
 904 +        *   o LRU list to put stripes w/o reference count on
 905 +        *   o stripe cache hash
 906 +        */
 907 +       struct list_head lists[NR_LISTS];
 908 +
 909 +       struct {
 910 +               unsigned short parity;  /* Parity chunk index. */
 911 +               short recover;          /* Recovery chunk index. */
 912 +       } idx;
 913 +
 914 +       /* This sets memory cache object (dm-mem-cache). */
 915 +       struct dm_mem_cache_object *obj;
 916 +
 917 +       /* Array of stripe sets (dynamically allocated). */
 918 +       struct stripe_set ss[0];
 919 +};
 920 +
 921 +/* States stripes can be in (flags field). */
 922 +enum stripe_states {
 923 +       STRIPE_ACTIVE,          /* Active io on stripe. */
 924 +       STRIPE_ERROR,           /* io error on stripe. */
 925 +       STRIPE_MERGED,          /* Writes got merged. */
 926 +       STRIPE_READ,            /* Read. */
 927 +       STRIPE_RBW,             /* Read-before-write. */
 928 +       STRIPE_RECONSTRUCT,     /* reconstruct of a missing chunk required. */
 929 +       STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 930 +};
 931 +
 932 +/* ... and macros to access them. */
 933 +#define        BITOPS(name, what, var, flag) \
 934 +static inline int TestClear ## name ## what(struct var *v) \
 935 +{ return test_and_clear_bit(flag, &v->io.flags); } \
 936 +static inline int TestSet ## name ## what(struct var *v) \
 937 +{ return test_and_set_bit(flag, &v->io.flags); } \
 938 +static inline void Clear ## name ## what(struct var *v) \
 939 +{ clear_bit(flag, &v->io.flags); } \
 940 +static inline void Set ## name ## what(struct var *v) \
 941 +{ set_bit(flag, &v->io.flags); } \
 942 +static inline int name ## what(struct var *v) \
 943 +{ return test_bit(flag, &v->io.flags); }
 944 +
 945 +
 946 +BITOPS(Stripe, Active, stripe, STRIPE_ACTIVE)
 947 +BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
 948 +BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
 949 +BITOPS(Stripe, Read, stripe, STRIPE_READ)
 950 +BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
 951 +BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
 952 +BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
 953 +
 954 +/* A stripe hash. */
 955 +struct stripe_hash {
 956 +       struct list_head *hash;
 957 +       unsigned buckets;
 958 +       unsigned mask;
 959 +       unsigned prime;
 960 +       unsigned shift;
 961 +};
 962 +
 963 +/* A stripe cache. */
 964 +struct stripe_cache {
 965 +       /* Stripe hash. */
 966 +       struct stripe_hash hash;
 967 +
 968 +       /* Stripes with io to flush, stripes to endio and LRU lists. */
 969 +       struct list_head lists[3];
 970 +
 971 +       /* Locks to protect endio and lru lists. */
 972 +       spinlock_t locks[NR_LOCKS];
 973 +
 974 +       /* Slab cache to allocate stripes from. */
 975 +       struct {
 976 +               struct kmem_cache *cache;       /* Cache itself. */
 977 +               char name[32];  /* Unique name. */
 978 +       } kc;
 979 +
 980 +       struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 981 +
 982 +       /* dm-mem-cache client resource context. */
 983 +       struct dm_mem_cache_client *mem_cache_client;
 984 +
 985 +       int stripes_parm;           /* # stripes parameter from constructor. */
 986 +       atomic_t stripes;           /* actual # of stripes in cache. */
 987 +       atomic_t stripes_to_shrink; /* # of stripes to shrink cache by. */
 988 +       atomic_t stripes_last;      /* last # of stripes in cache. */
 989 +       atomic_t active_stripes;    /* actual # of active stripes in cache. */
 990 +
 991 +       /* REMOVEME: */
 992 +       atomic_t max_active_stripes; /* actual # of active stripes in cache. */
 993 +};
 994 +
 995 +/* Flag specs for raid_dev */ ;
 996 +enum raid_dev_flags { DEVICE_FAILED, IO_QUEUED };
 997 +
 998 +/* The raid device in a set. */
 999 +struct raid_dev {
1000 +       struct dm_dev *dev;
1001 +       unsigned long flags;    /* raid_dev_flags. */
1002 +       sector_t start;         /* offset to map to. */
1003 +};
1004 +
1005 +/* Flags spec for raid_set. */
1006 +enum raid_set_flags {
1007 +       RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
1008 +       RS_DEAD,                /* RAID set inoperational. */
1009 +       RS_DEVEL_STATS,         /* REMOVEME: display status information. */
1010 +       RS_IO_ERROR,            /* io error on set. */
1011 +       RS_RECOVER,             /* Do recovery. */
1012 +       RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
1013 +       RS_REGION_GET,          /* get a region to recover. */
1014 +       RS_SC_BUSY,             /* stripe cache busy -> send an event. */
1015 +       RS_SUSPENDED,           /* RAID set suspendedn. */
1016 +};
1017 +
1018 +/* REMOVEME: devel stats counters. */
1019 +enum stats_types {
1020 +       S_BIOS_READ,
1021 +       S_BIOS_ADDED_READ,
1022 +       S_BIOS_ENDIO_READ,
1023 +       S_BIOS_WRITE,
1024 +       S_BIOS_ADDED_WRITE,
1025 +       S_BIOS_ENDIO_WRITE,
1026 +       S_CAN_MERGE,
1027 +       S_CANT_MERGE,
1028 +       S_CONGESTED,
1029 +       S_DM_IO_READ,
1030 +       S_DM_IO_WRITE,
1031 +       S_ACTIVE_READS,
1032 +       S_BANDWIDTH,
1033 +       S_BARRIER,
1034 +       S_BIO_COPY_PL_NEXT,
1035 +       S_DEGRADED,
1036 +       S_DELAYED_BIOS,
1037 +       S_EVICT,
1038 +       S_FLUSHS,
1039 +       S_HITS_1ST,
1040 +       S_IOS_POST,
1041 +       S_INSCACHE,
1042 +       S_MAX_LOOKUP,
1043 +       S_MERGE_PAGE_LOCKED,
1044 +       S_NO_BANDWIDTH,
1045 +       S_NOT_CONGESTED,
1046 +       S_NO_RW,
1047 +       S_NOSYNC,
1048 +       S_PROHIBITPAGEIO,
1049 +       S_RECONSTRUCT_EI,
1050 +       S_RECONSTRUCT_DEV,
1051 +       S_REDO,
1052 +       S_REQUEUE,
1053 +       S_STRIPE_ERROR,
1054 +       S_SUM_DELAYED_BIOS,
1055 +       S_XORS,
1056 +       S_NR_STATS,     /* # of stats counters. */
1057 +};
1058 +
1059 +/* Status type -> string mappings. */
1060 +struct stats_map {
1061 +       const enum stats_types type;
1062 +       const char *str;
1063 +};
1064 +
1065 +static struct stats_map stats_map[] = {
1066 +       { S_BIOS_READ, "r=" },
1067 +       { S_BIOS_ADDED_READ, "/" },
1068 +       { S_BIOS_ENDIO_READ, "/" },
1069 +       { S_BIOS_WRITE, " w=" },
1070 +       { S_BIOS_ADDED_WRITE, "/" },
1071 +       { S_BIOS_ENDIO_WRITE, "/" },
1072 +       { S_DM_IO_READ, " rc=" },
1073 +       { S_DM_IO_WRITE, " wc=" },
1074 +       { S_ACTIVE_READS, " active_reads=" },
1075 +       { S_BANDWIDTH, " bandwidth=" },
1076 +       { S_NO_BANDWIDTH, " no_bandwidth=" },
1077 +       { S_BARRIER, " barrier=" },
1078 +       { S_BIO_COPY_PL_NEXT, " bio_copy_pl_next=" },
1079 +       { S_CAN_MERGE, " can_merge=" },
1080 +       { S_MERGE_PAGE_LOCKED, "/page_locked=" },
1081 +       { S_CANT_MERGE, "/cant_merge=" },
1082 +       { S_CONGESTED, " congested=" },
1083 +       { S_NOT_CONGESTED, "/not_congested=" },
1084 +       { S_DEGRADED, " degraded=" },
1085 +       { S_DELAYED_BIOS, " delayed_bios=" },
1086 +       { S_SUM_DELAYED_BIOS, "/sum_delayed_bios=" },
1087 +       { S_EVICT, " evict=" },
1088 +       { S_FLUSHS, " flushs=" },
1089 +       { S_HITS_1ST, " hits_1st=" },
1090 +       { S_IOS_POST, " ios_post=" },
1091 +       { S_INSCACHE, " inscache=" },
1092 +       { S_MAX_LOOKUP, " max_lookup=" },
1093 +       { S_NO_RW, " no_rw=" },
1094 +       { S_NOSYNC, " nosync=" },
1095 +       { S_PROHIBITPAGEIO, " ProhibitPageIO=" },
1096 +       { S_RECONSTRUCT_EI, " reconstruct_ei=" },
1097 +       { S_RECONSTRUCT_DEV, " reconstruct_dev=" },
1098 +       { S_REDO, " redo=" },
1099 +       { S_REQUEUE, " requeue=" },
1100 +       { S_STRIPE_ERROR, " stripe_error=" },
1101 +       { S_XORS, " xors=" },
1102 +};
1103 +
1104 +/*
1105 + * A RAID set.
1106 + */
1107 +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
1108 +struct raid_set {
1109 +       struct dm_target *ti;   /* Target pointer. */
1110 +
1111 +       struct {
1112 +               unsigned long flags;    /* State flags. */
1113 +               spinlock_t in_lock;     /* Protects central input list below. */
1114 +               struct bio_list in;     /* Pending ios (central input list). */
1115 +               struct bio_list work;   /* ios work set. */
1116 +               wait_queue_head_t suspendq;     /* suspend synchronization. */
1117 +               atomic_t in_process;    /* counter of queued bios (suspendq). */
1118 +               atomic_t in_process_max;/* counter of queued bios max. */
1119 +
1120 +               /* io work. */
1121 +               struct workqueue_struct *wq;
1122 +               struct delayed_work dws;
1123 +       } io;
1124 +
1125 +       /* External locking. */
1126 +       struct dm_raid45_locking_type *locking;
1127 +
1128 +       struct stripe_cache sc; /* Stripe cache for this set. */
1129 +
1130 +       /* Xor optimization. */
1131 +       struct {
1132 +               struct xor_func *f;
1133 +               unsigned chunks;
1134 +               unsigned speed;
1135 +       } xor;
1136 +
1137 +       /* Recovery parameters. */
1138 +       struct recover {
1139 +               struct dm_dirty_log *dl;        /* Dirty log. */
1140 +               struct dm_rh_client *rh;        /* Region hash. */
1141 +
1142 +               /* dm-mem-cache client resource context for recovery stripes. */
1143 +               struct dm_mem_cache_client *mem_cache_client;
1144 +
1145 +               struct list_head stripes;       /* List of recovery stripes. */
1146 +
1147 +               region_t nr_regions;
1148 +               region_t nr_regions_to_recover;
1149 +               region_t nr_regions_recovered;
1150 +               unsigned long start_jiffies;
1151 +               unsigned long end_jiffies;
1152 +
1153 +               unsigned bandwidth;          /* Recovery bandwidth [%]. */
1154 +               unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
1155 +               unsigned bandwidth_parm; /*  " constructor parm. */
1156 +               unsigned io_size;        /* io size <= chunk size. */
1157 +               unsigned io_size_parm;   /* io size ctr parameter. */
1158 +
1159 +               /* recovery io throttling. */
1160 +               atomic_t io_count[2];   /* counter recover/regular io. */
1161 +               unsigned long last_jiffies;
1162 +
1163 +               struct dm_region *reg;  /* Actual region to recover. */
1164 +               sector_t pos;   /* Position within region to recover. */
1165 +               sector_t end;   /* End of region to recover. */
1166 +       } recover;
1167 +
1168 +       /* RAID set parameters. */
1169 +       struct {
1170 +               struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
1171 +               unsigned raid_parms;    /* # variable raid parameters. */
1172 +
1173 +               unsigned chunk_size;    /* Sectors per chunk. */
1174 +               unsigned chunk_size_parm;
1175 +               unsigned chunk_mask;    /* Mask for amount. */
1176 +               unsigned chunk_shift;   /* rsector chunk size shift. */
1177 +
1178 +               unsigned io_size;       /* Sectors per io. */
1179 +               unsigned io_size_parm;
1180 +               unsigned io_mask;       /* Mask for amount. */
1181 +               unsigned io_shift_mask; /* Mask for raid_address(). */
1182 +               unsigned io_shift;      /* rsector io size shift. */
1183 +               unsigned pages_per_io;  /* Pages per io. */
1184 +
1185 +               sector_t sectors_per_dev;       /* Sectors per device. */
1186 +
1187 +               atomic_t failed_devs;           /* Amount of devices failed. */
1188 +
1189 +               /* Index of device to initialize. */
1190 +               int dev_to_init;
1191 +               int dev_to_init_parm;
1192 +
1193 +               /* Raid devices dynamically allocated. */
1194 +               unsigned raid_devs;     /* # of RAID devices below. */
1195 +               unsigned data_devs;     /* # of RAID data devices. */
1196 +
1197 +               int ei;         /* index of failed RAID device. */
1198 +
1199 +               /* index of dedicated parity device (i.e. RAID4). */
1200 +               int pi;
1201 +               int pi_parm;    /* constructor parm for status output. */
1202 +       } set;
1203 +
1204 +       /* REMOVEME: devel stats counters. */
1205 +       atomic_t stats[S_NR_STATS];
1206 +
1207 +       /* Dynamically allocated temporary pointers for xor(). */
1208 +       unsigned long **data;
1209 +
1210 +       /* Dynamically allocated RAID devices. Alignment? */
1211 +       struct raid_dev dev[0];
1212 +};
1213 +
1214 +
1215 +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
1216 +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
1217 +BITOPS(RS, Dead, raid_set, RS_DEAD)
1218 +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
1219 +BITOPS(RS, IoError, raid_set, RS_IO_ERROR)
1220 +BITOPS(RS, Recover, raid_set, RS_RECOVER)
1221 +BITOPS(RS, RegionGet, raid_set, RS_REGION_GET)
1222 +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
1223 +BITOPS(RS, Suspended, raid_set, RS_SUSPENDED)
1224 +#undef BITOPS
1225 +
1226 +#define        PageIO(page)            PageChecked(page)
1227 +#define        AllowPageIO(page)       SetPageChecked(page)
1228 +#define        ProhibitPageIO(page)    ClearPageChecked(page)
1229 +
1230 +/*-----------------------------------------------------------------
1231 + * Raid-4/5 set structures.
1232 + *---------------------------------------------------------------*/
1233 +/* RAID level definitions. */
1234 +enum raid_level {
1235 +       raid4,
1236 +       raid5,
1237 +};
1238 +
1239 +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
1240 +enum raid_algorithm {
1241 +       none,
1242 +       left_asym,
1243 +       right_asym,
1244 +       left_sym,
1245 +       right_sym,
1246 +};
1247 +
1248 +struct raid_type {
1249 +       const char *name;               /* RAID algorithm. */
1250 +       const char *descr;              /* Descriptor text for logging. */
1251 +       const unsigned parity_devs;     /* # of parity devices. */
1252 +       const unsigned minimal_devs;    /* minimal # of devices in set. */
1253 +       const enum raid_level level;            /* RAID level. */
1254 +       const enum raid_algorithm algorithm;    /* RAID algorithm. */
1255 +};
1256 +
1257 +/* Supported raid types and properties. */
1258 +static struct raid_type raid_types[] = {
1259 +       {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
1260 +       {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
1261 +       {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
1262 +       {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
1263 +       {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
1264 +};
1265 +
1266 +/* Address as calculated by raid_address(). */
1267 +struct address {
1268 +       sector_t key;           /* Hash key (start address of stripe). */
1269 +       unsigned di, pi;        /* Data and parity disks index. */
1270 +};
1271 +
1272 +/* REMOVEME: reset statistics counters. */
1273 +static void stats_reset(struct raid_set *rs)
1274 +{
1275 +       unsigned s = S_NR_STATS;
1276 +
1277 +       while (s--)
1278 +               atomic_set(rs->stats + s, 0);
1279 +}
1280 +
1281 +/*----------------------------------------------------------------
1282 + * RAID set management routines.
1283 + *--------------------------------------------------------------*/
1284 +/*
1285 + * Begin small helper functions.
1286 + */
1287 +/* Queue (optionally delayed) io work. */
1288 +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
1289 +{
1290 +       struct delayed_work *dws = &rs->io.dws;
1291 +
1292 +       cancel_delayed_work(dws);
1293 +       queue_delayed_work(rs->io.wq, dws, delay);
1294 +}
1295 +
1296 +/* Queue io work immediately (called from region hash too). */
1297 +static INLINE void wake_do_raid(void *context)
1298 +{
1299 +       wake_do_raid_delayed(context, 0);
1300 +}
1301 +
1302 +/* Wait until all io has been processed. */
1303 +static INLINE void wait_ios(struct raid_set *rs)
1304 +{
1305 +       wait_event(rs->io.suspendq, !atomic_read(&rs->io.in_process));
1306 +}
1307 +
1308 +/* Declare io queued to device. */
1309 +static INLINE void io_dev_queued(struct raid_dev *dev)
1310 +{
1311 +       set_bit(IO_QUEUED, &dev->flags);
1312 +}
1313 +
1314 +/* Io on device and reset ? */
1315 +static inline int io_dev_clear(struct raid_dev *dev)
1316 +{
1317 +       return test_and_clear_bit(IO_QUEUED, &dev->flags);
1318 +}
1319 +
1320 +/* Get an io reference. */
1321 +static INLINE void io_get(struct raid_set *rs)
1322 +{
1323 +       int p = atomic_inc_return(&rs->io.in_process);
1324 +
1325 +       if (p > atomic_read(&rs->io.in_process_max))
1326 +               atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
1327 +}
1328 +
1329 +/* Put the io reference and conditionally wake io waiters. */
1330 +static INLINE void io_put(struct raid_set *rs)
1331 +{
1332 +       /* Intel: rebuild data corrupter? */
1333 +       if (!atomic_read(&rs->io.in_process)) {
1334 +               DMERR("%s would go negative!!!", __func__);
1335 +               return;
1336 +       }
1337 +
1338 +       if (atomic_dec_and_test(&rs->io.in_process))
1339 +               wake_up(&rs->io.suspendq);
1340 +}
1341 +
1342 +/* Calculate device sector offset. */
1343 +static INLINE sector_t _sector(struct raid_set *rs, struct bio *bio)
1344 +{
1345 +       sector_t sector = bio->bi_sector;
1346 +
1347 +       sector_div(sector, rs->set.data_devs);
1348 +       return sector;
1349 +}
1350 +
1351 +/* Test device operational. */
1352 +static INLINE int dev_operational(struct raid_set *rs, unsigned p)
1353 +{
1354 +       return !test_bit(DEVICE_FAILED, &rs->dev[p].flags);
1355 +}
1356 +
1357 +/* Return # of active stripes in stripe cache. */
1358 +static INLINE int sc_active(struct stripe_cache *sc)
1359 +{
1360 +       return atomic_read(&sc->active_stripes);
1361 +}
1362 +
1363 +/* Test io pending on stripe. */
1364 +static INLINE int stripe_io(struct stripe *stripe)
1365 +{
1366 +       return atomic_read(&stripe->io.pending);
1367 +}
1368 +
1369 +static INLINE void stripe_io_inc(struct stripe *stripe)
1370 +{
1371 +       atomic_inc(&stripe->io.pending);
1372 +}
1373 +
1374 +static INLINE void stripe_io_dec(struct stripe *stripe)
1375 +{
1376 +       atomic_dec(&stripe->io.pending);
1377 +}
1378 +
1379 +/* Wrapper needed by for_each_io_dev(). */
1380 +static void _stripe_io_inc(struct stripe *stripe, unsigned p)
1381 +{
1382 +       stripe_io_inc(stripe);
1383 +}
1384 +
1385 +/* Error a stripe. */
1386 +static INLINE void stripe_error(struct stripe *stripe, struct page *page)
1387 +{
1388 +       SetStripeError(stripe);
1389 +       SetPageError(page);
1390 +       atomic_inc(RS(stripe->sc)->stats + S_STRIPE_ERROR);
1391 +}
1392 +
1393 +/* Page IOed ok. */
1394 +enum dirty_type { CLEAN, DIRTY };
1395 +static INLINE void page_set(struct page *page, enum dirty_type type)
1396 +{
1397 +       switch (type) {
1398 +       case DIRTY:
1399 +               SetPageDirty(page);
1400 +               AllowPageIO(page);
1401 +               break;
1402 +
1403 +       case CLEAN:
1404 +               ClearPageDirty(page);
1405 +               break;
1406 +
1407 +       default:
1408 +               BUG();
1409 +       }
1410 +
1411 +       SetPageUptodate(page);
1412 +       ClearPageError(page);
1413 +}
1414 +
1415 +/* Return region state for a sector. */
1416 +static INLINE int
1417 +region_state(struct raid_set *rs, sector_t sector, unsigned long state)
1418 +{
1419 +       struct dm_rh_client *rh = rs->recover.rh;
1420 +
1421 +       return RSRecover(rs) ?
1422 +              (dm_rh_get_state(rh, dm_rh_sector_to_region(rh, sector), 1) &
1423 +               state) : 0;
1424 +}
1425 +
1426 +/* Check maximum devices which may fail in a raid set. */
1427 +static inline int raid_set_degraded(struct raid_set *rs)
1428 +{
1429 +       return RSIoError(rs);
1430 +}
1431 +
1432 +/* Check # of devices which may fail in a raid set. */
1433 +static INLINE int raid_set_operational(struct raid_set *rs)
1434 +{
1435 +       /* Too many failed devices -> BAD. */
1436 +       return atomic_read(&rs->set.failed_devs) <=
1437 +              rs->set.raid_type->parity_devs;
1438 +}
1439 +
1440 +/*
1441 + * Return true in case a page_list should be read/written
1442 + *
1443 + * Conditions to read/write:
1444 + *     o 1st page in list not uptodate
1445 + *     o 1st page in list dirty
1446 + *     o if we optimized io away, we flag it using the pages checked bit.
1447 + */
1448 +static INLINE unsigned page_io(struct page *page)
1449 +{
1450 +       /* Optimization: page was flagged to need io during first run. */
1451 +       if (PagePrivate(page)) {
1452 +               ClearPagePrivate(page);
1453 +               return 1;
1454 +       }
1455 +
1456 +       /* Avoid io if prohibited or a locked page. */
1457 +       if (!PageIO(page) || PageLocked(page))
1458 +               return 0;
1459 +
1460 +       if (!PageUptodate(page) || PageDirty(page)) {
1461 +               /* Flag page needs io for second run optimization. */
1462 +               SetPagePrivate(page);
1463 +               return 1;
1464 +       }
1465 +
1466 +       return 0;
1467 +}
1468 +
1469 +/* Call a function on each page list needing io. */
1470 +static INLINE unsigned
1471 +for_each_io_dev(struct raid_set *rs, struct stripe *stripe,
1472 +               void (*f_io)(struct stripe *stripe, unsigned p))
1473 +{
1474 +       unsigned p = rs->set.raid_devs, r = 0;
1475 +
1476 +       while (p--) {
1477 +               if (page_io(PAGE(stripe, p))) {
1478 +                       f_io(stripe, p);
1479 +                       r++;
1480 +               }
1481 +       }
1482 +
1483 +       return r;
1484 +}
1485 +
1486 +/* Reconstruct a particular device ?. */
1487 +static INLINE int dev_to_init(struct raid_set *rs)
1488 +{
1489 +       return rs->set.dev_to_init > -1;
1490 +}
1491 +
1492 +/*
1493 + * Index of device to calculate parity on.
1494 + * Either the parity device index *or* the selected device to init
1495 + * after a spare replacement.
1496 + */
1497 +static INLINE unsigned dev_for_parity(struct stripe *stripe)
1498 +{
1499 +       struct raid_set *rs = RS(stripe->sc);
1500 +
1501 +       return dev_to_init(rs) ? rs->set.dev_to_init : stripe->idx.parity;
1502 +}
1503 +
1504 +/* Return the index of the device to be recovered. */
1505 +static int idx_get(struct raid_set *rs)
1506 +{
1507 +       /* Avoid to read in the pages to be reconstructed anyway. */
1508 +       if (dev_to_init(rs))
1509 +               return rs->set.dev_to_init;
1510 +       else if (rs->set.raid_type->level == raid4)
1511 +               return rs->set.pi;
1512 +
1513 +       return -1;
1514 +}
1515 +
1516 +/* RAID set congested function. */
1517 +static int raid_set_congested(void *congested_data, int bdi_bits)
1518 +{
1519 +       struct raid_set *rs = congested_data;
1520 +       int r = 0; /* Assume uncongested. */
1521 +       unsigned p = rs->set.raid_devs;
1522 +
1523 +       /* If any of our component devices are overloaded. */
1524 +       while (p--) {
1525 +               struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
1526 +
1527 +               r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1528 +       }
1529 +
1530 +       /* REMOVEME: statistics. */
1531 +       atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
1532 +       return r;
1533 +}
1534 +
1535 +/* Display RAID set dead message once. */
1536 +static void raid_set_dead(struct raid_set *rs)
1537 +{
1538 +       if (!TestSetRSDead(rs)) {
1539 +               unsigned p;
1540 +               char buf[BDEVNAME_SIZE];
1541 +
1542 +               DMERR("FATAL: too many devices failed -> RAID set dead");
1543 +
1544 +               for (p = 0; p < rs->set.raid_devs; p++) {
1545 +                       if (!dev_operational(rs, p))
1546 +                               DMERR("device /dev/%s failed",
1547 +                                     bdevname(rs->dev[p].dev->bdev, buf));
1548 +               }
1549 +       }
1550 +}
1551 +
1552 +/* RAID set degrade check. */
1553 +static INLINE int
1554 +raid_set_check_and_degrade(struct raid_set *rs,
1555 +                          struct stripe *stripe, unsigned p)
1556 +{
1557 +       if (test_and_set_bit(DEVICE_FAILED, &rs->dev[p].flags))
1558 +               return -EPERM;
1559 +
1560 +       /* Through an event in case of member device errors. */
1561 +       dm_table_event(rs->ti->table);
1562 +       atomic_inc(&rs->set.failed_devs);
1563 +
1564 +       /* Only log the first member error. */
1565 +       if (!TestSetRSIoError(rs)) {
1566 +               char buf[BDEVNAME_SIZE];
1567 +
1568 +               /* Store index for recovery. */
1569 +               mb();
1570 +               rs->set.ei = p;
1571 +               mb();
1572 +
1573 +               DMERR("CRITICAL: %sio error on device /dev/%s "
1574 +                     "in region=%llu; DEGRADING RAID set",
1575 +                     stripe ? "" : "FAKED ",
1576 +                     bdevname(rs->dev[p].dev->bdev, buf),
1577 +                     (unsigned long long) (stripe ? stripe->key : 0));
1578 +               DMERR("further device error messages suppressed");
1579 +       }
1580 +
1581 +       return 0;
1582 +}
1583 +
1584 +static void
1585 +raid_set_check_degrade(struct raid_set *rs, struct stripe *stripe)
1586 +{
1587 +       unsigned p = rs->set.raid_devs;
1588 +
1589 +       while (p--) {
1590 +               struct page *page = PAGE(stripe, p);
1591 +
1592 +               if (PageError(page)) {
1593 +                       ClearPageError(page);
1594 +                       raid_set_check_and_degrade(rs, stripe, p);
1595 +               }
1596 +       }
1597 +}
1598 +
1599 +/* RAID set upgrade check. */
1600 +static int raid_set_check_and_upgrade(struct raid_set *rs, unsigned p)
1601 +{
1602 +       if (!test_and_clear_bit(DEVICE_FAILED, &rs->dev[p].flags))
1603 +               return -EPERM;
1604 +
1605 +       if (atomic_dec_and_test(&rs->set.failed_devs)) {
1606 +               ClearRSIoError(rs);
1607 +               rs->set.ei = -1;
1608 +       }
1609 +
1610 +       return 0;
1611 +}
1612 +
1613 +/* Lookup a RAID device by name or by major:minor number. */
1614 +union dev_lookup {
1615 +       const char *dev_name;
1616 +       struct raid_dev *dev;
1617 +};
1618 +enum lookup_type { byname, bymajmin, bynumber };
1619 +static int raid_dev_lookup(struct raid_set *rs, enum lookup_type by,
1620 +                          union dev_lookup *dl)
1621 +{
1622 +       unsigned p;
1623 +
1624 +       /*
1625 +        * Must be an incremental loop, because the device array
1626 +        * can have empty slots still on calls from raid_ctr()
1627 +        */
1628 +       for (p = 0; p < rs->set.raid_devs; p++) {
1629 +               char buf[BDEVNAME_SIZE];
1630 +               struct raid_dev *dev = rs->dev + p;
1631 +
1632 +               if (!dev->dev)
1633 +                       break;
1634 +
1635 +               /* Format dev string appropriately if necessary. */
1636 +               if (by == byname)
1637 +                       bdevname(dev->dev->bdev, buf);
1638 +               else if (by == bymajmin)
1639 +                       format_dev_t(buf, dev->dev->bdev->bd_dev);
1640 +
1641 +               /* Do the actual check. */
1642 +               if (by == bynumber) {
1643 +                       if (dl->dev->dev->bdev->bd_dev ==
1644 +                           dev->dev->bdev->bd_dev)
1645 +                               return p;
1646 +               } else if (!strcmp(dl->dev_name, buf))
1647 +                       return p;
1648 +       }
1649 +
1650 +       return -ENODEV;
1651 +}
1652 +
1653 +/* End io wrapper. */
1654 +static INLINE void
1655 +_bio_endio(struct raid_set *rs, struct bio *bio, int error)
1656 +{
1657 +       /* REMOVEME: statistics. */
1658 +       atomic_inc(rs->stats + (bio_data_dir(bio) == WRITE ?
1659 +                  S_BIOS_ENDIO_WRITE : S_BIOS_ENDIO_READ));
1660 +       bio_endio(bio, error);
1661 +       io_put(rs);             /* Wake any suspend waiters. */
1662 +}
1663 +
1664 +/*
1665 + * End small helper functions.
1666 + */
1667 +
1668 +
1669 +/*
1670 + * Stripe hash functions
1671 + */
1672 +/* Initialize/destroy stripe hash. */
1673 +static int hash_init(struct stripe_hash *hash, unsigned stripes)
1674 +{
1675 +       unsigned buckets = 2, max_buckets = stripes / 4;
1676 +       unsigned hash_primes[] = {
1677 +               /* Table of primes for hash_fn/table size optimization. */
1678 +               3, 7, 13, 27, 53, 97, 193, 389, 769,
1679 +               1543, 3079, 6151, 12289, 24593,
1680 +       };
1681 +
1682 +       /* Calculate number of buckets (2^^n <= stripes / 4). */
1683 +       while (buckets < max_buckets)
1684 +               buckets <<= 1;
1685 +
1686 +       /* Allocate stripe hash. */
1687 +       hash->hash = vmalloc(buckets * sizeof(*hash->hash));
1688 +       if (!hash->hash)
1689 +               return -ENOMEM;
1690 +
1691 +       hash->buckets = buckets;
1692 +       hash->mask = buckets - 1;
1693 +       hash->shift = ffs(buckets);
1694 +       if (hash->shift > ARRAY_SIZE(hash_primes) + 1)
1695 +               hash->shift = ARRAY_SIZE(hash_primes) + 1;
1696 +
1697 +       BUG_ON(hash->shift - 2 > ARRAY_SIZE(hash_primes) + 1);
1698 +       hash->prime = hash_primes[hash->shift - 2];
1699 +
1700 +       /* Initialize buckets. */
1701 +       while (buckets--)
1702 +               INIT_LIST_HEAD(hash->hash + buckets);
1703 +
1704 +       return 0;
1705 +}
1706 +
1707 +static INLINE void hash_exit(struct stripe_hash *hash)
1708 +{
1709 +       if (hash->hash) {
1710 +               vfree(hash->hash);
1711 +               hash->hash = NULL;
1712 +       }
1713 +}
1714 +
1715 +/* List add (head/tail/locked/unlocked) inlines. */
1716 +enum list_lock_type { LIST_LOCKED, LIST_UNLOCKED };
1717 +#define        LIST_DEL(name, list) \
1718 +static void stripe_ ## name ## _del(struct stripe *stripe, \
1719 +                                   enum list_lock_type lock) { \
1720 +       struct list_head *lh = stripe->lists + (list); \
1721 +       spinlock_t *l = NULL; \
1722 +\
1723 +       if (lock == LIST_LOCKED) { \
1724 +               l = stripe->sc->locks + LOCK_LRU; \
1725 +               spin_lock_irq(l); \
1726 +       } \
1727 +\
1728 +\
1729 +       if (!list_empty(lh)) \
1730 +               list_del_init(lh); \
1731 +\
1732 +       if (lock == LIST_LOCKED) \
1733 +               spin_unlock_irq(l); \
1734 +}
1735 +
1736 +LIST_DEL(hash, LIST_HASH)
1737 +LIST_DEL(lru, LIST_LRU)
1738 +#undef LIST_DEL
1739 +
1740 +enum list_pos_type { POS_HEAD, POS_TAIL };
1741 +#define        LIST_ADD(name, list) \
1742 +static void stripe_ ## name ## _add(struct stripe *stripe, \
1743 +                                   enum list_pos_type pos, \
1744 +                                   enum list_lock_type lock) { \
1745 +       struct list_head *lh = stripe->lists + (list); \
1746 +       struct stripe_cache *sc = stripe->sc; \
1747 +       spinlock_t *l = NULL; \
1748 +\
1749 +       if (lock == LIST_LOCKED) { \
1750 +               l = sc->locks + LOCK_LRU; \
1751 +               spin_lock_irq(l); \
1752 +       } \
1753 +\
1754 +       if (list_empty(lh)) { \
1755 +               if (pos == POS_HEAD) \
1756 +                       list_add(lh, sc->lists + (list)); \
1757 +               else \
1758 +                       list_add_tail(lh, sc->lists + (list)); \
1759 +       } \
1760 +\
1761 +       if (lock == LIST_LOCKED) \
1762 +               spin_unlock_irq(l); \
1763 +}
1764 +
1765 +LIST_ADD(endio, LIST_ENDIO)
1766 +LIST_ADD(io, LIST_IO)
1767 +LIST_ADD(lru, LIST_LRU)
1768 +#undef LIST_ADD
1769 +
1770 +#define POP(list) \
1771 +       do { \
1772 +               if (list_empty(sc->lists + list)) \
1773 +                       stripe = NULL; \
1774 +               else { \
1775 +                       stripe = list_first_entry(&sc->lists[list], \
1776 +                                                 struct stripe, \
1777 +                                                 lists[list]); \
1778 +                       list_del_init(&stripe->lists[list]); \
1779 +               } \
1780 +       } while (0);
1781 +
1782 +/* Pop an available stripe off the lru list. */
1783 +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1784 +{
1785 +       struct stripe *stripe;
1786 +       spinlock_t *lock = sc->locks + LOCK_LRU;
1787 +
1788 +       spin_lock_irq(lock);
1789 +       POP(LIST_LRU);
1790 +       spin_unlock_irq(lock);
1791 +
1792 +       if (stripe)
1793 +               /* Remove from hash before reuse. */
1794 +               stripe_hash_del(stripe, LIST_UNLOCKED);
1795 +
1796 +       return stripe;
1797 +}
1798 +
1799 +static inline unsigned hash_fn(struct stripe_hash *hash, sector_t key)
1800 +{
1801 +       return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
1802 +}
1803 +
1804 +static inline struct list_head *
1805 +hash_bucket(struct stripe_hash *hash, sector_t key)
1806 +{
1807 +       return hash->hash + hash_fn(hash, key);
1808 +}
1809 +
1810 +/* Insert an entry into a hash. */
1811 +static inline void hash_insert(struct stripe_hash *hash, struct stripe *stripe)
1812 +{
1813 +       list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
1814 +}
1815 +
1816 +/* Insert an entry into the stripe hash. */
1817 +static inline void
1818 +sc_insert(struct stripe_cache *sc, struct stripe *stripe)
1819 +{
1820 +       hash_insert(&sc->hash, stripe);
1821 +}
1822 +
1823 +/* Lookup an entry in the stripe hash. */
1824 +static inline struct stripe *
1825 +stripe_lookup(struct stripe_cache *sc, sector_t key)
1826 +{
1827 +       unsigned c = 0;
1828 +       struct stripe *stripe;
1829 +       struct list_head *bucket = hash_bucket(&sc->hash, key);
1830 +
1831 +       list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
1832 +               /* REMOVEME: statisics. */
1833 +               if (++c > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
1834 +                       atomic_set(RS(sc)->stats + S_MAX_LOOKUP, c);
1835 +
1836 +               if (stripe->key == key)
1837 +                       return stripe;
1838 +       }
1839 +
1840 +       return NULL;
1841 +}
1842 +
1843 +/* Resize the stripe cache hash on size changes. */
1844 +static int hash_resize(struct stripe_cache *sc)
1845 +{
1846 +       /* Resize threshold reached? */
1847 +       if (atomic_read(&sc->stripes) > 2 * atomic_read(&sc->stripes_last)
1848 +           || atomic_read(&sc->stripes) < atomic_read(&sc->stripes_last) / 4) {
1849 +               int r;
1850 +               struct stripe_hash hash, hash_tmp;
1851 +               spinlock_t *lock;
1852 +
1853 +               r = hash_init(&hash, atomic_read(&sc->stripes));
1854 +               if (r)
1855 +                       return r;
1856 +
1857 +               lock = sc->locks + LOCK_LRU;
1858 +               spin_lock_irq(lock);
1859 +               if (sc->hash.hash) {
1860 +                       unsigned b = sc->hash.buckets;
1861 +                       struct list_head *pos, *tmp;
1862 +
1863 +                       /* Walk old buckets and insert into new. */
1864 +                       while (b--) {
1865 +                               list_for_each_safe(pos, tmp, sc->hash.hash + b)
1866 +                                   hash_insert(&hash,
1867 +                                               list_entry(pos, struct stripe,
1868 +                                                          lists[LIST_HASH]));
1869 +                       }
1870 +
1871 +               }
1872 +
1873 +               memcpy(&hash_tmp, &sc->hash, sizeof(hash_tmp));
1874 +               memcpy(&sc->hash, &hash, sizeof(sc->hash));
1875 +               atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1876 +               spin_unlock_irq(lock);
1877 +
1878 +               hash_exit(&hash_tmp);
1879 +       }
1880 +
1881 +       return 0;
1882 +}
1883 +
1884 +/*
1885 + * Stripe cache locking functions
1886 + */
1887 +/* Dummy lock function for local RAID4+5. */
1888 +static void *no_lock(sector_t key, enum dm_lock_type type)
1889 +{
1890 +       return &no_lock;
1891 +}
1892 +
1893 +/* Dummy unlock function for local RAID4+5. */
1894 +static void no_unlock(void *lock_handle)
1895 +{
1896 +}
1897 +
1898 +/* No locking (for local RAID 4+5). */
1899 +static struct dm_raid45_locking_type locking_none = {
1900 +       .lock = no_lock,
1901 +       .unlock = no_unlock,
1902 +};
1903 +
1904 +/* Clustered RAID 4+5. */
1905 +/* FIXME: code this. */
1906 +static struct dm_raid45_locking_type locking_cluster = {
1907 +       .lock = no_lock,
1908 +       .unlock = no_unlock,
1909 +};
1910 +
1911 +/* Lock a stripe (for clustering). */
1912 +static int
1913 +stripe_lock(struct raid_set *rs, struct stripe *stripe, int rw, sector_t key)
1914 +{
1915 +       stripe->lock = rs->locking->lock(key, rw == READ ? DM_RAID45_SHARED :
1916 +                                                          DM_RAID45_EX);
1917 +       return stripe->lock ? 0 : -EPERM;
1918 +}
1919 +
1920 +/* Unlock a stripe (for clustering). */
1921 +static void stripe_unlock(struct raid_set *rs, struct stripe *stripe)
1922 +{
1923 +       rs->locking->unlock(stripe->lock);
1924 +       stripe->lock = NULL;
1925 +}
1926 +
1927 +/*
1928 + * Stripe cache functions.
1929 + */
1930 +/*
1931 + * Invalidate all page lists pages of a stripe.
1932 + *
1933 + * I only keep state for the whole list in the first page.
1934 + */
1935 +static INLINE void
1936 +stripe_pages_invalidate(struct stripe *stripe)
1937 +{
1938 +       unsigned p = RS(stripe->sc)->set.raid_devs;
1939 +
1940 +       while (p--) {
1941 +               struct page *page = PAGE(stripe, p);
1942 +
1943 +               ProhibitPageIO(page);
1944 +               ClearPageChecked(page);
1945 +               ClearPageDirty(page);
1946 +               ClearPageError(page);
1947 +               clear_page_locked(page);
1948 +               ClearPagePrivate(page);
1949 +               ClearPageUptodate(page);
1950 +       }
1951 +}
1952 +
1953 +/* Prepare stripe for (re)use. */
1954 +static INLINE void stripe_invalidate(struct stripe *stripe)
1955 +{
1956 +       stripe->io.flags = 0;
1957 +       stripe_pages_invalidate(stripe);
1958 +}
1959 +
1960 +/* Allow io on all chunks of a stripe. */
1961 +static INLINE void stripe_allow_io(struct stripe *stripe)
1962 +{
1963 +       unsigned p = RS(stripe->sc)->set.raid_devs;
1964 +
1965 +       while (p--)
1966 +               AllowPageIO(PAGE(stripe, p));
1967 +}
1968 +
1969 +/* Initialize a stripe. */
1970 +static void
1971 +stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1972 +{
1973 +       unsigned p = RS(sc)->set.raid_devs;
1974 +       unsigned i;
1975 +
1976 +       /* Work all io chunks. */
1977 +       while (p--) {
1978 +               struct stripe_set *ss = stripe->ss + p;
1979 +
1980 +               stripe->obj[p].private = ss;
1981 +               ss->stripe = stripe;
1982 +
1983 +               i = ARRAY_SIZE(ss->bl);
1984 +               while (i--)
1985 +                       bio_list_init(ss->bl + i);
1986 +       }
1987 +
1988 +       stripe->sc = sc;
1989 +
1990 +       i = ARRAY_SIZE(stripe->lists);
1991 +       while (i--)
1992 +               INIT_LIST_HEAD(stripe->lists + i);
1993 +
1994 +       atomic_set(&stripe->cnt, 0);
1995 +       atomic_set(&stripe->io.pending, 0);
1996 +
1997 +       stripe_invalidate(stripe);
1998 +}
1999 +
2000 +/* Number of pages per chunk. */
2001 +static inline unsigned chunk_pages(unsigned io_size)
2002 +{
2003 +       return dm_div_up(io_size, SECTORS_PER_PAGE);
2004 +}
2005 +
2006 +/* Number of pages per stripe. */
2007 +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
2008 +{
2009 +       return chunk_pages(io_size) * rs->set.raid_devs;
2010 +}
2011 +
2012 +/* Initialize part of page_list (recovery). */
2013 +static INLINE void stripe_zero_pl_part(struct stripe *stripe, unsigned p,
2014 +                                      unsigned start, unsigned count)
2015 +{
2016 +       unsigned pages = chunk_pages(count);
2017 +       /* Get offset into the page_list. */
2018 +       struct page_list *pl = pl_elem(PL(stripe, p), start / SECTORS_PER_PAGE);
2019 +
2020 +       BUG_ON(!pl);
2021 +       while (pl && pages--) {
2022 +               BUG_ON(!pl->page);
2023 +               memset(page_address(pl->page), 0, PAGE_SIZE);
2024 +               pl = pl->next;
2025 +       }
2026 +}
2027 +
2028 +/* Initialize parity chunk of stripe. */
2029 +static INLINE void stripe_zero_chunk(struct stripe *stripe, unsigned p)
2030 +{
2031 +       stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
2032 +}
2033 +
2034 +/* Return dynamic stripe structure size. */
2035 +static INLINE size_t stripe_size(struct raid_set *rs)
2036 +{
2037 +       return sizeof(struct stripe) +
2038 +                     rs->set.raid_devs * sizeof(struct stripe_set);
2039 +}
2040 +
2041 +/* Allocate a stripe and its memory object. */
2042 +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
2043 +enum grow { SC_GROW, SC_KEEP };
2044 +static struct stripe *stripe_alloc(struct stripe_cache *sc,
2045 +                                  struct dm_mem_cache_client *mc,
2046 +                                  enum grow grow)
2047 +{
2048 +       int r;
2049 +       struct stripe *stripe;
2050 +
2051 +       stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
2052 +       if (stripe) {
2053 +               /* Grow the dm-mem-cache by one object. */
2054 +               if (grow == SC_GROW) {
2055 +                       r = dm_mem_cache_grow(mc, 1);
2056 +                       if (r)
2057 +                               goto err_free;
2058 +               }
2059 +
2060 +               stripe->obj = dm_mem_cache_alloc(mc);
2061 +               if (!stripe->obj)
2062 +                       goto err_shrink;
2063 +
2064 +               stripe_init(sc, stripe);
2065 +       }
2066 +
2067 +       return stripe;
2068 +
2069 +err_shrink:
2070 +       if (grow == SC_GROW)
2071 +               dm_mem_cache_shrink(mc, 1);
2072 +err_free:
2073 +       kmem_cache_free(sc->kc.cache, stripe);
2074 +       return NULL;
2075 +}
2076 +
2077 +/*
2078 + * Free a stripes memory object, shrink the
2079 + * memory cache and free the stripe itself
2080 + */
2081 +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
2082 +{
2083 +       dm_mem_cache_free(mc, stripe->obj);
2084 +       dm_mem_cache_shrink(mc, 1);
2085 +       kmem_cache_free(stripe->sc->kc.cache, stripe);
2086 +}
2087 +
2088 +/* Free the recovery stripe. */
2089 +static void stripe_recover_free(struct raid_set *rs)
2090 +{
2091 +       struct recover *rec = &rs->recover;
2092 +       struct list_head *stripes = &rec->stripes;
2093 +
2094 +       while (!list_empty(stripes)) {
2095 +               struct stripe *stripe = list_first_entry(stripes, struct stripe,
2096 +                                                        lists[LIST_RECOVER]);
2097 +               list_del(stripe->lists + LIST_RECOVER);
2098 +               stripe_free(stripe, rec->mem_cache_client);
2099 +       }
2100 +}
2101 +
2102 +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
2103 +static INLINE void stripe_endio_push(struct stripe *stripe)
2104 +{
2105 +       int wake;
2106 +       unsigned long flags;
2107 +       struct stripe_cache *sc = stripe->sc;
2108 +       spinlock_t *lock = sc->locks + LOCK_ENDIO;
2109 +
2110 +       spin_lock_irqsave(lock, flags);
2111 +       wake = list_empty(sc->lists + LIST_ENDIO);
2112 +       stripe_endio_add(stripe, POS_HEAD, LIST_UNLOCKED);
2113 +       spin_unlock_irqrestore(lock, flags);
2114 +
2115 +       if (wake)
2116 +               wake_do_raid(RS(sc));
2117 +}
2118 +
2119 +/* Protected check for stripe cache endio list empty. */
2120 +static INLINE int stripe_endio_empty(struct stripe_cache *sc)
2121 +{
2122 +       int r;
2123 +       spinlock_t *lock = sc->locks + LOCK_ENDIO;
2124 +
2125 +       spin_lock_irq(lock);
2126 +       r = list_empty(sc->lists + LIST_ENDIO);
2127 +       spin_unlock_irq(lock);
2128 +
2129 +       return r;
2130 +}
2131 +
2132 +/* Pop a stripe off safely off the endio list. */
2133 +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
2134 +{
2135 +       struct stripe *stripe;
2136 +       spinlock_t *lock = sc->locks + LOCK_ENDIO;
2137 +
2138 +       /* This runs in parallel with endio(). */
2139 +       spin_lock_irq(lock);
2140 +       POP(LIST_ENDIO)
2141 +       spin_unlock_irq(lock);
2142 +       return stripe;
2143 +}
2144 +
2145 +#undef POP
2146 +
2147 +/* Evict stripe from cache. */
2148 +static void stripe_evict(struct stripe *stripe)
2149 +{
2150 +       struct raid_set *rs = RS(stripe->sc);
2151 +       stripe_hash_del(stripe, LIST_UNLOCKED); /* Take off hash. */
2152 +
2153 +       if (list_empty(stripe->lists + LIST_LRU)) {
2154 +               stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2155 +               atomic_inc(rs->stats + S_EVICT); /* REMOVEME: statistics. */
2156 +       }
2157 +}
2158 +
2159 +/* Grow stripe cache. */
2160 +static int
2161 +sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
2162 +{
2163 +       int r = 0;
2164 +       struct raid_set *rs = RS(sc);
2165 +
2166 +       /* Try to allocate this many (additional) stripes. */
2167 +       while (stripes--) {
2168 +               struct stripe *stripe =
2169 +                       stripe_alloc(sc, sc->mem_cache_client, grow);
2170 +
2171 +               if (likely(stripe)) {
2172 +                       stripe->io.size = rs->set.io_size;
2173 +                       stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2174 +                       atomic_inc(&sc->stripes);
2175 +               } else {
2176 +                       r = -ENOMEM;
2177 +                       break;
2178 +               }
2179 +       }
2180 +
2181 +       ClearRSScBusy(rs);
2182 +       return r ? r : hash_resize(sc);
2183 +}
2184 +
2185 +/* Shrink stripe cache. */
2186 +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
2187 +{
2188 +       int r = 0;
2189 +
2190 +       /* Try to get unused stripe from LRU list. */
2191 +       while (stripes--) {
2192 +               struct stripe *stripe;
2193 +
2194 +               stripe = stripe_lru_pop(sc);
2195 +               if (stripe) {
2196 +                       /* An lru stripe may never have ios pending! */
2197 +                       BUG_ON(stripe_io(stripe));
2198 +                       stripe_free(stripe, sc->mem_cache_client);
2199 +                       atomic_dec(&sc->stripes);
2200 +               } else {
2201 +                       r = -ENOENT;
2202 +                       break;
2203 +               }
2204 +       }
2205 +
2206 +       /* Check if stats are still sane. */
2207 +       if (atomic_read(&sc->max_active_stripes) >
2208 +           atomic_read(&sc->stripes))
2209 +               atomic_set(&sc->max_active_stripes, 0);
2210 +
2211 +       if (r)
2212 +               return r;
2213 +
2214 +       ClearRSScBusy(RS(sc));
2215 +       return hash_resize(sc);
2216 +}
2217 +
2218 +/* Create stripe cache. */
2219 +static int sc_init(struct raid_set *rs, unsigned stripes)
2220 +{
2221 +       unsigned i, nr;
2222 +       struct stripe_cache *sc = &rs->sc;
2223 +       struct stripe *stripe;
2224 +       struct recover *rec = &rs->recover;
2225 +
2226 +       /* Initialize lists and locks. */
2227 +       i = ARRAY_SIZE(sc->lists);
2228 +       while (i--)
2229 +               INIT_LIST_HEAD(sc->lists + i);
2230 +
2231 +       i = NR_LOCKS;
2232 +       while (i--)
2233 +               spin_lock_init(sc->locks + i);
2234 +
2235 +       /* Initialize atomic variables. */
2236 +       atomic_set(&sc->stripes, 0);
2237 +       atomic_set(&sc->stripes_last, 0);
2238 +       atomic_set(&sc->stripes_to_shrink, 0);
2239 +       atomic_set(&sc->active_stripes, 0);
2240 +       atomic_set(&sc->max_active_stripes, 0); /* REMOVEME: statistics. */
2241 +
2242 +       /*
2243 +        * We need a runtime unique # to suffix the kmem cache name
2244 +        * because we'll have one for each active RAID set.
2245 +        */
2246 +       nr = atomic_inc_return(&_stripe_sc_nr);
2247 +       sprintf(sc->kc.name, "%s_%d", TARGET, nr);
2248 +       sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
2249 +                                        0, 0, NULL);
2250 +       if (!sc->kc.cache)
2251 +               return -ENOMEM;
2252 +
2253 +       /* Create memory cache client context for RAID stripe cache. */
2254 +       sc->mem_cache_client =
2255 +               dm_mem_cache_client_create(stripes, rs->set.raid_devs,
2256 +                                          chunk_pages(rs->set.io_size));
2257 +       if (IS_ERR(sc->mem_cache_client))
2258 +               return PTR_ERR(sc->mem_cache_client);
2259 +
2260 +       /* Create memory cache client context for RAID recovery stripe(s). */
2261 +       rec->mem_cache_client =
2262 +               dm_mem_cache_client_create(MAX_RECOVER, rs->set.raid_devs,
2263 +                                          chunk_pages(rec->io_size));
2264 +       if (IS_ERR(rec->mem_cache_client))
2265 +               return PTR_ERR(rec->mem_cache_client);
2266 +
2267 +       /* Allocate stripe for set recovery. */
2268 +       /* XXX: cope with MAX_RECOVERY. */
2269 +       INIT_LIST_HEAD(&rec->stripes);
2270 +       for (i = 0; i < MAX_RECOVER; i++) {
2271 +               stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
2272 +               if (!stripe)
2273 +                       return -ENOMEM;
2274 +
2275 +               SetStripeRecover(stripe);
2276 +               stripe->io.size = rec->io_size;
2277 +               list_add(stripe->lists + LIST_RECOVER, &rec->stripes);
2278 +       }
2279 +
2280 +       /*
2281 +        * Allocate the stripe objetcs from the
2282 +        * cache and add them to the LRU list.
2283 +        */
2284 +       return sc_grow(sc, stripes, SC_KEEP);
2285 +}
2286 +
2287 +/* Destroy the stripe cache. */
2288 +static void sc_exit(struct stripe_cache *sc)
2289 +{
2290 +       if (sc->kc.cache) {
2291 +               BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
2292 +               kmem_cache_destroy(sc->kc.cache);
2293 +       }
2294 +
2295 +       if (sc->mem_cache_client)
2296 +               dm_mem_cache_client_destroy(sc->mem_cache_client);
2297 +
2298 +       ClearRSRecover(RS(sc));
2299 +       stripe_recover_free(RS(sc));
2300 +       if (RS(sc)->recover.mem_cache_client)
2301 +               dm_mem_cache_client_destroy(RS(sc)->recover.mem_cache_client);
2302 +
2303 +       hash_exit(&sc->hash);
2304 +}
2305 +
2306 +/*
2307 + * Calculate RAID address
2308 + *
2309 + * Delivers tuple with the index of the data disk holding the chunk
2310 + * in the set, the parity disks index and the start of the stripe
2311 + * within the address space of the set (used as the stripe cache hash key).
2312 + */
2313 +/* thx MD. */
2314 +static struct address *
2315 +raid_address(struct raid_set *rs, sector_t sector, struct address *addr)
2316 +{
2317 +       unsigned data_devs = rs->set.data_devs, di, pi,
2318 +                raid_devs = rs->set.raid_devs;
2319 +       sector_t stripe, tmp;
2320 +
2321 +       /*
2322 +        * chunk_number = sector / chunk_size
2323 +        * stripe = chunk_number / data_devs
2324 +        * di = stripe % data_devs;
2325 +        */
2326 +       stripe = sector >> rs->set.chunk_shift;
2327 +       di = sector_div(stripe, data_devs);
2328 +
2329 +       switch (rs->set.raid_type->level) {
2330 +       case raid5:
2331 +               tmp = stripe;
2332 +               pi = sector_div(tmp, raid_devs);
2333 +
2334 +               switch (rs->set.raid_type->algorithm) {
2335 +               case left_asym:         /* Left asymmetric. */
2336 +                       pi = data_devs - pi;
2337 +               case right_asym:        /* Right asymmetric. */
2338 +                       if (di >= pi)
2339 +                               di++;
2340 +                       break;
2341 +
2342 +               case left_sym:          /* Left symmetric. */
2343 +                       pi = data_devs - pi;
2344 +               case right_sym:         /* Right symmetric. */
2345 +                       di = (pi + di + 1) % raid_devs;
2346 +                       break;
2347 +
2348 +               default:
2349 +                       DMERR("Unknown RAID algorithm %d",
2350 +                             rs->set.raid_type->algorithm);
2351 +                       goto out;
2352 +               }
2353 +
2354 +               break;
2355 +
2356 +       case raid4:
2357 +               pi = rs->set.pi;
2358 +               if (di >= pi)
2359 +                       di++;
2360 +               break;
2361 +
2362 +       default:
2363 +               DMERR("Unknown RAID level %d", rs->set.raid_type->level);
2364 +               goto out;
2365 +       }
2366 +
2367 +       /*
2368 +        * Hash key = start offset on any single device of the RAID set;
2369 +        * adjusted in case io size differs from chunk size.
2370 +        */
2371 +       addr->key = (stripe << rs->set.chunk_shift) +
2372 +                   (sector & rs->set.io_shift_mask);
2373 +       addr->di = di;
2374 +       addr->pi = pi;
2375 +
2376 +out:
2377 +       return addr;
2378 +}
2379 +
2380 +/*
2381 + * Copy data across between stripe pages and bio vectors.
2382 + *
2383 + * Pay attention to data alignment in stripe and bio pages.
2384 + */
2385 +static void
2386 +bio_copy_page_list(int rw, struct stripe *stripe,
2387 +                  struct page_list *pl, struct bio *bio)
2388 +{
2389 +       unsigned i, page_offset;
2390 +       void *page_addr;
2391 +       struct raid_set *rs = RS(stripe->sc);
2392 +       struct bio_vec *bv;
2393 +
2394 +       /* Get start page in page list for this sector. */
2395 +       i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
2396 +       pl = pl_elem(pl, i);
2397 +
2398 +       page_addr = page_address(pl->page);
2399 +       page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
2400 +
2401 +       /* Walk all segments and copy data across between bio_vecs and pages. */
2402 +       bio_for_each_segment(bv, bio, i) {
2403 +               int len = bv->bv_len, size;
2404 +               unsigned bio_offset = 0;
2405 +               void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
2406 +redo:
2407 +               size = (page_offset + len > PAGE_SIZE) ?
2408 +                      PAGE_SIZE - page_offset : len;
2409 +
2410 +               if (rw == READ)
2411 +                       memcpy(bio_addr + bio_offset,
2412 +                              page_addr + page_offset, size);
2413 +               else
2414 +                       memcpy(page_addr + page_offset,
2415 +                              bio_addr + bio_offset, size);
2416 +
2417 +               page_offset += size;
2418 +               if (page_offset == PAGE_SIZE) {
2419 +                       /*
2420 +                        * We reached the end of the chunk page ->
2421 +                        * need refer to the next one to copy more data.
2422 +                        */
2423 +                       len -= size;
2424 +                       if (len) {
2425 +                               /* Get next page. */
2426 +                               pl = pl->next;
2427 +                               BUG_ON(!pl);
2428 +                               page_addr = page_address(pl->page);
2429 +                               page_offset = 0;
2430 +                               bio_offset += size;
2431 +                               /* REMOVEME: statistics. */
2432 +                               atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
2433 +                               goto redo;
2434 +                       }
2435 +               }
2436 +
2437 +               __bio_kunmap_atomic(bio_addr, KM_USER0);
2438 +       }
2439 +}
2440 +
2441 +/*
2442 + * Xor optimization macros.
2443 + */
2444 +/* Xor data pointer declaration and initialization macros. */
2445 +#define DECLARE_2      unsigned long *d0 = data[0], *d1 = data[1]
2446 +#define DECLARE_3      DECLARE_2, *d2 = data[2]
2447 +#define DECLARE_4      DECLARE_3, *d3 = data[3]
2448 +#define DECLARE_5      DECLARE_4, *d4 = data[4]
2449 +#define DECLARE_6      DECLARE_5, *d5 = data[5]
2450 +#define DECLARE_7      DECLARE_6, *d6 = data[6]
2451 +#define DECLARE_8      DECLARE_7, *d7 = data[7]
2452 +
2453 +/* Xor unrole macros. */
2454 +#define D2(n)  d0[n] = d0[n] ^ d1[n]
2455 +#define D3(n)  D2(n) ^ d2[n]
2456 +#define D4(n)  D3(n) ^ d3[n]
2457 +#define D5(n)  D4(n) ^ d4[n]
2458 +#define D6(n)  D5(n) ^ d5[n]
2459 +#define D7(n)  D6(n) ^ d6[n]
2460 +#define D8(n)  D7(n) ^ d7[n]
2461 +
2462 +#define        X_2(macro, offset)      macro(offset); macro(offset + 1);
2463 +#define        X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
2464 +#define        X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
2465 +#define        X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
2466 +#define        X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
2467 +#define        X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
2468 +
2469 +/* Define a _xor_#chunks_#xors_per_run() function. */
2470 +#define        _XOR(chunks, xors_per_run) \
2471 +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
2472 +{ \
2473 +       unsigned end = XOR_SIZE / sizeof(data[0]), i; \
2474 +       DECLARE_ ## chunks; \
2475 +\
2476 +       for (i = 0; i < end; i += xors_per_run) { \
2477 +               X_ ## xors_per_run(D ## chunks, i); \
2478 +       } \
2479 +}
2480 +
2481 +/* Define xor functions for 2 - 8 chunks. */
2482 +#define        MAKE_XOR_PER_RUN(xors_per_run) \
2483 +       _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
2484 +       _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
2485 +       _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
2486 +       _XOR(8, xors_per_run);
2487 +
2488 +MAKE_XOR_PER_RUN(8)    /* Define _xor_*_8() functions. */
2489 +MAKE_XOR_PER_RUN(16)   /* Define _xor_*_16() functions. */
2490 +MAKE_XOR_PER_RUN(32)   /* Define _xor_*_32() functions. */
2491 +MAKE_XOR_PER_RUN(64)   /* Define _xor_*_64() functions. */
2492 +
2493 +#define MAKE_XOR(xors_per_run) \
2494 +struct { \
2495 +       void (*f)(unsigned long **); \
2496 +} static xor_funcs ## xors_per_run[] = { \
2497 +       { NULL }, \
2498 +       { NULL }, \
2499 +       { _xor2_ ## xors_per_run }, \
2500 +       { _xor3_ ## xors_per_run }, \
2501 +       { _xor4_ ## xors_per_run }, \
2502 +       { _xor5_ ## xors_per_run }, \
2503 +       { _xor6_ ## xors_per_run }, \
2504 +       { _xor7_ ## xors_per_run }, \
2505 +       { _xor8_ ## xors_per_run }, \
2506 +}; \
2507 +\
2508 +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
2509 +{ \
2510 +       /* Call respective function for amount of chunks. */ \
2511 +       xor_funcs ## xors_per_run[n].f(data); \
2512 +}
2513 +
2514 +/* Define xor_8() - xor_64 functions. */
2515 +MAKE_XOR(8)
2516 +MAKE_XOR(16)
2517 +MAKE_XOR(32)
2518 +MAKE_XOR(64)
2519 +
2520 +/* Maximum number of chunks, which can be xor'ed in one go. */
2521 +#define        XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
2522 +
2523 +struct xor_func {
2524 +       xor_function_t f;
2525 +       const char *name;
2526 +} static xor_funcs[] = {
2527 +       {xor_8,   "xor_8"},
2528 +       {xor_16,  "xor_16"},
2529 +       {xor_32,  "xor_32"},
2530 +       {xor_64,  "xor_64"},
2531 +};
2532 +
2533 +/*
2534 + * Calculate crc.
2535 + *
2536 + * This indexes into the page list of the stripe.
2537 + *
2538 + * All chunks will be xored into the parity chunk
2539 + * in maximum groups of xor.chunks.
2540 + *
2541 + * FIXME: try mapping the pages on discontiguous memory.
2542 + */
2543 +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
2544 +{
2545 +       struct raid_set *rs = RS(stripe->sc);
2546 +       unsigned max_chunks = rs->xor.chunks, n, p;
2547 +       unsigned o = sector / SECTORS_PER_PAGE; /* Offset into the page_list. */
2548 +       unsigned long **d = rs->data;
2549 +       xor_function_t xor_f = rs->xor.f->f;
2550 +
2551 +       /* Address of parity page to xor into. */
2552 +       d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
2553 +
2554 +       /* Preset pointers to data pages. */
2555 +       for (n = 1, p = rs->set.raid_devs; p--; ) {
2556 +               if (p != pi && PageIO(PAGE(stripe, p)))
2557 +                       d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
2558 +
2559 +               /* If max chunks -> xor .*/
2560 +               if (n == max_chunks) {
2561 +                       xor_f(n, d);
2562 +                       n = 1;
2563 +               }
2564 +       }
2565 +
2566 +       /* If chunks -> xor. */
2567 +       if (n > 1)
2568 +               xor_f(n, d);
2569 +
2570 +       /* Set parity page uptodate and clean. */
2571 +       page_set(PAGE(stripe, pi), CLEAN);
2572 +}
2573 +
2574 +/* Common xor loop through all stripe page lists. */
2575 +static void common_xor(struct stripe *stripe, sector_t count,
2576 +                      unsigned off, unsigned p)
2577 +{
2578 +       unsigned sector;
2579 +
2580 +       for (sector = off; sector < count; sector += SECTORS_PER_XOR)
2581 +               xor(stripe, p, sector);
2582 +
2583 +       atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
2584 +}
2585 +
2586 +/*
2587 + * Calculate parity sectors on intact stripes.
2588 + *
2589 + * Need to calculate raid address for recover stripe, because its
2590 + * chunk sizes differs and is typically larger than io chunk size.
2591 + */
2592 +static void parity_xor(struct stripe *stripe)
2593 +{
2594 +       struct raid_set *rs = RS(stripe->sc);
2595 +       unsigned chunk_size = rs->set.chunk_size,
2596 +                io_size = stripe->io.size,
2597 +                xor_size = chunk_size > io_size ? io_size : chunk_size;
2598 +       sector_t off;
2599 +
2600 +       /* This can be the recover stripe with a larger io size. */
2601 +       for (off = 0; off < io_size; off += xor_size) {
2602 +               unsigned pi;
2603 +
2604 +               /*
2605 +                * Recover stripe likely is bigger than regular io
2606 +                * ones and has no precalculated parity disk index ->
2607 +                * need to calculate RAID address.
2608 +                */
2609 +               if (unlikely(StripeRecover(stripe))) {
2610 +                       struct address addr;
2611 +
2612 +                       raid_address(rs,
2613 +                                    (stripe->key + off) * rs->set.data_devs,
2614 +                                    &addr);
2615 +                       pi = addr.pi;
2616 +                       stripe_zero_pl_part(stripe, pi, off,
2617 +                                           rs->set.chunk_size);
2618 +               } else
2619 +                       pi = stripe->idx.parity;
2620 +
2621 +               common_xor(stripe, xor_size, off, pi);
2622 +               page_set(PAGE(stripe, pi), DIRTY);
2623 +       }
2624 +}
2625 +
2626 +/* Reconstruct missing chunk. */
2627 +static void reconstruct_xor(struct stripe *stripe)
2628 +{
2629 +       struct raid_set *rs = RS(stripe->sc);
2630 +       int p = stripe->idx.recover;
2631 +
2632 +       BUG_ON(p < 0);
2633 +
2634 +       /* REMOVEME: statistics. */
2635 +       atomic_inc(rs->stats + (raid_set_degraded(rs) ?
2636 +                   S_RECONSTRUCT_EI : S_RECONSTRUCT_DEV));
2637 +
2638 +       /* Zero chunk to be reconstructed. */
2639 +       stripe_zero_chunk(stripe, p);
2640 +       common_xor(stripe, stripe->io.size, 0, p);
2641 +}
2642 +
2643 +/*
2644 + * Try getting a stripe either from the hash or from the lru list
2645 + */
2646 +static inline void _stripe_get(struct stripe *stripe)
2647 +{
2648 +       atomic_inc(&stripe->cnt);
2649 +}
2650 +
2651 +static struct stripe *stripe_get(struct raid_set *rs, struct address *addr)
2652 +{
2653 +       struct stripe_cache *sc = &rs->sc;
2654 +       struct stripe *stripe;
2655 +
2656 +       stripe = stripe_lookup(sc, addr->key);
2657 +       if (stripe) {
2658 +               _stripe_get(stripe);
2659 +               /* Remove from the lru list if on. */
2660 +               stripe_lru_del(stripe, LIST_LOCKED);
2661 +               atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2662 +       } else {
2663 +               /* Second try to get an LRU stripe. */
2664 +               stripe = stripe_lru_pop(sc);
2665 +               if (stripe) {
2666 +                       _stripe_get(stripe);
2667 +                       /* Invalidate before reinserting with changed key. */
2668 +                       stripe_invalidate(stripe);
2669 +                       stripe->key = addr->key;
2670 +                       stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2671 +                                                               addr->key);
2672 +                       stripe->idx.parity = addr->pi;
2673 +                       sc_insert(sc, stripe);
2674 +                       /* REMOVEME: statistics. */
2675 +                       atomic_inc(rs->stats + S_INSCACHE);
2676 +               }
2677 +       }
2678 +
2679 +       return stripe;
2680 +}
2681 +
2682 +/*
2683 + * Decrement reference count on a stripe.
2684 + *
2685 + * Move it to list of LRU stripes if zero.
2686 + */
2687 +static void stripe_put(struct stripe *stripe)
2688 +{
2689 +       if (atomic_dec_and_test(&stripe->cnt)) {
2690 +               if (TestClearStripeActive(stripe))
2691 +                       atomic_dec(&stripe->sc->active_stripes);
2692 +
2693 +               /* Put stripe onto the LRU list. */
2694 +               stripe_lru_add(stripe, POS_TAIL, LIST_LOCKED);
2695 +       }
2696 +
2697 +       BUG_ON(atomic_read(&stripe->cnt) < 0);
2698 +}
2699 +
2700 +/*
2701 + * Process end io
2702 + *
2703 + * I need to do it here because I can't in interrupt
2704 + *
2705 + * Read and write functions are split in order to avoid
2706 + * conditionals in the main loop for performamce reasons.
2707 + */
2708 +
2709 +/* Helper read bios on a page list. */
2710 +static void _bio_copy_page_list(struct stripe *stripe, struct page_list *pl,
2711 +                               struct bio *bio)
2712 +{
2713 +       bio_copy_page_list(READ, stripe, pl, bio);
2714 +}
2715 +
2716 +/* Helper write bios on a page list. */
2717 +static void _rh_dec(struct stripe *stripe, struct page_list *pl,
2718 +                   struct bio *bio)
2719 +{
2720 +       dm_rh_dec(RS(stripe->sc)->recover.rh, stripe->region);
2721 +}
2722 +
2723 +/* End io all bios on a page list. */
2724 +static inline int
2725 +page_list_endio(int rw, struct stripe *stripe, unsigned p, unsigned *count)
2726 +{
2727 +       int r = 0;
2728 +       struct bio_list *bl = BL(stripe, p, rw);
2729 +
2730 +       if (!bio_list_empty(bl)) {
2731 +               struct page_list *pl = PL(stripe, p);
2732 +               struct page *page = pl->page;
2733 +
2734 +               if (PageLocked(page))
2735 +                       r = -EBUSY;
2736 +               /*
2737 +                * FIXME: PageUptodate() not cleared
2738 +                *        properly for missing chunks ?
2739 +                */
2740 +               else if (PageUptodate(page)) {
2741 +                       struct bio *bio;
2742 +                       struct raid_set *rs = RS(stripe->sc);
2743 +                       void (*h_f)(struct stripe *, struct page_list *,
2744 +                                   struct bio *) =
2745 +                               (rw == READ) ? _bio_copy_page_list : _rh_dec;
2746 +
2747 +                       while ((bio = bio_list_pop(bl))) {
2748 +                               h_f(stripe, pl, bio);
2749 +                               _bio_endio(rs, bio, 0);
2750 +                               stripe_put(stripe);
2751 +                               if (count)
2752 +                                       (*count)++;
2753 +                       }
2754 +               } else
2755 +                       r = -EAGAIN;
2756 +       }
2757 +
2758 +       return r;
2759 +}
2760 +
2761 +/*
2762 + * End io all reads/writes on a stripe copying
2763 + * read date accross from stripe to bios.
2764 + */
2765 +static int stripe_endio(int rw, struct stripe *stripe, unsigned *count)
2766 +{
2767 +       int r = 0;
2768 +       unsigned p = RS(stripe->sc)->set.raid_devs;
2769 +
2770 +       while (p--) {
2771 +               int rr = page_list_endio(rw, stripe, p, count);
2772 +
2773 +               if (rr && r != -EIO)
2774 +                       r = rr;
2775 +       }
2776 +
2777 +       return r;
2778 +}
2779 +
2780 +/* Fail all ios on a bio list and return # of bios. */
2781 +static unsigned
2782 +bio_list_fail(struct raid_set *rs, struct stripe *stripe, struct bio_list *bl)
2783 +{
2784 +       unsigned r;
2785 +       struct bio *bio;
2786 +
2787 +       raid_set_dead(rs);
2788 +
2789 +       /* Update region counters. */
2790 +       if (stripe) {
2791 +               struct dm_rh_client *rh = rs->recover.rh;
2792 +
2793 +               bio_list_for_each(bio, bl) {
2794 +                       if (bio_data_dir(bio) == WRITE)
2795 +                               dm_rh_dec(rh, stripe->region);
2796 +               }
2797 +       }
2798 +
2799 +       /* Error end io all bios. */
2800 +       for (r = 0; (bio = bio_list_pop(bl)); r++)
2801 +               _bio_endio(rs, bio, -EIO);
2802 +
2803 +       return r;
2804 +}
2805 +
2806 +/* Fail all ios of a bio list of a stripe and drop io pending count. */
2807 +static void
2808 +stripe_bio_list_fail(struct raid_set *rs, struct stripe *stripe,
2809 +                    struct bio_list *bl)
2810 +{
2811 +       unsigned put = bio_list_fail(rs, stripe, bl);
2812 +
2813 +       while (put--)
2814 +               stripe_put(stripe);
2815 +}
2816 +
2817 +/* Fail all ios hanging off all bio lists of a stripe. */
2818 +static void stripe_fail_io(struct stripe *stripe)
2819 +{
2820 +       struct raid_set *rs = RS(stripe->sc);
2821 +       unsigned p = rs->set.raid_devs;
2822 +
2823 +       stripe_evict(stripe);
2824 +
2825 +       while (p--) {
2826 +               struct stripe_set *ss = stripe->ss + p;
2827 +               int i = ARRAY_SIZE(ss->bl);
2828 +
2829 +               while (i--)
2830 +                       stripe_bio_list_fail(rs, stripe, ss->bl + i);
2831 +       }
2832 +}
2833 +
2834 +/*
2835 + * Handle all stripes by handing them to the daemon, because we can't
2836 + * map their pages to copy the data in interrupt context.
2837 + *
2838 + * We don't want to handle them here either, while interrupts are disabled.
2839 + */
2840 +
2841 +/* Read/write endio function for dm-io (interrupt context). */
2842 +static void endio(unsigned long error, void *context)
2843 +{
2844 +       struct dm_mem_cache_object *obj = context;
2845 +       struct stripe_set *ss = obj->private;
2846 +       struct stripe *stripe = ss->stripe;
2847 +       struct page *page = obj->pl->page;
2848 +
2849 +       if (unlikely(error))
2850 +               stripe_error(stripe, page);
2851 +       else
2852 +               page_set(page, CLEAN);
2853 +
2854 +       clear_page_locked(page);
2855 +       stripe_io_dec(stripe);
2856 +
2857 +       /* Add stripe to endio list and wake daemon. */
2858 +       stripe_endio_push(stripe);
2859 +}
2860 +
2861 +/*
2862 + * Recovery io throttling
2863 + */
2864 +/* Conditionally reset io counters. */
2865 +enum count_type { IO_WORK = 0, IO_RECOVER };
2866 +static int recover_io_reset(struct raid_set *rs)
2867 +{
2868 +       unsigned long j = jiffies;
2869 +
2870 +       /* Pay attention to jiffies overflows. */
2871 +       if (j > rs->recover.last_jiffies + HZ
2872 +           || j < rs->recover.last_jiffies) {
2873 +               rs->recover.last_jiffies = j;
2874 +               atomic_set(rs->recover.io_count + IO_WORK, 0);
2875 +               atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2876 +               return 1;
2877 +       }
2878 +
2879 +       return 0;
2880 +}
2881 +
2882 +/* Count ios. */
2883 +static INLINE void
2884 +recover_io_count(struct raid_set *rs, struct stripe *stripe)
2885 +{
2886 +       if (RSRecover(rs)) {
2887 +               recover_io_reset(rs);
2888 +               atomic_inc(rs->recover.io_count +
2889 +                          (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2890 +       }
2891 +}
2892 +
2893 +/* Read/Write a page_list asynchronously. */
2894 +static void page_list_rw(struct stripe *stripe, unsigned p)
2895 +{
2896 +       struct stripe_cache *sc = stripe->sc;
2897 +       struct raid_set *rs = RS(sc);
2898 +       struct dm_mem_cache_object *obj = stripe->obj + p;
2899 +       struct page_list *pl = obj->pl;
2900 +       struct page *page = pl->page;
2901 +       struct raid_dev *dev = rs->dev + p;
2902 +       struct dm_io_region io = {
2903 +               .bdev = dev->dev->bdev,
2904 +               .sector = stripe->key,
2905 +               .count = stripe->io.size,
2906 +       };
2907 +       struct dm_io_request control = {
2908 +               .bi_rw = PageDirty(page) ? WRITE : READ,
2909 +               .mem.type = DM_IO_PAGE_LIST,
2910 +               .mem.ptr.pl = pl,
2911 +               .mem.offset = 0,
2912 +               .notify.fn = endio,
2913 +               .notify.context = obj,
2914 +               .client = sc->dm_io_client,
2915 +       };
2916 +
2917 +       BUG_ON(PageLocked(page));
2918 +
2919 +       /*
2920 +        * Don't rw past end of device, which can happen, because
2921 +        * typically sectors_per_dev isn't divisable by io_size.
2922 +        */
2923 +       if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2924 +               io.count = rs->set.sectors_per_dev - io.sector;
2925 +
2926 +       io.sector += dev->start;        /* Add <offset>. */
2927 +       recover_io_count(rs, stripe);   /* Recovery io accounting. */
2928 +
2929 +       /* REMOVEME: statistics. */
2930 +       atomic_inc(rs->stats +
2931 +                   (PageDirty(page) ? S_DM_IO_WRITE : S_DM_IO_READ));
2932 +
2933 +       ClearPageError(page);
2934 +       set_page_locked(page);
2935 +       io_dev_queued(dev);
2936 +       BUG_ON(dm_io(&control, 1, &io, NULL));
2937 +}
2938 +
2939 +/*
2940 + * Write dirty / read not uptodate page lists of a stripe.
2941 + */
2942 +static unsigned stripe_page_lists_rw(struct raid_set *rs, struct stripe *stripe)
2943 +{
2944 +       unsigned r;
2945 +
2946 +       /*
2947 +        * Increment the pending count on the stripe
2948 +        * first, so that we don't race in endio().
2949 +        *
2950 +        * An inc (IO) is needed for any page:
2951 +        *
2952 +        * o not uptodate
2953 +        * o dirtied by writes merged
2954 +        * o dirtied by parity calculations
2955 +        */
2956 +       r = for_each_io_dev(rs, stripe, _stripe_io_inc);
2957 +       if (r) {
2958 +               /* io needed: chunks are not uptodate/dirty. */
2959 +               int max;        /* REMOVEME: */
2960 +               struct stripe_cache *sc = &rs->sc;
2961 +
2962 +               if (!TestSetStripeActive(stripe))
2963 +                       atomic_inc(&sc->active_stripes);
2964 +
2965 +               /* Take off the lru list in case it got added there. */
2966 +               stripe_lru_del(stripe, LIST_LOCKED);
2967 +
2968 +               /* Submit actual io. */
2969 +               for_each_io_dev(rs, stripe, page_list_rw);
2970 +
2971 +               /* REMOVEME: statistics */
2972 +               max = sc_active(sc);
2973 +               if (atomic_read(&sc->max_active_stripes) < max)
2974 +                       atomic_set(&sc->max_active_stripes, max);
2975 +
2976 +               atomic_inc(rs->stats + S_FLUSHS);
2977 +               /* END REMOVEME: statistics */
2978 +       }
2979 +
2980 +       return r;
2981 +}
2982 +
2983 +/* Work in all pending writes. */
2984 +static INLINE void _writes_merge(struct stripe *stripe, unsigned p)
2985 +{
2986 +       struct bio_list *write = BL(stripe, p, WRITE);
2987 +
2988 +       if (!bio_list_empty(write)) {
2989 +               struct page_list *pl = stripe->obj[p].pl;
2990 +               struct bio *bio;
2991 +               struct bio_list *write_merged = BL(stripe, p, WRITE_MERGED);
2992 +
2993 +               /*
2994 +                * We can play with the lists without holding a lock,
2995 +                * because it is just us accessing them anyway.
2996 +                */
2997 +               bio_list_for_each(bio, write)
2998 +                       bio_copy_page_list(WRITE, stripe, pl, bio);
2999 +
3000 +               bio_list_merge(write_merged, write);
3001 +               bio_list_init(write);
3002 +               page_set(pl->page, DIRTY);
3003 +       }
3004 +}
3005 +
3006 +/* Merge in all writes hence dirtying respective pages. */
3007 +static INLINE void writes_merge(struct stripe *stripe)
3008 +{
3009 +       unsigned p = RS(stripe->sc)->set.raid_devs;
3010 +
3011 +       while (p--)
3012 +               _writes_merge(stripe, p);
3013 +}
3014 +
3015 +/* Check, if a chunk gets completely overwritten. */
3016 +static INLINE int stripe_check_overwrite(struct stripe *stripe, unsigned p)
3017 +{
3018 +       unsigned sectors = 0;
3019 +       struct bio *bio;
3020 +       struct bio_list *bl = BL(stripe, p, WRITE);
3021 +
3022 +       bio_list_for_each(bio, bl)
3023 +               sectors += bio_sectors(bio);
3024 +
3025 +       return sectors == RS(stripe->sc)->set.io_size;
3026 +}
3027 +
3028 +/*
3029 + * Prepare stripe to avoid io on broken/reconstructed
3030 + * drive in order to reconstruct date on endio.
3031 + */
3032 +enum prepare_type { IO_ALLOW, IO_PROHIBIT };
3033 +static void stripe_prepare(struct stripe *stripe, unsigned p,
3034 +                          enum prepare_type type)
3035 +{
3036 +       struct page *page = PAGE(stripe, p);
3037 +
3038 +       switch (type) {
3039 +       case IO_PROHIBIT:
3040 +               /*
3041 +                * In case we prohibit, we gotta make sure, that
3042 +                * io on all other chunks than the one which failed
3043 +                * or is being reconstructed is allowed and that it
3044 +                * doesn't have state uptodate.
3045 +                */
3046 +               stripe_allow_io(stripe);
3047 +               ClearPageUptodate(page);
3048 +               ProhibitPageIO(page);
3049 +
3050 +               /* REMOVEME: statistics. */
3051 +               atomic_inc(RS(stripe->sc)->stats + S_PROHIBITPAGEIO);
3052 +               stripe->idx.recover = p;
3053 +               SetStripeReconstruct(stripe);
3054 +               break;
3055 +
3056 +       case IO_ALLOW:
3057 +               AllowPageIO(page);
3058 +               stripe->idx.recover = -1;
3059 +               ClearStripeReconstruct(stripe);
3060 +               break;
3061 +
3062 +       default:
3063 +               BUG();
3064 +       }
3065 +}
3066 +
3067 +/*
3068 + * Degraded/reconstruction mode.
3069 + *
3070 + * Check stripe state to figure which chunks don't need IO.
3071 + */
3072 +static INLINE void stripe_check_reconstruct(struct stripe *stripe,
3073 +                                           int prohibited)
3074 +{
3075 +       struct raid_set *rs = RS(stripe->sc);
3076 +
3077 +       /*
3078 +        * Degraded mode (device(s) failed) ->
3079 +        * avoid io on the failed device.
3080 +        */
3081 +       if (unlikely(raid_set_degraded(rs))) {
3082 +               /* REMOVEME: statistics. */
3083 +               atomic_inc(rs->stats + S_DEGRADED);
3084 +               stripe_prepare(stripe, rs->set.ei, IO_PROHIBIT);
3085 +               return;
3086 +       } else {
3087 +               /*
3088 +                * Reconstruction mode (ie. a particular device or
3089 +                * some (rotating) parity chunk is being resynchronized) ->
3090 +                *   o make sure all needed pages are read in
3091 +                *   o writes are allowed to go through
3092 +                */
3093 +               int r = region_state(rs, stripe->key, DM_RH_NOSYNC);
3094 +
3095 +               if (r) {
3096 +                       /* REMOVEME: statistics. */
3097 +                       atomic_inc(rs->stats + S_NOSYNC);
3098 +                       stripe_prepare(stripe, dev_for_parity(stripe),
3099 +                                      IO_PROHIBIT);
3100 +                       return;
3101 +               }
3102 +       }
3103 +
3104 +       /*
3105 +        * All disks good. Avoid reading parity chunk and reconstruct it
3106 +        * unless we have prohibited io to chunk(s).
3107 +        */
3108 +       if (!prohibited) {
3109 +               if (StripeMerged(stripe))
3110 +                       stripe_prepare(stripe, stripe->idx.parity, IO_ALLOW);
3111 +               else {
3112 +                       stripe_prepare(stripe, stripe->idx.parity, IO_PROHIBIT);
3113 +
3114 +                       /*
3115 +                        * Overrule stripe_prepare to reconstruct the
3116 +                        * parity chunk, because it'll be created new anyway.
3117 +                        */
3118 +                       ClearStripeReconstruct(stripe);
3119 +               }
3120 +       }
3121 +}
3122 +
3123 +/* Check, if stripe is ready to merge writes. */
3124 +static INLINE int stripe_check_merge(struct stripe *stripe)
3125 +{
3126 +       struct raid_set *rs = RS(stripe->sc);
3127 +       int prohibited = 0;
3128 +       unsigned chunks = 0, p = rs->set.raid_devs;
3129 +
3130 +       /* Walk all chunks. */
3131 +       while (p--) {
3132 +               struct page *page = PAGE(stripe, p);
3133 +
3134 +               /* Can't merge active chunks. */
3135 +               if (PageLocked(page)) {
3136 +                       /* REMOVEME: statistics. */
3137 +                       atomic_inc(rs->stats + S_MERGE_PAGE_LOCKED);
3138 +                       break;
3139 +               }
3140 +
3141 +               /* Can merge uptodate chunks and have to count parity chunk. */
3142 +               if (PageUptodate(page) || p == stripe->idx.parity) {
3143 +                       chunks++;
3144 +                       continue;
3145 +               }
3146 +
3147 +               /* Read before write ordering. */
3148 +               if (RSCheckOverwrite(rs) &&
3149 +                   bio_list_empty(BL(stripe, p, READ))) {
3150 +                       int r = stripe_check_overwrite(stripe, p);
3151 +
3152 +                       if (r) {
3153 +                               chunks++;
3154 +                               /* REMOVEME: statistics. */
3155 +                               atomic_inc(RS(stripe->sc)->stats +
3156 +                                          S_PROHIBITPAGEIO);
3157 +                               ProhibitPageIO(page);
3158 +                               prohibited = 1;
3159 +                       }
3160 +               }
3161 +       }
3162 +
3163 +       if (chunks == rs->set.raid_devs) {
3164 +               /* All pages are uptodate or get written over or mixture. */
3165 +               /* REMOVEME: statistics. */
3166 +               atomic_inc(rs->stats + S_CAN_MERGE);
3167 +               return 0;
3168 +       } else
3169 +               /* REMOVEME: statistics.*/
3170 +               atomic_inc(rs->stats + S_CANT_MERGE);
3171 +
3172 +       return prohibited ? 1 : -EPERM;
3173 +}
3174 +
3175 +/* Check, if stripe is ready to merge writes. */
3176 +static INLINE int stripe_check_read(struct stripe *stripe)
3177 +{
3178 +       int r = 0;
3179 +       unsigned p = RS(stripe->sc)->set.raid_devs;
3180 +
3181 +       /* Walk all chunks. */
3182 +       while (p--) {
3183 +               struct page *page = PAGE(stripe, p);
3184 +
3185 +               if (!PageLocked(page) &&
3186 +                   bio_list_empty(BL(stripe, p, READ))) {
3187 +                       ProhibitPageIO(page);
3188 +                       r = 1;
3189 +               }
3190 +       }
3191 +
3192 +       return r;
3193 +}
3194 +
3195 +/*
3196 + * Read/write a stripe.
3197 + *
3198 + * All stripe read/write activity goes through this function.
3199 + *
3200 + * States to cover:
3201 + *   o stripe to read and/or write
3202 + *   o stripe with error to reconstruct
3203 + */
3204 +static int stripe_rw(struct stripe *stripe)
3205 +{
3206 +       struct raid_set *rs = RS(stripe->sc);
3207 +       int prohibited = 0, r;
3208 +
3209 +       /*
3210 +        * Check the state of the RAID set and if degraded (or
3211 +        * resynchronizing for reads), read in all other chunks but
3212 +        * the one on the dead/resynchronizing device in order to be
3213 +        * able to reconstruct the missing one.
3214 +        *
3215 +        * Merge all writes hanging off uptodate pages of the stripe.
3216 +        */
3217 +
3218 +       /* Initially allow io on all chunks and prohibit below, if necessary. */
3219 +       stripe_allow_io(stripe);
3220 +
3221 +       if (StripeRBW(stripe)) {
3222 +               r = stripe_check_merge(stripe);
3223 +               if (!r) {
3224 +                       /*
3225 +                        * If I could rely on valid parity (which would only
3226 +                        * be sure in case of a full synchronization),
3227 +                        * I could xor a fraction of chunks out of
3228 +                        * parity and back in.
3229 +                        *
3230 +                        * For the time being, I got to redo parity...
3231 +                        */
3232 +                       /* parity_xor(stripe); */       /* Xor chunks out. */
3233 +                       stripe_zero_chunk(stripe, stripe->idx.parity);
3234 +                       writes_merge(stripe);           /* Merge writes in. */
3235 +                       parity_xor(stripe);             /* Update parity. */
3236 +                       ClearStripeRBW(stripe);         /* Disable RBW. */
3237 +                       SetStripeMerged(stripe);        /* Writes merged. */
3238 +               }
3239 +
3240 +               if (r > 0)
3241 +                       prohibited = 1;
3242 +       } else if (!raid_set_degraded(rs))
3243 +               /* Only allow for read avoidance if not degraded. */
3244 +               prohibited = stripe_check_read(stripe);
3245 +
3246 +       /*
3247 +        * Check, if io needs to be allowed/prohibeted on certain chunks
3248 +        * because of a degraded set or reconstruction on a region.
3249 +        */
3250 +       stripe_check_reconstruct(stripe, prohibited);
3251 +
3252 +       /* Now submit any reads/writes. */
3253 +       r = stripe_page_lists_rw(rs, stripe);
3254 +       if (!r) {
3255 +               /*
3256 +                * No io submitted because of chunk io prohibited or
3257 +                * locked pages -> push to end io list for processing.
3258 +                */
3259 +               atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
3260 +               stripe_endio_push(stripe);
3261 +               wake_do_raid(rs);       /* Wake myself. */
3262 +       }
3263 +
3264 +       return 0;
3265 +}
3266 +
3267 +/* Flush stripe either via flush list or imeediately. */
3268 +enum flush_type { FLUSH_DELAY, FLUSH_NOW };
3269 +static int stripe_flush(struct stripe *stripe, enum flush_type type)
3270 +{
3271 +       int r = 0;
3272 +
3273 +       stripe_lru_del(stripe, LIST_LOCKED);
3274 +
3275 +       /* Immediately flush. */
3276 +       if (type == FLUSH_NOW) {
3277 +               if (likely(raid_set_operational(RS(stripe->sc))))
3278 +                       r = stripe_rw(stripe); /* Read/write stripe. */
3279 +               else
3280 +                       /* Optimization: Fail early on failed sets. */
3281 +                       stripe_fail_io(stripe);
3282 +       /* Delay flush by putting it on io list for later processing. */
3283 +       } else if (type == FLUSH_DELAY)
3284 +               stripe_io_add(stripe, POS_TAIL, LIST_UNLOCKED);
3285 +       else
3286 +               BUG();
3287 +
3288 +       return r;
3289 +}
3290 +
3291 +/*
3292 + * Queue reads and writes to a stripe by hanging
3293 + * their bios off the stripsets read/write lists.
3294 + *
3295 + * Endio reads on uptodate chunks.
3296 + */
3297 +static INLINE int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
3298 +                                  struct bio_list *reject)
3299 +{
3300 +       int r = 0;
3301 +       struct address addr;
3302 +       struct stripe *stripe =
3303 +               stripe_get(rs, raid_address(rs, bio->bi_sector, &addr));
3304 +
3305 +       if (stripe) {
3306 +               int rr, rw = bio_data_dir(bio);
3307 +
3308 +               rr = stripe_lock(rs, stripe, rw, addr.key); /* Lock stripe */
3309 +               if (rr) {
3310 +                       stripe_put(stripe);
3311 +                       goto out;
3312 +               }
3313 +
3314 +               /* Distinguish read and write cases. */
3315 +               bio_list_add(BL(stripe, addr.di, rw), bio);
3316 +
3317 +               /* REMOVEME: statistics */
3318 +               atomic_inc(rs->stats + (rw == WRITE ?
3319 +                          S_BIOS_ADDED_WRITE : S_BIOS_ADDED_READ));
3320 +
3321 +               if (rw == READ)
3322 +                       SetStripeRead(stripe);
3323 +               else {
3324 +                       SetStripeRBW(stripe);
3325 +
3326 +                       /* Inrement pending write count on region. */
3327 +                       dm_rh_inc(rs->recover.rh, stripe->region);
3328 +                       r = 1;  /* Region hash needs a flush. */
3329 +               }
3330 +
3331 +               /*
3332 +                * Optimize stripe flushing:
3333 +                *
3334 +                * o directly start io for read stripes.
3335 +                *
3336 +                * o put stripe onto stripe caches io_list for RBW,
3337 +                *   so that do_flush() can belabour it after we put
3338 +                *   more bios to the stripe for overwrite optimization.
3339 +                */
3340 +               stripe_flush(stripe,
3341 +                            StripeRead(stripe) ? FLUSH_NOW : FLUSH_DELAY);
3342 +
3343 +       /* Got no stripe from cache -> reject bio. */
3344 +       } else {
3345 +out:
3346 +               bio_list_add(reject, bio);
3347 +               /* REMOVEME: statistics. */
3348 +               atomic_inc(rs->stats + S_IOS_POST);
3349 +       }
3350 +
3351 +       return r;
3352 +}
3353 +
3354 +/*
3355 + * Recovery functions
3356 + */
3357 +/* Read a stripe off a raid set for recovery. */
3358 +static int recover_read(struct raid_set *rs, struct stripe *stripe, int idx)
3359 +{
3360 +       /* Invalidate all pages so that they get read in. */
3361 +       stripe_pages_invalidate(stripe);
3362 +
3363 +       /* Allow io on all recovery chunks. */
3364 +       stripe_allow_io(stripe);
3365 +
3366 +       if (idx > -1)
3367 +               ProhibitPageIO(PAGE(stripe, idx));
3368 +
3369 +       stripe->key = rs->recover.pos;
3370 +       return stripe_page_lists_rw(rs, stripe);
3371 +}
3372 +
3373 +/* Write a stripe to a raid set for recovery. */
3374 +static int recover_write(struct raid_set *rs, struct stripe *stripe, int idx)
3375 +{
3376 +       /*
3377 +        * If this is a reconstruct of a particular device, then
3378 +        * reconstruct the respective page(s), else create parity page(s).
3379 +        */
3380 +       if (idx > -1) {
3381 +               struct page *page = PAGE(stripe, idx);
3382 +
3383 +               AllowPageIO(page);
3384 +               stripe_zero_chunk(stripe, idx);
3385 +               common_xor(stripe, stripe->io.size, 0, idx);
3386 +               page_set(page, DIRTY);
3387 +       } else
3388 +               parity_xor(stripe);
3389 +
3390 +       return stripe_page_lists_rw(rs, stripe);
3391 +}
3392 +
3393 +/* Recover bandwidth available ?. */
3394 +static int recover_bandwidth(struct raid_set *rs)
3395 +{
3396 +       int r, work;
3397 +
3398 +       /* On reset -> allow recovery. */
3399 +       r = recover_io_reset(rs);
3400 +       if (r || RSBandwidth(rs))
3401 +               goto out;
3402 +
3403 +       work = atomic_read(rs->recover.io_count + IO_WORK);
3404 +       if (work) {
3405 +               /* Pay attention to larger recover stripe size. */
3406 +               int recover =
3407 +                   atomic_read(rs->recover.io_count + IO_RECOVER) *
3408 +                               rs->recover.io_size /
3409 +                               rs->set.io_size;
3410 +
3411 +               /*
3412 +                * Don't use more than given bandwidth of
3413 +                * the work io for recovery.
3414 +                */
3415 +               if (recover > work / rs->recover.bandwidth_work) {
3416 +                       /* REMOVEME: statistics. */
3417 +                       atomic_inc(rs->stats + S_NO_BANDWIDTH);
3418 +                       return 0;
3419 +               }
3420 +       }
3421 +
3422 +out:
3423 +       atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
3424 +       return 1;
3425 +}
3426 +
3427 +/* Try to get a region to recover. */
3428 +static int recover_get_region(struct raid_set *rs)
3429 +{
3430 +       struct recover *rec = &rs->recover;
3431 +       struct dm_rh_client *rh = rec->rh;
3432 +
3433 +       /* Start quiescing some regions. */
3434 +       if (!RSRegionGet(rs)) {
3435 +               int r = recover_bandwidth(rs); /* Enough bandwidth ?. */
3436 +
3437 +               if (r) {
3438 +                       r = dm_rh_recovery_prepare(rh);
3439 +                       if (r < 0) {
3440 +                               DMINFO("No %sregions to recover",
3441 +                                      rec->nr_regions_to_recover ?
3442 +                                      "more " : "");
3443 +                               return -ENOENT;
3444 +                       }
3445 +               } else
3446 +                       return -EAGAIN;
3447 +
3448 +               SetRSRegionGet(rs);
3449 +       }
3450 +
3451 +       if (!rec->reg) {
3452 +               rec->reg = dm_rh_recovery_start(rh);
3453 +               if (rec->reg) {
3454 +                       /*
3455 +                        * A reference for the the region I'll
3456 +                        * keep till I've completely synced it.
3457 +                        */
3458 +                       io_get(rs);
3459 +                       rec->pos = dm_rh_region_to_sector(rh,
3460 +                               dm_rh_get_region_key(rec->reg));
3461 +                       rec->end = rec->pos + dm_rh_get_region_size(rh);
3462 +                       return 1;
3463 +               } else
3464 +                       return -EAGAIN;
3465 +       }
3466 +
3467 +       return 0;
3468 +}
3469 +
3470 +/* Read/write a recovery stripe. */
3471 +static INLINE int recover_stripe_rw(struct raid_set *rs, struct stripe *stripe)
3472 +{
3473 +       /* Read/write flip-flop. */
3474 +       if (TestClearStripeRBW(stripe)) {
3475 +               SetStripeRead(stripe);
3476 +               return recover_read(rs, stripe, idx_get(rs));
3477 +       } else if (TestClearStripeRead(stripe))
3478 +               return recover_write(rs, stripe, idx_get(rs));
3479 +
3480 +       return 0;
3481 +}
3482 +
3483 +/* Reset recovery variables. */
3484 +static void recovery_region_reset(struct raid_set *rs)
3485 +{
3486 +       rs->recover.reg = NULL;
3487 +       ClearRSRegionGet(rs);
3488 +}
3489 +
3490 +/* Update region hash state. */
3491 +static void recover_rh_update(struct raid_set *rs, int error)
3492 +{
3493 +       struct recover *rec = &rs->recover;
3494 +       struct dm_rh_client *rh = rec->rh;
3495 +       struct dm_region *reg = rec->reg;
3496 +
3497 +       if (reg) {
3498 +               dm_rh_recovery_end(rh, reg, error);
3499 +               if (!error)
3500 +                       rec->nr_regions_recovered++;
3501 +
3502 +               recovery_region_reset(rs);
3503 +       }
3504 +
3505 +       dm_rh_update_states(rh, 1);
3506 +       dm_rh_flush(rh);
3507 +       io_put(rs);     /* Release the io reference for the region. */
3508 +}
3509 +
3510 +/* Called by main io daemon to recover regions. */
3511 +/* FIXME: cope with MAX_RECOVER > 1. */
3512 +static INLINE void _do_recovery(struct raid_set *rs, struct stripe *stripe)
3513 +{
3514 +       int r;
3515 +       struct recover *rec = &rs->recover;
3516 +
3517 +       /* If recovery is active -> return. */
3518 +       if (StripeActive(stripe))
3519 +               return;
3520 +
3521 +       /* io error is fatal for recovery -> stop it. */
3522 +       if (unlikely(StripeError(stripe)))
3523 +               goto err;
3524 +
3525 +       /* Get a region to recover. */
3526 +       r = recover_get_region(rs);
3527 +       switch (r) {
3528 +       case 1: /* Got a new region. */
3529 +               /* Flag read before write. */
3530 +               ClearStripeRead(stripe);
3531 +               SetStripeRBW(stripe);
3532 +               break;
3533 +
3534 +       case 0:
3535 +               /* Got a region in the works. */
3536 +               r = recover_bandwidth(rs);
3537 +               if (r) /* Got enough bandwidth. */
3538 +                       break;
3539 +
3540 +       case -EAGAIN:
3541 +               /* No bandwidth/quiesced region yet, try later. */
3542 +               wake_do_raid_delayed(rs, HZ / 10);
3543 +               return;
3544 +
3545 +       case -ENOENT:   /* No more regions. */
3546 +               dm_table_event(rs->ti->table);
3547 +               goto free;
3548 +       }
3549 +
3550 +       /* Read/write a recover stripe. */
3551 +       r = recover_stripe_rw(rs, stripe);
3552 +       if (r) {
3553 +               /* IO initiated, get another reference for the IO. */
3554 +               io_get(rs);
3555 +               return;
3556 +       }
3557 +
3558 +       /* Update recovery position within region. */
3559 +       rec->pos += stripe->io.size;
3560 +
3561 +       /* If we're at end of region, update region hash. */
3562 +       if (rec->pos >= rec->end ||
3563 +           rec->pos >= rs->set.sectors_per_dev)
3564 +               recover_rh_update(rs, 0);
3565 +       else
3566 +               SetStripeRBW(stripe);
3567 +
3568 +       /* Schedule myself for another round... */
3569 +       wake_do_raid(rs);
3570 +       return;
3571 +
3572 +err:
3573 +       raid_set_check_degrade(rs, stripe);
3574 +
3575 +       {
3576 +               char buf[BDEVNAME_SIZE];
3577 +
3578 +               DMERR("stopping recovery due to "
3579 +                     "ERROR on /dev/%s, stripe at offset %llu",
3580 +                     bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3581 +                     (unsigned long long) stripe->key);
3582 +
3583 +       }
3584 +
3585 +       /* Make sure, that all quiesced regions get released. */
3586 +       do {
3587 +               if (rec->reg)
3588 +                       dm_rh_recovery_end(rec->rh, rec->reg, -EIO);
3589 +
3590 +               rec->reg = dm_rh_recovery_start(rec->rh);
3591 +       } while (rec->reg);
3592 +
3593 +       recover_rh_update(rs, -EIO);
3594 +free:
3595 +       rs->set.dev_to_init = -1;
3596 +
3597 +       /* Check for jiffies overrun. */
3598 +       rs->recover.end_jiffies = jiffies;
3599 +       if (rs->recover.end_jiffies < rs->recover.start_jiffies)
3600 +               rs->recover.end_jiffies = ~0;
3601 +
3602 +       ClearRSRecover(rs);
3603 +}
3604 +
3605 +static INLINE void do_recovery(struct raid_set *rs)
3606 +{
3607 +       struct stripe *stripe;
3608 +
3609 +       list_for_each_entry(stripe, &rs->recover.stripes, lists[LIST_RECOVER])
3610 +               _do_recovery(rs, stripe);
3611 +
3612 +       if (!RSRecover(rs))
3613 +               stripe_recover_free(rs);
3614 +}
3615 +
3616 +/*
3617 + * END recovery functions
3618 + */
3619 +
3620 +/* End io process all stripes handed in by endio() callback. */
3621 +static void do_endios(struct raid_set *rs)
3622 +{
3623 +       struct stripe_cache *sc = &rs->sc;
3624 +       struct stripe *stripe;
3625 +
3626 +       while ((stripe = stripe_endio_pop(sc))) {
3627 +               unsigned count;
3628 +
3629 +               /* Recovery stripe special case. */
3630 +               if (unlikely(StripeRecover(stripe))) {
3631 +                       if (stripe_io(stripe))
3632 +                               continue;
3633 +
3634 +                       io_put(rs); /* Release region io reference. */
3635 +                       ClearStripeActive(stripe);
3636 +
3637 +                       /* REMOVEME: statistics*/
3638 +                       atomic_dec(&sc->active_stripes);
3639 +                       continue;
3640 +               }
3641 +
3642 +               /* Early end io all reads on any uptodate chunks. */
3643 +               stripe_endio(READ, stripe, (count = 0, &count));
3644 +               if (stripe_io(stripe)) {
3645 +                       if (count) /* REMOVEME: statistics. */
3646 +                               atomic_inc(rs->stats + S_ACTIVE_READS);
3647 +
3648 +                       continue;
3649 +               }
3650 +
3651 +               /* Set stripe inactive after all io got processed. */
3652 +               if (TestClearStripeActive(stripe))
3653 +                       atomic_dec(&sc->active_stripes);
3654 +
3655 +               /* Unlock stripe (for clustering). */
3656 +               stripe_unlock(rs, stripe);
3657 +
3658 +               /*
3659 +                * If an io error on a stripe occured and the RAID set
3660 +                * is still operational, requeue the stripe for io.
3661 +                */
3662 +               if (TestClearStripeError(stripe)) {
3663 +                       raid_set_check_degrade(rs, stripe);
3664 +                       ClearStripeReconstruct(stripe);
3665 +
3666 +                       if (!StripeMerged(stripe) &&
3667 +                           raid_set_operational(rs)) {
3668 +                               stripe_pages_invalidate(stripe);
3669 +                               stripe_flush(stripe, FLUSH_DELAY);
3670 +                               /* REMOVEME: statistics. */
3671 +                               atomic_inc(rs->stats + S_REQUEUE);
3672 +                               continue;
3673 +                       }
3674 +               }
3675 +
3676 +               /* Check if the RAID set is inoperational to error ios. */
3677 +               if (!raid_set_operational(rs)) {
3678 +                       ClearStripeReconstruct(stripe);
3679 +                       stripe_fail_io(stripe);
3680 +                       BUG_ON(atomic_read(&stripe->cnt));
3681 +                       continue;
3682 +               }
3683 +
3684 +               /* Got to reconstruct a missing chunk. */
3685 +               if (TestClearStripeReconstruct(stripe))
3686 +                       reconstruct_xor(stripe);
3687 +
3688 +               /*
3689 +                * Now that we've got a complete stripe, we can
3690 +                * process the rest of the end ios on reads.
3691 +                */
3692 +               BUG_ON(stripe_endio(READ, stripe, NULL));
3693 +               ClearStripeRead(stripe);
3694 +
3695 +               /*
3696 +                * Read-before-write stripes need to be flushed again in
3697 +                * order to work the write data into the pages *after*
3698 +                * they were read in.
3699 +                */
3700 +               if (TestClearStripeMerged(stripe))
3701 +                       /* End io all bios which got merged already. */
3702 +                       BUG_ON(stripe_endio(WRITE_MERGED, stripe, NULL));
3703 +
3704 +               /* Got to put on flush list because of new writes. */
3705 +               if (StripeRBW(stripe))
3706 +                       stripe_flush(stripe, FLUSH_DELAY);
3707 +       }
3708 +}
3709 +
3710 +/*
3711 + * Stripe cache shrinking.
3712 + */
3713 +static INLINE void do_sc_shrink(struct raid_set *rs)
3714 +{
3715 +       unsigned shrink = atomic_read(&rs->sc.stripes_to_shrink);
3716 +
3717 +       if (shrink) {
3718 +               unsigned cur = atomic_read(&rs->sc.stripes);
3719 +
3720 +               sc_shrink(&rs->sc, shrink);
3721 +               shrink -= cur - atomic_read(&rs->sc.stripes);
3722 +               atomic_set(&rs->sc.stripes_to_shrink, shrink);
3723 +
3724 +               /*
3725 +                * Wake myself up in case we failed to shrink the
3726 +                * requested amount in order to try again later.
3727 +                */
3728 +               if (shrink)
3729 +                       wake_do_raid(rs);
3730 +       }
3731 +}
3732 +
3733 +
3734 +/*
3735 + * Process all ios
3736 + *
3737 + * We do different things with the io depending on the
3738 + * state of the region that it's in:
3739 + *
3740 + * o reads: hang off stripe cache or postpone if full
3741 + *
3742 + * o writes:
3743 + *
3744 + *  CLEAN/DIRTY/NOSYNC:        increment pending and hang io off stripe's stripe set.
3745 + *                     In case stripe cache is full or busy, postpone the io.
3746 + *
3747 + *  RECOVERING:                delay the io until recovery of the region completes.
3748 + *
3749 + */
3750 +static INLINE void do_ios(struct raid_set *rs, struct bio_list *ios)
3751 +{
3752 +       int r;
3753 +       unsigned flush = 0;
3754 +       struct dm_rh_client *rh = rs->recover.rh;
3755 +       struct bio *bio;
3756 +       struct bio_list delay, reject;
3757 +
3758 +       bio_list_init(&delay);
3759 +       bio_list_init(&reject);
3760 +
3761 +       /*
3762 +        * Classify each io:
3763 +        *    o delay to recovering regions
3764 +        *    o queue to all other regions
3765 +        */
3766 +       while ((bio = bio_list_pop(ios))) {
3767 +               /*
3768 +                * In case we get a barrier bio, push it back onto
3769 +                * the input queue unless all work queues are empty
3770 +                * and the stripe cache is inactive.
3771 +                */
3772 +               if (unlikely(bio_barrier(bio))) {
3773 +                       /* REMOVEME: statistics. */
3774 +                       atomic_inc(rs->stats + S_BARRIER);
3775 +                       if (!list_empty(rs->sc.lists + LIST_IO) ||
3776 +                           !bio_list_empty(&delay) ||
3777 +                           !bio_list_empty(&reject) ||
3778 +                           sc_active(&rs->sc)) {
3779 +                               bio_list_push(ios, bio);
3780 +                               break;
3781 +                       }
3782 +               }
3783 +
3784 +               r = region_state(rs, _sector(rs, bio), DM_RH_RECOVERING);
3785 +               if (unlikely(r)) {
3786 +                       /* Got to wait for recovering regions. */
3787 +                       bio_list_add(&delay, bio);
3788 +                       SetRSBandwidth(rs);
3789 +               } else {
3790 +                       /*
3791 +                        * Process ios to non-recovering regions by queueing
3792 +                        * them to stripes (does rh_inc()) for writes).
3793 +                        */
3794 +                       flush += stripe_queue_bio(rs, bio, &reject);
3795 +               }
3796 +       }
3797 +
3798 +       if (flush) {
3799 +               r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3800 +               if (r)
3801 +                       DMERR("dirty log flush");
3802 +       }
3803 +
3804 +       /* Delay ios to regions which are recovering. */
3805 +       while ((bio = bio_list_pop(&delay))) {
3806 +               /* REMOVEME: statistics.*/
3807 +               atomic_inc(rs->stats + S_DELAYED_BIOS);
3808 +               atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3809 +               dm_rh_delay_by_region(rh, bio,
3810 +                       dm_rh_sector_to_region(rh, _sector(rs, bio)));
3811 +
3812 +       }
3813 +
3814 +       /* Merge any rejected bios back to the head of the input list. */
3815 +       bio_list_merge_head(ios, &reject);
3816 +}
3817 +
3818 +/* Flush any stripes on the io list. */
3819 +static INLINE void do_flush(struct raid_set *rs)
3820 +{
3821 +       struct list_head *list = rs->sc.lists + LIST_IO, *pos, *tmp;
3822 +
3823 +       list_for_each_safe(pos, tmp, list) {
3824 +               int r = stripe_flush(list_entry(pos, struct stripe,
3825 +                                               lists[LIST_IO]), FLUSH_NOW);
3826 +
3827 +               /* Remove from the list only if the stripe got processed. */
3828 +               if (!r)
3829 +                       list_del_init(pos);
3830 +       }
3831 +}
3832 +
3833 +/* Send an event in case we're getting too busy. */
3834 +static INLINE void do_busy_event(struct raid_set *rs)
3835 +{
3836 +       if ((sc_active(&rs->sc) > atomic_read(&rs->sc.stripes) * 4 / 5)) {
3837 +               if (!TestSetRSScBusy(rs))
3838 +                       dm_table_event(rs->ti->table);
3839 +       } else
3840 +               ClearRSScBusy(rs);
3841 +}
3842 +
3843 +/* Unplug: let the io role on the sets devices. */
3844 +static INLINE void do_unplug(struct raid_set *rs)
3845 +{
3846 +       struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3847 +
3848 +       while (dev-- > rs->dev) {
3849 +               /* Only call any device unplug function, if io got queued. */
3850 +               if (io_dev_clear(dev))
3851 +                       blk_unplug(bdev_get_queue(dev->dev->bdev));
3852 +       }
3853 +}
3854 +
3855 +/*-----------------------------------------------------------------
3856 + * RAID daemon
3857 + *---------------------------------------------------------------*/
3858 +/*
3859 + * o belabour all end ios
3860 + * o optionally shrink the stripe cache
3861 + * o update the region hash states
3862 + * o optionally do recovery
3863 + * o grab the input queue
3864 + * o work an all requeued or new ios and perform stripe cache flushs
3865 + *   unless the RAID set is inoperational (when we error ios)
3866 + * o check, if the stripe cache gets too busy and throw an event if so
3867 + * o unplug any component raid devices with queued bios
3868 + */
3869 +static void do_raid(struct work_struct *ws)
3870 +{
3871 +       struct raid_set *rs = container_of(ws, struct raid_set, io.dws.work);
3872 +       struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3873 +       spinlock_t *lock = &rs->io.in_lock;
3874 +
3875 +       /*
3876 +        * We always need to end io, so that ios
3877 +        * can get errored in case the set failed
3878 +        * and the region counters get decremented
3879 +        * before we update the region hash states.
3880 +        */
3881 +redo:
3882 +       do_endios(rs);
3883 +
3884 +       /*
3885 +        * Now that we've end io'd, which may have put stripes on
3886 +        * the LRU list, we shrink the stripe cache if requested.
3887 +        */
3888 +       do_sc_shrink(rs);
3889 +
3890 +       /* Update region hash states before we go any further. */
3891 +       dm_rh_update_states(rs->recover.rh, 1);
3892 +
3893 +       /* Try to recover regions. */
3894 +       if (RSRecover(rs))
3895 +               do_recovery(rs);
3896 +
3897 +       /* More endios -> process. */
3898 +       if (!stripe_endio_empty(&rs->sc)) {
3899 +               atomic_inc(rs->stats + S_REDO);
3900 +               goto redo;
3901 +       }
3902 +
3903 +       /* Quickly grab all new ios queued and add them to the work list. */
3904 +       spin_lock_irq(lock);
3905 +       bio_list_merge(ios, ios_in);
3906 +       bio_list_init(ios_in);
3907 +       spin_unlock_irq(lock);
3908 +
3909 +       /* Let's assume we're operational most of the time ;-). */
3910 +       if (likely(raid_set_operational(rs))) {
3911 +               /* If we got ios, work them into the cache. */
3912 +               if (!bio_list_empty(ios)) {
3913 +                       do_ios(rs, ios);
3914 +                       do_unplug(rs);  /* Unplug the sets device queues. */
3915 +               }
3916 +
3917 +               do_flush(rs);           /* Flush any stripes on io list. */
3918 +               do_unplug(rs);          /* Unplug the sets device queues. */
3919 +               do_busy_event(rs);      /* Check if we got too busy. */
3920 +
3921 +               /* More endios -> process. */
3922 +               if (!stripe_endio_empty(&rs->sc)) {
3923 +                       atomic_inc(rs->stats + S_REDO);
3924 +                       goto redo;
3925 +               }
3926 +       } else
3927 +               /* No way to reconstruct data with too many devices failed. */
3928 +               bio_list_fail(rs, NULL, ios);
3929 +}
3930 +
3931 +/*
3932 + * Callback for region hash to dispatch
3933 + * delayed bios queued to recovered regions
3934 + * (Gets called via rh_update_states()).
3935 + */
3936 +static void dispatch_delayed_bios(void *context, struct bio_list *bl, int dummy)
3937 +{
3938 +       struct raid_set *rs = context;
3939 +       struct bio *bio;
3940 +
3941 +       /* REMOVEME: decrement pending delayed bios counter. */
3942 +       bio_list_for_each(bio, bl)
3943 +               atomic_dec(rs->stats + S_DELAYED_BIOS);
3944 +
3945 +       /* Merge region hash private list to work list. */
3946 +       bio_list_merge_head(&rs->io.work, bl);
3947 +       bio_list_init(bl);
3948 +       ClearRSBandwidth(rs);
3949 +}
3950 +
3951 +/*************************************************************
3952 + * Constructor helpers
3953 + *************************************************************/
3954 +/* Calculate MB/sec. */
3955 +static INLINE unsigned mbpers(struct raid_set *rs, unsigned speed)
3956 +{
3957 +       return to_bytes(speed * rs->set.data_devs *
3958 +                       rs->recover.io_size * HZ >> 10) >> 10;
3959 +}
3960 +
3961 +/*
3962 + * Discover fastest xor algorithm and # of chunks combination.
3963 + */
3964 +/* Calculate speed for algorithm and # of chunks. */
3965 +static INLINE unsigned xor_speed(struct stripe *stripe)
3966 +{
3967 +       unsigned r = 0;
3968 +       unsigned long j;
3969 +
3970 +       /* Wait for next tick. */
3971 +       for (j = jiffies; j == jiffies;)
3972 +               ;
3973 +
3974 +       /* Do xors for a full tick. */
3975 +       for (j = jiffies; j == jiffies;) {
3976 +               mb();
3977 +               common_xor(stripe, stripe->io.size, 0, 0);
3978 +               mb();
3979 +               r++;
3980 +               mb();
3981 +       }
3982 +
3983 +       return r;
3984 +}
3985 +
3986 +/* Optimize xor algorithm for this RAID set. */
3987 +static unsigned xor_optimize(struct raid_set *rs)
3988 +{
3989 +       unsigned chunks_max = 2, speed_max = 0;
3990 +       struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3991 +       struct stripe *stripe;
3992 +
3993 +       BUG_ON(list_empty(&rs->recover.stripes));
3994 +       stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3995 +                           lists[LIST_RECOVER]);
3996 +
3997 +       /*
3998 +        * Got to allow io on all chunks, so that
3999 +        * xor() will actually work on them.
4000 +        */
4001 +       stripe_allow_io(stripe);
4002 +
4003 +       /* Try all xor functions. */
4004 +       while (f-- > xor_funcs) {
4005 +               unsigned speed;
4006 +
4007 +               /* Set actual xor function for common_xor(). */
4008 +               rs->xor.f = f;
4009 +               rs->xor.chunks = XOR_CHUNKS_MAX + 1;
4010 +
4011 +               while (rs->xor.chunks-- > 2) {
4012 +                       speed = xor_speed(stripe);
4013 +                       if (speed > speed_max) {
4014 +                               speed_max = speed;
4015 +                               chunks_max = rs->xor.chunks;
4016 +                               f_max = f;
4017 +                       }
4018 +               }
4019 +       }
4020 +
4021 +       /* Memorize optimum parameters. */
4022 +       rs->xor.f = f_max;
4023 +       rs->xor.chunks = chunks_max;
4024 +       return speed_max;
4025 +}
4026 +
4027 +/*
4028 + * Allocate a RAID context (a RAID set)
4029 + */
4030 +static int
4031 +context_alloc(struct raid_set **raid_set, struct raid_type *raid_type,
4032 +             unsigned stripes, unsigned chunk_size, unsigned io_size,
4033 +             unsigned recover_io_size, unsigned raid_devs,
4034 +             sector_t sectors_per_dev,
4035 +             struct dm_target *ti, unsigned dl_parms, char **argv)
4036 +{
4037 +       int r;
4038 +       unsigned p;
4039 +       size_t len;
4040 +       sector_t region_size, ti_len;
4041 +       struct raid_set *rs = NULL;
4042 +       struct dm_dirty_log *dl;
4043 +       struct recover *rec;
4044 +
4045 +       /*
4046 +        * Create the dirty log
4047 +        *
4048 +        * We need to change length for the dirty log constructor,
4049 +        * because we want an amount of regions for all stripes derived
4050 +        * from the single device size, so that we can keep region
4051 +        * size = 2^^n independant of the number of devices
4052 +        */
4053 +       ti_len = ti->len;
4054 +       ti->len = sectors_per_dev;
4055 +       dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
4056 +       ti->len = ti_len;
4057 +       if (!dl)
4058 +               goto bad_dirty_log;
4059 +
4060 +       /* Chunk size *must* be smaller than region size. */
4061 +       region_size = dl->type->get_region_size(dl);
4062 +       if (chunk_size > region_size)
4063 +               goto bad_chunk_size;
4064 +
4065 +       /* Recover io size *must* be smaller than region size as well. */
4066 +       if (recover_io_size > region_size)
4067 +               goto bad_recover_io_size;
4068 +
4069 +       /* Size and allocate the RAID set structure. */
4070 +       len = sizeof(*rs->data) + sizeof(*rs->dev);
4071 +       if (array_too_big(sizeof(*rs), len, raid_devs))
4072 +               goto bad_array;
4073 +
4074 +       len = sizeof(*rs) + raid_devs * len;
4075 +       rs = kzalloc(len, GFP_KERNEL);
4076 +       if (!rs)
4077 +               goto bad_alloc;
4078 +
4079 +       rec = &rs->recover;
4080 +       atomic_set(&rs->io.in_process, 0);
4081 +       atomic_set(&rs->io.in_process_max, 0);
4082 +       rec->io_size = recover_io_size;
4083 +
4084 +       /* Pointer to data array. */
4085 +       rs->data = (unsigned long **)
4086 +                  ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
4087 +       rec->dl = dl;
4088 +       rs->set.raid_devs = p = raid_devs;
4089 +       rs->set.data_devs = raid_devs - raid_type->parity_devs;
4090 +       rs->set.raid_type = raid_type;
4091 +
4092 +       /*
4093 +        * Set chunk and io size and respective shifts
4094 +        * (used to avoid divisions)
4095 +        */
4096 +       rs->set.chunk_size = chunk_size;
4097 +       rs->set.chunk_mask = chunk_size - 1;
4098 +       rs->set.chunk_shift = ffs(chunk_size) - 1;
4099 +
4100 +       rs->set.io_size = io_size;
4101 +       rs->set.io_mask = io_size - 1;
4102 +       rs->set.io_shift = ffs(io_size) - 1;
4103 +       rs->set.io_shift_mask = rs->set.chunk_mask & ~rs->set.io_mask;
4104 +
4105 +       rs->set.pages_per_io = chunk_pages(io_size);
4106 +       rs->set.sectors_per_dev = sectors_per_dev;
4107 +
4108 +       rs->set.ei = -1;        /* Indicate no failed device. */
4109 +       atomic_set(&rs->set.failed_devs, 0);
4110 +
4111 +       rs->ti = ti;
4112 +
4113 +       atomic_set(rec->io_count + IO_WORK, 0);
4114 +       atomic_set(rec->io_count + IO_RECOVER, 0);
4115 +
4116 +       /* Initialize io lock and queues. */
4117 +       spin_lock_init(&rs->io.in_lock);
4118 +       bio_list_init(&rs->io.in);
4119 +       bio_list_init(&rs->io.work);
4120 +
4121 +       init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
4122 +
4123 +       rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
4124 +       rec->rh = dm_rh_client_create(MAX_RECOVER, dispatch_delayed_bios, rs,
4125 +                                     wake_do_raid, rs, dl, region_size,
4126 +                                     rs->recover.nr_regions);
4127 +       if (IS_ERR(rec->rh))
4128 +               goto bad_rh;
4129 +
4130 +       /* Initialize stripe cache. */
4131 +       r = sc_init(rs, stripes);
4132 +       if (r)
4133 +               goto bad_sc;
4134 +
4135 +       /* Create dm-io client context. */
4136 +       rs->sc.dm_io_client = dm_io_client_create(rs->set.raid_devs *
4137 +                                                 rs->set.pages_per_io);
4138 +       if (IS_ERR(rs->sc.dm_io_client))
4139 +               goto bad_dm_io_client;
4140 +
4141 +       /* REMOVEME: statistics. */
4142 +       stats_reset(rs);
4143 +       ClearRSDevelStats(rs);  /* Disnable development status. */
4144 +
4145 +       *raid_set = rs;
4146 +       return 0;
4147 +
4148 +bad_dirty_log:
4149 +       TI_ERR_RET("Error creating dirty log", -ENOMEM);
4150 +
4151 +
4152 +bad_chunk_size:
4153 +       dm_dirty_log_destroy(dl);
4154 +       TI_ERR("Chunk size larger than region size");
4155 +
4156 +bad_recover_io_size:
4157 +       dm_dirty_log_destroy(dl);
4158 +       TI_ERR("Recover stripe io size larger than region size");
4159 +
4160 +bad_array:
4161 +       dm_dirty_log_destroy(dl);
4162 +       TI_ERR("Arry too big");
4163 +
4164 +bad_alloc:
4165 +       dm_dirty_log_destroy(dl);
4166 +       TI_ERR_RET("Cannot allocate raid context", -ENOMEM);
4167 +
4168 +bad_rh:
4169 +       dm_dirty_log_destroy(dl);
4170 +       ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
4171 +       goto free_rs;
4172 +
4173 +bad_sc:
4174 +       ti->error = DM_MSG_PREFIX "Error creating stripe cache";
4175 +       goto free;
4176 +
4177 +bad_dm_io_client:
4178 +       ti->error = DM_MSG_PREFIX "Error allocating dm-io resources";
4179 +free:
4180 +       dm_rh_client_destroy(rec->rh);
4181 +       sc_exit(&rs->sc);
4182 +       dm_rh_client_destroy(rec->rh); /* Destroys dirty log as well. */
4183 +free_rs:
4184 +       kfree(rs);
4185 +       return -ENOMEM;
4186 +}
4187 +
4188 +/* Free a RAID context (a RAID set). */
4189 +static void
4190 +context_free(struct raid_set *rs, struct dm_target *ti, unsigned r)
4191 +{
4192 +       while (r--)
4193 +               dm_put_device(ti, rs->dev[r].dev);
4194 +
4195 +       dm_io_client_destroy(rs->sc.dm_io_client);
4196 +       sc_exit(&rs->sc);
4197 +       dm_rh_client_destroy(rs->recover.rh);
4198 +       dm_dirty_log_destroy(rs->recover.dl);
4199 +       kfree(rs);
4200 +}
4201 +
4202 +/* Create work queue and initialize work. */
4203 +static int rs_workqueue_init(struct raid_set *rs)
4204 +{
4205 +       struct dm_target *ti = rs->ti;
4206 +
4207 +       rs->io.wq = create_singlethread_workqueue(DAEMON);
4208 +       if (!rs->io.wq)
4209 +               TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
4210 +
4211 +       INIT_DELAYED_WORK(&rs->io.dws, do_raid);
4212 +       return 0;
4213 +}
4214 +
4215 +/* Return pointer to raid_type structure for raid name. */
4216 +static struct raid_type *get_raid_type(char *name)
4217 +{
4218 +       struct raid_type *r = ARRAY_END(raid_types);
4219 +
4220 +       while (r-- > raid_types) {
4221 +               if (!strnicmp(STR_LEN(r->name, name)))
4222 +                       return r;
4223 +       }
4224 +
4225 +       return NULL;
4226 +}
4227 +
4228 +/* FIXME: factor out to dm core. */
4229 +static int multiple(sector_t a, sector_t b, sector_t *n)
4230 +{
4231 +       sector_t r = a;
4232 +
4233 +       sector_div(r, b);
4234 +       *n = r;
4235 +       return a == r * b;
4236 +}
4237 +
4238 +/* Log RAID set information to kernel log. */
4239 +static void raid_set_log(struct raid_set *rs, unsigned speed)
4240 +{
4241 +       unsigned p;
4242 +       char buf[BDEVNAME_SIZE];
4243 +
4244 +       for (p = 0; p < rs->set.raid_devs; p++)
4245 +               DMINFO("/dev/%s is raid disk %u",
4246 +                      bdevname(rs->dev[p].dev->bdev, buf), p);
4247 +
4248 +       DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes",
4249 +              rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
4250 +              atomic_read(&rs->sc.stripes));
4251 +       DMINFO("algorithm \"%s\", %u chunks with %uMB/s", rs->xor.f->name,
4252 +              rs->xor.chunks, mbpers(rs, speed));
4253 +       DMINFO("%s set with net %u/%u devices", rs->set.raid_type->descr,
4254 +              rs->set.data_devs, rs->set.raid_devs);
4255 +}
4256 +
4257 +/* Get all devices and offsets. */
4258 +static int
4259 +dev_parms(struct dm_target *ti, struct raid_set *rs,
4260 +         char **argv, int *p)
4261 +{
4262 +       for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
4263 +               int r;
4264 +               unsigned long long tmp;
4265 +               struct raid_dev *dev = rs->dev + *p;
4266 +               union dev_lookup dl = {.dev = dev };
4267 +
4268 +               /* Get offset and device. */
4269 +               r = sscanf(argv[1], "%llu", &tmp);
4270 +               if (r != 1)
4271 +                       TI_ERR("Invalid RAID device offset parameter");
4272 +
4273 +               dev->start = tmp;
4274 +               r = dm_get_device(ti, argv[0], dev->start,
4275 +                                 rs->set.sectors_per_dev,
4276 +                                 dm_table_get_mode(ti->table), &dev->dev);
4277 +               if (r)
4278 +                       TI_ERR_RET("RAID device lookup failure", r);
4279 +
4280 +               r = raid_dev_lookup(rs, bynumber, &dl);
4281 +               if (r != -ENODEV && r < *p) {
4282 +                       (*p)++; /* Ensure dm_put_device() on actual device. */
4283 +                       TI_ERR_RET("Duplicate RAID device", -ENXIO);
4284 +               }
4285 +       }
4286 +
4287 +       return 0;
4288 +}
4289 +
4290 +/* Set recovery bandwidth. */
4291 +static INLINE void
4292 +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
4293 +{
4294 +       rs->recover.bandwidth = bandwidth;
4295 +       rs->recover.bandwidth_work = 100 / bandwidth;
4296 +}
4297 +
4298 +/* Handle variable number of RAID parameters. */
4299 +static int
4300 +raid_variable_parms(struct dm_target *ti, char **argv,
4301 +                   unsigned i, int *raid_parms,
4302 +                   int *chunk_size, int *chunk_size_parm,
4303 +                   int *stripes, int *stripes_parm,
4304 +                   int *io_size, int *io_size_parm,
4305 +                   int *recover_io_size, int *recover_io_size_parm,
4306 +                   int *bandwidth, int *bandwidth_parm)
4307 +{
4308 +       /* Fetch # of variable raid parameters. */
4309 +       if (sscanf(argv[i++], "%d", raid_parms) != 1 ||
4310 +           !range_ok(*raid_parms, 0, 5))
4311 +               TI_ERR("Bad variable raid parameters number");
4312 +
4313 +       if (*raid_parms) {
4314 +               /*
4315 +                * If we've got variable RAID parameters,
4316 +                * chunk size is the first one
4317 +                */
4318 +               if (sscanf(argv[i++], "%d", chunk_size) != 1 ||
4319 +                   (*chunk_size != -1 &&
4320 +                    (!POWER_OF_2(*chunk_size) ||
4321 +                     !range_ok(*chunk_size, IO_SIZE_MIN, CHUNK_SIZE_MAX))))
4322 +                       TI_ERR("Invalid chunk size; must be 2^^n and <= 16384");
4323 +
4324 +               *chunk_size_parm = *chunk_size;
4325 +               if (*chunk_size == -1)
4326 +                       *chunk_size = CHUNK_SIZE;
4327 +
4328 +               /*
4329 +                * In case we've got 2 or more variable raid
4330 +                * parameters, the number of stripes is the second one
4331 +                */
4332 +               if (*raid_parms > 1) {
4333 +                       if (sscanf(argv[i++], "%d", stripes) != 1 ||
4334 +                           (*stripes != -1 &&
4335 +                            !range_ok(*stripes, STRIPES_MIN,
4336 +                                      STRIPES_MAX)))
4337 +                               TI_ERR("Invalid number of stripes: must "
4338 +                                      "be >= 8 and <= 8192");
4339 +               }
4340 +
4341 +               *stripes_parm = *stripes;
4342 +               if (*stripes == -1)
4343 +                       *stripes = STRIPES;
4344 +
4345 +               /*
4346 +                * In case we've got 3 or more variable raid
4347 +                * parameters, the io size is the third one.
4348 +                */
4349 +               if (*raid_parms > 2) {
4350 +                       if (sscanf(argv[i++], "%d", io_size) != 1 ||
4351 +                           (*io_size != -1 &&
4352 +                            (!POWER_OF_2(*io_size) ||
4353 +                             !range_ok(*io_size, IO_SIZE_MIN,
4354 +                                       min(BIO_MAX_SECTORS / 2,
4355 +                                       *chunk_size)))))
4356 +                               TI_ERR("Invalid io size; must "
4357 +                                      "be 2^^n and less equal "
4358 +                                      "min(BIO_MAX_SECTORS/2, chunk size)");
4359 +               } else
4360 +                       *io_size = *chunk_size;
4361 +
4362 +               *io_size_parm = *io_size;
4363 +               if (*io_size == -1)
4364 +                       *io_size = *chunk_size;
4365 +
4366 +               /*
4367 +                * In case we've got 4 variable raid parameters,
4368 +                * the recovery stripe io_size is the fourth one
4369 +                */
4370 +               if (*raid_parms > 3) {
4371 +                       if (sscanf(argv[i++], "%d", recover_io_size) != 1 ||
4372 +                           (*recover_io_size != -1 &&
4373 +                            (!POWER_OF_2(*recover_io_size) ||
4374 +                            !range_ok(*recover_io_size, RECOVER_IO_SIZE_MIN,
4375 +                                      BIO_MAX_SECTORS / 2))))
4376 +                               TI_ERR("Invalid recovery io size; must be "
4377 +                                      "2^^n and less equal BIO_MAX_SECTORS/2");
4378 +               }
4379 +
4380 +               *recover_io_size_parm = *recover_io_size;
4381 +               if (*recover_io_size == -1)
4382 +                       *recover_io_size = RECOVER_IO_SIZE;
4383 +
4384 +               /*
4385 +                * In case we've got 5 variable raid parameters,
4386 +                * the recovery io bandwidth is the fifth one
4387 +                */
4388 +               if (*raid_parms > 4) {
4389 +                       if (sscanf(argv[i++], "%d", bandwidth) != 1 ||
4390 +                           (*bandwidth != -1 &&
4391 +                            !range_ok(*bandwidth, BANDWIDTH_MIN,
4392 +                                      BANDWIDTH_MAX)))
4393 +                               TI_ERR("Invalid recovery bandwidth "
4394 +                                      "percentage; must be > 0 and <= 100");
4395 +               }
4396 +
4397 +               *bandwidth_parm = *bandwidth;
4398 +               if (*bandwidth == -1)
4399 +                       *bandwidth = BANDWIDTH;
4400 +       }
4401 +
4402 +       return 0;
4403 +}
4404 +
4405 +/* Parse optional locking parameters. */
4406 +static int
4407 +raid_locking_parms(struct dm_target *ti, char **argv,
4408 +                  unsigned i, int *locking_parms,
4409 +                  struct dm_raid45_locking_type **locking_type)
4410 +{
4411 +       *locking_parms = 0;
4412 +       *locking_type = &locking_none;
4413 +
4414 +       if (!strnicmp(argv[i], "none", strlen(argv[i])))
4415 +               *locking_parms = 1;
4416 +       else if (!strnicmp(argv[i + 1], "locking", strlen(argv[i + 1]))) {
4417 +               *locking_type = &locking_none;
4418 +               *locking_parms = 2;
4419 +       } else if (!strnicmp(argv[i + 1], "cluster", strlen(argv[i + 1]))) {
4420 +               *locking_type = &locking_cluster;
4421 +               /* FIXME: namespace. */
4422 +               *locking_parms = 3;
4423 +       }
4424 +
4425 +       return *locking_parms == 1 ? -EINVAL : 0;
4426 +}
4427 +
4428 +/* Set backing device information properties of RAID set. */
4429 +static void rs_set_bdi(struct raid_set *rs, unsigned stripes, unsigned chunks)
4430 +{
4431 +       unsigned p, ra_pages;
4432 +       struct mapped_device *md = dm_table_get_md(rs->ti->table);
4433 +       struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
4434 +
4435 +       /* Set read-ahead for the RAID set and the component devices. */
4436 +       bdi->ra_pages = stripes * stripe_pages(rs, rs->set.io_size);
4437 +       ra_pages = chunks * chunk_pages(rs->set.io_size);
4438 +       for (p = rs->set.raid_devs; p--; ) {
4439 +               struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
4440 +
4441 +               q->backing_dev_info.ra_pages = ra_pages;
4442 +       }
4443 +
4444 +       /* Set congested function and data. */
4445 +       bdi->congested_fn = raid_set_congested;
4446 +       bdi->congested_data = rs;
4447 +
4448 +       dm_put(md);
4449 +}
4450 +
4451 +/* Get backing device information properties of RAID set. */
4452 +static void rs_get_ra(struct raid_set *rs, unsigned *stripes, unsigned *chunks)
4453 +{
4454 +       struct mapped_device *md = dm_table_get_md(rs->ti->table);
4455 +
4456 +        *stripes = dm_disk(md)->queue->backing_dev_info.ra_pages
4457 +                   / stripe_pages(rs, rs->set.io_size);
4458 +       *chunks = bdev_get_queue(rs->dev->dev->bdev)->backing_dev_info.ra_pages
4459 +                 / chunk_pages(rs->set.io_size);
4460 +
4461 +       dm_put(md);
4462 +}
4463 +
4464 +/*
4465 + * Construct a RAID4/5 mapping:
4466 + *
4467 + * log_type #log_params <log_params> \
4468 + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
4469 + * [locking "none"/"cluster"]
4470 + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
4471 + *
4472 + * log_type = "core"/"disk",
4473 + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
4474 + * log_params = [dirty_log_path] region_size [[no]sync])
4475 + *
4476 + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
4477 + *
4478 + * #parity_dev = N if raid_type = "raid4"
4479 + * o N = -1: pick default = last device
4480 + * o N >= 0 and < #raid_devs: parity device index
4481 + *
4482 + * #raid_variable_params = 0-5; raid_params (-1 = default):
4483 + *   [chunk_size [#stripes [io_size [recover_io_size [%recovery_bandwidth]]]]]
4484 + *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
4485 + *     and <= CHUNK_SIZE_MAX)
4486 + *   o #stripes is number of stripes allocated to stripe cache
4487 + *     (must be > 1 and < STRIPES_MAX)
4488 + *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
4489 + *   o recover_io_size (io unit size per device for recovery in sectors;
4490 +       must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
4491 + *   o %recovery_bandwith is the maximum amount spend for recovery during
4492 + *     application io (1-100%)
4493 + * If raid_variable_params = 0, defaults will be used.
4494 + * Any raid_variable_param can be set to -1 to apply a default
4495 + *
4496 + * #raid_devs = N (N >= 3)
4497 + *
4498 + * #dev_to_initialize = N
4499 + * -1: initialize parity on all devices
4500 + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
4501 + * of a failed devices content after replacement
4502 + *
4503 + * <dev_path> = device_path (eg, /dev/sdd1)
4504 + * <offset>   = begin at offset on <dev_path>
4505 + *
4506 + */
4507 +#define        MIN_PARMS       13
4508 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
4509 +{
4510 +       int bandwidth = BANDWIDTH, bandwidth_parm = -1,
4511 +           chunk_size = CHUNK_SIZE, chunk_size_parm = -1,
4512 +           dev_to_init, dl_parms, locking_parms, parity_parm, pi = -1,
4513 +           i, io_size = IO_SIZE, io_size_parm = -1,
4514 +           r, raid_devs, raid_parms,
4515 +           recover_io_size = RECOVER_IO_SIZE, recover_io_size_parm = -1,
4516 +           stripes = STRIPES, stripes_parm = -1;
4517 +       unsigned speed;
4518 +       sector_t tmp, sectors_per_dev;
4519 +       struct dm_raid45_locking_type *locking;
4520 +       struct raid_set *rs;
4521 +       struct raid_type *raid_type;
4522 +
4523 +       /* Ensure minimum number of parameters. */
4524 +       if (argc < MIN_PARMS)
4525 +               TI_ERR("Not enough parameters");
4526 +
4527 +       /* Fetch # of dirty log parameters. */
4528 +       if (sscanf(argv[1], "%d", &dl_parms) != 1
4529 +           || !range_ok(dl_parms, 1, 4711))
4530 +               TI_ERR("Bad dirty log parameters number");
4531 +
4532 +       /* Check raid_type. */
4533 +       raid_type = get_raid_type(argv[dl_parms + 2]);
4534 +       if (!raid_type)
4535 +               TI_ERR("Bad raid type");
4536 +
4537 +       /* In case of RAID4, parity drive is selectable. */
4538 +       parity_parm = !!(raid_type->level == raid4);
4539 +
4540 +       /* Handle variable number of RAID parameters. */
4541 +       r = raid_variable_parms(ti, argv, dl_parms + parity_parm + 3,
4542 +                               &raid_parms,
4543 +                               &chunk_size, &chunk_size_parm,
4544 +                               &stripes, &stripes_parm,
4545 +                               &io_size, &io_size_parm,
4546 +                               &recover_io_size, &recover_io_size_parm,
4547 +                               &bandwidth, &bandwidth_parm);
4548 +       if (r)
4549 +               return r;
4550 +
4551 +       r = raid_locking_parms(ti, argv,
4552 +                              dl_parms + parity_parm + raid_parms + 4,
4553 +                              &locking_parms, &locking);
4554 +       if (r)
4555 +               return r;
4556 +
4557 +       /* # of raid devices. */
4558 +       i = dl_parms + parity_parm + raid_parms + locking_parms + 4;
4559 +       if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
4560 +           raid_devs < raid_type->minimal_devs)
4561 +               TI_ERR("Invalid number of raid devices");
4562 +
4563 +       /* In case of RAID4, check parity drive index is in limits. */
4564 +       if (raid_type->level == raid4) {
4565 +               /* Fetch index of parity device. */
4566 +               if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
4567 +                   !range_ok(pi, 0, raid_devs - 1))
4568 +                       TI_ERR("Invalid RAID4 parity device index");
4569 +       }
4570 +
4571 +       /*
4572 +        * Index of device to initialize starts at 0
4573 +        *
4574 +        * o -1 -> don't initialize a particular device,
4575 +        * o 0..raid_devs-1 -> initialize respective device
4576 +        *   (used for reconstruction of a replaced device)
4577 +        */
4578 +       if (sscanf
4579 +           (argv[dl_parms + parity_parm + raid_parms + locking_parms + 5],
4580 +            "%d", &dev_to_init) != 1
4581 +           || !range_ok(dev_to_init, -1, raid_devs - 1))
4582 +               TI_ERR("Invalid number for raid device to initialize");
4583 +
4584 +       /* Check # of raid device arguments. */
4585 +       if (argc - dl_parms - parity_parm - raid_parms - 6 !=
4586 +           2 * raid_devs)
4587 +               TI_ERR("Wrong number of raid device/offset arguments");
4588 +
4589 +       /*
4590 +        * Check that the table length is devisable
4591 +        * w/o rest by (raid_devs - parity_devs)
4592 +        */
4593 +       if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4594 +                     &sectors_per_dev))
4595 +               TI_ERR
4596 +                   ("Target length not divisable by number of data devices");
4597 +
4598 +       /*
4599 +        * Check that the device size is
4600 +        * devisable w/o rest by chunk size
4601 +        */
4602 +       if (!multiple(sectors_per_dev, chunk_size, &tmp))
4603 +               TI_ERR("Device length not divisable by chunk_size");
4604 +
4605 +       /****************************************************************
4606 +        * Now that we checked the constructor arguments ->
4607 +        * let's allocate the RAID set
4608 +        ****************************************************************/
4609 +       r = context_alloc(&rs, raid_type, stripes, chunk_size, io_size,
4610 +                         recover_io_size, raid_devs, sectors_per_dev,
4611 +                         ti, dl_parms, argv);
4612 +       if (r)
4613 +               return r;
4614 +
4615 +       /*
4616 +        * Set these here in order to avoid passing
4617 +        * too many arguments to context_alloc()
4618 +        */
4619 +       rs->set.dev_to_init_parm = dev_to_init;
4620 +       rs->set.dev_to_init = dev_to_init;
4621 +       rs->set.pi_parm = pi;
4622 +       rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4623 +       rs->set.raid_parms = raid_parms;
4624 +       rs->set.chunk_size_parm = chunk_size_parm;
4625 +       rs->set.io_size_parm = io_size_parm;
4626 +       rs->sc.stripes_parm = stripes_parm;
4627 +       rs->recover.io_size_parm = recover_io_size_parm;
4628 +       rs->recover.bandwidth_parm = bandwidth_parm;
4629 +       recover_set_bandwidth(rs, bandwidth);
4630 +
4631 +       /* Use locking type to lock stripe access. */
4632 +       rs->locking = locking;
4633 +
4634 +       /* Get the device/offset tupels. */
4635 +       argv += dl_parms + 6 + parity_parm + raid_parms;
4636 +       r = dev_parms(ti, rs, argv, &i);
4637 +       if (r)
4638 +               goto err;
4639 +
4640 +       /* Initialize recovery. */
4641 +       rs->recover.start_jiffies = jiffies;
4642 +       rs->recover.end_jiffies = 0;
4643 +       recovery_region_reset(rs);
4644 +
4645 +       /* Allow for recovery of any nosync regions. */
4646 +       SetRSRecover(rs);
4647 +
4648 +       /* Set backing device information (eg. read ahead). */
4649 +       rs_set_bdi(rs, chunk_size * 2, io_size * 4);
4650 +       SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4651 +
4652 +       speed = xor_optimize(rs); /* Select best xor algorithm. */
4653 +
4654 +       /* Initialize work queue to handle this RAID set's io. */
4655 +       r = rs_workqueue_init(rs);
4656 +       if (r)
4657 +               goto err;
4658 +
4659 +       raid_set_log(rs, speed); /* Log information about RAID set. */
4660 +
4661 +       /*
4662 +        * Make sure that dm core only hands maximum io size
4663 +        * length down and pays attention to io boundaries.
4664 +        */
4665 +       ti->split_io = rs->set.io_size;
4666 +       ti->private = rs;
4667 +       return 0;
4668 +
4669 +err:
4670 +       context_free(rs, ti, i);
4671 +       return r;
4672 +}
4673 +
4674 +/*
4675 + * Destruct a raid mapping
4676 + */
4677 +static void raid_dtr(struct dm_target *ti)
4678 +{
4679 +       struct raid_set *rs = ti->private;
4680 +
4681 +       /* Indicate recovery end so that ios in flight drain. */
4682 +       ClearRSRecover(rs);
4683 +
4684 +       wake_do_raid(rs);       /* Wake daemon. */
4685 +       wait_ios(rs);           /* Wait for any io still being processed. */
4686 +       destroy_workqueue(rs->io.wq);
4687 +       context_free(rs, ti, rs->set.raid_devs);
4688 +}
4689 +
4690 +/* Queues ios to RAID sets. */
4691 +static inline void queue_bio(struct raid_set *rs, struct bio *bio)
4692 +{
4693 +       int wake;
4694 +       struct bio_list *in = &rs->io.in;
4695 +       spinlock_t *in_lock = &rs->io.in_lock;
4696 +
4697 +       spin_lock_irq(in_lock);
4698 +       wake = bio_list_empty(in);
4699 +       bio_list_add(in, bio);
4700 +       spin_unlock_irq(in_lock);
4701 +
4702 +       /* Wake daemon if input list was empty. */
4703 +       if (wake)
4704 +               wake_do_raid(rs);
4705 +}
4706 +
4707 +/* Raid mapping function. */
4708 +static int raid_map(struct dm_target *ti, struct bio *bio,
4709 +                   union map_info *map_context)
4710 +{
4711 +       /* I don't want to waste stripe cache capacity. */
4712 +       if (bio_rw(bio) == READA)
4713 +               return -EIO;
4714 +       else {
4715 +               struct raid_set *rs = ti->private;
4716 +
4717 +               /* REMOVEME: statistics. */
4718 +               atomic_inc(rs->stats +
4719 +                          (bio_data_dir(bio) == WRITE ?
4720 +                           S_BIOS_WRITE : S_BIOS_READ));
4721 +
4722 +               /*
4723 +                * Get io reference to be waiting for to drop
4724 +                * to zero on device suspension/destruction.
4725 +                */
4726 +               io_get(rs);
4727 +               bio->bi_sector -= ti->begin;    /* Remap sector. */
4728 +               queue_bio(rs, bio);             /* Queue to the daemon. */
4729 +               return DM_MAPIO_SUBMITTED;      /* Handle later. */
4730 +       }
4731 +}
4732 +
4733 +/* Device suspend. */
4734 +static void raid_postsuspend(struct dm_target *ti)
4735 +{
4736 +       struct raid_set *rs = ti->private;
4737 +       struct dm_dirty_log *dl = rs->recover.dl;
4738 +
4739 +       SetRSSuspended(rs);
4740 +
4741 +       if (RSRecover(rs))
4742 +               dm_rh_stop_recovery(rs->recover.rh); /* Wakes do_raid(). */
4743 +       else
4744 +               wake_do_raid(rs);
4745 +
4746 +       wait_ios(rs);   /* Wait for completion of all ios being processed. */
4747 +       if (dl->type->postsuspend && dl->type->postsuspend(dl))
4748 +               /* Suspend dirty log. */
4749 +               /* FIXME: need better error handling. */
4750 +               DMWARN("log suspend failed");
4751 +}
4752 +
4753 +/* Device resume. */
4754 +static void raid_resume(struct dm_target *ti)
4755 +{
4756 +       struct raid_set *rs = ti->private;
4757 +       struct recover *rec = &rs->recover;
4758 +       struct dm_dirty_log *dl = rec->dl;
4759 +
4760 +       if (dl->type->resume && dl->type->resume(dl))
4761 +               /* Resume dirty log. */
4762 +               /* FIXME: need better error handling. */
4763 +               DMWARN("log resume failed");
4764 +
4765 +       rec->nr_regions_to_recover =
4766 +           rec->nr_regions - dl->type->get_sync_count(dl);
4767 +
4768 +       ClearRSSuspended(rs);
4769 +
4770 +       /* Reset any unfinished recovery. */
4771 +       if (RSRecover(rs)) {
4772 +               recovery_region_reset(rs);
4773 +               dm_rh_start_recovery(rec->rh);/* Calls wake_do_raid(). */
4774 +       } else
4775 +               wake_do_raid(rs);
4776 +}
4777 +
4778 +static INLINE unsigned sc_size(struct raid_set *rs)
4779 +{
4780 +       return to_sector(atomic_read(&rs->sc.stripes) *
4781 +                        (sizeof(struct stripe) +
4782 +                         (sizeof(struct stripe_set) +
4783 +                          (sizeof(struct page_list) +
4784 +                           to_bytes(rs->set.io_size) *
4785 +                           rs->set.raid_devs)) +
4786 +                         (rs->recover.
4787 +                          end_jiffies ? 0 : to_bytes(rs->set.raid_devs *
4788 +                                                     rs->recover.
4789 +                                                     io_size))));
4790 +}
4791 +
4792 +/* REMOVEME: status output for development. */
4793 +static void
4794 +raid_devel_stats(struct dm_target *ti, char *result,
4795 +                unsigned *size, unsigned maxlen)
4796 +{
4797 +       unsigned chunks, stripes, sz = *size;
4798 +       unsigned long j;
4799 +       char buf[BDEVNAME_SIZE], *p;
4800 +       struct stats_map *sm, *sm_end = ARRAY_END(stats_map);
4801 +       struct raid_set *rs = ti->private;
4802 +       struct recover *rec = &rs->recover;
4803 +       struct timespec ts;
4804 +
4805 +       DMEMIT("%s ", version);
4806 +       DMEMIT("io_inprocess=%d ", atomic_read(&rs->io.in_process));
4807 +       DMEMIT("io_inprocess_max=%d ", atomic_read(&rs->io.in_process_max));
4808 +
4809 +       for (sm = stats_map; sm < sm_end; sm++)
4810 +               DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4811 +
4812 +       DMEMIT(" overwrite=%s ", RSCheckOverwrite(rs) ? "on" : "off");
4813 +       DMEMIT("sc=%u/%u/%u/%u/%u ", rs->set.chunk_size, rs->set.io_size,
4814 +              atomic_read(&rs->sc.stripes), rs->sc.hash.buckets,
4815 +              sc_size(rs));
4816 +
4817 +       j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4818 +           rec->start_jiffies;
4819 +       jiffies_to_timespec(j, &ts);
4820 +       sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4821 +       p = strchr(buf, '.');
4822 +       p[3] = 0;
4823 +
4824 +       DMEMIT("rg=%llu%s/%llu/%llu/%u %s ",
4825 +              (unsigned long long) rec->nr_regions_recovered,
4826 +              RSRegionGet(rs) ? "+" : "",
4827 +              (unsigned long long) rec->nr_regions_to_recover,
4828 +              (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4829 +
4830 +       rs_get_ra(rs, &stripes, &chunks);
4831 +       DMEMIT("ra=%u/%u ", stripes, chunks);
4832 +
4833 +       *size = sz;
4834 +}
4835 +
4836 +static int
4837 +raid_status(struct dm_target *ti, status_type_t type,
4838 +           char *result, unsigned maxlen)
4839 +{
4840 +       unsigned i, sz = 0;
4841 +       char buf[BDEVNAME_SIZE];
4842 +       struct raid_set *rs = ti->private;
4843 +
4844 +       switch (type) {
4845 +       case STATUSTYPE_INFO:
4846 +               /* REMOVEME: statistics. */
4847 +               if (RSDevelStats(rs))
4848 +                       raid_devel_stats(ti, result, &sz, maxlen);
4849 +
4850 +               DMEMIT("%u ", rs->set.raid_devs);
4851 +
4852 +               for (i = 0; i < rs->set.raid_devs; i++)
4853 +                       DMEMIT("%s ",
4854 +                              format_dev_t(buf, rs->dev[i].dev->bdev->bd_dev));
4855 +
4856 +               DMEMIT("1 ");
4857 +               for (i = 0; i < rs->set.raid_devs; i++) {
4858 +                       DMEMIT("%c", dev_operational(rs, i) ? 'A' : 'D');
4859 +
4860 +                       if (rs->set.raid_type->level == raid4 &&
4861 +                           i == rs->set.pi)
4862 +                               DMEMIT("p");
4863 +
4864 +                       if (rs->set.dev_to_init == i)
4865 +                               DMEMIT("i");
4866 +               }
4867 +
4868 +               break;
4869 +
4870 +       case STATUSTYPE_TABLE:
4871 +               sz = rs->recover.dl->type->status(rs->recover.dl, type,
4872 +                                                 result, maxlen);
4873 +               DMEMIT("%s %u ", rs->set.raid_type->name,
4874 +                      rs->set.raid_parms);
4875 +
4876 +               if (rs->set.raid_type->level == raid4)
4877 +                       DMEMIT("%d ", rs->set.pi_parm);
4878 +
4879 +               if (rs->set.raid_parms)
4880 +                       DMEMIT("%d ", rs->set.chunk_size_parm);
4881 +
4882 +               if (rs->set.raid_parms > 1)
4883 +                       DMEMIT("%d ", rs->sc.stripes_parm);
4884 +
4885 +               if (rs->set.raid_parms > 2)
4886 +                       DMEMIT("%d ", rs->set.io_size_parm);
4887 +
4888 +               if (rs->set.raid_parms > 3)
4889 +                       DMEMIT("%d ", rs->recover.io_size_parm);
4890 +
4891 +               if (rs->set.raid_parms > 4)
4892 +                       DMEMIT("%d ", rs->recover.bandwidth_parm);
4893 +
4894 +               DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4895 +
4896 +               for (i = 0; i < rs->set.raid_devs; i++)
4897 +                       DMEMIT("%s %llu ",
4898 +                              format_dev_t(buf,
4899 +                                           rs->dev[i].dev->bdev->bd_dev),
4900 +                              (unsigned long long) rs->dev[i].start);
4901 +       }
4902 +
4903 +       return 0;
4904 +}
4905 +
4906 +/*
4907 + * Message interface
4908 + */
4909 +enum raid_msg_actions {
4910 +       act_bw,                 /* Recovery bandwidth switch. */
4911 +       act_dev,                /* Device failure switch. */
4912 +       act_overwrite,          /* Stripe overwrite check. */
4913 +       act_read_ahead,         /* Set read ahead. */
4914 +       act_stats,              /* Development statistics switch. */
4915 +       act_sc,                 /* Stripe cache switch. */
4916 +
4917 +       act_on,                 /* Set entity on. */
4918 +       act_off,                /* Set entity off. */
4919 +       act_reset,              /* Reset entity. */
4920 +
4921 +       act_set = act_on,       /* Set # absolute. */
4922 +       act_grow = act_off,     /* Grow # by an amount. */
4923 +       act_shrink = act_reset, /* Shrink # by an amount. */
4924 +};
4925 +
4926 +/* Turn a delta to absolute. */
4927 +static int _absolute(unsigned long action, int act, int r)
4928 +{
4929 +       /* Make delta absolute. */
4930 +       if (test_bit(act_set, &action))
4931 +               ;
4932 +       else if (test_bit(act_grow, &action))
4933 +               r += act;
4934 +       else if (test_bit(act_shrink, &action))
4935 +               r = act - r;
4936 +       else
4937 +               r = -EINVAL;
4938 +
4939 +       return r;
4940 +}
4941 +
4942 + /* Change recovery io bandwidth. */
4943 +static int bandwidth_change(struct dm_msg *msg, void *context)
4944 +{
4945 +       struct raid_set *rs = context;
4946 +       int act = rs->recover.bandwidth;
4947 +       int bandwidth = DM_MSG_INT_ARG(msg);
4948 +
4949 +       if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4950 +               /* Make delta bandwidth absolute. */
4951 +               bandwidth = _absolute(msg->action, act, bandwidth);
4952 +
4953 +               /* Check range. */
4954 +               if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4955 +                       recover_set_bandwidth(rs, bandwidth);
4956 +                       return 0;
4957 +               }
4958 +       }
4959 +
4960 +       set_bit(dm_msg_ret_arg, &msg->ret);
4961 +       set_bit(dm_msg_ret_inval, &msg->ret);
4962 +       return -EINVAL;
4963 +}
4964 +
4965 +/* Change state of a device (running/offline). */
4966 +/* FIXME: this only works while recovering!. */
4967 +static int device_state(struct dm_msg *msg, void *context)
4968 +{
4969 +       int r;
4970 +       const char *str = "is already ";
4971 +       union dev_lookup dl = { .dev_name = DM_MSG_STR_ARG(msg) };
4972 +       struct raid_set *rs = context;
4973 +
4974 +       r = raid_dev_lookup(rs, strchr(dl.dev_name, ':') ?
4975 +                           bymajmin : byname, &dl);
4976 +       if (r == -ENODEV) {
4977 +               DMERR("device %s is no member of this set", dl.dev_name);
4978 +               return r;
4979 +       }
4980 +
4981 +       if (test_bit(act_off, &msg->action)) {
4982 +               if (dev_operational(rs, r))
4983 +                       str = "";
4984 +       } else if (!dev_operational(rs, r))
4985 +               str = "";
4986 +
4987 +       DMINFO("/dev/%s %s%s", dl.dev_name, str,
4988 +              test_bit(act_off, &msg->action) ? "offline" : "running");
4989 +
4990 +       return test_bit(act_off, &msg->action) ?
4991 +              raid_set_check_and_degrade(rs, NULL, r) :
4992 +              raid_set_check_and_upgrade(rs, r);
4993 +}
4994 +
4995 +/* Set/reset development feature flags. */
4996 +static int devel_flags(struct dm_msg *msg, void *context)
4997 +{
4998 +       struct raid_set *rs = context;
4999 +
5000 +       if (test_bit(act_on, &msg->action))
5001 +               return test_and_set_bit(msg->spec->parm,
5002 +                                       &rs->io.flags) ? -EPERM : 0;
5003 +       else if (test_bit(act_off, &msg->action))
5004 +               return test_and_clear_bit(msg->spec->parm,
5005 +                                         &rs->io.flags) ? 0 : -EPERM;
5006 +       else if (test_bit(act_reset, &msg->action)) {
5007 +               if (test_bit(act_stats, &msg->action)) {
5008 +                       stats_reset(rs);
5009 +                       goto on;
5010 +               } else if (test_bit(act_overwrite, &msg->action)) {
5011 +on:
5012 +                       set_bit(msg->spec->parm, &rs->io.flags);
5013 +                       return 0;
5014 +               }
5015 +       }
5016 +
5017 +       return -EINVAL;
5018 +}
5019 +
5020 + /* Set stripe and chunk read ahead pages. */
5021 +static int read_ahead_set(struct dm_msg *msg, void *context)
5022 +{
5023 +       int stripes = DM_MSG_INT_ARGS(msg, 0);
5024 +       int chunks  = DM_MSG_INT_ARGS(msg, 1);
5025 +
5026 +       if (range_ok(stripes, 1, 512) &&
5027 +           range_ok(chunks, 1, 512)) {
5028 +               rs_set_bdi(context, stripes, chunks);
5029 +               return 0;
5030 +       }
5031 +
5032 +       set_bit(dm_msg_ret_arg, &msg->ret);
5033 +       set_bit(dm_msg_ret_inval, &msg->ret);
5034 +       return -EINVAL;
5035 +}
5036 +
5037 +/* Resize the stripe cache. */
5038 +static int stripecache_resize(struct dm_msg *msg, void *context)
5039 +{
5040 +       int act, stripes;
5041 +       struct raid_set *rs = context;
5042 +
5043 +       /* Deny permission in case the daemon is still shrinking!. */
5044 +       if (atomic_read(&rs->sc.stripes_to_shrink))
5045 +               return -EPERM;
5046 +
5047 +       stripes = DM_MSG_INT_ARG(msg);
5048 +       if (stripes > 0) {
5049 +               act = atomic_read(&rs->sc.stripes);
5050 +
5051 +               /* Make delta stripes absolute. */
5052 +               stripes = _absolute(msg->action, act, stripes);
5053 +
5054 +               /*
5055 +                * Check range and that the # of stripes changes.
5056 +                * We can grow from gere but need to leave any
5057 +                * shrinking to the worker for synchronization.
5058 +                */
5059 +               if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX)) {
5060 +                       if (stripes > act)
5061 +                               return sc_grow(&rs->sc, stripes - act, SC_GROW);
5062 +                       else if (stripes < act) {
5063 +                               atomic_set(&rs->sc.stripes_to_shrink,
5064 +                                          act - stripes);
5065 +                               wake_do_raid(rs);
5066 +                       }
5067 +
5068 +                       return 0;
5069 +               }
5070 +       }
5071 +
5072 +       set_bit(dm_msg_ret_arg, &msg->ret);
5073 +       set_bit(dm_msg_ret_inval, &msg->ret);
5074 +       return -EINVAL;
5075 +}
5076 +
5077 +/* Parse the RAID message action. */
5078 +/*
5079 + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'     # e.g 'ba se 50'
5080 + * 'de{vice] o[ffline]/r[unning] DevName/maj:min' # e.g 'device o /dev/sda'
5081 + * "o[verwrite]  {on,of[f],r[eset]}'           # e.g. 'o of'
5082 + * "r[ead_ahead] set #stripes #chunks          # e.g. 'r se 3 2'
5083 + * 'sta[tistics] {on,of[f],r[eset]}'           # e.g. 'stat of'
5084 + * 'str[ipecache] {se[t],g[row],sh[rink]} #'   # e.g. 'stripe set 1024'
5085 + *
5086 + */
5087 +static int
5088 +raid_message(struct dm_target *ti, unsigned argc, char **argv)
5089 +{
5090 +       /* Variables to store the parsed parameters im. */
5091 +       static int i[2];
5092 +       static unsigned long *i_arg[] = {
5093 +               (unsigned long *) i + 0,
5094 +               (unsigned long *) i + 1,
5095 +       };
5096 +       static char *p;
5097 +       static unsigned long *p_arg[] = { (unsigned long *) &p };
5098 +
5099 +       /* Declare all message option strings. */
5100 +       static char *str_sgs[] = { "set", "grow", "shrink" };
5101 +       static char *str_dev[] = { "running", "offline" };
5102 +       static char *str_oor[] = { "on", "off", "reset" };
5103 +
5104 +       /* Declare all actions. */
5105 +       static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
5106 +       static unsigned long act_oor[] = { act_on, act_off, act_reset };
5107 +
5108 +       /* Bandwidth option. */
5109 +       static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
5110 +       static struct dm_message_argument bw_args = {
5111 +               1, i_arg, { dm_msg_int_t }
5112 +       };
5113 +
5114 +       /* Device option. */
5115 +       static struct dm_message_option dev_opt = { 2, str_dev, act_oor };
5116 +       static struct dm_message_argument dev_args = {
5117 +               1, p_arg, { dm_msg_base_t }
5118 +       };
5119 +
5120 +       /* Read ahead option. */
5121 +       static struct dm_message_option ra_opt = { 1, str_sgs, act_sgs };
5122 +       static struct dm_message_argument ra_args = {
5123 +               2, i_arg, { dm_msg_int_t, dm_msg_int_t }
5124 +       };
5125 +
5126 +       static struct dm_message_argument null_args = {
5127 +               0, NULL, { dm_msg_int_t }
5128 +       };
5129 +
5130 +       /* Overwrite and statistics option. */
5131 +       static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
5132 +
5133 +       /* Sripecache option. */
5134 +       static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
5135 +
5136 +       /* Declare messages. */
5137 +       static struct dm_msg_spec specs[] = {
5138 +               { "bandwidth", act_bw, &bw_opt, &bw_args,
5139 +                 0, bandwidth_change },
5140 +               { "device", act_dev, &dev_opt, &dev_args,
5141 +                 0, device_state },
5142 +               { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
5143 +                 RS_CHECK_OVERWRITE, devel_flags },
5144 +               { "read_ahead", act_read_ahead, &ra_opt, &ra_args,
5145 +                 0, read_ahead_set },
5146 +               { "statistics", act_stats, &ovr_stats_opt, &null_args,
5147 +                 RS_DEVEL_STATS, devel_flags },
5148 +               { "stripecache", act_sc, &stripe_opt, &bw_args,
5149 +                 0, stripecache_resize },
5150 +       };
5151 +
5152 +       /* The message for the parser. */
5153 +       struct dm_msg msg = {
5154 +               .num_specs = ARRAY_SIZE(specs),
5155 +               .specs = specs,
5156 +       };
5157 +
5158 +       return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
5159 +}
5160 +/*
5161 + * END message interface
5162 + */
5163 +
5164 +static struct target_type raid_target = {
5165 +       .name = "raid45",
5166 +       .version = {1, 0, 0},
5167 +       .module = THIS_MODULE,
5168 +       .ctr = raid_ctr,
5169 +       .dtr = raid_dtr,
5170 +       .map = raid_map,
5171 +       .postsuspend = raid_postsuspend,
5172 +       .resume = raid_resume,
5173 +       .status = raid_status,
5174 +       .message = raid_message,
5175 +};
5176 +
5177 +static void init_exit(const char *bad_msg, const char *good_msg, int r)
5178 +{
5179 +       if (r)
5180 +               DMERR("Failed to %sregister target [%d]", bad_msg, r);
5181 +       else
5182 +               DMINFO("%s %s", good_msg, version);
5183 +}
5184 +
5185 +static int __init dm_raid_init(void)
5186 +{
5187 +       int r;
5188 +
5189 +       r = dm_register_target(&raid_target);
5190 +       init_exit("", "initialized", r);
5191 +       return r;
5192 +}
5193 +
5194 +static void __exit dm_raid_exit(void)
5195 +{
5196 +       int r;
5197 +
5198 +       r = dm_unregister_target(&raid_target);
5199 +       init_exit("un", "exit", r);
5200 +}
5201 +
5202 +/* Module hooks. */
5203 +module_init(dm_raid_init);
5204 +module_exit(dm_raid_exit);
5205 +
5206 +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
5207 +MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
5208 +MODULE_LICENSE("GPL");
5209 --- /dev/null
5210 +++ b/drivers/md/dm-raid45.h
5211 @@ -0,0 +1,28 @@
5212 +/*
5213 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
5214 + *
5215 + * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
5216 + *
5217 + * Locking definitions for the device-mapper RAID45 target.
5218 + *
5219 + * This file is released under the GPL.
5220 + *
5221 + */
5222 +
5223 +#ifndef _DM_RAID45_H
5224 +#define _DM_RAID45_H
5225 +
5226 +/* Factor out to dm.h! */
5227 +#define        STR_LEN(ptr, str)       (ptr), (str), strlen((ptr))
5228 +
5229 +enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
5230 +
5231 +struct dm_raid45_locking_type {
5232 +       /* Request a lock on a stripe. */
5233 +       void* (*lock)(sector_t key, enum dm_lock_type type);
5234 +
5235 +       /* Release a lock on a stripe. */
5236 +       void (*unlock)(void *lock_handle);
5237 +};
5238 +
5239 +#endif
5240 --- /dev/null
5241 +++ b/drivers/md/dm-regions.c
5242 @@ -0,0 +1,723 @@
5243 +/*
5244 + * Copyright (C) 2003 Sistina Software Limited.
5245 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
5246 + *
5247 + * This file is released under the GPL.
5248 + */
5249 +
5250 +#include <linux/dm-dirty-log.h>
5251 +#include <linux/dm-regions.h>
5252 +
5253 +#include <linux/ctype.h>
5254 +#include <linux/init.h>
5255 +#include <linux/module.h>
5256 +#include <linux/vmalloc.h>
5257 +
5258 +#include "dm.h"
5259 +#include "dm-bio-list.h"
5260 +
5261 +#define        DM_MSG_PREFIX   "region hash"
5262 +
5263 +/*-----------------------------------------------------------------
5264 + * Region hash
5265 + *
5266 + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions.
5267 + * Each region can be in one of three states:
5268 + *
5269 + * o clean
5270 + * o dirty,
5271 + * o nosync.
5272 + *
5273 + * There is no need to put clean regions in the hash.
5274 + *
5275 + *
5276 + * In addition to being present in the hash table a region _may_
5277 + * be present on one of three lists.
5278 + *
5279 + *   clean_regions: Regions on this list have no io pending to
5280 + *   them, they are in sync, we are no longer interested in them,
5281 + *   they are dull.  dm_rh_update_states() will remove them from the
5282 + *   hash table.
5283 + *
5284 + *   quiesced_regions: These regions have been spun down, ready
5285 + *   for recovery.  dm_rh_recovery_start() will remove regions from
5286 + *   this list and hand them to the caller, which will schedule the
5287 + *   recovery io.
5288 + *
5289 + *   recovered_regions: Regions that the caller has successfully
5290 + *   recovered.  dm_rh_update_states() will now schedule any delayed
5291 + *   io, up the recovery_count, and remove the region from the hash.
5292 + *
5293 + * There are 2 locks:
5294 + *   A rw spin lock 'hash_lock' protects just the hash table,
5295 + *   this is never held in write mode from interrupt context,
5296 + *   which I believe means that we only have to disable irqs when
5297 + *   doing a write lock.
5298 + *
5299 + *   An ordinary spin lock 'region_lock' that protects the three
5300 + *   lists in the region_hash, with the 'state', 'list' and
5301 + *   'delayed_bios' fields of the regions.  This is used from irq
5302 + *   context, so all other uses will have to suspend local irqs.
5303 + *---------------------------------------------------------------*/
5304 +struct region_hash {
5305 +       unsigned max_recovery; /* Max # of regions to recover in parallel */
5306 +
5307 +       /* Callback function to dispatch queued writes on recovered regions. */
5308 +       void (*dispatch)(void *context, struct bio_list *bios, int error);
5309 +       void *dispatch_context;
5310 +
5311 +       /* Callback function to wakeup callers worker thread. */
5312 +       void (*wake)(void *context);
5313 +       void *wake_context;
5314 +
5315 +       uint32_t region_size;
5316 +       unsigned region_shift;
5317 +
5318 +       /* holds persistent region state */
5319 +       struct dm_dirty_log *log;
5320 +
5321 +       /* hash table */
5322 +       rwlock_t hash_lock;
5323 +       mempool_t *region_pool;
5324 +       unsigned mask;
5325 +       unsigned nr_buckets;
5326 +       unsigned prime;
5327 +       unsigned shift;
5328 +       struct list_head *buckets;
5329 +
5330 +       spinlock_t region_lock;
5331 +       atomic_t recovery_in_flight;
5332 +       struct semaphore recovery_count;
5333 +       struct list_head clean_regions;
5334 +       struct list_head quiesced_regions;
5335 +       struct list_head recovered_regions;
5336 +       struct list_head failed_recovered_regions;
5337 +};
5338 +
5339 +struct region {
5340 +       region_t key;
5341 +       enum dm_rh_region_states state;
5342 +       void *context;  /* Caller context. */
5343 +
5344 +       struct list_head hash_list;
5345 +       struct list_head list;
5346 +
5347 +       atomic_t pending;
5348 +       struct bio_list delayed_bios;
5349 +};
5350 +
5351 +/*
5352 + * Conversion fns
5353 + */
5354 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector)
5355 +{
5356 +       return sector >> ((struct region_hash *) rh)->region_shift;
5357 +}
5358 +EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
5359 +
5360 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio)
5361 +{
5362 +       return dm_rh_sector_to_region(rh, bio->bi_sector);
5363 +}
5364 +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
5365 +
5366 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region)
5367 +{
5368 +       return region << ((struct region_hash *) rh)->region_shift;
5369 +}
5370 +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
5371 +
5372 +/*
5373 + * Retrival fns.
5374 + */
5375 +region_t dm_rh_get_region_key(struct dm_region *reg)
5376 +{
5377 +       return ((struct region *) reg)->key;
5378 +}
5379 +EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
5380 +
5381 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh)
5382 +{
5383 +       return ((struct region_hash *) rh)->region_size;
5384 +}
5385 +EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
5386 +
5387 +/* Squirrel a context with a region. */
5388 +void *dm_rh_reg_get_context(struct dm_region *reg)
5389 +{
5390 +       return ((struct region *) reg)->context;
5391 +}
5392 +EXPORT_SYMBOL_GPL(dm_rh_reg_get_context);
5393 +
5394 +void dm_rh_reg_set_context(struct dm_region *reg, void *context)
5395 +{
5396 +       ((struct region *) reg)->context = context;
5397 +}
5398 +EXPORT_SYMBOL_GPL(dm_rh_reg_set_context);
5399 +
5400 +/*
5401 + * Create region hash client.
5402 + */
5403 +#define MIN_REGIONS 64
5404 +struct dm_rh_client *dm_rh_client_create(
5405 +                unsigned max_recovery,
5406 +                void (*dispatch)(void *dispatch_context,
5407 +                                 struct bio_list *bios, int error),
5408 +                void *dispatch_context,
5409 +                void (*wake)(void *wake_context), void *wake_context,
5410 +                struct dm_dirty_log *log, uint32_t region_size,
5411 +                region_t nr_regions)
5412 +{
5413 +       unsigned i;
5414 +       unsigned nr_buckets, max_buckets;
5415 +       unsigned hash_primes[] = {
5416 +               /* Table of primes for rh_hash/table size optimization. */
5417 +               3, 7, 13, 27, 53, 97, 193, 389, 769,
5418 +               1543, 3079, 6151, 12289, 24593,
5419 +       };
5420 +       struct region_hash *rh;
5421 +
5422 +       if (region_size & (region_size - 1)) {
5423 +               DMERR("region size must be 2^^n");
5424 +               return ERR_PTR(-EINVAL);
5425 +       }
5426 +
5427 +       /* Calculate a suitable number of buckets for our hash table. */
5428 +       max_buckets = nr_regions >> 6;
5429 +       for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
5430 +               ;
5431 +       nr_buckets >>= 1;
5432 +
5433 +       rh = kmalloc(sizeof(*rh), GFP_KERNEL);
5434 +       if (!rh) {
5435 +               DMERR("unable to allocate region hash memory");
5436 +               return ERR_PTR(-ENOMEM);
5437 +       }
5438 +
5439 +       rh->max_recovery = max_recovery;
5440 +       rh->dispatch = dispatch;
5441 +       rh->dispatch_context = dispatch_context;
5442 +       rh->wake = wake;
5443 +       rh->wake_context = wake_context;
5444 +       rh->log = log;
5445 +       rh->region_size = region_size;
5446 +       rh->region_shift = ffs(region_size) - 1;
5447 +       rwlock_init(&rh->hash_lock);
5448 +       rh->mask = nr_buckets - 1;
5449 +       rh->nr_buckets = nr_buckets;
5450 +       rh->shift = ffs(nr_buckets);
5451 +
5452 +       /* Check prime array limits. */
5453 +       i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ?
5454 +           ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2;
5455 +       rh->prime = hash_primes[i];
5456 +
5457 +       rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
5458 +       if (!rh->buckets) {
5459 +               DMERR("unable to allocate region hash bucket memory");
5460 +               kfree(rh);
5461 +               return ERR_PTR(-ENOMEM);
5462 +       }
5463 +
5464 +       for (i = 0; i < nr_buckets; i++)
5465 +               INIT_LIST_HEAD(rh->buckets + i);
5466 +
5467 +       spin_lock_init(&rh->region_lock);
5468 +       sema_init(&rh->recovery_count, 0);
5469 +       atomic_set(&rh->recovery_in_flight, 0);
5470 +       INIT_LIST_HEAD(&rh->clean_regions);
5471 +       INIT_LIST_HEAD(&rh->quiesced_regions);
5472 +       INIT_LIST_HEAD(&rh->recovered_regions);
5473 +       INIT_LIST_HEAD(&rh->failed_recovered_regions);
5474 +
5475 +       rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
5476 +                                                     sizeof(struct region));
5477 +       if (!rh->region_pool) {
5478 +               vfree(rh->buckets);
5479 +               kfree(rh);
5480 +               rh = ERR_PTR(-ENOMEM);
5481 +       }
5482 +
5483 +       return (struct dm_rh_client *) rh;
5484 +}
5485 +EXPORT_SYMBOL_GPL(dm_rh_client_create);
5486 +
5487 +void dm_rh_client_destroy(struct dm_rh_client *rh_in)
5488 +{
5489 +       unsigned h;
5490 +       struct region_hash *rh = (struct region_hash *) rh_in;
5491 +       struct region *reg, *tmp;
5492 +
5493 +       BUG_ON(!list_empty(&rh->quiesced_regions));
5494 +
5495 +       for (h = 0; h < rh->nr_buckets; h++) {
5496 +               list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) {
5497 +                       BUG_ON(atomic_read(&reg->pending));
5498 +                       mempool_free(reg, rh->region_pool);
5499 +               }
5500 +       }
5501 +
5502 +       if (rh->region_pool)
5503 +               mempool_destroy(rh->region_pool);
5504 +
5505 +       vfree(rh->buckets);
5506 +       kfree(rh);
5507 +}
5508 +EXPORT_SYMBOL_GPL(dm_rh_client_destroy);
5509 +
5510 +static inline unsigned rh_hash(struct region_hash *rh, region_t region)
5511 +{
5512 +       return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
5513 +}
5514 +
5515 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
5516 +{
5517 +       struct region *reg;
5518 +       struct list_head *bucket = rh->buckets + rh_hash(rh, region);
5519 +
5520 +       list_for_each_entry(reg, bucket, hash_list) {
5521 +               if (reg->key == region)
5522 +                       return reg;
5523 +       }
5524 +
5525 +       return NULL;
5526 +}
5527 +
5528 +static void __rh_insert(struct region_hash *rh, struct region *reg)
5529 +{
5530 +       list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
5531 +}
5532 +
5533 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
5534 +{
5535 +       struct region *reg, *nreg;
5536 +
5537 +       read_unlock(&rh->hash_lock);
5538 +       nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
5539 +       if (unlikely(!nreg))
5540 +               nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
5541 +
5542 +       nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
5543 +                     DM_RH_CLEAN : DM_RH_NOSYNC;
5544 +       nreg->key = region;
5545 +       INIT_LIST_HEAD(&nreg->list);
5546 +       atomic_set(&nreg->pending, 0);
5547 +       bio_list_init(&nreg->delayed_bios);
5548 +
5549 +       write_lock_irq(&rh->hash_lock);
5550 +       reg = __rh_lookup(rh, region);
5551 +       if (reg)
5552 +               /* We lost the race. */
5553 +               mempool_free(nreg, rh->region_pool);
5554 +       else {
5555 +               __rh_insert(rh, nreg);
5556 +               if (nreg->state == DM_RH_CLEAN) {
5557 +                       spin_lock(&rh->region_lock);
5558 +                       list_add(&nreg->list, &rh->clean_regions);
5559 +                       spin_unlock(&rh->region_lock);
5560 +               }
5561 +
5562 +               reg = nreg;
5563 +       }
5564 +
5565 +       write_unlock_irq(&rh->hash_lock);
5566 +       read_lock(&rh->hash_lock);
5567 +       return reg;
5568 +}
5569 +
5570 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
5571 +{
5572 +       struct region *reg;
5573 +
5574 +       reg = __rh_lookup(rh, region);
5575 +       return reg ? reg : __rh_alloc(rh, region);
5576 +}
5577 +
5578 +int dm_rh_get_state(struct dm_rh_client *rh_in, region_t region, int may_block)
5579 +{
5580 +       int r;
5581 +       struct region_hash *rh = (struct region_hash *) rh_in;
5582 +       struct region *reg;
5583 +
5584 +       read_lock(&rh->hash_lock);
5585 +       reg = __rh_lookup(rh, region);
5586 +       read_unlock(&rh->hash_lock);
5587 +
5588 +       if (reg)
5589 +               return reg->state;
5590 +
5591 +       /*
5592 +        * The region wasn't in the hash, so we fall back to the dirty log.
5593 +        */
5594 +       r = rh->log->type->in_sync(rh->log, region, may_block);
5595 +
5596 +       /*
5597 +        * Any error from the dirty log (eg. -EWOULDBLOCK)
5598 +        * gets taken as a DM_RH_NOSYNC
5599 +        */
5600 +       return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
5601 +}
5602 +EXPORT_SYMBOL_GPL(dm_rh_get_state);
5603 +
5604 +void dm_rh_set_state(struct dm_rh_client *rh_in, region_t region,
5605 +                    enum dm_rh_region_states state, int may_block)
5606 +{
5607 +       struct region_hash *rh = (struct region_hash *) rh_in;
5608 +       struct region *reg;
5609 +       struct dm_dirty_log *log = rh->log;
5610 +
5611 +       if (state == DM_RH_NOSYNC)
5612 +               log->type->set_region_sync(log, region, 0);
5613 +       else if (state == DM_RH_CLEAN)
5614 +               log->type->clear_region(log, region);
5615 +       else if (state == DM_RH_DIRTY)
5616 +               log->type->mark_region(log, region);
5617 +
5618 +       read_lock(&rh->hash_lock);
5619 +       reg = __rh_find(rh, region);
5620 +       reg->state = state;
5621 +       read_unlock(&rh->hash_lock);
5622 +}
5623 +EXPORT_SYMBOL_GPL(dm_rh_set_state);
5624 +
5625 +void dm_rh_update_states(struct dm_rh_client *rh_in, int errors_handled)
5626 +{
5627 +       struct region_hash *rh = (struct region_hash *) rh_in;
5628 +       struct region *reg, *next;
5629 +       LIST_HEAD(clean);
5630 +       LIST_HEAD(recovered);
5631 +       LIST_HEAD(failed_recovered);
5632 +
5633 +       /*
5634 +        * Quickly grab the lists and remove any regions from hash.
5635 +        */
5636 +       write_lock_irq(&rh->hash_lock);
5637 +       spin_lock(&rh->region_lock);
5638 +       if (!list_empty(&rh->clean_regions)) {
5639 +               list_splice_init(&rh->clean_regions, &clean);
5640 +
5641 +               list_for_each_entry(reg, &clean, list)
5642 +                       list_del(&reg->hash_list);
5643 +       }
5644 +
5645 +       if (!list_empty(&rh->recovered_regions)) {
5646 +               list_splice_init(&rh->recovered_regions, &recovered);
5647 +
5648 +               list_for_each_entry(reg, &recovered, list)
5649 +                       list_del(&reg->hash_list);
5650 +       }
5651 +
5652 +       if (!list_empty(&rh->failed_recovered_regions)) {
5653 +               list_splice_init(&rh->failed_recovered_regions,
5654 +                                &failed_recovered);
5655 +
5656 +               list_for_each_entry(reg, &recovered, list)
5657 +                       list_del(&reg->hash_list);
5658 +       }
5659 +
5660 +       spin_unlock(&rh->region_lock);
5661 +       write_unlock_irq(&rh->hash_lock);
5662 +
5663 +       /*
5664 +        * All the regions on the recovered and clean lists have
5665 +        * now been pulled out of the system, so no need to do
5666 +        * any more locking.
5667 +        */
5668 +       list_for_each_entry_safe(reg, next, &recovered, list) {
5669 +               rh->log->type->clear_region(rh->log, reg->key);
5670 +               rh->log->type->set_region_sync(rh->log, reg->key, 1);
5671 +
5672 +               if (reg->delayed_bios.head)
5673 +                       rh->dispatch(rh->dispatch_context,
5674 +                                    &reg->delayed_bios, 0);
5675 +
5676 +               up(&rh->recovery_count);
5677 +               mempool_free(reg, rh->region_pool);
5678 +       }
5679 +
5680 +       list_for_each_entry_safe(reg, next, &failed_recovered, list) {
5681 +               rh->log->type->set_region_sync(rh->log, reg->key,
5682 +                                              errors_handled ? 0 : 1);
5683 +               if (reg->delayed_bios.head)
5684 +                       rh->dispatch(rh->dispatch_context,
5685 +                                    &reg->delayed_bios, -EIO);
5686 +
5687 +               up(&rh->recovery_count);
5688 +               mempool_free(reg, rh->region_pool);
5689 +       }
5690 +
5691 +       list_for_each_entry_safe(reg, next, &clean, list) {
5692 +               rh->log->type->clear_region(rh->log, reg->key);
5693 +               mempool_free(reg, rh->region_pool);
5694 +       }
5695 +
5696 +       dm_rh_flush(rh_in);
5697 +}
5698 +EXPORT_SYMBOL_GPL(dm_rh_update_states);
5699 +
5700 +void dm_rh_inc(struct dm_rh_client *rh_in, region_t region)
5701 +{
5702 +       struct region_hash *rh = (struct region_hash *) rh_in;
5703 +       struct region *reg;
5704 +
5705 +       read_lock(&rh->hash_lock);
5706 +       reg = __rh_find(rh, region);
5707 +       if (reg->state == DM_RH_CLEAN) {
5708 +               rh->log->type->mark_region(rh->log, reg->key);
5709 +
5710 +               spin_lock_irq(&rh->region_lock);
5711 +               reg->state = DM_RH_DIRTY;
5712 +               list_del_init(&reg->list);      /* Take off the clean list. */
5713 +               spin_unlock_irq(&rh->region_lock);
5714 +       }
5715 +
5716 +       atomic_inc(&reg->pending);
5717 +       read_unlock(&rh->hash_lock);
5718 +}
5719 +EXPORT_SYMBOL_GPL(dm_rh_inc);
5720 +
5721 +void dm_rh_inc_pending(struct dm_rh_client *rh_in, struct bio_list *bios)
5722 +{
5723 +       struct bio *bio;
5724 +
5725 +       for (bio = bios->head; bio; bio = bio->bi_next)
5726 +               dm_rh_inc(rh_in, dm_rh_bio_to_region(rh_in, bio));
5727 +}
5728 +EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
5729 +
5730 +int dm_rh_dec(struct dm_rh_client *rh_in, region_t region)
5731 +{
5732 +       int r = 0;
5733 +       struct region_hash *rh = (struct region_hash *) rh_in;
5734 +       struct region *reg;
5735 +
5736 +       read_lock(&rh->hash_lock);
5737 +       reg = __rh_lookup(rh, region);
5738 +       read_unlock(&rh->hash_lock);
5739 +
5740 +       BUG_ON(!reg);
5741 +
5742 +       if (atomic_dec_and_test(&reg->pending)) {
5743 +               unsigned long flags;
5744 +
5745 +               /*
5746 +                * There is no pending I/O for this region.
5747 +                * We can move the region to corresponding list for next action.
5748 +                * At this point, the region is not yet connected to any list.
5749 +                *
5750 +                * If the state is DM_RH_NOSYNC, the region should be kept off
5751 +                * from clean list.
5752 +                * The hash entry for DM_RH_NOSYNC will remain in memory
5753 +                * until the region is recovered or the map is reloaded.
5754 +                */
5755 +
5756 +               spin_lock_irqsave(&rh->region_lock, flags);
5757 +               if (reg->state == DM_RH_RECOVERING)
5758 +                       list_add_tail(&reg->list, &rh->quiesced_regions);
5759 +               else {
5760 +                       reg->state = DM_RH_CLEAN;
5761 +                       list_add(&reg->list, &rh->clean_regions);
5762 +               }
5763 +               spin_unlock_irqrestore(&rh->region_lock, flags);
5764 +
5765 +               r = 1;
5766 +       }
5767 +
5768 +       return r;
5769 +}
5770 +EXPORT_SYMBOL_GPL(dm_rh_dec);
5771 +
5772 +/*
5773 + * Starts quiescing a region in preparation for recovery.
5774 + */
5775 +static int __rh_recovery_prepare(struct region_hash *rh)
5776 +{
5777 +       int r;
5778 +       region_t region;
5779 +       struct region *reg;
5780 +
5781 +       /*
5782 +        * Ask the dirty log what's next.
5783 +        */
5784 +       r = rh->log->type->get_resync_work(rh->log, &region);
5785 +       if (r <= 0)
5786 +               return r;
5787 +
5788 +       /*
5789 +        * Get this region, and start it quiescing
5790 +        * by setting the recovering flag.
5791 +        */
5792 +       read_lock(&rh->hash_lock);
5793 +       reg = __rh_find(rh, region);
5794 +       read_unlock(&rh->hash_lock);
5795 +
5796 +       spin_lock_irq(&rh->region_lock);
5797 +
5798 +       reg->state = DM_RH_RECOVERING;
5799 +
5800 +       /* Already quiesced ? */
5801 +       list_del_init(&reg->list);
5802 +       if (!atomic_read(&reg->pending))
5803 +               list_add(&reg->list, &rh->quiesced_regions);
5804 +
5805 +       spin_unlock_irq(&rh->region_lock);
5806 +       return 1;
5807 +}
5808 +
5809 +int dm_rh_recovery_prepare(struct dm_rh_client *rh_in)
5810 +{
5811 +       int r = 0;
5812 +       struct region_hash *rh = (struct region_hash *) rh_in;
5813 +
5814 +       /* Extra reference to avoid race with rh_stop_recovery */
5815 +       atomic_inc(&rh->recovery_in_flight);
5816 +
5817 +       while (!down_trylock(&rh->recovery_count)) {
5818 +               atomic_inc(&rh->recovery_in_flight);
5819 +
5820 +               if (__rh_recovery_prepare(rh) <= 0) {
5821 +                       atomic_dec(&rh->recovery_in_flight);
5822 +                       up(&rh->recovery_count);
5823 +                       r = -ENOENT;
5824 +                       break;
5825 +               }
5826 +       }
5827 +
5828 +       /* Drop the extra reference. */
5829 +       if (atomic_dec_and_test(&rh->recovery_in_flight))
5830 +               r = -ESRCH;
5831 +
5832 +       return r;
5833 +}
5834 +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
5835 +
5836 +/*
5837 + * Returns any quiesced regions.
5838 + */
5839 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh_in)
5840 +{
5841 +       struct region_hash *rh = (struct region_hash *) rh_in;
5842 +       struct region *reg = NULL;
5843 +
5844 +       spin_lock_irq(&rh->region_lock);
5845 +       if (!list_empty(&rh->quiesced_regions)) {
5846 +               reg = list_entry(rh->quiesced_regions.next,
5847 +                                struct region, list);
5848 +               list_del_init(&reg->list); /* Remove from the quiesced list. */
5849 +       }
5850 +
5851 +       spin_unlock_irq(&rh->region_lock);
5852 +       return (struct dm_region *) reg;
5853 +}
5854 +EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
5855 +
5856 +/*
5857 + * Put region on list of recovered ones.
5858 + */
5859 +void dm_rh_recovery_end(struct dm_rh_client *rh_in, struct dm_region *reg_in,
5860 +                       int error)
5861 +{
5862 +       struct region_hash *rh = (struct region_hash *) rh_in;
5863 +       struct region *reg = (struct region *) reg_in;
5864 +
5865 +       spin_lock_irq(&rh->region_lock);
5866 +       if (error) {
5867 +               reg->state = DM_RH_NOSYNC;
5868 +               list_add(&reg->list, &rh->failed_recovered_regions);
5869 +       } else
5870 +               list_add(&reg->list, &rh->recovered_regions);
5871 +
5872 +       atomic_dec(&rh->recovery_in_flight);
5873 +       spin_unlock_irq(&rh->region_lock);
5874 +
5875 +       rh->wake(rh->wake_context);
5876 +       BUG_ON(atomic_read(&rh->recovery_in_flight) < 0);
5877 +}
5878 +EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
5879 +
5880 +/* Return recovery in flight count. */
5881 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh_in)
5882 +{
5883 +       return atomic_read(&((struct region_hash *) rh_in)->recovery_in_flight);
5884 +}
5885 +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
5886 +
5887 +int dm_rh_flush(struct dm_rh_client *rh_in)
5888 +{
5889 +       struct region_hash *rh = (struct region_hash *) rh_in;
5890 +
5891 +       return rh->log->type->flush(rh->log);
5892 +}
5893 +EXPORT_SYMBOL_GPL(dm_rh_flush);
5894 +
5895 +void dm_rh_delay_by_region(struct dm_rh_client *rh_in,
5896 +                          struct bio *bio, region_t region)
5897 +{
5898 +       struct region_hash *rh = (struct region_hash *) rh_in;
5899 +       struct region *reg;
5900 +
5901 +       /* FIXME: locking. */
5902 +       read_lock(&rh->hash_lock);
5903 +       reg = __rh_find(rh, region);
5904 +       bio_list_add(&reg->delayed_bios, bio);
5905 +       read_unlock(&rh->hash_lock);
5906 +}
5907 +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
5908 +
5909 +void dm_rh_delay(struct dm_rh_client *rh_in, struct bio *bio)
5910 +{
5911 +       return dm_rh_delay_by_region(rh_in, bio,
5912 +                                    dm_rh_bio_to_region(rh_in, bio));
5913 +}
5914 +EXPORT_SYMBOL_GPL(dm_rh_delay);
5915 +
5916 +void dm_rh_dispatch_bios(struct dm_rh_client *rh_in,
5917 +                        region_t region, int error)
5918 +{
5919 +       struct region_hash *rh = (struct region_hash *) rh_in;
5920 +       struct region *reg;
5921 +       struct bio_list delayed_bios;
5922 +
5923 +       /* FIXME: locking. */
5924 +       read_lock(&rh->hash_lock);
5925 +       reg = __rh_find(rh, region);
5926 +       BUG_ON(!reg);
5927 +       delayed_bios = reg->delayed_bios;
5928 +       bio_list_init(&reg->delayed_bios);
5929 +       read_unlock(&rh->hash_lock);
5930 +
5931 +       if (delayed_bios.head)
5932 +               rh->dispatch(rh->dispatch_context, &delayed_bios, error);
5933 +
5934 +       up(&rh->recovery_count);
5935 +}
5936 +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios);
5937 +
5938 +void dm_rh_stop_recovery(struct dm_rh_client *rh_in)
5939 +{
5940 +       int i;
5941 +       struct region_hash *rh = (struct region_hash *) rh_in;
5942 +
5943 +       rh->wake(rh->wake_context);
5944 +
5945 +       /* wait for any recovering regions */
5946 +       for (i = 0; i < rh->max_recovery; i++)
5947 +               down(&rh->recovery_count);
5948 +}
5949 +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
5950 +
5951 +void dm_rh_start_recovery(struct dm_rh_client *rh_in)
5952 +{
5953 +       int i;
5954 +       struct region_hash *rh = (struct region_hash *) rh_in;
5955 +
5956 +       for (i = 0; i < rh->max_recovery; i++)
5957 +               up(&rh->recovery_count);
5958 +
5959 +       rh->wake(rh->wake_context);
5960 +}
5961 +EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
5962 +
5963 +MODULE_DESCRIPTION(DM_NAME " region hash");
5964 +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <hjm@redhat.com>");
5965 +MODULE_LICENSE("GPL");
5966 --- a/drivers/md/Kconfig
5967 +++ b/drivers/md/Kconfig
5968 @@ -269,6 +269,14 @@ config DM_DELAY
5969
5970         If unsure, say N.
5971
5972 +config DM_RAID45
5973 +       tristate "RAID 4/5 target (EXPERIMENTAL)"
5974 +       depends on BLK_DEV_DM && EXPERIMENTAL
5975 +       ---help---
5976 +       A target that supports RAID4 and RAID5 mappings.
5977 +
5978 +       If unsure, say N.
5979 +
5980  config DM_UEVENT
5981         bool "DM uevents (EXPERIMENTAL)"
5982         depends on BLK_DEV_DM && EXPERIMENTAL
5983 --- a/drivers/md/Makefile
5984 +++ b/drivers/md/Makefile
5985 @@ -34,7 +34,9 @@ obj-$(CONFIG_DM_CRYPT)                += dm-crypt.o
5986  obj-$(CONFIG_DM_DELAY)         += dm-delay.o
5987  obj-$(CONFIG_DM_MULTIPATH)     += dm-multipath.o dm-round-robin.o
5988  obj-$(CONFIG_DM_SNAPSHOT)      += dm-snapshot.o
5989 -obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o dm-log.o
5990 +obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o dm-regions.o dm-log.o
5991 +obj-$(CONFIG_DM_RAID45)                += dm-raid45.o dm-log.o dm-memcache.o \
5992 +                                  dm-regions.o dm-message.o
5993  obj-$(CONFIG_DM_ZERO)          += dm-zero.o
5994
5995  quiet_cmd_unroll = UNROLL  $@
5996 --- /dev/null
5997 +++ b/include/linux/dm-regions.h
5998 @@ -0,0 +1,115 @@
5999 +/*
6000 + * Copyright (C) 2003 Sistina Software Limited.
6001 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6002 + *
6003 + * Device-Mapper dirty region hash interface.
6004 + *
6005 + * This file is released under the GPL.
6006 + */
6007 +
6008 +#ifndef DM_REGION_HASH_H
6009 +#define DM_REGION_HASH_H
6010 +
6011 +#include <linux/dm-dirty-log.h>
6012 +
6013 +/*-----------------------------------------------------------------
6014 + * Region hash
6015 + *----------------------------------------------------------------*/
6016 +struct dm_rh_client;
6017 +struct dm_region;
6018 +
6019 +/*
6020 + * States a region can have.
6021 + */
6022 +enum dm_rh_region_states {
6023 +       DM_RH_CLEAN      = 0x01,        /* No writes in flight. */
6024 +       DM_RH_DIRTY      = 0x02,        /* Writes in flight. */
6025 +       DM_RH_NOSYNC     = 0x04,        /* Out of sync. */
6026 +       DM_RH_RECOVERING = 0x08,        /* Under resynchronization. */
6027 +};
6028 +
6029 +/*
6030 + * Region hash create/destroy.
6031 + */
6032 +struct bio_list;
6033 +struct dm_rh_client *dm_rh_client_create(
6034 +                unsigned max_recovery,
6035 +                void (*dispatch)(void *dispatch_context,
6036 +                                 struct bio_list *bios, int error),
6037 +                void *dispatch_context,
6038 +                void (*wake)(void *wake_context), void *wake_context,
6039 +                struct dm_dirty_log *log, uint32_t region_size,
6040 +                region_t nr_regions);
6041 +void dm_rh_client_destroy(struct dm_rh_client *rh);
6042 +
6043 +/*
6044 + * Conversion fns:
6045 + *
6046 + *   bio -> region
6047 + *   sector -> region
6048 + *   region -> sector
6049 + */
6050 +region_t dm_rh_bio_to_region(struct dm_rh_client *rh, struct bio *bio);
6051 +region_t dm_rh_sector_to_region(struct dm_rh_client *rh, sector_t sector);
6052 +sector_t dm_rh_region_to_sector(struct dm_rh_client *rh, region_t region);
6053 +
6054 +/*
6055 + * Functions to set a caller context in a region.
6056 + */
6057 +void *dm_rh_reg_get_context(struct dm_region *reg);
6058 +void dm_rh_reg_set_context(struct dm_region *reg, void *context);
6059 +
6060 +/*
6061 + * Get region size and key (ie. number of the region).
6062 + */
6063 +sector_t dm_rh_get_region_size(struct dm_rh_client *rh);
6064 +sector_t dm_rh_get_region_key(struct dm_region *reg);
6065 +
6066 +/*
6067 + * Get/set/update region state (and dirty log).
6068 + *
6069 + * dm_rh_update_states
6070 + *     @errors_handled != 0 influences
6071 + *     that the state of the region will be kept NOSYNC
6072 + */
6073 +int dm_rh_get_state(struct dm_rh_client *rh, region_t region, int may_block);
6074 +void dm_rh_set_state(struct dm_rh_client *rh, region_t region,
6075 +                    enum dm_rh_region_states state, int may_block);
6076 +void dm_rh_update_states(struct dm_rh_client *rh, int errors_handled);
6077 +
6078 +/* Flush the region hash and dirty log. */
6079 +int dm_rh_flush(struct dm_rh_client *rh);
6080 +
6081 +/* Inc/dec pending count on regions. */
6082 +void dm_rh_inc(struct dm_rh_client *rh, region_t region);
6083 +void dm_rh_inc_pending(struct dm_rh_client *rh, struct bio_list *bios);
6084 +int dm_rh_dec(struct dm_rh_client *rh, region_t region);
6085 +
6086 +/* Delay bios on regions. */
6087 +void dm_rh_delay(struct dm_rh_client *rh, struct bio *bio);
6088 +void dm_rh_delay_by_region(struct dm_rh_client *rh,
6089 +                          struct bio *bio, region_t region);
6090 +
6091 +/*
6092 + * Normally, the region hash will automatically call the dispatch function.
6093 + * dm_rh_dispatch_bios() is for intentional dispatching of bios.
6094 + */
6095 +void dm_rh_dispatch_bios(struct dm_rh_client *rh, region_t region, int error);
6096 +
6097 +/*
6098 + * Region recovery control.
6099 + */
6100 +/* Prepare some regions for recovery by starting to quiesce them. */
6101 +int dm_rh_recovery_prepare(struct dm_rh_client *rh);
6102 +/* Try fetching a quiesced region for recovery. */
6103 +struct dm_region *dm_rh_recovery_start(struct dm_rh_client *rh);
6104 +/* Report recovery end on a region. */
6105 +void dm_rh_recovery_end(struct dm_rh_client *rh, struct dm_region *reg,
6106 +                       int error);
6107 +/* Check for amount of recoveries in flight. */
6108 +int dm_rh_recovery_in_flight(struct dm_rh_client *rh);
6109 +/* Start/stop recovery. */
6110 +void dm_rh_stop_recovery(struct dm_rh_client *rh);
6111 +void dm_rh_start_recovery(struct dm_rh_client *rh);
6112 +
6113 +#endif /* #ifdef DM_REGION_HASH_H */