]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Merge branch 'master' into devel-3.2
authorNeilBrown <neilb@suse.de>
Mon, 14 Mar 2011 07:49:57 +0000 (18:49 +1100)
committerNeilBrown <neilb@suse.de>
Mon, 14 Mar 2011 07:49:57 +0000 (18:49 +1100)
99 files changed:
ANNOUNCE-3.2 [new file with mode: 0644]
Assemble.c
Build.c
Create.c
Detail.c
Examine.c
Grow.c
Incremental.c
Kill.c
Makefile
Manage.c
Monitor.c
Query.c
ReadMe.c
config.c
external-reshape-design.txt [new file with mode: 0644]
inventory
managemon.c
mapfile.c
md.4
md_p.h
mdadm.8.in
mdadm.c
mdadm.conf.5
mdadm.h
mdadm.spec
mdassemble.8
mdassemble.c
mdmon-design.txt [new file with mode: 0644]
mdmon.8
mdmon.c
mdmon.h
mdstat.c
monitor.c
msg.c
msg.h
part.h [new file with mode: 0644]
platform-intel.c
platform-intel.h
policy.c [new file with mode: 0644]
probe_roms.c
probe_roms.h
restripe.c
super-ddf.c
super-gpt.c [new file with mode: 0644]
super-intel.c
super-mbr.c [new file with mode: 0644]
super0.c
super1.c
sysfs.c
test [changed mode: 0644->0755]
tests/08imsm-overlap
tests/09imsm-assemble
tests/09imsm-create-fail-rebuild
tests/11spare-migration [new file with mode: 0644]
tests/12imsm-r0_2d-grow-r0_3d [new file with mode: 0644]
tests/12imsm-r0_2d-grow-r0_4d [new file with mode: 0644]
tests/12imsm-r0_2d-grow-r0_5d [new file with mode: 0644]
tests/12imsm-r0_3d-grow-r0_4d [new file with mode: 0644]
tests/12imsm-r5_3d-grow-r5_4d [new file with mode: 0644]
tests/12imsm-r5_3d-grow-r5_5d [new file with mode: 0644]
tests/13imsm-r0_r0_2d-grow-r0_r0_4d [new file with mode: 0644]
tests/13imsm-r0_r0_2d-grow-r0_r0_5d [new file with mode: 0644]
tests/13imsm-r0_r0_3d-grow-r0_r0_4d [new file with mode: 0644]
tests/13imsm-r0_r5_3d-grow-r0_r5_4d [new file with mode: 0644]
tests/13imsm-r0_r5_3d-grow-r0_r5_5d [new file with mode: 0644]
tests/13imsm-r5_r0_3d-grow-r5_r0_4d [new file with mode: 0644]
tests/13imsm-r5_r0_3d-grow-r5_r0_5d [new file with mode: 0644]
tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d [new file with mode: 0644]
tests/14imsm-r0_3d_no_spares-migrate-r5_3d [new file with mode: 0644]
tests/14imsm-r0_r0_2d-takeover-r10_4d [new file with mode: 0644]
tests/14imsm-r10_4d-grow-r10_5d [new file with mode: 0644]
tests/14imsm-r10_r5_4d-takeover-r0_2d [new file with mode: 0644]
tests/14imsm-r1_2d-grow-r1_3d [new file with mode: 0644]
tests/14imsm-r1_2d-takeover-r0_2d [new file with mode: 0644]
tests/14imsm-r5_3d-grow-r5_5d-no-spares [new file with mode: 0644]
tests/14imsm-r5_3d-migrate-r4_3d [new file with mode: 0644]
tests/15imsm-r0_3d_64k-migrate-r0_3d_256k [new file with mode: 0644]
tests/15imsm-r5_3d_4k-migrate-r5_3d_256k [new file with mode: 0644]
tests/15imsm-r5_3d_64k-migrate-r5_3d_256k [new file with mode: 0644]
tests/15imsm-r5_6d_4k-migrate-r5_6d_256k [new file with mode: 0644]
tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k [new file with mode: 0644]
tests/16imsm-r0_3d-migrate-r5_4d [new file with mode: 0644]
tests/16imsm-r0_5d-migrate-r5_6d [new file with mode: 0644]
tests/16imsm-r5_3d-migrate-r0_3d [new file with mode: 0644]
tests/16imsm-r5_5d-migrate-r0_5d [new file with mode: 0644]
tests/18imsm-1d-takeover-r0_1d [new file with mode: 0644]
tests/18imsm-1d-takeover-r1_2d [new file with mode: 0644]
tests/18imsm-r0_2d-takeover-r10_4d [new file with mode: 0644]
tests/18imsm-r10_4d-takeover-r0_2d [new file with mode: 0644]
tests/18imsm-r1_2d-takeover-r0_1d [new file with mode: 0644]
tests/env-08imsm-overlap [deleted file]
tests/env-09imsm-assemble [deleted file]
tests/env-09imsm-create-fail-rebuild [deleted file]
tests/env-imsm-template [new file with mode: 0644]
tests/imsm-grow-template [new file with mode: 0644]
tests/utils [new file with mode: 0644]
udev-md-raid.rules
util.c

diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2
new file mode 100644 (file)
index 0000000..9e282bc
--- /dev/null
@@ -0,0 +1,77 @@
+Subject:  ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY)
+
+I am pleased to announce the availability of
+   mdadm version 3.2
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm devel-3.2
+   http://neil.brown.name/git?p=mdadm
+
+This is a "Developers only" release.  Please don't consider using it
+or making it available to others without reading the following.
+
+
+By far the most significant change in this release related to the
+management of reshaping arrays.  This code has been substantially
+re-written so that it can work with 'externally managed metadata' -
+Intel's IMSM in particular.  We now support level migration and
+OnLine Capacity Expansion on these arrays.
+
+However, while the code largely works it has not been tested
+exhaustively so there are likely to be problems.  As the reshape code
+for native metadata arrays was changed as part of this rewrite these
+problems could also result in regressions for reshape of native
+metadata.
+
+It is partly to encourage greater testing that this release is being
+made.  Any reports of problem - particular reproducible recipes for
+triggering the problems - will be gratefully received.
+
+It is hopped that a "3.2.1" release will be available in early March
+which will be a bugfix release over this and can be considered
+suitable for general use.
+
+Other changes of note:
+
+ - Policy framework.
+   Various policy statements can be made in the mdadm.conf to guide
+   the behaviour of mdadm, particular with regards to how new devices
+   are treated by "mdadm -I".
+   Depending on the 'action' associated with a device (identified by
+   its 'path') such need devices can be automatically re-added to and
+   existing array that they previously fell out off, or automatically
+   added as a spare if they appear to contain no data.
+
+ - mdadm now has a limited understanding of partition tables.  This
+   allows the policy framework to make decisions about partitioned
+   devices as well.
+
+ - --incremental --remove can be told what --path the device was on,
+   and this info will be recorded so that another device appearing at
+   the same physical location can be preferentially added to the same
+   array (provides the spare-same-slot action policy applied to the
+   path).
+
+ - A new flags "--invalid-backup" flag is available in --assemble
+   mode.  This can be used to re-assemble an array which was stopping
+   in the middle of a reshape, and for which the 'backup file' is no
+   longer available or is corrupted.  The array may have some
+   corruption in it at the point where reshape was up to, but at least
+   the rest of the array will become available.
+   
+
+ - Various internal restructuring - more is needed.
+
+
+Any feed back and bug reports are always welcomed at:
+    linux-raid@vger.kernel.org
+
+And please:  don't use this in production - particularly not the
+--grow functionality.
+
+NeilBrown 1st February 2011
+
+
index ea3a6484d9ad78ac8977a0efa02e686c9662324b..bfc879c7f4433550aa25e80dc1fd39156544582f 100644 (file)
@@ -70,9 +70,72 @@ static int is_member_busy(char *metadata_version)
        return busy;
 }
 
+static int ident_matches(struct mddev_ident *ident,
+                        struct mdinfo *content,
+                        struct supertype *tst,
+                        char *homehost,
+                        char *update, char *devname)
+{
+
+       if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
+           same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 &&
+           memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong uuid.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
+           name_matches(content->name, ident->name, homehost)==0) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong name.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->super_minor != UnSet &&
+           ident->super_minor != content->array.md_minor) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong super-minor.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->level != UnSet &&
+           ident->level != content->array.level) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong raid level.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->raid_disks != UnSet &&
+           ident->raid_disks!= content->array.raid_disks) {
+               if (devname)
+                       fprintf(stderr, Name ": %s requires wrong number of drives.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->member && ident->member[0]) {
+               /* content->text_version must match */
+               char *s = strchr(content->text_version+1, '/');
+               if (s == NULL) {
+                       if (devname)
+                               fprintf(stderr, Name ": %s is not a container and one is required.\n",
+                                       devname);
+                       return 0;
+               } else if (strcmp(ident->member, s+1) != 0) {
+                       if (devname)
+                               fprintf(stderr, Name ": skipping wrong member %s is %s\n",
+                                       content->text_version, devname);
+                       return 0;
+               }
+       }
+       return 1;
+}
+                        
+
 int Assemble(struct supertype *st, char *mddev,
-            mddev_ident_t ident,
-            mddev_dev_t devlist, char *backup_file,
+            struct mddev_ident *ident,
+            struct mddev_dev *devlist,
+            char *backup_file, int invalid_backup,
             int readonly, int runstop,
             char *update, char *homehost, int require_homehost,
             int verbose, int force)
@@ -145,6 +208,7 @@ int Assemble(struct supertype *st, char *mddev,
                               */
                struct mdinfo i;
        } *devices;
+       char *devmap;
        int *best = NULL; /* indexed by raid_disk */
        int bestcnt = 0;
        int devcnt = 0;
@@ -160,7 +224,7 @@ int Assemble(struct supertype *st, char *mddev,
        int start_partial_ok = (runstop >= 0) && 
                (force || devlist==NULL || auto_assem);
        unsigned int num_devs;
-       mddev_dev_t tmpdev;
+       struct mddev_dev *tmpdev;
        struct mdinfo info;
        struct mdinfo *content = NULL;
        char *avail;
@@ -168,6 +232,7 @@ int Assemble(struct supertype *st, char *mddev,
        char *name = NULL;
        int trustworthy;
        char chosen_name[1024];
+       struct domainlist *domains = NULL;
 
        if (get_linux_version() < 2004000)
                old_linux = 1;
@@ -211,7 +276,6 @@ int Assemble(struct supertype *st, char *mddev,
                        num_devs++;
                tmpdev = tmpdev->next;
        }
-       devices = malloc(num_devs * sizeof(*devices));
 
        if (!st && ident->st) st = ident->st;
 
@@ -225,11 +289,13 @@ int Assemble(struct supertype *st, char *mddev,
         */
        for (tmpdev = devlist;
             tmpdev;
-            tmpdev = tmpdev->next) {
+            tmpdev = tmpdev ? tmpdev->next : NULL) {
                char *devname = tmpdev->devname;
                int dfd;
                struct stat stb;
                struct supertype *tst = dup_super(st);
+               struct dev_policy *pol = NULL;
+               int found_container = 0;
 
                if (tmpdev->used > 1) continue;
 
@@ -255,36 +321,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": %s is not a block device.\n",
                                devname);
                        tmpdev->used = 2;
-               } else if (!tst && (tst = guess_super(dfd)) == NULL) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": no recogniseable superblock on %s\n",
-                                       devname);
-                       tmpdev->used = 2;
-               } else if (tst->ss->load_super(tst,dfd, NULL)) {
-                       if (report_missmatch)
-                               fprintf( stderr, Name ": no RAID superblock on %s\n",
-                                        devname);
-               } else if (auto_assem && st == NULL &&
-                          !conf_test_metadata(tst->ss->name,
-                                              tst->ss->match_home(tst, homehost) == 1)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has metadata type %s for which "
-                                       "auto-assembly is disabled\n",
-                                       devname, tst->ss->name);
-                       tst->ss->free_super(tst);
-                       tmpdev->used = 2;
-               } else {
-                       content = &info;
-                       memset(content, 0, sizeof(*content));
-                       tst->ss->getinfo_super(tst, content);
-               }
-               if (dfd >= 0) close(dfd);
-
-               if (tst && tst->sb && tst->ss->container_content
-                   && tst->loaded_container) {
-                       /* tmpdev is a container.  We need to be either
-                        * looking for a member, or auto-assembling
-                        */
+               } else if (must_be_container(dfd)) {
                        if (st) {
                                /* already found some components, this cannot
                                 * be another one.
@@ -292,8 +329,81 @@ int Assemble(struct supertype *st, char *mddev,
                                if (report_missmatch)
                                        fprintf(stderr, Name ": %s is a container, but we are looking for components\n",
                                                devname);
-                               goto loop;
+                               tmpdev->used = 2;
+#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+                       } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": not a recognisable container: %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+#endif
+                       } else if (!tst->ss->load_container
+                                  || tst->ss->load_container(tst, dfd, NULL)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no correct container type: %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (auto_assem &&
+                                  !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)),
+                                                      tst->ss->match_home(tst, homehost) == 1)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": %s has metadata type %s for which "
+                                               "auto-assembly is disabled\n",
+                                               devname, tst->ss->name);
+                               tmpdev->used = 2;
+                       } else
+                               found_container = 1;
+               } else {
+                       if (!tst && (tst = guess_super(dfd)) == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no recogniseable superblock on %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (tst->ss->load_super(tst,dfd, NULL)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no RAID superblock on %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (tst->ss->compare_super == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": Cannot assemble %s metadata on %s\n",
+                                               tst->ss->name, devname);
+                               tmpdev->used = 2;
+                       } else if (auto_assem && st == NULL &&
+                                  !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)),
+                                                      tst->ss->match_home(tst, homehost) == 1)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": %s has metadata type %s for which "
+                                               "auto-assembly is disabled\n",
+                                               devname, tst->ss->name);
+                               tmpdev->used = 2;
                        }
+               }
+               if (dfd >= 0) close(dfd);
+               if (tmpdev->used == 2) {
+                       if (auto_assem || !inargv)
+                               /* Ignore unrecognised devices during auto-assembly */
+                               goto loop;
+                       if (ident->uuid_set || ident->name[0] ||
+                           ident->super_minor != UnSet)
+                               /* Ignore unrecognised device if looking for
+                                * specific array */
+                               goto loop;
+                           
+
+                       fprintf(stderr, Name ": %s has no superblock - assembly aborted\n",
+                               devname);
+                       if (st)
+                               st->ss->free_super(st);
+                       dev_policy_free(pol);
+                       domain_free(domains);
+                       return 1;
+               }
+
+               if (found_container) {
+                       /* tmpdev is a container.  We need to be either
+                        * looking for a member, or auto-assembling
+                        */
 
                        if (ident->container) {
                                if (ident->container[0] == '/' &&
@@ -306,6 +416,11 @@ int Assemble(struct supertype *st, char *mddev,
                                if (ident->container[0] != '/') {
                                        /* we have a uuid */
                                        int uuid[4];
+
+                                       content = &info;
+                                       memset(content, 0, sizeof(*content));
+                                       tst->ss->getinfo_super(tst, content, NULL);
+
                                        if (!parse_uuid(ident->container, uuid) ||
                                            !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
                                                if (report_missmatch)
@@ -320,194 +435,192 @@ int Assemble(struct supertype *st, char *mddev,
                        if (verbose > 0)
                                fprintf(stderr, Name ": looking in container %s\n",
                                        devname);
-               next_member:
-                       if (tmpdev->content)
-                               content = tmpdev->content;
-                       else
-                               content = tst->ss->container_content(tst);
-                       if (!content)
-                               goto loop; /* empty container */
 
-                       tmpdev->content = content->next;
-                       if (tmpdev->content == NULL)
-                               tmpdev->used = 2;
+                       for (content = tst->ss->container_content(tst, NULL);
+                            content;
+                            content = content->next) {
 
-               } else if (ident->container || ident->member) {
-                       /* No chance of this matching if we don't have
-                        * a container */
-                       if (report_missmatch)
-                               fprintf(stderr, Name "%s is not a container, and one is required.\n",
-                                       devname);
-                       goto loop;
-               }
-
-               if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
-                   (!tst || !tst->sb ||
-                    same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong uuid.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
-                   (!tst || !tst->sb ||
-                    name_matches(content->name, ident->name, homehost)==0)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong name.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->super_minor != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->super_minor != content->array.md_minor)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong super-minor.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->level != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->level != content->array.level)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong raid level.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->raid_disks != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->raid_disks!= content->array.raid_disks)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s requires wrong number of drives.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (auto_assem) {
-                       if (tst == NULL || tst->sb == NULL)
-                               continue;
-               }
-               /* If we are this far, then we are nearly commited to this device.
-                * If the super_block doesn't exist, or doesn't match others,
-                * then we probably cannot continue
-                * However if one of the arrays is for the homehost, and
-                * the other isn't that can disambiguate.
-                */
-
-               if (!tst || !tst->sb) {
-                       fprintf(stderr, Name ": %s has no superblock - assembly aborted\n",
-                               devname);
-                       if (st)
-                               st->ss->free_super(st);
-                       return 1;
-               }
-
-               if (tst && tst->sb && tst->ss->container_content
-                   && tst->loaded_container) {
-                       /* we have the one container we need, don't keep
-                        * looking.  If the chosen member is active, skip.
-                        */
-                       if (is_member_busy(content->text_version)) {
-                               if (report_missmatch)
-                                       fprintf(stderr, Name ": member %s in %s is already assembled\n",
-                                               content->text_version,
-                                               devname);
-                       skip:
-                               if (tmpdev->content)
-                                       goto next_member;
-                               tst->ss->free_super(tst);
-                               tst = NULL;
-                               content = NULL;
-                               if (auto_assem)
+                               /* do not assemble arrays that might have bad blocks */
+                               if (content->array.state & (1<<MD_SB_BBM_ERRORS)) {
+                                       fprintf(stderr, Name ": BBM log found in metadata. "
+                                                               "Cannot activate array(s).\n");
+                                       tmpdev->used = 2;
                                        goto loop;
-                               return 1;
-                       }
-                       if (ident->member && ident->member[0]) {
-                               char *s = strchr(content->text_version+1, '/');
-                               if (s == NULL) {
-                                       fprintf(stderr, Name ": badly formatted version: %s\n",
-                                               content->text_version);
-                                       goto skip;
                                }
-                               if (strcmp(ident->member, s+1) != 0) {
+                               if (!ident_matches(ident, content, tst,
+                                                  homehost, update,
+                                                  report_missmatch ? devname : NULL))
+                                       /* message already printed */;
+                               else if (is_member_busy(content->text_version)) {
                                        if (report_missmatch)
-                                               fprintf(stderr,
-                                                       Name ": skipping wrong member %s\n",
-                                                       content->text_version);
-                                       goto skip;
-                               }
+                                               fprintf(stderr, Name ": member %s in %s is already assembled\n",
+                                                       content->text_version,
+                                                       devname);
+                               } else
+                                       break;
+                       }
+                       if (!content) {
+                               tmpdev->used = 2;
+                               goto loop; /* empty container */
                        }
+
                        st = tst; tst = NULL;
                        if (!auto_assem && inargv && tmpdev->next != NULL) {
                                fprintf(stderr, Name ": %s is a container, but is not "
                                        "only device given: confused and aborting\n",
                                        devname);
                                st->ss->free_super(st);
+                               dev_policy_free(pol);
+                               domain_free(domains);
                                return 1;
                        }
                        if (verbose > 0)
                                fprintf(stderr, Name ": found match on member %s in %s\n",
                                        content->text_version, devname);
-                       break;
-               }
-               if (st == NULL)
-                       st = dup_super(tst);
-               if (st->minor_version == -1)
-                       st->minor_version = tst->minor_version;
-               if (st->ss != tst->ss ||
-                   st->minor_version != tst->minor_version ||
-                   st->ss->compare_super(st, tst) != 0) {
-                       /* Some mismatch. If exactly one array matches this host,
-                        * we can resolve on that one.
-                        * Or, if we are auto assembling, we just ignore the second
-                        * for now.
-                        */
-                       if (auto_assem)
+
+                       /* make sure we finished the loop */
+                       tmpdev = NULL;
+                       goto loop;
+               } else {
+
+                       content = &info;
+                       memset(content, 0, sizeof(*content));
+                       tst->ss->getinfo_super(tst, content, NULL);
+
+                       if (!ident_matches(ident, content, tst,
+                                          homehost, update,
+                                          report_missmatch ? devname : NULL))
                                goto loop;
-                       if (homehost) {
-                               int first = st->ss->match_home(st, homehost);
-                               int last = tst->ss->match_home(tst, homehost);
-                               if (first != last &&
-                                   (first == 1 || last == 1)) {
-                                       /* We can do something */
-                                       if (first) {/* just ignore this one */
-                                               if (report_missmatch)
-                                                       fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
-                                                               devname);
-                                               goto loop;
-                                       } else { /* reject all those sofar */
-                                               mddev_dev_t td;
-                                               if (report_missmatch)
-                                                       fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
-                                                               devname);
-                                               for (td=devlist; td != tmpdev; td=td->next)
-                                                       if (td->used == 1)
-                                                               td->used = 0;
-                                               tmpdev->used = 1;
-                                               goto loop;
+                               
+                       if (st == NULL)
+                               st = dup_super(tst);
+                       if (st->minor_version == -1)
+                               st->minor_version = tst->minor_version;
+
+                       if (memcmp(content->uuid, uuid_zero,
+                                  sizeof(int[4])) == 0) {
+                               /* this is a floating spare.  It cannot define
+                                * an array unless there are no more arrays of
+                                * this type to be found.  It can be included
+                                * in an array of this type though.
+                                */
+                               tmpdev->used = 3;
+                               goto loop;
+                       }
+
+                       if (st->ss != tst->ss ||
+                           st->minor_version != tst->minor_version ||
+                           st->ss->compare_super(st, tst) != 0) {
+                               /* Some mismatch. If exactly one array matches this host,
+                                * we can resolve on that one.
+                                * Or, if we are auto assembling, we just ignore the second
+                                * for now.
+                                */
+                               if (auto_assem)
+                                       goto loop;
+                               if (homehost) {
+                                       int first = st->ss->match_home(st, homehost);
+                                       int last = tst->ss->match_home(tst, homehost);
+                                       if (first != last &&
+                                           (first == 1 || last == 1)) {
+                                               /* We can do something */
+                                               if (first) {/* just ignore this one */
+                                                       if (report_missmatch)
+                                                               fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
+                                                                       devname);
+                                                       goto loop;
+                                               } else { /* reject all those sofar */
+                                                       struct mddev_dev *td;
+                                                       if (report_missmatch)
+                                                               fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
+                                                                       devname);
+                                                       for (td=devlist; td != tmpdev; td=td->next)
+                                                               if (td->used == 1)
+                                                                       td->used = 0;
+                                                       tmpdev->used = 1;
+                                                       goto loop;
+                                               }
                                        }
                                }
+                               fprintf(stderr, Name ": superblock on %s doesn't match others - assembly aborted\n",
+                                       devname);
+                               tst->ss->free_super(tst);
+                               st->ss->free_super(st);
+                               dev_policy_free(pol);
+                               domain_free(domains);
+                               return 1;
                        }
-                       fprintf(stderr, Name ": superblock on %s doesn't match others - assembly aborted\n",
-                               devname);
-                       tst->ss->free_super(tst);
-                       st->ss->free_super(st);
-                       return 1;
+                       tmpdev->used = 1;
                }
-
-               tmpdev->used = 1;
-
        loop:
-               if (tmpdev->content)
-                       goto next_member;
+               /* Collect domain information from members only */
+               if (tmpdev && tmpdev->used == 1) {
+                       if (!pol)
+                               pol = devnum_policy(stb.st_rdev);
+                       domain_merge(&domains, pol, tst?tst->ss->name:NULL);
+               }
+               dev_policy_free(pol);
+               pol = NULL;
                if (tst)
                        tst->ss->free_super(tst);
        }
 
+       /* Check if we found some imsm spares but no members */
+       if ((auto_assem ||
+            (ident->uuid_set &&
+             memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) &&
+           (!st || !st->sb))
+               for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+                       if (tmpdev->used != 3)
+                               continue;
+                       tmpdev->used = 1;
+                       content = &info;
+
+                       if (!st->sb) {
+                               /* we need sb from one of the spares */
+                               int dfd = dev_open(tmpdev->devname, O_RDONLY);
+                               if (dfd < 0 ||
+                                   st->ss->load_super(st, dfd, NULL))
+                                       tmpdev->used = 2;
+                               if (dfd > 0)
+                                       close(dfd);
+                       }
+               }
+
+       /* Now reject spares that don't match domains of identified members */
+       for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+               struct stat stb;
+               if (tmpdev->used != 3)
+                       continue;
+               if (stat(tmpdev->devname, &stb)< 0) {
+                       fprintf(stderr, Name ": fstat failed for %s: %s\n",
+                               tmpdev->devname, strerror(errno));
+                       tmpdev->used = 2;
+               } else {
+                       struct dev_policy *pol = devnum_policy(stb.st_rdev);
+                       int dt = domain_test(domains, pol, NULL);
+                       if (inargv && dt != 0)
+                               /* take this spare as domains match
+                                * if there are any */
+                               tmpdev->used = 1;
+                       else if (!inargv && dt == 1)
+                               /* device wasn't explicitly listed, so need
+                                * explicit domain match - which we have */
+                               tmpdev->used = 1;
+                       else
+                               /* if domains don't match mark as unused */
+                               tmpdev->used = 0;
+                       dev_policy_free(pol);
+               }
+       }
+       domain_free(domains);
+       
        if (!st || !st->sb || !content)
                return 2;
 
        /* Now need to open the array device.  Use create_mddev */
        if (content == &info)
-               st->ss->getinfo_super(st, content);
+               st->ss->getinfo_super(st, content, NULL);
 
        trustworthy = FOREIGN;
        name = content->name;
@@ -549,7 +662,6 @@ int Assemble(struct supertype *st, char *mddev,
                            chosen_name);
        if (mdfd < 0) {
                st->ss->free_super(st);
-               free(devices);
                if (auto_assem)
                        goto try_again;
                return 1;
@@ -575,7 +687,6 @@ int Assemble(struct supertype *st, char *mddev,
                close(mdfd);
                mdfd = -3;
                st->ss->free_super(st);
-               free(devices);
                if (auto_assem)
                        goto try_again;
                return 1;
@@ -585,13 +696,19 @@ int Assemble(struct supertype *st, char *mddev,
 #ifndef MDASSEMBLE
        if (content != &info) {
                /* This is a member of a container.  Try starting the array. */
-               return assemble_container_content(st, mdfd, content, runstop,
-                                          chosen_name, verbose);
+               int err;
+               err = assemble_container_content(st, mdfd, content, runstop,
+                                                chosen_name, verbose,
+                                                backup_file);
+               close(mdfd);
+               return err;
        }
 #endif
        /* Ok, no bad inconsistancy, we can try updating etc */
        bitmap_done = 0;
        content->update_private = NULL;
+       devices = malloc(num_devs * sizeof(*devices));
+       devmap = calloc(num_devs * content->array.raid_disks, 1);
        for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) {
                char *devname = tmpdev->devname;
                struct stat stb;
@@ -602,6 +719,7 @@ int Assemble(struct supertype *st, char *mddev,
                        /* prepare useful information in info structures */
                        struct stat stb2;
                        struct supertype *tst;
+                       int err;
                        fstat(mdfd, &stb2);
 
                        if (strcmp(update, "uuid")==0 &&
@@ -625,30 +743,45 @@ int Assemble(struct supertype *st, char *mddev,
                                if (dfd >= 0)
                                        close(dfd);
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
-                       tst->ss->getinfo_super(tst, content);
+                       tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
 
                        memcpy(content->uuid, ident->uuid, 16);
                        strcpy(content->name, ident->name);
                        content->array.md_minor = minor(stb2.st_rdev);
 
-                       tst->ss->update_super(tst, content, update,
-                                             devname, verbose,
-                                             ident->uuid_set, homehost);
+                       if (strcmp(update, "byteorder") == 0)
+                               err = 0;
+                       else
+                               err = tst->ss->update_super(tst, content, update,
+                                                           devname, verbose,
+                                                           ident->uuid_set,
+                                                           homehost);
+                       if (err < 0) {
+                               fprintf(stderr,
+                                       Name ": --update=%s not understood"
+                                       " for %s metadata\n",
+                                       update, tst->ss->name);
+                               tst->ss->free_super(tst);
+                               free(tst);
+                               close(mdfd);
+                               close(dfd);
+                               free(devices);
+                               free(devmap);
+                               return 1;
+                       }
                        if (strcmp(update, "uuid")==0 &&
                            !ident->uuid_set) {
                                ident->uuid_set = 1;
                                memcpy(ident->uuid, content->uuid, 16);
                        }
-                       if (dfd < 0)
-                               fprintf(stderr, Name ": Cannot open %s for superblock update\n",
-                                       devname);
-                       else if (tst->ss->store_super(tst, dfd))
+                       if (tst->ss->store_super(tst, dfd))
                                fprintf(stderr, Name ": Could not re-write superblock on %s.\n",
                                        devname);
-                       if (dfd >= 0)
-                               close(dfd);
+                       close(dfd);
 
                        if (strcmp(update, "uuid")==0 &&
                            ident->bitmap_fd >= 0 && !bitmap_done) {
@@ -673,9 +806,11 @@ int Assemble(struct supertype *st, char *mddev,
                                if (dfd >= 0)
                                        close(dfd);
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
-                       tst->ss->getinfo_super(tst, content);
+                       tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
                        tst->ss->free_super(tst);
                        close(dfd);
                }
@@ -745,6 +880,8 @@ int Assemble(struct supertype *st, char *mddev,
                                           "the\n      DEVICE list in mdadm.conf"
                                        );
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
                        if (best[i] == -1
@@ -763,13 +900,15 @@ int Assemble(struct supertype *st, char *mddev,
                if (st)
                        st->ss->free_super(st);
                close(mdfd);
+               free(devices);
+               free(devmap);
                return 1;
        }
 
        if (update && strcmp(update, "byteorder")==0)
                st->minor_version = 90;
 
-       st->ss->getinfo_super(st, content);
+       st->ss->getinfo_super(st, content, NULL);
        clean = content->array.state & 1;
 
        /* now we have some devices that might be suitable.
@@ -792,15 +931,32 @@ int Assemble(struct supertype *st, char *mddev,
                if (content->array.level != LEVEL_MULTIPATH)
                        if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
                                if (!(devices[j].i.disk.state
-                                     & (1<<MD_DISK_FAULTY)))
+                                     & (1<<MD_DISK_FAULTY))) {
+                                       devices[j].uptodate = 1;
                                        sparecnt++;
+                               }
                                continue;
                        }
+               /* If this devices thinks that 'most_recent' has failed, then
+                * we must reject this device.
+                */
+               if (j != most_recent &&
+                   content->array.raid_disks > 0 &&
+                   devices[most_recent].i.disk.raid_disk >= 0 &&
+                   devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) {
+                       if (verbose > -1)
+                               fprintf(stderr, Name ": ignoring %s as it reports %s as failed\n",
+                                       devices[j].devname, devices[most_recent].devname);
+                       best[i] = -1;
+                       continue;
+               }
                if (devices[j].i.events+event_margin >=
                    devices[most_recent].i.events) {
                        devices[j].uptodate = 1;
                        if (i < content->array.raid_disks) {
-                               if (devices[j].i.recovery_start == MaxSector) {
+                               if (devices[j].i.recovery_start == MaxSector ||
+                                   (content->reshape_active &&
+                                    j >= content->array.raid_disks - content->delta_disks)) {
                                        okcnt++;
                                        avail[i]=1;
                                } else
@@ -809,6 +965,7 @@ int Assemble(struct supertype *st, char *mddev,
                                sparecnt++;
                }
        }
+       free(devmap);
        while (force && !enough(content->array.level, content->array.raid_disks,
                                content->array.layout, 1,
                                avail, okcnt)) {
@@ -910,6 +1067,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Cannot open %s: %s\n",
                                devices[j].devname, strerror(errno));
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (st->ss->load_super(st,fd, NULL)) {
@@ -917,6 +1075,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": RAID superblock has disappeared from %s\n",
                                devices[j].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                close(fd);
@@ -924,9 +1083,10 @@ int Assemble(struct supertype *st, char *mddev,
        if (st->sb == NULL) {
                fprintf(stderr, Name ": No suitable drives found for %s\n", mddev);
                close(mdfd);
+               free(devices);
                return 1;
        }
-       st->ss->getinfo_super(st, content);
+       st->ss->getinfo_super(st, content, NULL);
 #ifndef MDASSEMBLE
        sysfs_init(content, mdfd, 0);
 #endif
@@ -987,6 +1147,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n",
                                devices[chosen_drive].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (st->ss->store_super(st, fd)) {
@@ -994,6 +1155,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Could not re-write superblock on %s\n",
                                devices[chosen_drive].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                close(fd);
@@ -1024,8 +1186,16 @@ int Assemble(struct supertype *st, char *mddev,
                        } else
                                fdlist[i] = -1;
                }
-               if (!err)
-                       err = Grow_restart(st, content, fdlist, bestcnt, backup_file, verbose > 0);
+               if (!err) {
+                       err = Grow_restart(st, content, fdlist, bestcnt,
+                                          backup_file, verbose > 0);
+                       if (err && invalid_backup) {
+                               if (verbose > 0)
+                                       fprintf(stderr, Name ": continuing"
+                                               " without restoring backup\n");
+                               err = 0;
+                       }
+               }
                while (i>0) {
                        i--;
                        if (fdlist[i]>=0) close(fdlist[i]);
@@ -1035,6 +1205,7 @@ int Assemble(struct supertype *st, char *mddev,
                        if (backup_file == NULL)
                                fprintf(stderr,"      Possibly you needed to specify the --backup-file\n");
                        close(mdfd);
+                       free(devices);
                        return err;
                }
        }
@@ -1060,6 +1231,7 @@ int Assemble(struct supertype *st, char *mddev,
                                mddev, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, NULL);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (ident->bitmap_fd >= 0) {
@@ -1067,6 +1239,7 @@ int Assemble(struct supertype *st, char *mddev,
                                fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n");
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                } else if (ident->bitmap_file) {
@@ -1077,6 +1250,7 @@ int Assemble(struct supertype *st, char *mddev,
                                        ident->bitmap_file);
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                        if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
@@ -1084,6 +1258,7 @@ int Assemble(struct supertype *st, char *mddev,
                                close(bmfd);
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                        close(bmfd);
@@ -1140,9 +1315,11 @@ int Assemble(struct supertype *st, char *mddev,
                                                content->array.raid_disks);
                                fprintf(stderr, "\n");
                        }
+                       st->ss->free_super(st);
                        sysfs_uevent(content, "change");
                        wait_for(chosen_name, mdfd);
                        close(mdfd);
+                       free(devices);
                        return 0;
                }
 
@@ -1191,6 +1368,29 @@ int Assemble(struct supertype *st, char *mddev,
                                                                      (4 * content->array.chunk_size / 4096) + 1);
                                        }
                                }
+                               if (okcnt < (unsigned)content->array.raid_disks) {
+                                       /* If any devices did not get added
+                                        * because the kernel rejected them based
+                                        * on event count, try adding them
+                                        * again providing the action policy is
+                                        * 're-add' or greater.  The bitmap
+                                        * might allow them to be included, or
+                                        * they will become spares.
+                                        */
+                                       for (i = 0; i <= bestcnt; i++) {
+                                               int j = best[i];
+                                               if (j >= 0 && !devices[j].uptodate) {
+                                                       if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add))
+                                                               continue;
+                                                       rv = add_disk(mdfd, st, content,
+                                                                     &devices[j].i);
+                                                       if (rv == 0 && verbose >= 0)
+                                                               fprintf(stderr,
+                                                                       Name ": %s has been re-added.\n",
+                                                                       devices[j].devname);
+                                               }
+                                       }
+                               }
                                wait_for(mddev, mdfd);
                                close(mdfd);
                                if (auto_assem) {
@@ -1219,6 +1419,7 @@ int Assemble(struct supertype *st, char *mddev,
                                                usecs <<= 1;
                                        }
                                }
+                               free(devices);
                                return 0;
                        }
                        fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n",
@@ -1239,6 +1440,7 @@ int Assemble(struct supertype *st, char *mddev,
                        if (auto_assem)
                                ioctl(mdfd, STOP_ARRAY, NULL);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (runstop == -1) {
@@ -1248,6 +1450,7 @@ int Assemble(struct supertype *st, char *mddev,
                                fprintf(stderr, " (out of %d)", content->array.raid_disks);
                        fprintf(stderr, ", but not started.\n");
                        close(mdfd);
+                       free(devices);
                        return 0;
                }
                if (verbose >= -1) {
@@ -1277,6 +1480,7 @@ int Assemble(struct supertype *st, char *mddev,
                if (auto_assem)
                        ioctl(mdfd, STOP_ARRAY, NULL);
                close(mdfd);
+               free(devices);
                return 1;
        } else {
                /* The "chosen_drive" is a good choice, and if necessary, the superblock has
@@ -1293,48 +1497,94 @@ int Assemble(struct supertype *st, char *mddev,
 
        }
        close(mdfd);
+       free(devices);
        return 0;
 }
 
 #ifndef MDASSEMBLE
 int assemble_container_content(struct supertype *st, int mdfd,
                               struct mdinfo *content, int runstop,
-                              char *chosen_name, int verbose)
+                              char *chosen_name, int verbose,
+                              char *backup_file)
 {
        struct mdinfo *dev, *sra;
        int working = 0, preexist = 0;
+       int expansion = 0;
        struct map_ent *map = NULL;
 
        sysfs_init(content, mdfd, 0);
 
        sra = sysfs_read(mdfd, 0, GET_VERSION);
        if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0)
-               if (sysfs_set_array(content, md_get_version(mdfd)) != 0) {
-                       close(mdfd);
+               if (sysfs_set_array(content, md_get_version(mdfd)) != 0)
                        return 1;
-               }
+
+       if (content->reshape_active)
+               block_subarray(content);
+
        if (sra)
                sysfs_free(sra);
 
        for (dev = content->devs; dev; dev = dev->next)
-               if (sysfs_add_disk(content, dev, 1) == 0)
-                       working++;
-               else if (errno == EEXIST)
+               if (sysfs_add_disk(content, dev, 1) == 0) {
+                       if (dev->disk.raid_disk >= content->array.raid_disks &&
+                           content->reshape_active)
+                               expansion++;
+                       else
+                               working++;
+               } else if (errno == EEXIST)
                        preexist++;
-       if (working == 0) {
-               close(mdfd);
+       if (working == 0)
                return 1;/* Nothing new, don't try to start */
-       }
-       
+
        map_update(&map, fd2devnum(mdfd),
                   content->text_version,
                   content->uuid, chosen_name);
 
        if (runstop > 0 ||
-                (working + preexist) >= content->array.working_disks) {
+                (working + preexist + expansion) >=
+                       content->array.working_disks) {
                int err;
 
-               switch(content->array.level) {
+               if (content->reshape_active) {
+                       int spare = content->array.raid_disks + expansion;
+                       int i;
+                       int *fdlist = malloc(sizeof(int) *
+                                            (working + expansion
+                                             + content->array.raid_disks));
+                       for (i=0; i<spare; i++)
+                               fdlist[i] = -1;
+                       for (dev = content->devs; dev; dev = dev->next) {
+                               char buf[20];
+                               int fd;
+                               sprintf(buf, "%d:%d",
+                                       dev->disk.major,
+                                       dev->disk.minor);
+                               fd = dev_open(buf, O_RDWR);
+
+                               if (dev->disk.raid_disk >= 0)
+                                       fdlist[dev->disk.raid_disk] = fd;
+                               else
+                                       fdlist[spare++] = fd;
+                       }
+                       err = Grow_restart(st, content, fdlist, spare,
+                                          backup_file, verbose > 0);
+                       while (spare > 0) {
+                               spare--;
+                               if (fdlist[spare] >= 0)
+                                       close(fdlist[spare]);
+                       }
+                       if (err) {
+                               fprintf(stderr, Name ": Failed to restore critical"
+                                       " section for reshape - sorry.\n");
+                               if (!backup_file)
+                                       fprintf(stderr, Name ":  Possibly you need"
+                                               " to specify a --backup-file\n");
+                               return 1;
+                       }
+
+                       err = Grow_continue(mdfd, st, content, backup_file);
+               } else switch(content->array.level) {
                case LEVEL_LINEAR:
                case LEVEL_MULTIPATH:
                case 0:
@@ -1365,12 +1615,14 @@ int assemble_container_content(struct supertype *st, int mdfd,
                                        chosen_name, working + preexist);
                        if (preexist)
                                fprintf(stderr, " (%d new)", working);
+                       if (expansion)
+                               fprintf(stderr, " ( + %d for expansion)",
+                                       expansion);
                        fprintf(stderr, "\n");
                }
                if (!err)
                        wait_for(chosen_name, mdfd);
-               close(mdfd);
-               return 0;
+               return err;
                /* FIXME should have an O_EXCL and wait for read-auto */
        } else {
                if (verbose >= 0)
@@ -1378,7 +1630,6 @@ int assemble_container_content(struct supertype *st, int mdfd,
                                ": %s assembled with %d devices but "
                                "not started\n",
                                chosen_name, working);
-               close(mdfd);
                return 1;
        }
 }
diff --git a/Build.c b/Build.c
index 7f3925864731bf63e0c1c1a10357a3476a95a2fe..cb9f01e33264376e63be43c2efd8da1672c96461 100644 (file)
--- a/Build.c
+++ b/Build.c
@@ -29,7 +29,7 @@
 #define STOP_MD                _IO (MD_MAJOR, 3)
 
 int Build(char *mddev, int chunk, int level, int layout,
-         int raiddisks, mddev_dev_t devlist, int assume_clean,
+         int raiddisks, struct mddev_dev *devlist, int assume_clean,
          char *bitmap_file, int bitmap_chunk, int write_behind,
          int delay, int verbose, int autof, unsigned long long size)
 {
@@ -50,7 +50,7 @@ int Build(char *mddev, int chunk, int level, int layout,
        int vers;
        struct stat stb;
        int subdevs = 0, missing_disks = 0;
-       mddev_dev_t dv;
+       struct mddev_dev *dv;
        int bitmap_fd;
        unsigned long long bitmapsize;
        int mdfd;
index ba3b99b6087bac6c1a813631f0dc81888779a8a5..6349f86538fd4cf4f6c59d10b49a70c572ece2e6 100644 (file)
--- a/Create.c
+++ b/Create.c
@@ -31,8 +31,8 @@ static int default_layout(struct supertype *st, int level, int verbose)
 {
        int layout = UnSet;
 
-       if (st && st->ss->default_layout)
-               layout = st->ss->default_layout(level);
+       if (st && st->ss->default_geometry)
+               st->ss->default_geometry(st, &level, &layout, NULL);
 
        if (layout == UnSet)
                switch(level) {
@@ -69,7 +69,7 @@ int Create(struct supertype *st, char *mddev,
           int chunk, int level, int layout, unsigned long long size,
           int raiddisks, int sparedisks,
           char *name, char *homehost, int *uuid,
-          int subdevs, mddev_dev_t devlist,
+          int subdevs, struct mddev_dev *devlist,
           int runstop, int verbose, int force, int assume_clean,
           char *bitmap_file, int bitmap_chunk, int write_behind,
           int delay, int autof)
@@ -95,7 +95,7 @@ int Create(struct supertype *st, char *mddev,
        char *mindisc = NULL;
        char *maxdisc = NULL;
        int dnum;
-       mddev_dev_t dv;
+       struct mddev_dev *dv;
        int fail=0, warn=0;
        struct stat stb;
        int first_missing = subdevs * 2;
@@ -114,6 +114,7 @@ int Create(struct supertype *st, char *mddev,
        struct mdinfo info, *infos;
        int did_default = 0;
        int do_default_layout = 0;
+       int do_default_chunk = 0;
        unsigned long safe_mode_delay = 0;
        char chosen_name[1024];
        struct map_ent *map = NULL;
@@ -122,15 +123,8 @@ int Create(struct supertype *st, char *mddev,
        int major_num = BITMAP_MAJOR_HI;
 
        memset(&info, 0, sizeof(info));
-
-       if (level == UnSet) {
-               /* "ddf" and "imsm" metadata only supports one level - should possibly
-                * push this into metadata handler??
-                */
-               if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
-                       level = LEVEL_CONTAINER;
-       }
-
+       if (level == UnSet && st && st->ss->default_geometry)
+               st->ss->default_geometry(st, &level, NULL, NULL);
        if (level == UnSet) {
                fprintf(stderr,
                        Name ": a RAID level is needed to create an array.\n");
@@ -171,15 +165,15 @@ int Create(struct supertype *st, char *mddev,
                    inf.raid_disks == 0) {
                        /* yep, looks like a container */
                        if (st) {
-                               rv = st->ss->load_super(st, fd,
-                                                       devlist->devname);
+                               rv = st->ss->load_container(st, fd,
+                                                           devlist->devname);
                                if (rv == 0)
                                        have_container = 1;
                        } else {
-                               st = guess_super(fd);
+                               st = super_by_fd(fd, NULL);
                                if (st && !(rv = st->ss->
-                                           load_super(st, fd,
-                                                      devlist->devname)))
+                                           load_container(st, fd,
+                                                          devlist->devname)))
                                        have_container = 1;
                                else
                                        st = NULL;
@@ -236,14 +230,10 @@ int Create(struct supertype *st, char *mddev,
        case 10:
        case 6:
        case 0:
-               if (chunk == 0) {
-                       if (st && st->ss->default_chunk)
-                               chunk = st->ss->default_chunk(st);
-
-                       chunk = chunk ? : 512;
-
-                       if (verbose > 0)
-                               fprintf(stderr, Name ": chunk size defaults to %dK\n", chunk);
+               if (chunk == 0 || chunk == UnSet) {
+                       chunk = UnSet;
+                       do_default_chunk = 1;
+                       /* chunk will be set later */
                }
                break;
        case LEVEL_LINEAR:
@@ -269,12 +259,17 @@ int Create(struct supertype *st, char *mddev,
                return 1;
        }
        
-       if (size && chunk)
+       if (size && chunk && chunk != UnSet)
                size &= ~(unsigned long long)(chunk - 1);
        newsize = size * 2;
        if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
-                                             chunk, size*2, NULL, &newsize, verbose>=0))
+                                             &chunk, size*2, NULL, &newsize, verbose>=0))
                return 1;
+
+       if (chunk) {
+               newsize &= ~(unsigned long long)(chunk*2 - 1);
+               size &= ~(unsigned long long)(chunk - 1);
+       }
        if (size == 0) {
                size = newsize / 2;
                if (size && verbose > 0)
@@ -317,9 +312,12 @@ int Create(struct supertype *st, char *mddev,
                                        layout = default_layout(st, level, verbose);
                                if (st && !st->ss->validate_geometry
                                                (st, level, layout, raiddisks,
-                                                chunk, size*2, dname, &freesize,
-                                                verbose > 0))
+                                                &chunk, size*2, dname, &freesize,
+                                                verbose > 0)) {
+                                       free(st);
                                        st = NULL;
+                                       chunk = do_default_chunk ? 0 : chunk;
+                               }
                        }
 
                        if (!st) {
@@ -336,7 +334,7 @@ int Create(struct supertype *st, char *mddev,
                                layout = default_layout(st, level, verbose);
                        if (!st->ss->validate_geometry(st, level, layout,
                                                       raiddisks,
-                                                      chunk, size*2, dname,
+                                                      &chunk, size*2, dname,
                                                       &freesize,
                                                       verbose >= 0)) {
 
@@ -348,6 +346,11 @@ int Create(struct supertype *st, char *mddev,
                                continue;
                        }
                }
+               if (verbose > 0 && do_default_chunk) {
+                       do_default_chunk = 0;
+                       fprintf(stderr, Name ": chunk size "
+                               "defaults to %dK\n", chunk);
+               }
 
                freesize /= 2; /* convert to K */
                if (chunk) {
@@ -384,13 +387,14 @@ int Create(struct supertype *st, char *mddev,
                        if (strcmp(st->ss->name, "1.x") == 0 &&
                            st->minor_version >= 1)
                                /* metadata at front */
-                               warn |= check_partitions(fd, dname, 0);
-                       else if (level == 1 || level == LEVEL_CONTAINER)
+                               warn |= check_partitions(fd, dname, 0, 0);
+                       else if (level == 1 || level == LEVEL_CONTAINER
+                                   || (level == 0 && raiddisks == 1))
                                /* partitions could be meaningful */
-                               warn |= check_partitions(fd, dname, freesize*2);
+                               warn |= check_partitions(fd, dname, freesize*2, size*2);
                        else
                                /* partitions cannot be meaningful */
-                               warn |= check_partitions(fd, dname, 0);
+                               warn |= check_partitions(fd, dname, 0, 0);
                        if (strcmp(st->ss->name, "1.x") == 0 &&
                            st->minor_version >= 1 &&
                            did_default &&
@@ -429,7 +433,7 @@ int Create(struct supertype *st, char *mddev,
                        /* size is meaningful */
                        if (!st->ss->validate_geometry(st, level, layout,
                                                       raiddisks,
-                                                      chunk, minsize*2,
+                                                      &chunk, minsize*2,
                                                       NULL, NULL, 0)) {
                                fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
                                return 1;
@@ -622,7 +626,7 @@ int Create(struct supertype *st, char *mddev,
 
        total_slots = info.array.nr_disks;
        sysfs_init(&info, mdfd, 0);
-       st->ss->getinfo_super(st, &info);
+       st->ss->getinfo_super(st, &info, NULL);
 
        if (did_default && verbose >= 0) {
                if (is_subarray(info.text_version)) {
@@ -680,7 +684,7 @@ int Create(struct supertype *st, char *mddev,
 
        sysfs_init(&info, mdfd, 0);
 
-       if (st->ss->external && st->subarray[0]) {
+       if (st->ss->external && st->container_dev != NoMdDev) {
                /* member */
 
                /* When creating a member, we need to be careful
@@ -743,7 +747,7 @@ int Create(struct supertype *st, char *mddev,
        infos = malloc(sizeof(*infos) * total_slots);
 
        for (pass=1; pass <=2 ; pass++) {
-               mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
+               struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
 
                for (dnum=0, dv = devlist ; dv ;
                     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
@@ -783,7 +787,8 @@ int Create(struct supertype *st, char *mddev,
                                if (have_container)
                                        fd = -1;
                                else {
-                                       if (st->ss->external && st->subarray[0])
+                                       if (st->ss->external &&
+                                           st->container_dev != NoMdDev)
                                                fd = open(dv->devname, O_RDWR);
                                        else
                                                fd = open(dv->devname, O_RDWR|O_EXCL);
@@ -805,7 +810,7 @@ int Create(struct supertype *st, char *mddev,
                                        ioctl(mdfd, STOP_ARRAY, NULL);
                                        goto abort;
                                }
-                               st->ss->getinfo_super(st, inf);
+                               st->ss->getinfo_super(st, inf, NULL);
                                safe_mode_delay = inf->safe_mode_delay;
 
                                if (have_container && verbose > 0)
@@ -831,7 +836,6 @@ int Create(struct supertype *st, char *mddev,
                                                Name ": ADD_NEW_DISK for %s "
                                                "failed: %s\n",
                                                dv->devname, strerror(errno));
-                                       st->ss->free_super(st);
                                        goto abort;
                                }
                                break;
@@ -850,7 +854,7 @@ int Create(struct supertype *st, char *mddev,
                         * again returns container info.
                         */
                        map_lock(&map);
-                       st->ss->getinfo_super(st, &info_new);
+                       st->ss->getinfo_super(st, &info_new, NULL);
                        if (st->ss->external && level != LEVEL_CONTAINER &&
                            !same_uuid(info_new.uuid, info.uuid, 0)) {
                                map_update(&map, fd2devnum(mdfd),
@@ -865,7 +869,7 @@ int Create(struct supertype *st, char *mddev,
                        if (me) {
                                char *path = strdup(me->path);
 
-                               st->ss->getinfo_super(st, &info_new);
+                               st->ss->getinfo_super(st, &info_new, NULL);
                                map_update(&map, st->container_dev,
                                           info_new.text_version,
                                           info_new.uuid, path);
@@ -874,10 +878,10 @@ int Create(struct supertype *st, char *mddev,
                        map_unlock(&map);
 
                        flush_metadata_updates(st);
+                       st->ss->free_super(st);
                }
        }
        free(infos);
-       st->ss->free_super(st);
 
        if (level == LEVEL_CONTAINER) {
                /* No need to start.  But we should signal udev to
@@ -921,7 +925,7 @@ int Create(struct supertype *st, char *mddev,
                }
                if (verbose >= 0)
                        fprintf(stderr, Name ": array %s started.\n", mddev);
-               if (st->ss->external && st->subarray[0]) {
+               if (st->ss->external && st->container_dev != NoMdDev) {
                        if (need_mdmon)
                                start_mdmon(st->container_dev);
 
index b3511f39b5bb4f318ca9c47fb437343b145e145a..375189d0b6332884320018939c54f467b6d1c8ea 100644 (file)
--- a/Detail.c
+++ b/Detail.c
@@ -49,8 +49,9 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        int is_rebuilding = 0;
        int failed = 0;
        struct supertype *st;
+       char *subarray = NULL;
        int max_disks = MD_SB_DISKS; /* just a default */
-       struct mdinfo info;
+       struct mdinfo *info = NULL;
        struct mdinfo *sra;
        char *member = NULL;
        char *container = NULL;
@@ -88,7 +89,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                return rv;
        }
        sra = sysfs_read(fd, 0, GET_VERSION);
-       st = super_by_fd(fd);
+       st = super_by_fd(fd, &subarray);
 
        if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode))
                stb.st_rdev = 0;
@@ -97,16 +98,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        if (st)
                max_disks = st->max_devs;
 
-       if (sra && is_subarray(sra->text_version) &&
-               strchr(sra->text_version+1, '/')) {
+       if (subarray) {
                /* This is a subarray of some container.
                 * We want the name of the container, and the member
                 */
-               char *s = strchr(sra->text_version+1, '/');
-               int dn;
-               *s++ = '\0';
-               member = s;
-               dn = devname2devnum(sra->text_version+1);
+               int dn = st->container_dev;
+
+               member = subarray;
                container = map_dev(dev2major(dn), dev2minor(dn), 1);
        }
 
@@ -143,25 +141,34 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                close(fd2);
                if (err)
                        continue;
-               st->ss->getinfo_super(st, &info);
+               if (info)
+                       free(info);
+               if (subarray)
+                       info = st->ss->container_content(st, subarray);
+               else {
+                       info = malloc(sizeof(*info));
+                       st->ss->getinfo_super(st, info, NULL);
+               }
+               if (!info)
+                       continue;
 
                if (array.raid_disks != 0 && /* container */
-                   (info.array.ctime != array.ctime ||
-                    info.array.level != array.level)) {
+                   (info->array.ctime != array.ctime ||
+                    info->array.level != array.level)) {
                        st->ss->free_super(st);
                        continue;
                }
                /* some formats (imsm) have free-floating-spares
-                * with a uuid of uuid_match_any, they don't
+                * with a uuid of uuid_zero, they don't
                 * have very good info about the rest of the
                 * container, so keep searching when
                 * encountering such a device.  Otherwise, stop
                 * after the first successful call to
                 * ->load_super.
                 */
-               if (memcmp(uuid_match_any,
-                          info.uuid,
-                          sizeof(uuid_match_any)) == 0) {
+               if (memcmp(uuid_zero,
+                          info->uuid,
+                          sizeof(uuid_zero)) == 0) {
                        st->ss->free_super(st);
                        continue;
                }
@@ -191,13 +198,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                       array.major_version, array.minor_version);
                }
                
-               if (st && st->sb) {
+               if (st && st->sb && info) {
                        char nbuf[64];
                        struct map_ent *mp, *map = NULL;
 
-                       fname_from_uuid(st, &info, nbuf, ':');
+                       fname_from_uuid(st, info, nbuf, ':');
                        printf("MD_UUID=%s\n", nbuf+5);
-                       mp = map_by_uuid(&map, info.uuid);
+                       mp = map_by_uuid(&map, info->uuid);
                        if (mp && mp->path &&
                            strncmp(mp->path, "/dev/md/", 8) == 0)
                                printf("MD_DEVNAME=%s\n", mp->path+8);
@@ -355,6 +362,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                if (atime)
                        printf("    Update Time : %.24s\n", ctime(&atime));
                if (array.raid_disks) {
+                       static char *sync_action[] = {", recovering",", resyncing",", reshaping",", checking"};
                        char *st;
                        if (avail_disks == array.raid_disks)
                                st = "";
@@ -367,8 +375,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                        printf("          State : %s%s%s%s\n",
                               (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
                               st,
-                              (!e || e->percent < 0) ? "" :
-                              (e->resync) ? ", resyncing": ", recovering",
+                              (!e || e->percent < 0) ? "" : sync_action[e->resync],
                               larray_size ? "": ", Not Started");
                }
                if (array.raid_disks)
@@ -410,50 +417,50 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
 
                if (e && e->percent >= 0) {
                        printf(" Re%s Status : %d%% complete\n",
-                              (st && st->sb && info.reshape_active)?
+                              (st && st->sb && info->reshape_active)?
                                  "shape":"build",
                               e->percent);
                        is_rebuilding = 1;
                }
                free_mdstat(ms);
 
-               if (st->sb && info.reshape_active) {
+               if (st->sb && info->reshape_active) {
 #if 0
 This is pretty boring
-                       printf("  Reshape pos'n : %llu%s\n", (unsigned long long) info.reshape_progress<<9,
-                              human_size((unsigned long long)info.reshape_progress<<9));
+                       printf("  Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9,
+                              human_size((unsigned long long)info->reshape_progress<<9));
 #endif
-                       if (info.delta_disks > 0)
+                       if (info->delta_disks > 0)
                                printf("  Delta Devices : %d, (%d->%d)\n",
-                                      info.delta_disks, array.raid_disks - info.delta_disks, array.raid_disks);
-                       if (info.delta_disks < 0)
+                                      info->delta_disks, array.raid_disks - info->delta_disks, array.raid_disks);
+                       if (info->delta_disks < 0)
                                printf("  Delta Devices : %d, (%d->%d)\n",
-                                      info.delta_disks, array.raid_disks, array.raid_disks + info.delta_disks);
-                       if (info.new_level != array.level) {
-                               char *c = map_num(pers, info.new_level);
+                                      info->delta_disks, array.raid_disks, array.raid_disks + info->delta_disks);
+                       if (info->new_level != array.level) {
+                               char *c = map_num(pers, info->new_level);
                                printf("      New Level : %s\n", c?c:"-unknown-");
                        }
-                       if (info.new_level != array.level ||
-                           info.new_layout != array.layout) {
-                               if (info.new_level == 5) {
-                                       char *c = map_num(r5layout, info.new_layout);
+                       if (info->new_level != array.level ||
+                           info->new_layout != array.layout) {
+                               if (info->new_level == 5) {
+                                       char *c = map_num(r5layout, info->new_layout);
                                        printf("     New Layout : %s\n",
                                               c?c:"-unknown-");
                                }
-                               if (info.new_level == 6) {
-                                       char *c = map_num(r6layout, info.new_layout);
+                               if (info->new_level == 6) {
+                                       char *c = map_num(r6layout, info->new_layout);
                                        printf("     New Layout : %s\n",
                                               c?c:"-unknown-");
                                }
-                               if (info.new_level == 10) {
+                               if (info->new_level == 10) {
                                        printf("     New Layout : near=%d, %s=%d\n",
-                                              info.new_layout&255,
-                                              (info.new_layout&0x10000)?"offset":"far",
-                                              (info.new_layout>>8)&255);
+                                              info->new_layout&255,
+                                              (info->new_layout&0x10000)?"offset":"far",
+                                              (info->new_layout>>8)&255);
                                }
                        }
-                       if (info.new_chunk != array.chunk_size)
-                               printf("  New Chunksize : %dK\n", info.new_chunk/1024);
+                       if (info->new_chunk != array.chunk_size)
+                               printf("  New Chunksize : %dK\n", info->new_chunk/1024);
                        printf("\n");
                } else if (e && e->percent >= 0)
                        printf("\n");
@@ -500,6 +507,7 @@ This is pretty boring
                else
                        printf("    Number   Major   Minor   RaidDevice\n");
        }
+       free(info);
 
        for (d= 0; d < max_disks; d++) {
                char *dv;
@@ -581,6 +589,7 @@ This is pretty boring
        free(disks);
 out:
        close(fd);
+       free(subarray);
        return rv;
 }
 
index 7fbd4ae2cf932e5decd61b5a32c97177d1fe8dbb..f949646f48aea561e8b68545b5e0f9d1dd13af67 100644 (file)
--- a/Examine.c
+++ b/Examine.c
@@ -30,7 +30,7 @@
 #endif
 #include       "md_u.h"
 #include       "md_p.h"
-int Examine(mddev_dev_t devlist, int brief, int export, int scan,
+int Examine(struct mddev_dev *devlist, int brief, int export, int scan,
            int SparcAdjust, struct supertype *forcest,
            char *homehost)
 {
@@ -64,6 +64,7 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
 
        for (; devlist ; devlist=devlist->next) {
                struct supertype *st;
+               int have_container = 0;
 
                fd = dev_open(devlist->devname, O_RDONLY);
                if (fd < 0) {
@@ -75,15 +76,29 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                        err = 1;
                }
                else {
+                       int container = 0;
                        if (forcest)
                                st = dup_super(forcest);
-                       else
+                       else if (must_be_container(fd)) {
+                               /* might be a container */
+                               st = super_by_fd(fd, NULL);
+                               container = 1;
+                       } else
                                st = guess_super(fd);
-                       if (st)
-                               err = st->ss->load_super(st, fd,
-                                                        (brief||scan) ? NULL
-                                                          :devlist->devname);
-                       else {
+                       if (st) {
+                               err = 1;
+                               if (!container)
+                                       err = st->ss->load_super(st, fd,
+                                                                (brief||scan) ? NULL
+                                                                :devlist->devname);
+                               if (err && st->ss->load_container) {
+                                       err = st->ss->load_container(st, fd,
+                                                                (brief||scan) ? NULL
+                                                                :devlist->devname);
+                                       if (!err)
+                                               have_container = 1;
+                               }
+                       } else {
                                if (!brief) {
                                        fprintf(stderr, Name ": No md superblock detected on %s.\n", devlist->devname);
                                        rv = 1;
@@ -100,7 +115,11 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                             devlist->devname, 0, 0, NULL);
                /* Ok, its good enough to try, though the checksum could be wrong */
 
-               if (brief) {
+               if (brief && st->ss->brief_examine_super == NULL) {
+                       if (!scan)
+                               fprintf(stderr, Name ": No brief listing for %s on %s\n",
+                                       st->ss->name, devlist->devname);
+               } else if (brief) {
                        struct array *ap;
                        char *d;
                        for (ap=arrays; ap; ap=ap->next) {
@@ -115,10 +134,10 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                ap->spares = 0;
                                ap->st = st;
                                arrays = ap;
-                               st->ss->getinfo_super(st, &ap->info);
+                               st->ss->getinfo_super(st, &ap->info, NULL);
                        } else
-                               st->ss->getinfo_super(st, &ap->info);
-                       if (!st->loaded_container &&
+                               st->ss->getinfo_super(st, &ap->info, NULL);
+                       if (!have_container &&
                            !(ap->info.disk.state & (1<<MD_DISK_SYNC)))
                                ap->spares++;
                        d = dl_strdup(devlist->devname);
@@ -126,6 +145,7 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                } else if (export) {
                        if (st->ss->export_examine_super)
                                st->ss->export_examine_super(st);
+                       st->ss->free_super(st);
                } else {
                        printf("%s:\n",devlist->devname);
                        st->ss->examine_super(st, homehost);
diff --git a/Grow.c b/Grow.c
index 087b4efeb195b494e1f8d922323242b38e103676..40e693edfbf3c6c39e655be17d674b4900bf9728 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -51,33 +51,41 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
        int nfd, fd2;
        int d, nd;
        struct supertype *st = NULL;
-
+       char *subarray = NULL;
 
        if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
                fprintf(stderr, Name ": cannot get array info for %s\n", devname);
                return 1;
        }
 
-       st = super_by_fd(fd);
+       if (info.array.level != -1) {
+               fprintf(stderr, Name ": can only add devices to linear arrays\n");
+               return 1;
+       }
+
+       st = super_by_fd(fd, &subarray);
        if (!st) {
                fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
                return 1;
        }
 
-       if (info.array.level != -1) {
-               fprintf(stderr, Name ": can only add devices to linear arrays\n");
-               return 1;
+       if (subarray) {
+               fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n");
+               free(subarray);
+               free(st);
        }
 
        nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
        if (nfd < 0) {
                fprintf(stderr, Name ": cannot open %s\n", newdev);
+               free(st);
                return 1;
        }
        fstat(nfd, &stb);
        if ((stb.st_mode & S_IFMT) != S_IFBLK) {
                fprintf(stderr, Name ": %s is not a block device!\n", newdev);
                close(nfd);
+               free(st);
                return 1;
        }
        /* now check out all the devices and make sure we can read the superblock */
@@ -85,28 +93,37 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
                mdu_disk_info_t disk;
                char *dv;
 
+               st->ss->free_super(st);
+
                disk.number = d;
                if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
                        fprintf(stderr, Name ": cannot get device detail for device %d\n",
                                d);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
                dv = map_dev(disk.major, disk.minor, 1);
                if (!dv) {
                        fprintf(stderr, Name ": cannot find device file for device %d\n",
                                d);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
                fd2 = dev_open(dv, O_RDWR);
                if (!fd2) {
                        fprintf(stderr, Name ": cannot open device file %s\n", dv);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
-               st->ss->free_super(st);
 
                if (st->ss->load_super(st, fd2, NULL)) {
                        fprintf(stderr, Name ": cannot find super block on %s\n", dv);
+                       close(nfd);
                        close(fd2);
+                       free(st);
                        return 1;
                }
                close(fd2);
@@ -204,6 +221,7 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
        mdu_bitmap_file_t bmf;
        mdu_array_info_t array;
        struct supertype *st;
+       char *subarray = NULL;
        int major = BITMAP_MAJOR_HI;
        int vers = md_get_version(fd);
        unsigned long long bitmapsize, array_size;
@@ -253,6 +271,11 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                        devname);
                return 1;
        }
+
+       if (strcmp(file, "none") == 0) {
+               fprintf(stderr, Name ": no bitmap found on %s\n", devname);
+               return 1;
+       }
        if (array.level <= 0) {
                fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
                        map_num(pers, array.level)?:"of this array");
@@ -277,16 +300,19 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                bitmapsize = bitmapsize * array.raid_disks / ncopies;
        }
 
-       st = super_by_fd(fd);
+       st = super_by_fd(fd, &subarray);
        if (!st) {
                fprintf(stderr, Name ": Cannot understand version %d.%d\n",
                        array.major_version, array.minor_version);
                return 1;
        }
-       if (strcmp(file, "none") == 0) {
-               fprintf(stderr, Name ": no bitmap found on %s\n", devname);
+       if (subarray) {
+               fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n");
+               free(subarray);
+               free(st);
                return 1;
-       } else if (strcmp(file, "internal") == 0) {
+       }
+       if (strcmp(file, "internal") == 0) {
                int d;
                if (st->ss->add_internal_bitmap == NULL) {
                        fprintf(stderr, Name ": Internal bitmaps not supported "
@@ -418,7 +444,7 @@ static struct mdp_backup_super {
        __u8 pad[512-68-32];
 } __attribute__((aligned(512))) bsb, bsb2;
 
-__u32 bsb_csum(char *buf, int len)
+static __u32 bsb_csum(char *buf, int len)
 {
        int i;
        int csum = 0;
@@ -427,852 +453,1849 @@ __u32 bsb_csum(char *buf, int len)
        return __cpu_to_le32(csum);
 }
 
-static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
-                     int *fds, unsigned long long *offsets,
-                     int disks, int chunk, int level, int layout, int data,
-                     int dests, int *destfd, unsigned long long *destoffsets);
-static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
-                       int *fds, unsigned long long *offsets,
-                       int disks, int chunk, int level, int layout, int data,
-                       int dests, int *destfd, unsigned long long *destoffsets);
-static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
-                          int *fds, unsigned long long *offsets,
-                          unsigned long long start,
-                          int disks, int chunk, int level, int layout, int data,
-                          int dests, int *destfd, unsigned long long *destoffsets);
-
-int freeze_array(struct mdinfo *sra)
+static int check_idle(struct supertype *st)
+{
+       /* Check that all member arrays for this container, or the
+        * container of this array, are idle
+        */
+       int container_dev = (st->container_dev != NoMdDev
+                            ? st->container_dev : st->devnum);
+       char container[40];
+       struct mdstat_ent *ent, *e;
+       int is_idle = 1;
+       
+       fmt_devname(container, container_dev);
+       ent = mdstat_read(0, 0);
+       for (e = ent ; e; e = e->next) {
+               if (!is_container_member(e, container))
+                       continue;
+               if (e->percent >= 0) {
+                       is_idle = 0;
+                       break;
+               }
+       }
+       free_mdstat(ent);
+       return is_idle;
+}
+
+static int freeze_container(struct supertype *st)
+{
+       int container_dev = (st->container_dev != NoMdDev
+                            ? st->container_dev : st->devnum);
+       char container[40];
+
+       if (!check_idle(st))
+               return -1;
+       
+       fmt_devname(container, container_dev);
+
+       if (block_monitor(container, 1)) {
+               fprintf(stderr, Name ": failed to freeze container\n");
+               return -2;
+       }
+
+       return 1;
+}
+
+static void unfreeze_container(struct supertype *st)
+{
+       int container_dev = (st->container_dev != NoMdDev
+                            ? st->container_dev : st->devnum);
+       char container[40];
+       
+       fmt_devname(container, container_dev);
+
+       unblock_monitor(container, 1);
+}
+
+static int freeze(struct supertype *st)
 {
-       /* Try to freeze resync on this array.
+       /* Try to freeze resync/rebuild on this array/container.
         * Return -1 if the array is busy,
+        * return -2 container cannot be frozen,
         * return 0 if this kernel doesn't support 'frozen'
         * return 1 if it worked.
         */
-       char buf[20];
-       if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
-               return 0;
-       if (strcmp(buf, "idle\n") != 0 &&
-           strcmp(buf, "frozen\n") != 0)
-               return -1;
-       if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
-               return 0;
-       return 1;
+       if (st->ss->external)
+               return freeze_container(st);
+       else {
+               struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
+               int err;
+
+               if (!sra)
+                       return -1;
+               err = sysfs_freeze_array(sra);
+               sysfs_free(sra);
+               return err;
+       }
 }
 
-void unfreeze_array(struct mdinfo *sra, int frozen)
+static void unfreeze(struct supertype *st)
 {
-       /* If 'frozen' is 1, unfreeze the array */
-       if (frozen > 0)
-               sysfs_set_str(sra, NULL, "sync_action", "idle");
+       if (st->ss->external)
+               return unfreeze_container(st);
+       else {
+               struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
+
+               if (sra)
+                       sysfs_set_str(sra, NULL, "sync_action", "idle");
+               else
+                       fprintf(stderr, Name ": failed to unfreeze array\n");
+               sysfs_free(sra);
+       }
 }
 
-void wait_reshape(struct mdinfo *sra)
+static void wait_reshape(struct mdinfo *sra)
 {
        int fd = sysfs_get_fd(sra, NULL, "sync_action");
        char action[20];
 
-       do {
+       if (fd < 0)
+               return;
+
+       while  (sysfs_fd_get_str(fd, action, 20) > 0 &&
+               strncmp(action, "reshape", 7) == 0) {
                fd_set rfds;
                FD_ZERO(&rfds);
                FD_SET(fd, &rfds);
                select(fd+1, NULL, NULL, &rfds, NULL);
-               
-               if (sysfs_fd_get_str(fd, action, 20) < 0) {
-                       close(fd);
-                       return;
-               }
-       } while  (strncmp(action, "reshape", 7) == 0);
-}
-                       
-               
-int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
-                long long size,
-                int level, char *layout_str, int chunksize, int raid_disks)
-{
-       /* Make some changes in the shape of an array.
-        * The kernel must support the change.
-        *
-        * There are three different changes.  Each can trigger
-        * a resync or recovery so we freeze that until we have
-        * requested everything (if kernel supports freezing - 2.6.30).
-        * The steps are:
-        *  - change size (i.e. component_size)
-        *  - change level
-        *  - change layout/chunksize/ndisks
-        *
-        * The last can require a reshape.  It is different on different
-        * levels so we need to check the level before actioning it.
-        * Some times the level change needs to be requested after the
-        * reshape (e.g. raid6->raid5, raid5->raid0)
-        *
-        */
-       struct mdu_array_info_s array, orig;
-       char *c;
-       int rv = 0;
-       struct supertype *st;
-
-       int nchunk, ochunk;
-       int nlayout, olayout;
-       int ndisks, odisks;
-       unsigned int ndata, odata;
-       int orig_level = UnSet;
-       char alt_layout[40];
-       int *fdlist;
-       unsigned long long *offsets;
-       int d, i;
-       int nrdisks;
-       int err;
-       int frozen;
-       unsigned long a,b, blocks, stripes;
-       unsigned long cache;
-       unsigned long long array_size;
-       int changed = 0;
-       int done;
-
-       struct mdinfo *sra;
-       struct mdinfo *sd;
-
-       if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
-               fprintf(stderr, Name ": %s is not an active md array - aborting\n",
-                       devname);
-               return 1;
        }
+       close(fd);
+}
 
-       if (size >= 0 &&
-           (chunksize || level!= UnSet || layout_str || raid_disks)) {
-               fprintf(stderr, Name ": cannot change component size at the same time "
-                       "as other changes.\n"
-                       "   Change size first, then check data is intact before "
-                       "making other changes.\n");
+static int reshape_super(struct supertype *st, long long size, int level,
+                        int layout, int chunksize, int raid_disks,
+                        int delta_disks, char *backup_file, char *dev,
+                        int verbose)
+{
+       /* nothing extra to check in the native case */
+       if (!st->ss->external)
+               return 0;
+       if (!st->ss->reshape_super ||
+           !st->ss->manage_reshape) {
+               fprintf(stderr, Name ": %s metadata does not support reshape\n",
+                       st->ss->name);
                return 1;
        }
 
-       if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
-           get_linux_version() < 2006032 &&
-           !check_env("MDADM_FORCE_FEWER")) {
-               fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
-                       "       Please use a newer kernel\n");
-               return 1;
-       }
-       sra = sysfs_read(fd, 0, GET_LEVEL);
-       if (sra)
-               frozen = freeze_array(sra);
-       else {
-               fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
-                       devname);
-               return 1;
-       }
-       if (frozen < 0) {
-               fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
-                       " be reshaped\n", devname);
-               return 1;
-       }
+       return st->ss->reshape_super(st, size, level, layout, chunksize,
+                                    raid_disks, delta_disks, backup_file, dev,
+                                    verbose);
+}
 
-       /* ========= set size =============== */
-       if (size >= 0 && (size == 0 || size != array.size)) {
-               array.size = size;
-               if (array.size != size) {
-                       /* got truncated to 32bit, write to
-                        * component_size instead
-                        */
-                       if (sra)
-                               rv = sysfs_set_num(sra, NULL,
-                                                  "component_size", size);
-                       else
-                               rv = -1;
+static void sync_metadata(struct supertype *st)
+{
+       if (st->ss->external) {
+               if (st->update_tail) {
+                       flush_metadata_updates(st);
+                       st->update_tail = &st->updates;
                } else
-                       rv = ioctl(fd, SET_ARRAY_INFO, &array);
-               if (rv != 0) {
-                       int err = errno;
-                       fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
-                               devname, strerror(err));
-                       if (err == EBUSY && 
-                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
-                               fprintf(stderr, "       Bitmap must be removed before size can be changed\n");
-                       rv = 1;
-                       goto release;
-               }
-               ioctl(fd, GET_ARRAY_INFO, &array);
-               size = get_component_size(fd)/2;
-               if (size == 0)
-                       size = array.size;
-               if (!quiet)
-                       fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
-                               devname, size);
-               changed = 1;
-       } else {
-               size = get_component_size(fd)/2;
-               if (size == 0)
-                       size = array.size;
+                       st->ss->sync_metadata(st);
        }
+}
 
-       /* ======= set level =========== */
-       if (level != UnSet && level != array.level) {
-               /* Trying to change the level.
-                * We might need to change layout first and schedule a
-                * level change for later.
-                * Level changes that can happen immediately are:
-                * 0->4,5,6  1->5  4->5,6  5->1,6
-                * Level changes that need a layout change first are:
-                * 6->5,4,0 : need a -6 layout, or parity-last
-                * 5->4,0   : need parity-last
-                */
-               if ((array.level == 6 || array.level == 5) &&
-                   (level == 5 || level == 4 || level == 0)) {
-                       /* Don't change level yet, but choose intermediate
-                        * layout
-                        */
-                       if (level == 5) {
-                               if (layout_str == NULL)
-                                       switch (array.layout) {
-                                       case ALGORITHM_LEFT_ASYMMETRIC:
-                                       case ALGORITHM_LEFT_ASYMMETRIC_6:
-                                       case ALGORITHM_ROTATING_N_RESTART:
-                                               layout_str = "left-asymmetric-6";
-                                               break;
-                                       case ALGORITHM_LEFT_SYMMETRIC:
-                                       case ALGORITHM_LEFT_SYMMETRIC_6:
-                                       case ALGORITHM_ROTATING_N_CONTINUE:
-                                               layout_str = "left-symmetric-6";
-                                               break;
-                                       case ALGORITHM_RIGHT_ASYMMETRIC:
-                                       case ALGORITHM_RIGHT_ASYMMETRIC_6:
-                                       case ALGORITHM_ROTATING_ZERO_RESTART:
-                                               layout_str = "right-asymmetric-6";
-                                               break;
-                                       case ALGORITHM_RIGHT_SYMMETRIC:
-                                       case ALGORITHM_RIGHT_SYMMETRIC_6:
-                                               layout_str = "right-symmetric-6";
-                                               break;
-                                       case ALGORITHM_PARITY_0:
-                                       case ALGORITHM_PARITY_0_6:
-                                               layout_str = "parity-first-6";
-                                               break;
-                                       case ALGORITHM_PARITY_N:
-                                               layout_str = "parity-last";
-                                               break;
-                                       default:
-                                               fprintf(stderr, Name ": %s: cannot"
-                                                       "convert layout to RAID5 equivalent\n",
-                                                       devname);
-                                               rv = 1;
-                                               goto release;
-                                       }
-                               else {
-                                       int l = map_name(r5layout, layout_str);
-                                       if (l == UnSet) {
-                                               fprintf(stderr, Name ": %s: layout '%s' not recognised\n",
-                                                       devname, layout_str);
-                                               rv = 1;
-                                               goto release;
-                                       }
-                                       if (l != ALGORITHM_PARITY_N) {
-                                               /* need the -6 version */
-                                               char *ls = map_num(r5layout, l);
-                                               strcat(strcpy(alt_layout, ls),
-                                                      "-6");
-                                               layout_str = alt_layout;
-                                       }
-                               }
-                               if (raid_disks)
-                                       /* The final raid6->raid5 conversion
-                                        * will reduce the number of disks,
-                                        * so now we need to aim higher
-                                        */
-                                       raid_disks++;
-                       } else
-                               layout_str = "parity-last";
-               } else {
-                       c = map_num(pers, level);
-                       if (c == NULL) {
-                               rv = 1;/* not possible */
-                               goto release;
-                       }
-                       err = sysfs_set_str(sra, NULL, "level", c);
-                       if (err) {
-                               err = errno;
-                               fprintf(stderr, Name ": %s: could not set level to %s\n",
-                                       devname, c);
-                               if (err == EBUSY && 
-                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
-                                       fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
-                               rv = 1;
-                               goto release;
-                       }
-                       orig = array;
-                       orig_level = orig.level;
-                       ioctl(fd, GET_ARRAY_INFO, &array);
-                       if (layout_str == NULL &&
-                           orig.level == 5 && level == 6 &&
-                           array.layout != orig.layout)
-                               layout_str = map_num(r5layout, orig.layout);
-                       if (!quiet)
-                               fprintf(stderr, Name " level of %s changed to %s\n",
-                                       devname, c);
-                       changed = 1;
-               }
-       }
-
-       /* ========= set shape (chunk_size / layout / ndisks)  ============== */
-       /* Check if layout change is a no-op */
-       if (layout_str) switch(array.level) {
-       case 5:
-               if (array.layout == map_name(r5layout, layout_str))
-                       layout_str = NULL;
-               break;
-       case 6:
-               if (layout_str == NULL &&
-                   ((chunksize && chunksize * 1024 != array.chunk_size) ||
-                    (raid_disks && raid_disks != array.raid_disks)) &&
-                   array.layout >= 16) {
-                       fprintf(stderr, Name
-                               ": %s has a non-standard layout.  If you wish to preserve this\n"
-                               "      during the reshape, please specify --layout=preserve\n"
-                               "      If you want to change it, specify a layout or use --layout=normalise\n",
-                               devname);
-                       rv = 1;
-                       goto release;
-               }
-               if (strcmp(layout_str, "normalise") == 0 ||
-                   strcmp(layout_str, "normalize") == 0) {
-                       char *hyphen;
-                       strcpy(alt_layout, map_num(r6layout, array.layout));
-                       hyphen = strrchr(alt_layout, '-');
-                       if (hyphen && strcmp(hyphen, "-6") == 0) {
-                               *hyphen = 0;
-                               layout_str = alt_layout;
-                       }
-               }
+static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
+{
+       /* when dealing with external metadata subarrays we need to be
+        * prepared to handle EAGAIN.  The kernel may need to wait for
+        * mdmon to mark the array active so the kernel can handle
+        * allocations/writeback when preparing the reshape action
+        * (md_allow_write()).  We temporarily disable safe_mode_delay
+        * to close a race with the array_state going clean before the
+        * next write to raid_disks / stripe_cache_size
+        */
+       char safe[50];
+       int rc;
 
-               if (array.layout == map_name(r6layout, layout_str))
-                       layout_str = NULL;
-               if (layout_str && strcmp(layout_str, "preserve") == 0)
-                       layout_str = NULL;
-               break;
-       }
-       if (layout_str == NULL
-           && (chunksize == 0 || chunksize*1024 == array.chunk_size)
-           && (raid_disks == 0 || raid_disks == array.raid_disks)) {
-               rv = 0;
-               if (level != UnSet && level != array.level) {
-                       /* Looks like this level change doesn't need
-                        * a reshape after all.
-                        */
-                       c = map_num(pers, level);
-                       if (c) {
-                               rv = sysfs_set_str(sra, NULL, "level", c);
-                               if (rv) {
-                                       int err = errno;
-                                       fprintf(stderr, Name ": %s: could not set level to %s\n",
-                                               devname, c);
-                                       if (err == EBUSY && 
-                                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
-                                               fprintf(stderr, "       Bitmap must be removed before level can be changed\n");
-                                       rv = 1;
-                               }
-                       }
-               } else if (!changed && !quiet)
-                       fprintf(stderr, Name ": %s: no change requested\n",
-                               devname);
-               goto release;
+       /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
+       if (!container ||
+           (strcmp(name, "raid_disks") != 0 &&
+            strcmp(name, "stripe_cache_size") != 0))
+               return sysfs_set_num(sra, NULL, name, n);
+
+       rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
+       if (rc <= 0)
+               return -1;
+       sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
+       rc = sysfs_set_num(sra, NULL, name, n);
+       if (rc < 0 && errno == EAGAIN) {
+               ping_monitor(container);
+               /* if we get EAGAIN here then the monitor is not active
+                * so stop trying
+                */
+               rc = sysfs_set_num(sra, NULL, name, n);
        }
+       sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
+       return rc;
+}
 
-       c = map_num(pers, array.level);
-       if (c == NULL) c = "-unknown-";
-       switch(array.level) {
-       default: /* raid0, linear, multipath cannot be reconfigured */
-               fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
-                       c, devname);
-               rv = 1;
-               break;
+int start_reshape(struct mdinfo *sra, int already_running)
+{
+       int err;
+       sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+       err = sysfs_set_num(sra, NULL, "suspend_hi", 0);
+       err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", 0);
+       if (!already_running)
+               sysfs_set_num(sra, NULL, "sync_min", 0);
+       err = err ?: sysfs_set_num(sra, NULL, "sync_max", 0);
+       if (!already_running)
+               err = err ?: sysfs_set_str(sra, NULL, "sync_action", "reshape");
+
+       return err;
+}
 
-       case LEVEL_FAULTY: /* only 'layout' change is permitted */
+void abort_reshape(struct mdinfo *sra)
+{
+       sysfs_set_str(sra, NULL, "sync_action", "idle");
+       sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+       sysfs_set_num(sra, NULL, "suspend_hi", 0);
+       sysfs_set_num(sra, NULL, "suspend_lo", 0);
+       sysfs_set_num(sra, NULL, "sync_min", 0);
+       sysfs_set_str(sra, NULL, "sync_max", "max");
+}
 
-               if (chunksize  || raid_disks) {
-                       fprintf(stderr, Name ": %s: Cannot change chunksize or disks of a 'faulty' array\n",
-                               devname);
-                       rv = 1;
-                       break;
-               }
-               if (layout_str == NULL)
-                       break; /* nothing to do.... */
+int remove_disks_for_takeover(struct supertype *st,
+                             struct mdinfo *sra,
+                             int layout)
+{
+       int nr_of_copies;
+       struct mdinfo *remaining;
+       int slot;
+
+       if (sra->array.level == 10)
+               nr_of_copies = layout & 0xff;
+       else if (sra->array.level == 1)
+               nr_of_copies = sra->array.raid_disks;
+       else
+               return 1;
 
-               array.layout = parse_layout_faulty(layout_str);
-               if (array.layout < 0) {
-                       fprintf(stderr, Name ": %s: layout %s not understood for 'faulty' array\n",
-                               devname, layout_str);
-                       rv = 1;
-                       break;
-               }
-               if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
-                       fprintf(stderr, Name ": Cannot set layout for %s: %s\n",
-                               devname, strerror(errno));
-                       rv = 1;
-               } else if (!quiet)
-                       printf("layout for %s set to %d\n", devname, array.layout);
-               break;
+       remaining = sra->devs;
+       sra->devs = NULL;
+       /* for each 'copy', select one device and remove from the list. */
+       for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
+               struct mdinfo **diskp;
+               int found = 0;
 
-       case 1: /* only raid_disks can each be changed. */
+               /* Find a working device to keep */
+               for (diskp =  &remaining; *diskp ; diskp = &(*diskp)->next) {
+                       struct mdinfo *disk = *diskp;
 
-               if (chunksize || layout_str != NULL) {
-                       fprintf(stderr, Name ": %s: Cannot change chunk size or layout for a RAID1 array.\n",
-                               devname);
-                       rv = 1;
+                       if (disk->disk.raid_disk < slot)
+                               continue;
+                       if (disk->disk.raid_disk >= slot + nr_of_copies)
+                               continue;
+                       if (disk->disk.state & (1<<MD_DISK_REMOVED))
+                               continue;
+                       if (disk->disk.state & (1<<MD_DISK_FAULTY))
+                               continue;
+                       if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
+                               continue;
+
+                       /* We have found a good disk to use! */
+                       *diskp = disk->next;
+                       disk->next = sra->devs;
+                       sra->devs = disk;
+                       found = 1;
                        break;
                }
-               if (raid_disks > 0) {
-                       array.raid_disks = raid_disks;
-                       if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
-                               fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
-                                       devname, strerror(errno));
-                               rv = 1;
-                       }
-               }
-               break;
+               if (!found)
+                       break;
+       }
 
-       case 4:
-       case 5:
-       case 6:
+       if (slot < sra->array.raid_disks) {
+               /* didn't find all slots */
+               struct mdinfo **e;
+               e = &remaining;
+               while (*e)
+                       e = &(*e)->next;
+               *e = sra->devs;
+               sra->devs = remaining;
+               return 1;
+       }
 
-               /*
-                * layout/chunksize/raid_disks can be changed
-                * though the kernel may not support it all.
-                */
-               st = super_by_fd(fd);
+       /* Remove all 'remaining' devices from the array */
+       while (remaining) {
+               struct mdinfo *sd = remaining;
+               remaining = sd->next;
+
+               sysfs_set_str(sra, sd, "state", "faulty");
+               sysfs_set_str(sra, sd, "slot", "none");
+               /* for external metadata disks should be removed in mdmon */
+               if (!st->ss->external)
+                       sysfs_set_str(sra, sd, "state", "remove");
+               sd->disk.state |= (1<<MD_DISK_REMOVED);
+               sd->disk.state &= ~(1<<MD_DISK_SYNC);
+               sd->next = sra->devs;
+               sra->devs = sd;
+       }
+       return 0;
+}
 
-               if (raid_disks > st->max_devs) {
-                       fprintf(stderr, Name ": Cannot increase raid-disks on "
-                               "this array beyond %d\n", st->max_devs);
-                       rv = 1;
-                       break;
-               }
+void reshape_free_fdlist(int *fdlist,
+                        unsigned long long *offsets,
+                        int size)
+{
+       int i;
 
-               /*
-                * There are three possibilities.
-                * 1/ The array will shrink.
-                *    We need to ensure the reshape will pause before reaching
-                *    the 'critical section'.  We also need to fork and wait for
-                *    that to happen.  When it does we 
-                *       suspend/backup/complete/unfreeze
-                *
-                * 2/ The array will not change size.
-                *    This requires that we keep a backup of a sliding window
-                *    so that we can restore data after a crash.  So we need
-                *    to fork and monitor progress.
-                *
-                * 3/ The array will grow. This is relatively easy.
-                *    However the kernel's restripe routines will cheerfully
-                *    overwrite some early data before it is safe.  So we
-                *    need to make a backup of the early parts of the array
-                *    and be ready to restore it if rebuild aborts very early.
-                *
-                *    We backup data by writing it to one spare, or to a
-                *    file which was given on command line.
-                *
-                *    [FOLLOWING IS OLD AND PARTLY WRONG]
-                *    So: we enumerate the devices in the array and
-                *    make sure we can open all of them.
-                *    Then we freeze the early part of the array and
-                *    backup to the various spares.
-                *    Then we request changes and start the reshape.
-                *    Monitor progress until it has passed the danger zone.
-                *    and finally invalidate the copied data and unfreeze the
-                *    start of the array.
-                *
-                * In each case, we first make sure that storage is available
-                * for the required backup.
-                * Then we:
-                *   -  request the shape change.
-                *   -  for to handle backup etc.
-                */
-               nchunk = ochunk = array.chunk_size;
-               nlayout = olayout = array.layout;
-               ndisks = odisks = array.raid_disks;
-
-               if (chunksize) {
-                       nchunk = chunksize * 1024;
-                       if (size % chunksize) {
-                               fprintf(stderr, Name ": component size %lluK is not"
-                                       " a multiple of chunksize %dK\n",
-                                       size, chunksize);
-                               break;
-                       }
-               }
-               if (layout_str != NULL)
-                       switch(array.level) {
-                       case 4: /* ignore layout */
-                               break;
-                       case 5:
-                               nlayout = map_name(r5layout, layout_str);
-                               if (nlayout == UnSet) {
-                                       fprintf(stderr, Name ": layout %s not understood for raid5.\n",
-                                               layout_str);
-                                       rv = 1;
-                                       goto release;
-                               }
-                               break;
+       for (i = 0; i < size; i++)
+               if (fdlist[i] >= 0)
+                       close(fdlist[i]);
+
+       free(fdlist);
+       free(offsets);
+}
 
-                       case 6:
-                               nlayout = map_name(r6layout, layout_str);
-                               if (nlayout == UnSet) {
-                                       fprintf(stderr, Name ": layout %s not understood for raid6.\n",
-                                               layout_str);
-                                       rv = 1;
+int reshape_prepare_fdlist(char *devname,
+                          struct mdinfo *sra,
+                          int raid_disks,
+                          int nrdisks,
+                          unsigned long blocks,
+                          char *backup_file,
+                          int *fdlist,
+                          unsigned long long *offsets)
+{
+       int d = 0;
+       struct mdinfo *sd;
+
+       for (d = 0; d <= nrdisks; d++)
+               fdlist[d] = -1;
+       d = raid_disks;
+       for (sd = sra->devs; sd; sd = sd->next) {
+               if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                       continue;
+               if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                       fdlist[sd->disk.raid_disk]
+                               = dev_open(dn, O_RDONLY);
+                       offsets[sd->disk.raid_disk] = sd->data_offset*512;
+                       if (fdlist[sd->disk.raid_disk] < 0) {
+                               fprintf(stderr,
+                                       Name ": %s: cannot open component %s\n",
+                                       devname, dn ? dn : "-unknown-");
+                               d = -1;
+                               goto release;
+                       }
+               } else if (backup_file == NULL) {
+                       /* spare */
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                               fdlist[d] = dev_open(dn, O_RDWR);
+                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
+                               if (fdlist[d] < 0) {
+                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                               devname, dn ? dn : "-unknown-");
+                                       d = -1;
                                        goto release;
                                }
-                               break;
+                               d++;
                        }
-               if (raid_disks) ndisks = raid_disks;
-
-               odata = odisks-1;
-               ndata = ndisks-1;
-               if (array.level == 6) {
-                       odata--; /* number of data disks */
-                       ndata--;
                }
+release:
+       return d;
+}
 
-               if (odata == ndata &&
-                   get_linux_version() < 2006032) {
-                       fprintf(stderr, Name ": in-place reshape is not safe before 2.6.32, sorry.\n");
-                       break;
-               }
+int reshape_open_backup_file(char *backup_file,
+                            int fd,
+                            char *devname,
+                            long blocks,
+                            int *fdlist,
+                            unsigned long long *offsets,
+                            int restart)
+{
+       /* Return 1 on success, 0 on any form of failure */
+       /* need to check backup file is large enough */
+       char buf[512];
+       struct stat stb;
+       unsigned int dev;
+       int i;
 
-               /* Check that we can hold all the data */
-               get_dev_size(fd, NULL, &array_size);
-               if (ndata * (unsigned long long)size < (array_size/1024)) {
-                       fprintf(stderr, Name ": this change will reduce the size of the array.\n"
-                               "       use --grow --array-size first to truncate array.\n"
-                               "       e.g. mdadm --grow %s --array-size %llu\n",
-                               devname, ndata * size);
-                       rv = 1;
-                       break;
+       *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL),
+                      S_IRUSR | S_IWUSR);
+       *offsets = 8 * 512;
+       if (*fdlist < 0) {
+               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+                       devname, backup_file, strerror(errno));
+               return 0;
+       }
+       /* Guard against backup file being on array device.
+        * If array is partitioned or if LVM etc is in the
+        * way this will not notice, but it is better than
+        * nothing.
+        */
+       fstat(*fdlist, &stb);
+       dev = stb.st_dev;
+       fstat(fd, &stb);
+       if (stb.st_rdev == dev) {
+               fprintf(stderr, Name ": backup file must NOT be"
+                       " on the array being reshaped.\n");
+               close(*fdlist);
+               return 0;
+       }
+
+       memset(buf, 0, 512);
+       for (i=0; i < blocks + 8 ; i++) {
+               if (write(*fdlist, buf, 512) != 512) {
+                       fprintf(stderr, Name ": %s: cannot create"
+                               " backup file %s: %s\n",
+                               devname, backup_file, strerror(errno));
+                       return 0;
                }
+       }
+       if (fsync(*fdlist) != 0) {
+               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+                       devname, backup_file, strerror(errno));
+               return 0;
+       }
 
-               /* So how much do we need to backup.
-                * We need an amount of data which is both a whole number of
-                * old stripes and a whole number of new stripes.
-                * So LCM for (chunksize*datadisks).
-                */
-               a = (ochunk/512) * odata;
-               b = (nchunk/512) * ndata;
-               /* Find GCD */
-               while (a != b) {
-                       if (a < b)
-                               b -= a;
-                       if (b < a)
-                               a -= b;
-               }
-               /* LCM == product / GCD */
-               blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
+       return 1;
+}
 
-               sysfs_free(sra);
-               sra = sysfs_read(fd, 0,
-                                GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
-                                GET_CACHE);
+unsigned long compute_backup_blocks(int nchunk, int ochunk,
+                                   unsigned int ndata, unsigned int odata)
+{
+       unsigned long a, b, blocks;
+       /* So how much do we need to backup.
+        * We need an amount of data which is both a whole number of
+        * old stripes and a whole number of new stripes.
+        * So LCM for (chunksize*datadisks).
+        */
+       a = (ochunk/512) * odata;
+       b = (nchunk/512) * ndata;
+       /* Find GCD */
+       while (a != b) {
+               if (a < b)
+                       b -= a;
+               if (b < a)
+                       a -= b;
+       }
+       /* LCM == product / GCD */
+       blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
 
-               if (!sra) {
-                       fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
-                               devname);
-                       rv = 1;
-                       break;
-               }
+       return blocks;
+}
 
-               if (ndata == odata) {
-                       /* Make 'blocks' bigger for better throughput, but
-                        * not so big that we reject it below.
-                        * Try for 16 megabytes
-                        */
-                       while (blocks * 32 < sra->component_size &&
-                              blocks < 16*1024*2)
-                              blocks *= 2;
-               } else
-                       fprintf(stderr, Name ": Need to backup %luK of critical "
-                               "section..\n", blocks/2);
+char *analyse_change(struct mdinfo *info, struct reshape *re)
+{
+       /* Based on the current array state in info->array and
+        * the changes in info->new_* etc, determine:
+        *  - whether the change is possible
+        *  - Intermediate level/raid_disks/layout
+        *  - whether a restriping reshape is needed
+        *  - number of sectors in minimum change unit.  This
+        *    will cover a whole number of stripes in 'before' and
+        *    'after'.
+        *
+        * Return message if the change should be rejected
+        *        NULL if the change can be achieved
+        *
+        * This can be called as part of starting a reshape, or
+        * when assembling an array that is undergoing reshape.
+        */
+       int new_disks;
+       /* delta_parity records change in number of devices
+        * caused by level change
+        */
+       int delta_parity = 0;
 
-               if (blocks >= sra->component_size/2) {
-                       fprintf(stderr, Name ": %s: Something wrong - reshape aborted\n",
-                               devname);
-                       rv = 1;
+       /* If a new level not explicitly given, we assume no-change */
+       if (info->new_level == UnSet)
+               info->new_level = info->array.level;
+
+       if (info->new_chunk)
+               switch (info->new_level) {
+               case 0:
+               case 4:
+               case 5:
+               case 6:
+               case 10:
+                       /* chunk size is meaningful, must divide component_size
+                        * evenly
+                        */
+                       if (info->component_size % (info->new_chunk/512))
+                               return "New chunk size does not"
+                                       " divide component size";
                        break;
+               default:
+                       return "chunk size not meaningful for this level";
                }
-               nrdisks = array.raid_disks + sra->array.spare_disks;
-               /* Now we need to open all these devices so we can read/write.
+       else
+               info->new_chunk = info->array.chunk_size;
+
+       switch (info->array.level) {
+       case 1:
+               /* RAID1 can convert to RAID1 with different disks, or
+                * raid5 with 2 disks, or
+                * raid0 with 1 disk
                 */
-               fdlist = malloc((1+nrdisks) * sizeof(int));
-               offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
-               if (!fdlist || !offsets) {
-                       fprintf(stderr, Name ": malloc failed: grow aborted\n");
-                       rv = 1;
+               if (info->new_level == 0) {
+                       if (info->delta_disks != UnSet &&
+                           info->delta_disks != 0)
+                               return "Cannot change number of disks "
+                                       "with RAID1->RAID0 conversion";
+                       re->level = 0;
+                       re->before.data_disks = 1;
+                       re->after.data_disks = 1;
+                       re->before.layout = 0;
+                       re->backup_blocks = 0;
+                       re->parity = 0;
+                       return NULL;
+               }
+               if (info->new_level == 1) {
+                       if (info->delta_disks == UnSet)
+                               /* Don't know what to do */
+                               return "no change requested for Growing RAID1";
+                       re->level = 1;
+                       re->backup_blocks = 0;
+                       re->parity = 0;
+                       return NULL;
+               }
+               if (info->array.raid_disks == 2 &&
+                   info->new_level == 5) {
+
+                       re->level = 5;
+                       re->before.data_disks = 1;
+                       if (info->delta_disks != UnSet &&
+                           info->delta_disks != 0)
+                               re->after.data_disks = 1 + info->delta_disks;
+                       else
+                               re->after.data_disks = 1;
+                       if (re->after.data_disks < 1)
+                               return "Number of disks too small for RAID5";
+
+                       re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
+                       info->array.chunk_size = 65536;
                        break;
                }
-               for (d=0; d <= nrdisks; d++)
-                       fdlist[d] = -1;
-               d = array.raid_disks;
-               for (sd = sra->devs; sd; sd=sd->next) {
-                       if (sd->disk.state & (1<<MD_DISK_FAULTY))
-                               continue;
-                       if (sd->disk.state & (1<<MD_DISK_SYNC)) {
-                               char *dn = map_dev(sd->disk.major,
-                                                  sd->disk.minor, 1);
-                               fdlist[sd->disk.raid_disk]
-                                       = dev_open(dn, O_RDONLY);
-                               offsets[sd->disk.raid_disk] = sd->data_offset*512;
-                               if (fdlist[sd->disk.raid_disk] < 0) {
-                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
-                                               devname, dn?dn:"-unknown-");
-                                       rv = 1;
-                                       goto release;
-                               }
-                       } else if (backup_file == NULL) {
-                               /* spare */
-                               char *dn = map_dev(sd->disk.major,
-                                                  sd->disk.minor, 1);
-                               fdlist[d] = dev_open(dn, O_RDWR);
-                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
-                               if (fdlist[d]<0) {
-                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
-                                               devname, dn?dn:"-unknown");
-                                       rv = 1;
-                                       goto release;
-                               }
-                               d++;
+               /* Could do some multi-stage conversions, but leave that to
+                * later.
+                */
+               return "Impossibly level change request for RAID1";
+
+       case 10:
+               /* RAID10 can only be converted from near mode to
+                * RAID0 by removing some devices
+                */
+               if ((info->array.layout & ~0xff) != 0x100)
+                       return "Cannot Grow RAID10 with far/offset layout";
+               /* number of devices must be multiple of number of copies */
+               if (info->array.raid_disks % (info->array.layout & 0xff))
+                       return "RAID10 layout too complex for Grow operation";
+
+               if (info->new_level != 0)
+                       return "RAID10 can only be changed to RAID0";
+               new_disks = (info->array.raid_disks
+                            / (info->array.layout & 0xff));
+               if (info->delta_disks == UnSet)
+                       info->delta_disks = (new_disks
+                                            - info->array.raid_disks);
+
+               if (info->delta_disks != new_disks - info->array.raid_disks)
+                       return "New number of raid-devices impossible for RAID10";
+               if (info->new_chunk &&
+                   info->new_chunk != info->array.chunk_size)
+                       return "Cannot change chunk-size with RAID10 Grow";
+
+               /* looks good */
+               re->level = 0;
+               re->parity = 0;
+               re->before.data_disks = new_disks;
+               re->after.data_disks = re->before.data_disks;
+               re->before.layout = 0;
+               re->backup_blocks = 0;
+               return NULL;
+
+       case 0:
+               /* RAID0 can be converted to RAID10, or to RAID456 */
+               if (info->new_level == 10) {
+                       if (info->new_layout == UnSet && info->delta_disks == UnSet) {
+                               /* Assume near=2 layout */
+                               info->new_layout = 0x102;
+                               info->delta_disks = info->array.raid_disks;
+                       }
+                       if (info->new_layout == UnSet) {
+                               int copies = 1 + (info->delta_disks
+                                                 / info->array.raid_disks);
+                               if (info->array.raid_disks * (copies-1)
+                                   != info->delta_disks)
+                                       return "Impossible number of devices"
+                                               " for RAID0->RAID10";
+                               info->new_layout = 0x100 + copies;
                        }
+                       if (info->delta_disks == UnSet) {
+                               int copies = info->new_layout & 0xff;
+                               if (info->new_layout != 0x100 + copies)
+                                       return "New layout impossible"
+                                               " for RAID0->RAID10";;
+                               info->delta_disks = (copies - 1) *
+                                       info->array.raid_disks;
+                       }
+                       if (info->new_chunk &&
+                           info->new_chunk != info->array.chunk_size)
+                               return "Cannot change chunk-size with RAID0->RAID10";
+                       /* looks good */
+                       re->level = 10;
+                       re->parity = 0;
+                       re->before.data_disks = (info->array.raid_disks +
+                                                info->delta_disks);
+                       re->after.data_disks = re->before.data_disks;
+                       re->before.layout = info->new_layout;
+                       re->backup_blocks = 0;
+                       return NULL;
+               }
+
+               /* RAID0 can also covert to RAID0/4/5/6 by first converting to
+                * a raid4 style layout of the final level.
+                */
+               switch (info->new_level) {
+               case 4:
+                       delta_parity = 1;
+               case 0:
+                       re->level = 4;
+                       re->before.layout = 0;
+                       break;
+               case 5:
+                       delta_parity = 1;
+                       re->level = 5;
+                       re->before.layout = ALGORITHM_PARITY_N;
+                       break;
+               case 6:
+                       delta_parity = 2;
+                       re->level = 6;
+                       re->before.layout = ALGORITHM_PARITY_N;
+                       break;
+               default:
+                       return "Impossible level change requested";
                }
-               if (backup_file == NULL) {
-                       if (ndata <= odata) {
-                               fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
-                                       devname);
-                               rv = 1;
+               re->before.data_disks = info->array.raid_disks;
+               /* determining 'after' layout happens outside this 'switch' */
+               break;
+
+       case 4:
+               info->array.layout = ALGORITHM_PARITY_N;
+       case 5:
+               switch (info->new_level) {
+               case 0:
+                       delta_parity = -1;
+               case 4:
+                       re->level = info->array.level;
+                       re->before.data_disks = info->array.raid_disks - 1;
+                       re->before.layout = info->array.layout;
+                       break;
+               case 5:
+                       re->level = 5;
+                       re->before.data_disks = info->array.raid_disks - 1;
+                       re->before.layout = info->array.layout;
+                       break;
+               case 6:
+                       delta_parity = 1;
+                       re->level = 6;
+                       re->before.data_disks = info->array.raid_disks - 1;
+                       switch (info->array.layout) {
+                       case ALGORITHM_LEFT_ASYMMETRIC:
+                               re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6;
                                break;
-                       } else if (sra->array.spare_disks == 0) {
-                               fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
-                                       "backup-file to backup critical section\n",
-                                       devname);
-                               rv = 1;
+                       case ALGORITHM_RIGHT_ASYMMETRIC:
+                               re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
                                break;
-                       }
-                       if (d == array.raid_disks) {
-                               fprintf(stderr, Name ": %s: No spare device for backup\n",
-                                       devname);
-                               rv = 1;
+                       case ALGORITHM_LEFT_SYMMETRIC:
+                               re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6;
                                break;
-                       }
-               } else {
-                       /* need to check backup file is large enough */
-                       char buf[512];
-                       struct stat stb;
-                       unsigned int dev;
-                       fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
-                                    S_IRUSR | S_IWUSR);
-                       offsets[d] = 8 * 512;
-                       if (fdlist[d] < 0) {
-                               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                       devname, backup_file, strerror(errno));
-                               rv = 1;
+                       case ALGORITHM_RIGHT_SYMMETRIC:
+                               re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6;
                                break;
-                       }
-                       /* Guard against backup file being on array device.
-                        * If array is partitioned or if LVM etc is in the
-                        * way this will not notice, but it is better than
-                        * nothing.
-                        */
-                       fstat(fdlist[d], &stb);
-                       dev = stb.st_dev;
-                       fstat(fd, &stb);
-                       if (stb.st_rdev == dev) {
-                               fprintf(stderr, Name ": backup file must NOT be"
-                                       " on the array being reshaped.\n");
-                               rv = 1;
-                               close(fdlist[d]);
+                       case ALGORITHM_PARITY_0:
+                               re->before.layout = ALGORITHM_PARITY_0_6;
                                break;
-                       }
-
-                       memset(buf, 0, 512);
-                       for (i=0; i < (signed)blocks + 8 ; i++) {
-                               if (write(fdlist[d], buf, 512) != 512) {
-                                       fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                               devname, backup_file, strerror(errno));
-                                       rv = 1;
-                                       break;
-                               }
-                       }
-                       if (fsync(fdlist[d]) != 0) {
-                               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                       devname, backup_file, strerror(errno));
-                               rv = 1;
+                       case ALGORITHM_PARITY_N:
+                               re->before.layout = ALGORITHM_PARITY_N_6;
                                break;
+                       default:
+                               return "Cannot convert an array with this layout";
                        }
-                       d++;
+                       break;
+               case 1:
+                       if (info->array.raid_disks != 2)
+                               return "Can only convert a 2-device array to RAID1";
+                       if (info->delta_disks != UnSet &&
+                           info->delta_disks != 0)
+                               return "Cannot set raid_disk when "
+                                       "converting RAID5->RAID1";
+                       re->level = 1;
+                       break;
+               default:
+                       return "Impossible level change requested";
+               }
+               break;
+       case 6:
+               switch (info->new_level) {
+               case 4:
+               case 5:
+                       delta_parity = -1;
+               case 6:
+                       re->level = 6;
+                       re->before.data_disks = info->array.raid_disks - 2;
+                       re->before.layout = info->array.layout;
+                       break;
+               default:
+                       return "Impossible level change requested";
                }
+               break;
+       }
 
-               /* lastly, check that the internal stripe cache is
-                * large enough, or it won't work.
-                */
-               
-               cache = (nchunk < ochunk) ? ochunk : nchunk;
-               cache = cache * 4 / 4096;
-               if (cache < blocks / 8 / odisks + 16)
-                       /* Make it big enough to hold 'blocks' */
-                       cache = blocks / 8 / odisks + 16;
-               if (sra->cache_size < cache)
-                       sysfs_set_num(sra, NULL, "stripe_cache_size",
-                                     cache+1);
-               /* Right, everything seems fine. Let's kick things off.
-                * If only changing raid_disks, use ioctl, else use
-                * sysfs.
+       /* If we reached here then it looks like a re-stripe is
+        * happening.  We have determined the intermediate level
+        * and initial raid_disks/layout and stored these in 're'.
+        *
+        * We need to deduce the final layout that can be atomically
+        * converted to the end state.
+        */
+       switch (info->new_level) {
+       case 0:
+               /* We can only get to RAID0 from RAID4 or RAID5
+                * with appropriate layout and one extra device
                 */
-               if (ochunk == nchunk && olayout == nlayout) {
-                       array.raid_disks = ndisks;
-                       if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
-                               int err = errno;
-                               rv = 1;
-                               fprintf(stderr, Name ": Cannot set device shape for %s: %s\n",
-                                       devname, strerror(errno));
-                               if (ndisks < odisks &&
-                                   get_linux_version() < 2006030)
-                                       fprintf(stderr, Name ": linux 2.6.30 or later required\n");
-                               if (err == EBUSY && 
-                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
-                                       fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
+               if (re->level != 4 && re->level != 5)
+                       return "Cannot covert to RAID0 from this level";
 
-                               break;
-                       }
-               } else {
-                       /* set them all just in case some old 'new_*' value
-                        * persists from some earlier problem
-                        */
-                       int err = err; /* only used if rv==1, and always set if
-                                       * rv==1, so initialisation not needed,
-                                       * despite gcc warning
-                                       */
-                       if (sysfs_set_num(sra, NULL, "chunk_size", nchunk) < 0)
-                               rv = 1, err = errno;
-                       if (!rv && sysfs_set_num(sra, NULL, "layout", nlayout) < 0)
-                               rv = 1, err = errno;
-                       if (!rv && sysfs_set_num(sra, NULL, "raid_disks", ndisks) < 0)
-                               rv = 1, err = errno;
-                       if (rv) {
-                               fprintf(stderr, Name ": Cannot set device shape for %s\n",
-                                       devname);
-                               if (get_linux_version() < 2006030)
-                                       fprintf(stderr, Name ": linux 2.6.30 or later required\n");
-                               if (err == EBUSY && 
-                                   (array.state & (1<<MD_SB_BITMAP_PRESENT)))
-                                       fprintf(stderr, "       Bitmap must be removed before shape can be changed\n");
-                               break;
-                       }
+               switch (re->level) {
+               case 4:
+                       re->after.layout = 0 ; break;
+               case 5:
+                       re->after.layout = ALGORITHM_PARITY_N; break;
                }
+               break;
+
+       case 4:
+               /* We can only get to RAID4 from RAID5 */
+               if (re->level != 4 && re->level != 5)
+                       return "Cannot convert to RAID4 from this level";
+
+               switch (re->level) {
+               case 4:
+                       re->after.layout = 0 ; break;
+               case 5:
+                       re->after.layout = ALGORITHM_PARITY_N; break;
+               }
+               break;
 
-               if (ndisks == 2 && odisks == 2) {
-                       /* No reshape is needed in this trivial case */
-                       rv = 0;
+       case 5:
+               /* We get to RAID5 for RAID5 or RAID6 */
+               if (re->level != 5 && re->level != 6)
+                       return "Cannot convert to RAID5 from this level";
+
+               switch (re->level) {
+               case 5:
+                       if (info->new_layout == UnSet)
+                               re->after.layout = re->before.layout;
+                       else
+                               re->after.layout = info->new_layout;
                        break;
+               case 6:
+                       if (info->new_layout == UnSet)
+                               info->new_layout = re->before.layout;
+
+                       /* after.layout needs to be raid6 version of new_layout */
+                       if (info->new_layout == ALGORITHM_PARITY_N)
+                               re->after.layout = ALGORITHM_PARITY_N;
+                       else {
+                               char layout[40];
+                               char *ls = map_num(r5layout, info->new_layout);
+                               int l;
+                               strcat(strcpy(layout, ls), "-6");
+                               l = map_name(r6layout, layout);
+                               if (l == UnSet)
+                                       return "Cannot find RAID6 layout"
+                                               " to convert to";
+                               re->after.layout = l;
+                       }
+               }
+               break;
+
+       case 6:
+               /* We must already be at level 6 */
+               if (re->level != 6)
+                       return "Impossible level change";
+               if (info->new_layout == UnSet)
+                       re->after.layout = info->array.layout;
+               else
+                       re->after.layout = info->new_layout;
+               break;
+       default:
+               return "Impossible level change requested";
+       }
+       if (info->delta_disks == UnSet)
+               info->delta_disks = delta_parity;
+
+       re->after.data_disks = (re->before.data_disks
+                               + info->delta_disks
+                               - delta_parity);
+       switch (re->level) {
+       case 6: re->parity = 2; break;
+       case 4:
+       case 5: re->parity = 1; break;
+       default: re->parity = 0; break;
+       }
+       /* So we have a restripe operation, we need to calculate the number
+        * of blocks per reshape operation.
+        */
+       if (info->new_chunk == 0)
+               info->new_chunk = info->array.chunk_size;
+       if (re->after.data_disks == re->before.data_disks &&
+           re->after.layout == re->before.layout &&
+           info->new_chunk == info->array.chunk_size) {
+               /* Nothing to change */
+               re->backup_blocks = 0;
+               return NULL;
+       }
+       if (re->after.data_disks == 1 && re->before.data_disks == 1) {
+               /* chunk and layout changes make no difference */
+               re->backup_blocks = 0;
+               return NULL;
+       }
+
+       if (re->after.data_disks == re->before.data_disks &&
+           get_linux_version() < 2006032)
+               return "in-place reshape is not safe before 2.6.32 - sorry.";
+
+       if (re->after.data_disks < re->before.data_disks &&
+           get_linux_version() < 2006030)
+               return "reshape to fewer devices is not supported before 2.6.32 - sorry.";
+
+       re->backup_blocks = compute_backup_blocks(
+               info->new_chunk, info->array.chunk_size,
+               re->after.data_disks,
+               re->before.data_disks);
+
+       re->new_size = info->component_size * re->after.data_disks;
+       return NULL;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+                        struct supertype *st, struct mdinfo *info,
+                        int force, struct mddev_dev *devlist,
+                        char *backup_file, int quiet, int forked,
+                        int restart);
+static int reshape_container(char *container, char *devname,
+                            struct supertype *st, 
+                            struct mdinfo *info,
+                            int force,
+                            char *backup_file,
+                            int quiet, int restart);
+
+int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
+                long long size,
+                int level, char *layout_str, int chunksize, int raid_disks,
+                struct mddev_dev *devlist,
+                int force)
+{
+       /* Make some changes in the shape of an array.
+        * The kernel must support the change.
+        *
+        * There are three different changes.  Each can trigger
+        * a resync or recovery so we freeze that until we have
+        * requested everything (if kernel supports freezing - 2.6.30).
+        * The steps are:
+        *  - change size (i.e. component_size)
+        *  - change level
+        *  - change layout/chunksize/ndisks
+        *
+        * The last can require a reshape.  It is different on different
+        * levels so we need to check the level before actioning it.
+        * Some times the level change needs to be requested after the
+        * reshape (e.g. raid6->raid5, raid5->raid0)
+        *
+        */
+       struct mdu_array_info_s array;
+       int rv = 0;
+       struct supertype *st;
+       char *subarray = NULL;
+
+       int frozen;
+       int changed = 0;
+       char *container = NULL;
+       char container_buf[20];
+       int cfd = -1;
+
+       struct mddev_dev *dv;
+       int added_disks;
+
+       struct mdinfo info;
+       struct mdinfo *sra;
+
+       if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
+               fprintf(stderr, Name ": %s is not an active md array - aborting\n",
+                       devname);
+               return 1;
+       }
+
+       if (size >= 0 &&
+           (chunksize || level!= UnSet || layout_str || raid_disks)) {
+               fprintf(stderr, Name ": cannot change component size at the same time "
+                       "as other changes.\n"
+                       "   Change size first, then check data is intact before "
+                       "making other changes.\n");
+               return 1;
+       }
+
+       if (raid_disks && raid_disks < array.raid_disks && array.level > 1 &&
+           get_linux_version() < 2006032 &&
+           !check_env("MDADM_FORCE_FEWER")) {
+               fprintf(stderr, Name ": reducing the number of devices is not safe before Linux 2.6.32\n"
+                       "       Please use a newer kernel\n");
+               return 1;
+       }
+
+       st = super_by_fd(fd, &subarray);
+       if (!st) {
+               fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname);
+               return 1;
+       }
+       if (raid_disks > st->max_devs) {
+               fprintf(stderr, Name ": Cannot increase raid-disks on this array"
+                       " beyond %d\n", st->max_devs);
+               return 1;
+       }
+
+       /* in the external case we need to check that the requested reshape is
+        * supported, and perform an initial check that the container holds the
+        * pre-requisite spare devices (mdmon owns final validation)
+        */
+       if (st->ss->external) {
+               int container_dev;
+               int rv;
+
+               if (subarray) {
+                       container_dev = st->container_dev;
+                       cfd = open_dev_excl(st->container_dev);
+               } else {
+                       container_dev = st->devnum;
+                       close(fd);
+                       cfd = open_dev_excl(st->devnum);
+                       fd = cfd;
                }
+               if (cfd < 0) {
+                       fprintf(stderr, Name ": Unable to open container for %s\n",
+                               devname);
+                       free(subarray);
+                       return 1;
+               }
+
+               fmt_devname(container_buf, container_dev);
+               container = container_buf;
+
+               rv = st->ss->load_container(st, cfd, NULL);
+
+               if (rv) {
+                       fprintf(stderr, Name ": Cannot read superblock for %s\n",
+                               devname);
+                       free(subarray);
+                       return 1;
+               }
+
+               if (mdmon_running(container_dev))
+                       st->update_tail = &st->updates;
+       }
+
+       added_disks = 0;
+       for (dv = devlist; dv; dv = dv->next)
+               added_disks++;
+       if (raid_disks > array.raid_disks &&
+           array.spare_disks +added_disks < (raid_disks - array.raid_disks) &&
+           !force) {
+               fprintf(stderr,
+                       Name ": Need %d spare%s to avoid degraded array,"
+                       " and only have %d.\n"
+                       "       Use --force to over-ride this check.\n",
+                       raid_disks - array.raid_disks, 
+                       raid_disks - array.raid_disks == 1 ? "" : "s", 
+                       array.spare_disks + added_disks);
+               return 1;
+       }
+
+       sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS
+                        | GET_STATE | GET_VERSION);
+       if (sra) {
+               if (st->ss->external && subarray == NULL) {
+                       array.level = LEVEL_CONTAINER;
+                       sra->array.level = LEVEL_CONTAINER;
+               }
+       } else {
+               fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
+                       devname);
+               return 1;
+       }
+       frozen = freeze(st);
+       if (frozen < -1) {
+               /* freeze() already spewed the reason */
+               return 1;
+       } else if (frozen < 0) {
+               fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
+                       " be reshaped\n", devname);
+               return 1;
+       }
+
+       /* ========= set size =============== */
+       if (size >= 0 && (size == 0 || size != array.size)) {
+               long long orig_size = array.size;
+
+               if (reshape_super(st, size, UnSet, UnSet, 0, 0, UnSet, NULL,
+                                 devname, !quiet)) {
+                       rv = 1;
+                       goto release;
+               }
+               sync_metadata(st);
+               array.size = size;
+               if (array.size != size) {
+                       /* got truncated to 32bit, write to
+                        * component_size instead
+                        */
+                       if (sra)
+                               rv = sysfs_set_num(sra, NULL,
+                                                  "component_size", size);
+                       else
+                               rv = -1;
+               } else
+                       rv = ioctl(fd, SET_ARRAY_INFO, &array);
+               if (rv != 0) {
+                       int err = errno;
+
+                       /* restore metadata */
+                       if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
+                                         UnSet, NULL, devname, !quiet) == 0)
+                               sync_metadata(st);
+                       fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
+                               devname, strerror(err));
+                       if (err == EBUSY && 
+                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                               fprintf(stderr, "       Bitmap must be removed before size can be changed\n");
+                       rv = 1;
+                       goto release;
+               }
+               ioctl(fd, GET_ARRAY_INFO, &array);
+               size = get_component_size(fd)/2;
+               if (size == 0)
+                       size = array.size;
+               if (!quiet)
+                       fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
+                               devname, size);
+               changed = 1;
+       } else if (array.level != LEVEL_CONTAINER) {
+               size = get_component_size(fd)/2;
+               if (size == 0)
+                       size = array.size;
+       }
+
+       /* ========= check for Raid10/Raid1 -> Raid0 conversion ===============
+        * current implementation assumes that following conditions must be met:
+        * - RAID10:
+        *      - far_copies == 1
+        *      - near_copies == 2
+        */
+       if ((level == 0 && array.level == 10 && sra &&
+           array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) ||
+           (level == 0 && array.level == 1 && sra)) {
+               int err;
+               err = remove_disks_for_takeover(st, sra, array.layout);
+               if (err) {
+                       dprintf(Name": Array cannot be reshaped\n");
+                       if (cfd > -1)
+                               close(cfd);
+                       rv = 1;
+                       goto release;
+               }
+               /* FIXME this is added with no justification - why is it here */
+               ping_monitor(container);
+       }
+
+       memset(&info, 0, sizeof(info));
+       info.array = array;
+       sysfs_init(&info, fd, NoMdDev);
+       strcpy(info.text_version, sra->text_version);
+       info.component_size = size*2;
+       info.new_level = level;
+       info.new_chunk = chunksize * 1024;
+       if (info.array.level == LEVEL_CONTAINER) {
+               info.delta_disks = UnSet;
+               info.array.raid_disks = raid_disks;
+       } else if (raid_disks)
+               info.delta_disks = raid_disks - info.array.raid_disks;
+       else
+               info.delta_disks = UnSet;
+       if (layout_str == NULL) {
+               info.new_layout = UnSet;
+               if (info.array.level == 6 &&
+                   (info.new_level == 6 || info.new_level == UnSet) &&
+                   info.array.layout >= 16) {
+                       fprintf(stderr, Name
+                               ": %s has a non-standard layout.  If you"
+                               " wish to preserve this\n"
+                               "      during the reshape, please specify"
+                               " --layout=preserve\n"
+                               "      If you want to change it, specify a"
+                               " layout or use --layout=normalise\n",
+                               devname);
+                       rv = 1;
+                       goto release;
+               }
+       } else if (strcmp(layout_str, "normalise") == 0 ||
+                strcmp(layout_str, "normalize") == 0) {
+               /* If we have a -6 RAID6 layout, remove the '-6'. */
+               info.new_layout = UnSet;
+               if (info.array.level == 6 && info.new_level == UnSet) {
+                       char l[40], *h;
+                       strcpy(l, map_num(r6layout, info.array.layout));
+                       h = strrchr(l, '-');
+                       if (h && strcmp(h, "-6") == 0) {
+                               *h = 0;
+                               info.new_layout = map_name(r6layout, l);
+                       }
+               }
+       } else if (strcmp(layout_str, "preserve") == 0) {
+               info.new_layout = UnSet;
+       } else {
+               int l = info.new_level;
+               if (l == UnSet)
+                       l = info.array.level;
+               switch (l) {
+               case 5:
+                       info.new_layout = map_name(r5layout, layout_str);
+                       break;
+               case 6:
+                       info.new_layout = map_name(r6layout, layout_str);
+                       break;
+               case 10:
+                       info.new_layout = parse_layout_10(layout_str);
+                       break;
+               case LEVEL_FAULTY:
+                       info.new_layout = parse_layout_faulty(layout_str);
+                       break;
+               default:
+                       fprintf(stderr, Name ": layout not meaningful"
+                               " with this level\n");
+                       rv = 1;
+                       goto release;
+               }
+               if (info.new_layout == UnSet) {
+                       fprintf(stderr, Name ": layout %s not understood"
+                               " for this level\n",
+                               layout_str);
+                       rv = 1;
+                       goto release;
+               }
+       }
+
+       if (array.level == LEVEL_CONTAINER) {
+               /* This change is to be applied to every array in the
+                * container.  This is only needed when the metadata imposes
+                * restraints of the various arrays in the container.
+                * Currently we only know that IMSM requires all arrays
+                * to have the same number of devices so changing the
+                * number of devices (On-Line Capacity Expansion) must be
+                * performed at the level of the container
+                */
+               rv = reshape_container(container, devname, st, &info,
+                                      force, backup_file, quiet, 0);
+               frozen = 0;
+       } else {
+               /* get spare devices from external metadata
+                */
+               if (st->ss->external) {
+                       struct mdinfo *info2;
+
+                       info2 = st->ss->container_content(st, subarray);
+                       if (info2) {
+                               info.array.spare_disks =
+                                       info2->array.spare_disks;
+                               sysfs_free(info2);
+                       }
+               }
+
+               /* Impose these changes on a single array.  First
+                * check that the metadata is OK with the change. */
+
+               if (reshape_super(st, info.component_size, info.new_level,
+                                 info.new_layout, info.new_chunk,
+                                 info.array.raid_disks, info.delta_disks,
+                                 backup_file, devname, quiet)) {
+                       rv = 1;
+                       goto release;
+               }
+               sync_metadata(st);
+               rv = reshape_array(container, fd, devname, st, &info, force,
+                                  devlist, backup_file, quiet, 0, 0);
+               frozen = 0;
+       }
+release:
+       if (frozen > 0)
+               unfreeze(st);
+       return rv;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+                        struct supertype *st, struct mdinfo *info,
+                        int force, struct mddev_dev *devlist,
+                        char *backup_file, int quiet, int forked,
+                        int restart)
+{
+       struct reshape reshape;
+       int spares_needed;
+       char *msg;
+       int orig_level = UnSet;
+       int disks, odisks;
+
+       struct mdu_array_info_s array;
+       char *c;
+
+       struct mddev_dev *dv;
+       int added_disks;
+
+       int *fdlist;
+       unsigned long long *offsets;
+       int d;
+       int nrdisks;
+       int err;
+       unsigned long blocks;
+       unsigned long cache;
+       unsigned long long array_size;
+       int done;
+       struct mdinfo *sra = NULL;
+
+       /* when reshaping a RAID0, the component_size might be zero.
+        * So try to fix that up.
+        */
+       if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+               dprintf("Cannot get array information.\n");
+               goto release;
+       }
+       if (array.level == 0 && info->component_size == 0) {
+               get_dev_size(fd, NULL, &array_size);
+               info->component_size = array_size / array.raid_disks;
+       }
+
+       if (info->reshape_active) {
+               int new_level = info->new_level;
+               info->new_level = UnSet;
+               msg = analyse_change(info, &reshape);
+               info->new_level = new_level;
+               if (!restart)
+                       /* Make sure the array isn't read-only */
+                       ioctl(fd, RESTART_ARRAY_RW, 0);
+       } else
+               msg = analyse_change(info, &reshape);
+       if (msg) {
+               fprintf(stderr, Name ": %s\n", msg);
+               goto release;
+       }
+       if (restart &&
+           (reshape.level != info->array.level ||
+            reshape.before.layout != info->array.layout ||
+            reshape.before.data_disks + reshape.parity != info->array.raid_disks)) {
+               fprintf(stderr, Name ": reshape info is not in native format -"
+                       " cannot continue.\n");
+               goto release;
+       }
+
+       if (restart) {
+               /* reshape already started. just skip to monitoring the reshape */
+               if (reshape.backup_blocks == 0)
+                       return 0;
+               goto started;
+       }
+       /* The container is frozen but the array may not be.
+        * So freeze the array so spares don't get put to the wrong use
+        * FIXME there should probably be a cleaner separation between
+        * freeze_array and freeze_container.
+        */
+       sysfs_freeze_array(info);
+       /* Check we have enough spares to not be degraded */
+       added_disks = 0;
+       for (dv = devlist; dv ; dv=dv->next)
+               added_disks++;
+       spares_needed = max(reshape.before.data_disks,
+                           reshape.after.data_disks)
+               + reshape.parity - array.raid_disks;
+
+       if (!force &&
+           info->new_level > 1 && info->array.level > 1 &&
+           spares_needed > info->array.spare_disks + added_disks) {
+               fprintf(stderr,
+                       Name ": Need %d spare%s to avoid degraded array,"
+                       " and only have %d.\n"
+                       "       Use --force to over-ride this check.\n",
+                       spares_needed,
+                       spares_needed == 1 ? "" : "s", 
+                       info->array.spare_disks + added_disks);
+               goto release;
+       }
+       /* Check we have enough spares to not fail */
+       spares_needed = max(reshape.before.data_disks,
+                           reshape.after.data_disks)
+               - array.raid_disks;
+       if ((info->new_level > 1 || info->new_level == 0) &&
+           spares_needed > info->array.spare_disks +added_disks) {
+               fprintf(stderr,
+                       Name ": Need %d spare%s to create working array,"
+                       " and only have %d.\n",
+                       spares_needed,
+                       spares_needed == 1 ? "" : "s", 
+                       info->array.spare_disks + added_disks);
+               goto release;
+       }
+
+       if (reshape.level != array.level) {
+               char *c = map_num(pers, reshape.level);
+               int err;
+               if (c == NULL)
+                       goto release;
+
+               err = sysfs_set_str(info, NULL, "level", c);
+               if (err) {
+                       err = errno;
+                       fprintf(stderr, Name ": %s: could not set level to %s\n",
+                               devname, c);
+                       if (err == EBUSY && 
+                           (info->array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                               fprintf(stderr, "       Bitmap must be removed"
+                                       " before level can be changed\n");
+                       goto release;
+               }
+               if (!quiet)
+                       fprintf(stderr, Name ": level of %s changed to %s\n",
+                               devname, c);    
+               orig_level = array.level;
+               sysfs_freeze_array(info);
+
+               if (reshape.level > 0 && st->ss->external) {
+                       /* make sure mdmon is aware of the new level */
+                       if (!mdmon_running(st->container_dev))
+                               start_mdmon(st->container_dev);
+                       ping_monitor(container);
+               }
+       }
+       /* ->reshape_super might have chosen some spares from the
+        * container that it wants to be part of the new array.
+        * We can collect them with ->container_content and give
+        * them to the kernel.
+        */
+       if (st->ss->reshape_super && st->ss->container_content) {
+               char *subarray = strchr(info->text_version+1, '/')+1;
+               struct mdinfo *info2 =
+                       st->ss->container_content(st, subarray);
+               struct mdinfo *d;
+
+               if (info2) {
+                       sysfs_init(info2, fd, st->devnum);
+                       for (d = info2->devs; d; d = d->next) {
+                               if (d->disk.state == 0 &&
+                                   d->disk.raid_disk >= 0) {
+                                       /* This is a spare that wants to
+                                        * be part of the array.
+                                        */
+                                       add_disk(fd, st, info2, d);
+                               }
+                       }
+                       sysfs_free(info2);
+               }
+       }
+       /* We might have been given some devices to add to the
+        * array.  Now that the array has been changed to the right
+        * level and frozen, we can safely add them.
+        */
+       if (devlist)
+               Manage_subdevs(devname, fd, devlist, !quiet,
+                              0,NULL);
+
+       if (reshape.backup_blocks == 0) {
+               /* No restriping needed, but we might need to impose
+                * some more changes: layout, raid_disks, chunk_size
+                */
+               /* read current array info */
+               if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+                       dprintf("Cannot get array information.\n");
+                       goto release;
+               }
+               /* compare current array info with new values and if
+                * it is different update them to new */
+               if (info->new_layout != UnSet &&
+                   info->new_layout != array.layout) {
+                       array.layout = info->new_layout;
+                       if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                               fprintf(stderr, Name ": failed to set new layout\n");
+                               goto release;
+                       } else if (!quiet)
+                               printf("layout for %s set to %d\n",
+                                      devname, array.layout);
+               }
+               if (info->delta_disks != UnSet &&
+                   info->delta_disks != 0 &&
+                   array.raid_disks != (info->array.raid_disks + info->delta_disks)) {
+                       array.raid_disks += info->delta_disks;
+                       if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                               fprintf(stderr, Name ": failed to set raid disks\n");
+                               goto release;
+                       } else if (!quiet) {
+                               printf("raid_disks for %s set to %d\n",
+                                      devname, array.raid_disks);
+                       }
+               }
+               if (info->new_chunk != 0 &&
+                   info->new_chunk != array.chunk_size) {
+                       if (sysfs_set_num(info, NULL,
+                                         "chunk_size", info->new_chunk) != 0) {
+                               fprintf(stderr, Name ": failed to set chunk size\n");
+                               goto release;
+                       } else if (!quiet)
+                               printf("chunk size for %s set to %d\n",
+                                      devname, array.chunk_size);
+               }
+               unfreeze(st);
+               return 0;
+       }
+
+       /*
+        * There are three possibilities.
+        * 1/ The array will shrink.
+        *    We need to ensure the reshape will pause before reaching
+        *    the 'critical section'.  We also need to fork and wait for
+        *    that to happen.  When it does we 
+        *       suspend/backup/complete/unfreeze
+        *
+        * 2/ The array will not change size.
+        *    This requires that we keep a backup of a sliding window
+        *    so that we can restore data after a crash.  So we need
+        *    to fork and monitor progress.
+        *    In future we will allow the data_offset to change, so
+        *    a sliding backup becomes unnecessary.
+        *
+        * 3/ The array will grow. This is relatively easy.
+        *    However the kernel's restripe routines will cheerfully
+        *    overwrite some early data before it is safe.  So we
+        *    need to make a backup of the early parts of the array
+        *    and be ready to restore it if rebuild aborts very early.
+        *    For externally managed metadata, we still need a forked
+        *    child to monitor the reshape and suspend IO over the region
+        *    that is being reshaped.
+        *
+        *    We backup data by writing it to one spare, or to a
+        *    file which was given on command line.
+        *
+        * In each case, we first make sure that storage is available
+        * for the required backup.
+        * Then we:
+        *   -  request the shape change.
+        *   -  fork to handle backup etc.
+        */
+started:
+       /* Check that we can hold all the data */
+       get_dev_size(fd, NULL, &array_size);
+       if (reshape.new_size < (array_size/512)) {
+               fprintf(stderr,
+                       Name ": this change will reduce the size of the array.\n"
+                       "       use --grow --array-size first to truncate array.\n"
+                       "       e.g. mdadm --grow %s --array-size %llu\n",
+                       devname, reshape.new_size/2);
+               goto release;
+       }
+
+       sra = sysfs_read(fd, 0,
+                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
+                        GET_CACHE);
+       if (!sra) {
+               fprintf(stderr, Name ": %s: Cannot get array details from sysfs\n",
+                       devname);
+               goto release;
+       }
+
+       /* Decide how many blocks (sectors) for a reshape
+        * unit.  The number we have so far is just a minimum
+        */
+       blocks = reshape.backup_blocks;
+       if (reshape.before.data_disks == 
+           reshape.after.data_disks) {
+               /* Make 'blocks' bigger for better throughput, but
+                * not so big that we reject it below.
+                * Try for 16 megabytes
+                */
+               while (blocks * 32 < sra->component_size &&
+                      blocks < 16*1024*2)
+                       blocks *= 2;
+       } else
+               fprintf(stderr, Name ": Need to backup %luK of critical "
+                       "section..\n", blocks/2);
+
+       if (blocks >= sra->component_size/2) {
+               fprintf(stderr, Name ": %s: Something wrong"
+                       " - reshape aborted\n",
+                       devname);
+               goto release;
+       }
+
+       /* Now we need to open all these devices so we can read/write.
+        */
+       nrdisks = max(reshape.before.data_disks,
+                     reshape.after.data_disks) + reshape.parity
+               + sra->array.spare_disks;
+       fdlist = malloc((1+nrdisks) * sizeof(int));
+       offsets = malloc((1+nrdisks) * sizeof(offsets[0]));
+       if (!fdlist || !offsets) {
+               fprintf(stderr, Name ": malloc failed: grow aborted\n");
+               goto release;
+       }
+
+       odisks = reshape.before.data_disks + reshape.parity;
+       d = reshape_prepare_fdlist(devname, sra, odisks,
+                                  nrdisks, blocks, backup_file,
+                                  fdlist, offsets);
+       if (d < 0) {
+               goto release;
+       }
+       if (backup_file == NULL) {
+               if (reshape.after.data_disks <= reshape.before.data_disks) {
+                       fprintf(stderr,
+                               Name ": %s: Cannot grow - need backup-file\n", 
+                               devname);
+                       goto release;
+               } else if (sra->array.spare_disks == 0) {
+                       fprintf(stderr, Name ": %s: Cannot grow - need a spare or "
+                               "backup-file to backup critical section\n",
+                               devname);
+                       goto release;
+               }
+       } else {
+               if (!reshape_open_backup_file(backup_file, fd, devname,
+                                             (signed)blocks,
+                                             fdlist+d, offsets+d, restart)) {
+                       goto release;
+               }
+               d++;
+       }
+
+       /* lastly, check that the internal stripe cache is
+        * large enough, or it won't work.
+        * It must hold at least 4 stripes of the larger
+        * chunk size
+        */
+       cache = max(info->array.chunk_size, info->new_chunk);
+       cache *= 4; /* 4 stripes minimum */
+       cache /= 512; /* convert to sectors */
+       disks = min(reshape.before.data_disks, reshape.after.data_disks);
+       /* make sure there is room for 'blocks' with a bit to spare */
+       if (cache < 16 + blocks / disks)
+               cache = 16 + blocks / disks;
+       cache /= (4096/512); /* Covert from sectors to pages */
+
+       if (sra->cache_size < cache)
+               subarray_set_num(container, sra, "stripe_cache_size",
+                                cache+1);
+
+       /* Right, everything seems fine. Let's kick things off.
+        * If only changing raid_disks, use ioctl, else use
+        * sysfs.
+        */
+       sync_metadata(st);
+
+       sra->new_chunk = info->new_chunk;
+
+       if (restart)
+               sra->reshape_progress = info->reshape_progress;
+       else {
+               sra->reshape_progress = 0;
+               if (reshape.after.data_disks < reshape.before.data_disks)
+                       /* start from the end of the new array */
+                       sra->reshape_progress = (sra->component_size
+                                                * reshape.after.data_disks);
+       }
+
+       if (info->array.chunk_size == info->new_chunk &&
+           reshape.before.layout == reshape.after.layout &&
+           st->ss->external == 0) {
+               /* use SET_ARRAY_INFO but only if reshape hasn't started */
+               ioctl(fd, GET_ARRAY_INFO, &array);
+               array.raid_disks = reshape.after.data_disks + reshape.parity;
+               if (!restart &&
+                   ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+                       int err = errno;
+
+                       fprintf(stderr,
+                               Name ": Cannot set device shape for %s: %s\n",
+                               devname, strerror(errno));
+
+                       if (err == EBUSY && 
+                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                               fprintf(stderr,
+                                       "       Bitmap must be removed before"
+                                       " shape can be changed\n");
+
+                       goto release;
+               }
+       } else if (!restart) {
+               /* set them all just in case some old 'new_*' value
+                * persists from some earlier problem.
+                */
+               int err = 0;
+               if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
+                       err = errno;
+               if (!err && sysfs_set_num(sra, NULL, "layout", 
+                                        reshape.after.layout) < 0)
+                       err = errno;
+               if (!err && subarray_set_num(container, sra, "raid_disks",
+                                           reshape.after.data_disks +
+                                           reshape.parity) < 0)
+                       err = errno;
+               if (err) {
+                       fprintf(stderr, Name ": Cannot set device shape for %s\n",
+                               devname);
+
+                       if (err == EBUSY && 
+                           (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+                               fprintf(stderr,
+                                       "       Bitmap must be removed before"
+                                       " shape can be changed\n");
+                       goto release;
+               }
+       }
+
+       err = start_reshape(sra, restart);
+       if (err) {
+               fprintf(stderr, 
+                       Name ": Cannot %s reshape for %s\n",
+                       restart ? "continue" : "start",
+                       devname);
+               goto release;
+       }
+       if (restart)
+               sysfs_set_str(sra, NULL, "array_state", "active");
+
+       /* Now we just need to kick off the reshape and watch, while
+        * handling backups of the data...
+        * This is all done by a forked background process.
+        */
+       switch(forked ? 0 : fork()) {
+       case -1:
+               fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
+                       strerror(errno));
+               abort_reshape(sra);
+               goto release;
+       default:
+               return 0;
+       case 0:
+               break;
+       }
+
+       close(fd);
+       if (check_env("MDADM_GROW_VERIFY"))
+               fd = open(devname, O_RDONLY | O_DIRECT);
+       else
+               fd = -1;
+       mlockall(MCL_FUTURE);
+
+       if (st->ss->external) {
+               /* metadata handler takes it from here */
+               done = st->ss->manage_reshape(
+                       fd, sra, &reshape, st, blocks,
+                       fdlist, offsets,
+                       d - odisks, fdlist+odisks,
+                       offsets+odisks);
+       } else
+               done = child_monitor(
+                       fd, sra, &reshape, st, blocks,
+                       fdlist, offsets,
+                       d - odisks, fdlist+odisks,
+                       offsets+odisks);
+
+       if (backup_file && done)
+               unlink(backup_file);
+       if (!done) {
+               abort_reshape(sra);
+               goto out;
+       }
+
+       if (!st->ss->external &&
+           !(reshape.before.data_disks != reshape.after.data_disks
+             && info->custom_array_size) &&
+           info->new_level == reshape.level &&
+           !forked) {
+               /* no need to wait for the reshape to finish as
+                * there is nothing more to do.
+                */
+               exit(0);
+       }
+       wait_reshape(sra);
+
+       if (st->ss->external) {
+               /* Re-load the metadata as much could have changed */
+               int cfd = open_dev(st->container_dev);
+               if (cfd >= 0) {
+                       ping_monitor(container);
+                       st->ss->free_super(st);
+                       st->ss->load_container(st, cfd, container);
+                       close(cfd);
+               }
+       }
+
+       /* set new array size if required customer_array_size is used
+        * by this metadata.
+        */
+       if (reshape.before.data_disks !=
+           reshape.after.data_disks &&
+           info->custom_array_size) {
+               struct mdinfo *info2;
+               char *subarray = strchr(info->text_version+1, '/')+1;
+
+               info2 = st->ss->container_content(st, subarray);
+               if (info2) {
+                       unsigned long long current_size = 0;
+                       unsigned long long new_size =
+                               info2->custom_array_size/2;
+
+                       if (sysfs_get_ll(sra,
+                                        NULL,
+                                        "array_size",
+                                        &current_size) == 0 &&
+                           new_size > current_size) {
+                               if (sysfs_set_num(sra, NULL,
+                                                 "array_size", new_size)
+                                   < 0)
+                                       dprintf("Error: Cannot"
+                                               " set array size");
+                               else
+                                       dprintf("Array size "
+                                               "changed");
+                               dprintf(" from %llu to %llu.\n",
+                                       current_size, new_size);
+                       }
+                       sysfs_free(info2);
+               }
+       }
+
+       if (info->new_level != reshape.level) {
+
+               c = map_num(pers, info->new_level);
+               if (c) {
+                       err = sysfs_set_str(sra, NULL, "level", c);
+                       if (err)
+                               fprintf(stderr, Name\
+                                       ": %s: could not set level "
+                                       "to %s\n", devname, c);
+               }
+       }
+out:
+       if (forked)
+               return 0;
+       unfreeze(st);
+       exit(0);
+
+release:
+       if (orig_level != UnSet && sra) {
+               c = map_num(pers, orig_level);
+               if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
+                       fprintf(stderr, Name ": aborting level change\n");
+       }
+       if (!forked)
+               unfreeze(st);
+       return 1;
+}
+
+int reshape_container(char *container, char *devname,
+                     struct supertype *st, 
+                     struct mdinfo *info,
+                     int force,
+                     char *backup_file,
+                     int quiet, int restart)
+{
+       struct mdinfo *cc = NULL;
+       int rv = restart;
+
+       /* component_size is not meaningful for a container,
+        * so pass '-1' meaning 'no change'
+        */
+       if (!restart &&
+           reshape_super(st, -1, info->new_level,
+                         info->new_layout, info->new_chunk,
+                         info->array.raid_disks, info->delta_disks,
+                         backup_file, devname, quiet)) {
+               unfreeze(st);
+               return 1;
+       }
+
+       sync_metadata(st);
+
+       /* ping monitor to be sure that update is on disk
+        */
+       ping_monitor(container);
+
+       switch (fork()) {
+       case -1: /* error */
+               perror("Cannot fork to complete reshape\n");
+               unfreeze(st);
+               return 1;
+       default: /* parent */
+               printf(Name ": multi-array reshape continues in background\n");
+               return 0;
+       case 0: /* child */
+               break;
+       }
 
-               /* set up the backup-super-block.  This requires the
-                * uuid from the array.
+       while(1) {
+               /* For each member array with reshape_active,
+                * we need to perform the reshape.
+                * We pick the first array that needs reshaping and
+                * reshape it.  reshape_array() will re-read the metadata
+                * so the next time through a different array should be
+                * ready for reshape.
+                * It is possible that the 'different' array will not
+                * be assembled yet.  In that case we simple exit.
+                * When it is assembled, the mdadm which assembles it
+                * will take over the reshape.
                 */
-               /* Find a superblock */
-               for (sd = sra->devs; sd; sd = sd->next) {
-                       char *dn;
-                       int devfd;
-                       int ok;
-                       if (sd->disk.state & (1<<MD_DISK_FAULTY))
+               struct mdinfo *content;
+               int fd;
+               struct mdstat_ent *mdstat;
+               char *adev;
+
+               sysfs_free(cc);
+
+               cc = st->ss->container_content(st, NULL);
+
+               for (content = cc; content ; content = content->next) {
+                       char *subarray;
+                       if (!content->reshape_active)
                                continue;
-                       dn = map_dev(sd->disk.major, sd->disk.minor, 1);
-                       devfd = dev_open(dn, O_RDONLY);
-                       if (devfd < 0)
+
+                       subarray = strchr(content->text_version+1, '/')+1;
+                       mdstat = mdstat_by_subdev(subarray,
+                                                 devname2devnum(container));
+                       if (!mdstat)
                                continue;
-                       ok = st->ss->load_super(st, devfd, NULL);
-                       close(devfd);
-                       if (ok >= 0)
-                               break;
-               }
-               if (!sd) {
-                       fprintf(stderr, Name ": %s: Cannot find a superblock\n",
-                               devname);
-                       rv = 1;
                        break;
                }
+               if (!content)
+                       break;
 
-               memset(&bsb, 0, 512);
-               memcpy(bsb.magic, "md_backup_data-1", 16);
-               st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
-               bsb.mtime = __cpu_to_le64(time(0));
-               bsb.devstart2 = blocks;
-               stripes = blocks / (ochunk/512) / odata;
-               /* Now we just need to kick off the reshape and watch, while
-                * handling backups of the data...
-                * This is all done by a forked background process.
-                */
-               switch(fork()) {
-               case 0:
-                       close(fd);
-                       if (check_env("MDADM_GROW_VERIFY"))
-                               fd = open(devname, O_RDONLY | O_DIRECT);
-                       else
-                               fd = -1;
-                       mlockall(MCL_FUTURE);
-
-                       if (odata < ndata)
-                               done = child_grow(fd, sra, stripes,
-                                                 fdlist, offsets,
-                                                 odisks, ochunk, array.level, olayout, odata,
-                                                 d - odisks, fdlist+odisks, offsets+odisks);
-                       else if (odata > ndata)
-                               done = child_shrink(fd, sra, stripes,
-                                                   fdlist, offsets,
-                                                   odisks, ochunk, array.level, olayout, odata,
-                                                   d - odisks, fdlist+odisks, offsets+odisks);
-                       else
-                               done = child_same_size(fd, sra, stripes,
-                                                      fdlist, offsets,
-                                                      0,
-                                                      odisks, ochunk, array.level, olayout, odata,
-                                                      d - odisks, fdlist+odisks, offsets+odisks);
-                       if (backup_file && done)
-                               unlink(backup_file);
-                       if (level != UnSet && level != array.level) {
-                               /* We need to wait for the reshape to finish
-                                * (which will have happened unless odata < ndata)
-                                * and then set the level
-                                */
-
-                               c = map_num(pers, level);
-                               if (c == NULL)
-                                       exit(0);/* not possible */
-
-                               if (odata < ndata)
-                                       wait_reshape(sra);
-                               err = sysfs_set_str(sra, NULL, "level", c);
-                               if (err)
-                                       fprintf(stderr, Name ": %s: could not set level to %s\n",
-                                               devname, c);
-                       }
-                       exit(0);
-               case -1:
-                       fprintf(stderr, Name ": Cannot run child to monitor reshape: %s\n",
-                               strerror(errno));
-                       rv = 1;
+               fd = open_dev(mdstat->devnum);
+               if (fd < 0)
                        break;
-               default:
-                       /* The child will take care of unfreezing the array */
-                       frozen = 0;
+               adev = map_dev(dev2major(mdstat->devnum),
+                              dev2minor(mdstat->devnum),
+                              0);
+               if (!adev)
+                       adev = content->text_version;
+
+               sysfs_init(content, fd, mdstat->devnum);
+
+               rv = reshape_array(container, fd, adev, st,
+                                  content, force, NULL,
+                                  backup_file, quiet, 1, restart);
+               close(fd);
+               restart = 0;
+               if (rv)
                        break;
-               }
-               break;
-
-       }
-
- release:
-       if (rv && orig_level != UnSet && sra) {
-               c = map_num(pers, orig_level);
-               if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
-                       fprintf(stderr, Name ": aborting level change\n");
        }
-       if (sra)
-               unfreeze_array(sra, frozen);
-       return rv;
+       if (!rv)
+               unfreeze(st);
+       sysfs_free(cc);
+       exit(0);
 }
 
 /*
@@ -1299,10 +2322,323 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
  * 
  */
 
+int progress_reshape(struct mdinfo *info, struct reshape *reshape,
+                    unsigned long long backup_point,
+                    unsigned long long wait_point,
+                    unsigned long long *suspend_point,
+                    unsigned long long *reshape_completed)
+{
+       /* This function is called repeatedly by the reshape manager.
+        * It determines how much progress can safely be made and allows
+        * that progress.
+        * - 'info' identifies the array and particularly records in
+        *    ->reshape_progress the metadata's knowledge of progress
+        *      This is a sector offset from the start of the array
+        *      of the next array block to be relocated.  This number
+        *      may increase from 0 or decrease from array_size, depending
+        *      on the type of reshape that is happening.
+        *    Note that in contrast, 'sync_completed' is a block count of the
+        *    reshape so far.  It gives the distance between the start point
+        *    (head or tail of device) and the next place that data will be
+        *    written.  It always increases.
+        * - 'reshape' is the structure created by analyse_change
+        * - 'backup_point' shows how much the metadata manager has backed-up
+        *   data.  For reshapes with increasing progress, it is the next address
+        *   to be backed up, previous addresses have been backed-up.  For
+        *   decreasing progress, it is the earliest address that has been
+        *   backed up - later address are also backed up.
+        *   So addresses between reshape_progress and backup_point are
+        *   backed up providing those are in the 'correct' order.
+        * - 'wait_point' is an array address.  When reshape_completed
+        *   passes this point, progress_reshape should return.  It might
+        *   return earlier if it determines that ->reshape_progress needs
+        *   to be updated or further backup is needed.
+        * - suspend_point is maintained by progress_reshape and the caller
+        *   should not touch it except to initialise to zero.
+        *   It is an array address and it only increases in 2.6.37 and earlier.
+        *   This makes it difficult to handle reducing reshapes with
+        *   external metadata.
+        *   However:  it is similar to backup_point in that it records the
+        *     other end of a suspended region from  reshape_progress.
+        *     it is moved to extend the region that is safe to backup and/or
+        *     reshape
+        * - reshape_completed is read from sysfs and returned.  The caller
+        *   should copy this into ->reshape_progress when it has reason to
+        *   believe that the metadata knows this, and any backup outside this
+        *   has been erased.
+        *
+        * Return value is:
+        *   1 if more data from backup_point - but only as far as suspend_point,
+        *     should be backed up
+        *   0 if things are progressing smoothly
+        *  -1 if the reshape is finished because it is all done,
+        *  -2 if the reshape is finished due to an error.
+        */
+
+       int advancing = (reshape->after.data_disks
+                        >= reshape->before.data_disks);
+       unsigned long long need_backup; /* All data between start of array and
+                                        * here will at some point need to
+                                        * be backed up.
+                                        */
+       unsigned long long read_offset, write_offset;
+       unsigned long long write_range;
+       unsigned long long max_progress, target, completed;
+       unsigned long long array_size = (info->component_size
+                                        * reshape->before.data_disks);
+       int fd;
+       char buf[20];
+
+       /* First, we unsuspend any region that is now known to be safe.
+        * If suspend_point is on the 'wrong' side of reshape_progress, then
+        * we don't have or need suspension at the moment.  This is true for
+        * native metadata when we don't need to back-up.
+        */
+       if (advancing) {
+               if (info->reshape_progress <= *suspend_point)
+                       sysfs_set_num(info, NULL, "suspend_lo",
+                                     info->reshape_progress);
+       } else {
+               /* Note: this won't work in 2.6.37 and before.
+                * Something somewhere should make sure we don't need it!
+                */
+               if (info->reshape_progress >= *suspend_point)
+                       sysfs_set_num(info, NULL, "suspend_hi",
+                                     info->reshape_progress);
+       }
+
+       /* Now work out how far it is safe to progress.
+        * If the read_offset for ->reshape_progress is less than
+        * 'blocks' beyond the write_offset, we can only progress as far
+        * as a backup.
+        * Otherwise we can progress until the write_offset for the new location
+        * reaches (within 'blocks' of) the read_offset at the current location.
+        * However that region must be suspended unless we are using native
+        * metadata.
+        * If we need to suspend more, we limit it to 128M per device, which is
+        * rather arbitrary and should be some time-based calculation.
+        */
+       read_offset = info->reshape_progress / reshape->before.data_disks;
+       write_offset = info->reshape_progress / reshape->after.data_disks;
+       write_range = info->new_chunk/512;
+       if (reshape->before.data_disks == reshape->after.data_disks)
+               need_backup = array_size;
+       else
+               need_backup = reshape->backup_blocks;
+       if (advancing) {
+               if (read_offset < write_offset + write_range)
+                       max_progress = backup_point;
+               else
+                       max_progress =
+                               read_offset *
+                               reshape->after.data_disks;
+       } else {
+               if (read_offset > write_offset - write_range)
+                       /* Can only progress as far as has been backed up,
+                        * which must be suspended */
+                       max_progress = backup_point;
+               else if (info->reshape_progress <= need_backup)
+                       max_progress = backup_point;
+               else {
+                       if (info->array.major_version >= 0)
+                               /* Can progress until backup is needed */
+                               max_progress = need_backup;
+                       else {
+                               /* Can progress until metadata update is required */
+                               max_progress =
+                                       read_offset *
+                                       reshape->after.data_disks;
+                               /* but data must be suspended */
+                               if (max_progress < *suspend_point)
+                                       max_progress = *suspend_point;
+                       }
+               }
+       }
+
+       /* We know it is safe to progress to 'max_progress' providing
+        * it is suspended or we are using native metadata.
+        * Consider extending suspend_point 128M per device if it
+        * is less than 64M per device beyond reshape_progress.
+        * But always do a multiple of 'blocks'
+        * FIXME this is too big - it takes to long to complete
+        * this much.
+        */
+       target = 64*1024*2 * min(reshape->before.data_disks,
+                                 reshape->after.data_disks);
+       target /= reshape->backup_blocks;
+       if (target < 2)
+               target = 2;
+       target *= reshape->backup_blocks;
+
+       /* For externally managed metadata we always need to suspend IO to
+        * the area being reshaped so we regularly push suspend_point forward.
+        * For native metadata we only need the suspend if we are going to do
+        * a backup.
+        */
+       if (advancing) {
+               if ((need_backup > info->reshape_progress
+                    || info->array.major_version < 0) &&
+                   *suspend_point < info->reshape_progress + target) {
+                       if (need_backup < *suspend_point + 2 * target)
+                               *suspend_point = need_backup;
+                       else if (*suspend_point + 2 * target < array_size)
+                               *suspend_point += 2 * target;
+                       else
+                               *suspend_point = array_size;
+                       sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
+                       if (max_progress > *suspend_point)
+                               max_progress = *suspend_point;
+               }
+       } else {
+               if (info->array.major_version >= 0) {
+                       /* Only need to suspend when about to backup */
+                       if (info->reshape_progress < need_backup * 2 &&
+                           *suspend_point > 0) {
+                               *suspend_point = 0;
+                               sysfs_set_num(info, NULL, "suspend_lo", 0);
+                               sysfs_set_num(info, NULL, "suspend_hi", need_backup);
+                       }
+               } else {
+                       /* Need to suspend continually */
+                       if (info->reshape_progress < *suspend_point)
+                               *suspend_point = info->reshape_progress;
+                       if (*suspend_point + target < info->reshape_progress)
+                               /* No need to move suspend region yet */;
+                       else {
+                               if (*suspend_point >= 2 * target)
+                                       *suspend_point -= 2 * target;
+                               else
+                                       *suspend_point = 0;
+                               sysfs_set_num(info, NULL, "suspend_lo",
+                                             *suspend_point);
+                       }
+                       if (max_progress < *suspend_point)
+                               max_progress = *suspend_point;
+               }
+       }
+
+       /* now set sync_max to allow that progress. sync_max, like
+        * sync_completed is a count of sectors written per device, so
+        * we find the difference between max_progress and the start point,
+        * and divide that by after.data_disks to get a sync_max
+        * number.
+        * At the same time we convert wait_point to a similar number
+        * for comparing against sync_completed.
+        */
+       /* scale down max_progress to per_disk */
+       max_progress /= reshape->after.data_disks;
+       /* Round to chunk size as some kernels give an erroneously high number */
+       max_progress /= info->new_chunk/512;
+       max_progress *= info->new_chunk/512;
+       /* And round to old chunk size as the kernel wants that */
+       max_progress /= info->array.chunk_size/512;
+       max_progress *= info->array.chunk_size/512;
+       /* Limit progress to the whole device */
+       if (max_progress > info->component_size)
+               max_progress = info->component_size;
+       wait_point /= reshape->after.data_disks;
+       if (!advancing) {
+               /* switch from 'device offset' to 'processed block count' */
+               max_progress = info->component_size - max_progress;
+               wait_point = info->component_size - wait_point;
+       }
+
+       sysfs_set_num(info, NULL, "sync_max", max_progress);
+
+       /* Now wait.  If we have already reached the point that we were
+        * asked to wait to, don't wait at all, else wait for any change.
+        * We need to select on 'sync_completed' as that is the place that
+        * notifications happen, but we are really interested in
+        * 'reshape_position'
+        */
+       fd = sysfs_get_fd(info, NULL, "sync_completed");
+       if (fd < 0)
+               goto check_progress;
+
+       if (sysfs_fd_get_ll(fd, &completed) < 0) {
+               close(fd);
+               goto check_progress;
+       }
+       while (completed < max_progress && completed < wait_point) {
+               /* Check that sync_action is still 'reshape' to avoid
+                * waiting forever on a dead array
+                */
+               char action[20];
+               fd_set rfds;
+               if (sysfs_get_str(info, NULL, "sync_action",
+                                 action, 20) <= 0 ||
+                   strncmp(action, "reshape", 7) != 0)
+                       break;
+               /* Some kernels reset 'sync_completed' to zero
+                * before setting 'sync_action' to 'idle'.
+                * So we need these extra tests.
+                */
+               if (completed == 0 && advancing
+                   && info->reshape_progress > 0)
+                       break;
+               if (completed == 0 && !advancing
+                   && info->reshape_progress < (info->component_size
+                                                * reshape->after.data_disks))
+                       break;
+               FD_ZERO(&rfds);
+               FD_SET(fd, &rfds);
+               select(fd+1, NULL, NULL, &rfds, NULL);
+               if (sysfs_fd_get_ll(fd, &completed) < 0) {
+                       close(fd);
+                       goto check_progress;
+               }
+       }
+       /* Some kernels reset 'sync_completed' to zero,
+        * we need to have real point we are in md
+        */
+       if (completed == 0)
+               completed = max_progress;
+
+       /* some kernels can give an incorrectly high 'completed' number */
+       completed /= (info->new_chunk/512);
+       completed *= (info->new_chunk/512);
+       /* Convert 'completed' back in to a 'progress' number */
+       completed *= reshape->after.data_disks;
+       if (!advancing) {
+               completed = info->component_size * reshape->after.data_disks
+                       - completed;
+       }
+       *reshape_completed = completed;
+       
+       close(fd);
+
+       /* We return the need_backup flag.  Caller will decide
+        * how much - a multiple of ->backup_blocks up to *suspend_point
+        */
+       if (advancing)
+               return need_backup > info->reshape_progress;
+       else
+               return need_backup >= info->reshape_progress;
+
+check_progress:
+       /* if we couldn't read a number from sync_completed, then
+        * either the reshape did complete, or it aborted.
+        * We can tell which by checking for 'none' in reshape_position.
+        */
+       strcpy(buf, "hi");
+       if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0
+           || strncmp(buf, "none", 4) != 0)
+               return -2; /* abort */
+       else {
+               /* Maybe racing with array shutdown - check state */
+               if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0
+                   || strncmp(buf, "inactive", 8) == 0
+                   || strncmp(buf, "clear",5) == 0)
+                       return -2; /* abort */
+               return -1; /* complete */
+       }
+}
+
+
 /* FIXME return status is never checked */
-int grow_backup(struct mdinfo *sra,
+static int grow_backup(struct mdinfo *sra,
                unsigned long long offset, /* per device */
-               unsigned long stripes, /* per device */
+               unsigned long stripes, /* per device, in old chunks */
                int *sources, unsigned long long *offsets,
                int disks, int chunk, int level, int layout,
                int dests, int *destfd, unsigned long long *destoffsets,
@@ -1325,7 +2661,7 @@ int grow_backup(struct mdinfo *sra,
                odata--;
        if (level == 6)
                odata--;
-       sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata);
+
        /* Check that array hasn't become degraded, else we might backup the wrong data */
        if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0)
                return -1; /* FIXME this error is ignored */
@@ -1416,43 +2752,16 @@ int grow_backup(struct mdinfo *sra,
  * every works.
  */
 /* FIXME return value is often ignored */
-int wait_backup(struct mdinfo *sra,
-               unsigned long long offset, /* per device */
-               unsigned long long blocks, /* per device */
-               unsigned long long blocks2, /* per device - hack */
+static int forget_backup(
                int dests, int *destfd, unsigned long long *destoffsets,
                int part)
 {
-       /* Wait for resync to pass the section that was backed up
-        * then erase the backup and allow IO
+       /* 
+        * Erase backup 'part' (which is 0 or 1)
         */
-       int fd = sysfs_get_fd(sra, NULL, "sync_completed");
-       unsigned long long completed;
        int i;
        int rv;
 
-       if (fd < 0)
-               return -1;
-       sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
-       if (offset == 0)
-               sysfs_set_str(sra, NULL, "sync_action", "reshape");
-       do {
-               char action[20];
-               fd_set rfds;
-               FD_ZERO(&rfds);
-               FD_SET(fd, &rfds);
-               select(fd+1, NULL, NULL, &rfds, NULL);
-               if (sysfs_fd_get_ll(fd, &completed) < 0) {
-                       close(fd);
-                       return -1;
-               }
-               if (sysfs_get_str(sra, NULL, "sync_action",
-                                 action, 20) > 0 &&
-                   strncmp(action, "reshape", 7) != 0)
-                       break;
-       } while (completed < offset + blocks);
-       close(fd);
-
        if (part) {
                bsb.arraystart2 = __cpu_to_le64(0);
                bsb.length2 = __cpu_to_le64(0);
@@ -1572,138 +2881,202 @@ static void validate(int afd, int bfd, unsigned long long offset)
        }
 }
 
-static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
-                     int *fds, unsigned long long *offsets,
-                     int disks, int chunk, int level, int layout, int data,
-                     int dests, int *destfd, unsigned long long *destoffsets)
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+                 struct supertype *st, unsigned long blocks,
+                 int *fds, unsigned long long *offsets,
+                 int dests, int *destfd, unsigned long long *destoffsets)
 {
+       /* Monitor a reshape where backup is being performed using
+        * 'native' mechanism - either to a backup file, or
+        * to some space in a spare.
+        */
        char *buf;
-       int degraded = 0;
+       int degraded = -1;
+       unsigned long long speed;
+       unsigned long long suspend_point, array_size;
+       unsigned long long backup_point, wait_point;
+       unsigned long long reshape_completed;
+       int done = 0;
+       int increasing = reshape->after.data_disks >= reshape->before.data_disks;
+       int part = 0; /* The next part of the backup area to fill.  It may already
+                      * be full, so we need to check */
+       int level = reshape->level;
+       int layout = reshape->before.layout;
+       int data = reshape->before.data_disks;
+       int disks = reshape->before.data_disks + reshape->parity;
+       int chunk = sra->array.chunk_size;
+       struct mdinfo *sd;
+       unsigned long stripes;
 
-       if (posix_memalign((void**)&buf, 4096, disks * chunk))
-               /* Don't start the 'reshape' */
+       /* set up the backup-super-block.  This requires the
+        * uuid from the array.
+        */
+       /* Find a superblock */
+       for (sd = sra->devs; sd; sd = sd->next) {
+               char *dn;
+               int devfd;
+               int ok;
+               if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                       continue;
+               dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+               devfd = dev_open(dn, O_RDONLY);
+               if (devfd < 0)
+                       continue;
+               ok = st->ss->load_super(st, devfd, NULL);
+               close(devfd);
+               if (ok >= 0)
+                       break;
+       }
+       if (!sd) {
+               fprintf(stderr, Name ": Cannot find a superblock\n");
                return 0;
-       sysfs_set_num(sra, NULL, "suspend_hi", 0);
-       sysfs_set_num(sra, NULL, "suspend_lo", 0);
-       grow_backup(sra, 0, stripes,
-                   fds, offsets, disks, chunk, level, layout,
-                   dests, destfd, destoffsets,
-                   0, &degraded, buf);
-       validate(afd, destfd[0], destoffsets[0]);
-       wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512),
-                   dests, destfd, destoffsets,
-                   0);
-       sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
-       free(buf);
-       /* FIXME this should probably be numeric */
-       sysfs_set_str(sra, NULL, "sync_max", "max");
-       return 1;
-}
+       }
 
-static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
-                       int *fds, unsigned long long *offsets,
-                       int disks, int chunk, int level, int layout, int data,
-                       int dests, int *destfd, unsigned long long *destoffsets)
-{
-       char *buf;
-       unsigned long long start;
-       int rv;
-       int degraded = 0;
+       memset(&bsb, 0, 512);
+       memcpy(bsb.magic, "md_backup_data-1", 16);
+       st->ss->uuid_from_super(st, (int*)&bsb.set_uuid);
+       bsb.mtime = __cpu_to_le64(time(0));
+       bsb.devstart2 = blocks;
+
+       stripes = blocks / (sra->array.chunk_size/512) /
+               reshape->before.data_disks;
 
        if (posix_memalign((void**)&buf, 4096, disks * chunk))
+               /* Don't start the 'reshape' */
                return 0;
-       start = sra->component_size - stripes * (chunk/512);
-       sysfs_set_num(sra, NULL, "sync_max", start);
-       sysfs_set_str(sra, NULL, "sync_action", "reshape");
-       sysfs_set_num(sra, NULL, "suspend_lo", 0);
-       sysfs_set_num(sra, NULL, "suspend_hi", 0);
-       rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512),
-                        dests, destfd, destoffsets, 0);
-       if (rv < 0)
-               return 0;
-       grow_backup(sra, 0, stripes,
-                   fds, offsets,
-                   disks, chunk, level, layout,
-                   dests, destfd, destoffsets,
-                   0, &degraded, buf);
-       validate(afd, destfd[0], destoffsets[0]);
-       wait_backup(sra, start, stripes*(chunk/512), 0,
-                   dests, destfd, destoffsets, 0);
-       sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
-       free(buf);
-       /* FIXME this should probably be numeric */
-       sysfs_set_str(sra, NULL, "sync_max", "max");
-       return 1;
-}
+       if (reshape->before.data_disks == reshape->after.data_disks) {
+               sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
+               sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
+       }
 
-static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
-                          int *fds, unsigned long long *offsets,
-                          unsigned long long start,
-                          int disks, int chunk, int level, int layout, int data,
-                          int dests, int *destfd, unsigned long long *destoffsets)
-{
-       unsigned long long size;
-       unsigned long tailstripes = stripes;
-       int part;
-       char *buf;
-       unsigned long long speed;
-       int degraded = 0;
+       if (increasing) {
+               array_size = sra->component_size * reshape->after.data_disks;
+               backup_point = sra->reshape_progress;
+               suspend_point = 0;
+       } else {
+               array_size = sra->component_size * reshape->before.data_disks;
+               backup_point = reshape->backup_blocks;
+               suspend_point = array_size;
+       }
 
+       while (!done) {
+               int rv;
 
-       if (posix_memalign((void**)&buf, 4096, disks * chunk))
-               return 0;
+               /* Want to return as soon the oldest backup slot can
+                * be released as that allows us to start backing up
+                * some more, providing suspend_point has been
+                * advanced, which it should have.
+                */
+               if (increasing) {
+                       wait_point = array_size;
+                       if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+                               wait_point = (__le64_to_cpu(bsb.arraystart) +
+                                             __le64_to_cpu(bsb.length));
+                       if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+                               wait_point = (__le64_to_cpu(bsb.arraystart2) +
+                                             __le64_to_cpu(bsb.length2));
+               } else {
+                       wait_point = 0;
+                       if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+                               wait_point = __le64_to_cpu(bsb.arraystart);
+                       if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+                               wait_point = __le64_to_cpu(bsb.arraystart2);
+               }
+
+               rv = progress_reshape(sra, reshape,
+                                     backup_point, wait_point,
+                                     &suspend_point, &reshape_completed);
+               /* external metadata would need to ping_monitor here */
+               sra->reshape_progress = reshape_completed;
+
+               /* Clear any backup region that is before 'here' */
+               if (increasing) {
+                       if (__le64_to_cpu(bsb.length) > 0 &&
+                           reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
+                                                 __le64_to_cpu(bsb.length)))
+                               forget_backup(dests, destfd,
+                                             destoffsets, 0);
+                       if (__le64_to_cpu(bsb.length2) > 0 &&
+                           reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
+                                                 __le64_to_cpu(bsb.length2)))
+                               forget_backup(dests, destfd,
+                                             destoffsets, 1);
+               } else {
+                       if (__le64_to_cpu(bsb.length) > 0 &&
+                           reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
+                               forget_backup(dests, destfd,
+                                             destoffsets, 0);
+                       if (__le64_to_cpu(bsb.length2) > 0 &&
+                           reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
+                               forget_backup(dests, destfd,
+                                             destoffsets, 1);
+               }
+
+               if (rv < 0) {
+                       if (rv == -1)
+                               done = 1;
+                       break;
+               }
+               if (rv == 0 && increasing && !st->ss->external) {
+                       /* No longer need to monitor this reshape */
+                       done = 1;
+                       break;
+               }
 
-       sysfs_set_num(sra, NULL, "suspend_lo", 0);
-       sysfs_set_num(sra, NULL, "suspend_hi", 0);
+               while (rv) {
+                       unsigned long long offset;
+                       unsigned long actual_stripes;
+                       /* Need to backup some data.
+                        * If 'part' is not used and the desired
+                        * backup size is suspended, do a backup,
+                        * then consider the next part.
+                        */
+                       /* Check that 'part' is unused */
+                       if (part == 0 && __le64_to_cpu(bsb.length) != 0)
+                               break;
+                       if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
+                               break;
 
-       sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
-       sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
-
-       grow_backup(sra, start, stripes,
-                   fds, offsets,
-                   disks, chunk, level, layout,
-                   dests, destfd, destoffsets,
-                   0, &degraded, buf);
-       grow_backup(sra, (start + stripes) * (chunk/512), stripes,
-                   fds, offsets,
-                   disks, chunk, level, layout,
-                   dests, destfd, destoffsets,
-                   1, &degraded, buf);
-       validate(afd, destfd[0], destoffsets[0]);
-       part = 0;
-       start += stripes * 2; /* where to read next */
-       size = sra->component_size / (chunk/512);
-       while (start < size) {
-               if (wait_backup(sra, (start-stripes*2)*(chunk/512),
-                               stripes*(chunk/512), 0,
-                               dests, destfd, destoffsets,
-                               part) < 0)
-                       return 0;
-               sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data);
-               if (start + stripes > size)
-                       tailstripes = (size - start);
-
-               grow_backup(sra, start*(chunk/512), tailstripes,
-                           fds, offsets,
-                           disks, chunk, level, layout,
-                           dests, destfd, destoffsets,
-                           part, &degraded, buf);
-               start += stripes;
-               part = 1 - part;
-               validate(afd, destfd[0], destoffsets[0]);
-       }
-       if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0,
-                       dests, destfd, destoffsets,
-                       part) < 0)
-               return 0;
-       sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data);
-       wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0,
-                   dests, destfd, destoffsets,
-                   1-part);
-       sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data);
-       sysfs_set_num(sra, NULL, "sync_speed_min", speed);
+                       offset = backup_point / data;
+                       actual_stripes = stripes;
+                       if (increasing) {
+                               if (offset + actual_stripes * (chunk/512) >
+                                   sra->component_size)
+                                       actual_stripes = ((sra->component_size - offset)
+                                                         / (chunk/512));
+                               if (offset + actual_stripes * (chunk/512) >
+                                   suspend_point/data)
+                                       break;
+                       } else {
+                               if (offset < actual_stripes * (chunk/512))
+                                       actual_stripes = offset / (chunk/512);
+                               offset -= actual_stripes * (chunk/512);
+                               if (offset < suspend_point/data)
+                                       break;
+                       }
+                       if (actual_stripes == 0)
+                               break;
+                       grow_backup(sra, offset, actual_stripes,
+                                   fds, offsets,
+                                   disks, chunk, level, layout,
+                                   dests, destfd, destoffsets,
+                                   part, &degraded, buf);
+                       validate(afd, destfd[0], destoffsets[0]);
+                       /* record where 'part' is up to */
+                       part = !part;
+                       if (increasing)
+                               backup_point += actual_stripes * (chunk/512) * data;
+                       else
+                               backup_point -= actual_stripes * (chunk/512) * data;
+               }
+       }
+
+       /* FIXME maybe call progress_reshape one more time instead */
+       abort_reshape(sra); /* remove any remaining suspension */
+       if (reshape->before.data_disks == reshape->after.data_disks)
+               sysfs_set_num(sra, NULL, "sync_speed_min", speed);
        free(buf);
-       return 1;
+       return done;
 }
 
 /*
@@ -1740,6 +3113,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                int fd;
                int bsbsize;
                char *devname, namebuf[20];
+               unsigned long long lo, hi;
 
                /* This was a spare and may have some saved data on it.
                 * Load the superblock, find and load the
@@ -1763,7 +3137,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (st->ss->load_super(st, fd, NULL))
                                continue;
 
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        st->ss->free_super(st);
 
                        if (lseek64(fd,
@@ -1823,42 +3197,52 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                }
 
                if (bsb.magic[15] == '1') {
-               if (info->delta_disks >= 0) {
-                       /* reshape_progress is increasing */
-                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
-                           info->reshape_progress) {
-                       nonew:
-                               if (verbose)
-                                       fprintf(stderr, Name ": backup-metadata found on %s but is not needed\n", devname);
-                               continue; /* No new data here */
+                       if (bsb.length == 0)
+                               continue;
+                       if (info->delta_disks >= 0) {
+                               /* reshape_progress is increasing */
+                               if (__le64_to_cpu(bsb.arraystart)
+                                   + __le64_to_cpu(bsb.length)
+                                   < info->reshape_progress) {
+                               nonew:
+                                       if (verbose)
+                                               fprintf(stderr, Name
+                  ": backup-metadata found on %s but is not needed\n", devname);
+                                       continue; /* No new data here */
+                               }
+                       } else {
+                               /* reshape_progress is decreasing */
+                               if (__le64_to_cpu(bsb.arraystart) >=
+                                   info->reshape_progress)
+                                       goto nonew; /* No new data here */
                        }
                } else {
-                       /* reshape_progress is decreasing */
-                       if (__le64_to_cpu(bsb.arraystart) >=
-                           info->reshape_progress)
-                               goto nonew; /* No new data here */
-               }
-               } else {
-               if (info->delta_disks >= 0) {
-                       /* reshape_progress is increasing */
-                       if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) <
-                           info->reshape_progress &&
-                           __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) <
-                           info->reshape_progress)
-                               goto nonew; /* No new data here */
-               } else {
-                       /* reshape_progress is decreasing */
-                       if (__le64_to_cpu(bsb.arraystart) >=
-                           info->reshape_progress &&
-                           __le64_to_cpu(bsb.arraystart2) >=
-                           info->reshape_progress)
-                               goto nonew; /* No new data here */
-               }
+                       if (bsb.length == 0 && bsb.length2 == 0)
+                               continue;
+                       if (info->delta_disks >= 0) {
+                               /* reshape_progress is increasing */
+                               if ((__le64_to_cpu(bsb.arraystart)
+                                    + __le64_to_cpu(bsb.length)
+                                    < info->reshape_progress)
+                                   &&
+                                   (__le64_to_cpu(bsb.arraystart2)
+                                    + __le64_to_cpu(bsb.length2)
+                                    < info->reshape_progress))
+                                       goto nonew; /* No new data here */
+                       } else {
+                               /* reshape_progress is decreasing */
+                               if (__le64_to_cpu(bsb.arraystart) >=
+                                   info->reshape_progress &&
+                                   __le64_to_cpu(bsb.arraystart2) >=
+                                   info->reshape_progress)
+                                       goto nonew; /* No new data here */
+                       }
                }
                if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
                second_fail:
                        if (verbose)
-                               fprintf(stderr, Name ": Failed to verify secondary backup-metadata block on %s\n",
+                               fprintf(stderr, Name
+                    ": Failed to verify secondary backup-metadata block on %s\n",
                                        devname);
                        continue; /* Cannot seek */
                }
@@ -1881,7 +3265,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                /* FIXME should be this be an error */
                                continue;
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        st->ss->free_super(st);
                        offsets[j] = dinfo.data_offset * 512;
                }
@@ -1922,7 +3306,28 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 
                /* Ok, so the data is restored. Let's update those superblocks. */
 
-               if (info->delta_disks >= 0) {
+               lo = hi = 0;
+               if (bsb.length) {
+                       lo = __le64_to_cpu(bsb.arraystart);
+                       hi = lo + __le64_to_cpu(bsb.length);
+               }
+               if (bsb.magic[15] == '2' && bsb.length2) {
+                       unsigned long long lo1, hi1;
+                       lo1 = __le64_to_cpu(bsb.arraystart2);
+                       hi1 = lo1 + __le64_to_cpu(bsb.length2);
+                       if (lo == hi) {
+                               lo = lo1;
+                               hi = hi1;
+                       } else if (lo < lo1)
+                               hi = hi1;
+                       else
+                               lo = lo1;
+               }
+               if (lo < hi &&
+                   (info->reshape_progress < lo ||
+                    info->reshape_progress > hi))
+                       /* backup does not affect reshape_progress*/ ;
+               else if (info->delta_disks >= 0) {
                        info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
                                __le64_to_cpu(bsb.length);
                        if (bsb.magic[15] == '2') {
@@ -1943,7 +3348,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (fdlist[j] < 0) continue;
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                continue;
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        dinfo.reshape_progress = info->reshape_progress;
                        st->ss->update_super(st, &dinfo,
                                             "_reshape_progress",
@@ -1997,159 +3402,34 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
                  char *backup_file)
 {
-       /* Array is assembled and ready to be started, but
-        * monitoring is probably required.
-        * So:
-        *   - start read-only
-        *   - set upper bound for resync
-        *   - initialise the 'suspend' boundaries
-        *   - switch to read-write
-        *   - fork and continue monitoring
-        */
+       char buf[40];
+       char *container = NULL;
        int err;
-       int backup_list[1];
-       unsigned long long backup_offsets[1];
-       int odisks, ndisks, ochunk, nchunk,odata,ndata;
-       unsigned long a,b,blocks,stripes;
-       int backup_fd;
-       int *fds;
-       unsigned long long *offsets;
-       int d;
-       struct mdinfo *sra, *sd;
-       int rv;
-       unsigned long cache;
-       int done = 0;
 
        err = sysfs_set_str(info, NULL, "array_state", "readonly");
        if (err)
                return err;
+       if (st->ss->external) {
+               fmt_devname(buf, st->container_dev);
+               container = buf;
+               freeze(st);
 
-       /* make sure reshape doesn't progress until we are ready */
-       sysfs_set_str(info, NULL, "sync_max", "0");
-       sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
-
-       sra = sysfs_read(-1, devname2devnum(info->sys_name),
-                        GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
-                        GET_CACHE);
-       if (!sra)
-               return 1;
-
-       /* ndisks is not growing, so raid_disks is old and +delta is new */
-       odisks = info->array.raid_disks;
-       ndisks = odisks + info->delta_disks;
-       odata = odisks - 1;
-       ndata = ndisks - 1;
-       if (info->array.level == 6) {
-               odata--;
-               ndata--;
-       }
-       ochunk = info->array.chunk_size;
-       nchunk = info->new_chunk;
-
-       a = (ochunk/512) * odata;
-       b = (nchunk/512) * ndata;
-       /* Find GCD */
-       while (a != b) {
-               if (a < b)
-                       b -= a;
-               if (b < a)
-                       a -= b;
-       }
-       /* LCM == product / GCD */
-       blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
-
-       if (ndata == odata)
-               while (blocks * 32 < sra->component_size &&
-                      blocks < 16*1024*2)
-                       blocks *= 2;
-       stripes = blocks / (info->array.chunk_size/512) / odata;
-
-       /* check that the internal stripe cache is
-        * large enough, or it won't work.
-        */
-       cache = (nchunk < ochunk) ? ochunk : nchunk;
-       cache = cache * 4 / 4096;
-       if (cache < blocks / 8 / odisks + 16)
-               /* Make it big enough to hold 'blocks' */
-               cache = blocks / 8 / odisks + 16;
-       if (sra->cache_size < cache)
-               sysfs_set_num(sra, NULL, "stripe_cache_size",
-                             cache+1);
-
-       memset(&bsb, 0, 512);
-       memcpy(bsb.magic, "md_backup_data-1", 16);
-       memcpy(&bsb.set_uuid, info->uuid, 16);
-       bsb.mtime = __cpu_to_le64(time(0));
-       bsb.devstart2 = blocks;
+               if (!mdmon_running(st->container_dev))
+                       start_mdmon(st->container_dev);
+               ping_monitor(devnum2devname(st->container_dev));
 
-       backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
-       backup_list[0] = backup_fd;
-       backup_offsets[0] = 8 * 512;
-       fds = malloc(odisks * sizeof(fds[0]));
-       offsets = malloc(odisks * sizeof(offsets[0]));
-       for (d=0; d<odisks; d++)
-               fds[d] = -1;
 
-       for (sd = sra->devs; sd; sd = sd->next) {
-               if (sd->disk.state & (1<<MD_DISK_FAULTY))
-                       continue;
-               if (sd->disk.state & (1<<MD_DISK_SYNC)) {
-                       char *dn = map_dev(sd->disk.major,
-                                          sd->disk.minor, 1);
-                       fds[sd->disk.raid_disk]
-                               = dev_open(dn, O_RDONLY);
-                       offsets[sd->disk.raid_disk] = sd->data_offset*512;
-                       if (fds[sd->disk.raid_disk] < 0) {
-                               fprintf(stderr, Name ": %s: cannot open component %s\n",
-                                       info->sys_name, dn?dn:"-unknown-");
-                               rv = 1;
-                               goto release;
-                       }
-                       free(dn);
+               if (info->reshape_active == 2) {
+                       int cfd = open_dev(st->container_dev);
+                       if (cfd < 0)
+                               return 1;
+                       st->ss->load_container(st, cfd, container);
+                       close(cfd);
+                       return reshape_container(container, NULL,
+                                                st, info, 0, backup_file,
+                                                0, 1);
                }
        }
-
-       switch(fork()) {
-       case 0:
-               close(mdfd);
-               mlockall(MCL_FUTURE);
-               if (info->delta_disks < 0)
-                       done = child_shrink(-1, info, stripes,
-                                           fds, offsets,
-                                           info->array.raid_disks,
-                                           info->array.chunk_size,
-                                           info->array.level, info->array.layout,
-                                           odata,
-                                           1, backup_list, backup_offsets);
-               else if (info->delta_disks == 0) {
-                       /* The 'start' is a per-device stripe number.
-                        * reshape_progress is a per-array sector number.
-                        * So divide by ndata * chunk_size
-                        */
-                       unsigned long long start = info->reshape_progress / ndata;
-                       start /= (info->array.chunk_size/512);
-                       done = child_same_size(-1, info, stripes,
-                                              fds, offsets,
-                                              start,
-                                              info->array.raid_disks,
-                                              info->array.chunk_size,
-                                              info->array.level, info->array.layout,
-                                              odata,
-                                              1, backup_list, backup_offsets);
-               }
-               if (backup_file && done)
-                       unlink(backup_file);
-               /* FIXME should I intuit a level change */
-               exit(0);
-       case -1:
-               fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
-                       strerror(errno));
-               return 1;
-       default:
-               break;
-       }
-release:
-       return 0;
+       return reshape_array(container, mdfd, "array", st, info, 1,
+                            NULL, backup_file, 0, 0, 1);
 }
-
-
index 4d3d181b10fbac9cce3e0dc119120925830fc5d4..300bdca7a35ff4f800cadf03277ef2715f675306 100644 (file)
  */
 
 #include       "mdadm.h"
+#include       <dirent.h>
+#include       <ctype.h>
 
-static int count_active(struct supertype *st, int mdfd, char **availp,
+static int count_active(struct supertype *st, struct mdinfo *sra,
+                       int mdfd, char **availp,
                        struct mdinfo *info);
 static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        int number, __u64 events, int verbose,
                        char *array_name);
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                    struct map_ent *target,
+                    struct supertype *st, int verbose);
+
+static int Incremental_container(struct supertype *st, char *devname,
+                                char *homehost,
+                                int verbose, int runstop, int autof);
+
+static struct mddev_ident *search_mdstat(struct supertype *st,
+                                          struct mdinfo *info,
+                                          char *devname,
+                                          int verbose, int *rvp);
 
 int Incremental(char *devname, int verbose, int runstop,
                struct supertype *st, char *homehost, int require_homehost,
@@ -78,20 +93,59 @@ int Incremental(char *devname, int verbose, int runstop,
         *   start the array (auto-readonly).
         */
        struct stat stb;
-       struct mdinfo info;
-       struct mddev_ident_s *array_list, *match;
+       struct mdinfo info, dinfo;
+       struct mdinfo *sra = NULL, *d;
+       struct mddev_ident *match;
        char chosen_name[1024];
-       int rv;
+       int rv = 1;
        struct map_ent *mp, *map = NULL;
-       int dfd, mdfd;
-       char *avail;
+       int dfd = -1, mdfd = -1;
+       char *avail = NULL;
        int active_disks;
-       int trustworthy = FOREIGN;
+       int trustworthy;
        char *name_to_use;
        mdu_array_info_t ainf;
+       struct dev_policy *policy = NULL;
+       struct map_ent target_array;
+       int have_target;
 
        struct createinfo *ci = conf_get_create_info();
 
+       if (stat(devname, &stb) < 0) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": stat failed for %s: %s.\n",
+                               devname, strerror(errno));
+               return rv;
+       }
+       if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": %s is not a block device.\n",
+                               devname);
+               return rv;
+       }
+       dfd = dev_open(devname, O_RDONLY|O_EXCL);
+       if (dfd < 0) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": cannot open %s: %s.\n",
+                               devname, strerror(errno));
+               return rv;
+       }
+       /* If the device is a container, we do something very different */
+       if (must_be_container(dfd)) {
+               if (!st)
+                       st = super_by_fd(dfd, NULL);
+               if (st && st->ss->load_container)
+                       rv = st->ss->load_container(st, dfd, NULL);
+
+               close(dfd);
+               if (!rv && st->ss->container_content)
+                       return Incremental_container(st, devname, homehost,
+                                                    verbose, runstop, autof);
+
+               fprintf(stderr, Name ": %s is not part of an md array.\n",
+                       devname);
+               return rv;
+       }
 
        /* 1/ Check if device is permitted by mdadm.conf */
 
@@ -100,117 +154,61 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                                ": %s not permitted by mdadm.conf.\n",
                                devname);
-               return 1;
+               goto out;
        }
 
        /* 2/ Find metadata, reject if none appropriate (check
         *            version/name from args) */
 
-       dfd = dev_open(devname, O_RDONLY|O_EXCL);
-       if (dfd < 0) {
-               if (verbose >= 0)
-                       fprintf(stderr, Name ": cannot open %s: %s.\n",
-                               devname, strerror(errno));
-               return 1;
-       }
        if (fstat(dfd, &stb) < 0) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": fstat failed for %s: %s.\n",
                                devname, strerror(errno));
-               close(dfd);
-               return 1;
+               goto out;
        }
        if ((stb.st_mode & S_IFMT) != S_IFBLK) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": %s is not a block device.\n",
                                devname);
-               close(dfd);
-               return 1;
+               goto out;
        }
 
+       dinfo.disk.major = major(stb.st_rdev);
+       dinfo.disk.minor = minor(stb.st_rdev);
+
+       policy = disk_policy(&dinfo);
+       have_target = policy_check_path(&dinfo, &target_array);
+
        if (st == NULL && (st = guess_super(dfd)) == NULL) {
                if (verbose >= 0)
                        fprintf(stderr, Name
                                ": no recognisable superblock on %s.\n",
                                devname);
-               close(dfd);
-               return 1;
+               rv = try_spare(devname, &dfd, policy,
+                              have_target ? &target_array : NULL,
+                              st, verbose);
+               goto out;
        }
-       if (st->ss->load_super(st, dfd, NULL)) {
+       if (st->ss->compare_super == NULL ||
+           st->ss->load_super(st, dfd, NULL)) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": no RAID superblock on %s.\n",
                                devname);
-               close(dfd);
-               return 1;
+               rv = try_spare(devname, &dfd, policy,
+                              have_target ? &target_array : NULL,
+                              st, verbose);
+               free(st);
+               goto out;
        }
-       close (dfd);
+       close (dfd); dfd = -1;
 
        memset(&info, 0, sizeof(info));
-       st->ss->getinfo_super(st, &info);
-       /* 3/ Check if there is a match in mdadm.conf */
+       st->ss->getinfo_super(st, &info, NULL);
 
-       array_list = conf_get_ident(NULL);
-       match = NULL;
-       for (; array_list; array_list = array_list->next) {
-               if (array_list->uuid_set &&
-                   same_uuid(array_list->uuid, info.uuid, st->ss->swapuuid)
-                   == 0) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": UUID differs from %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->name[0] &&
-                   strcasecmp(array_list->name, info.name) != 0) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Name differs from %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->devices &&
-                   !match_oneof(array_list->devices, devname)) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Not a listed device for %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->super_minor != UnSet &&
-                   array_list->super_minor != info.array.md_minor) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Different super-minor to %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (!array_list->uuid_set &&
-                   !array_list->name[0] &&
-                   !array_list->devices &&
-                   array_list->super_minor == UnSet) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                            ": %s doesn't have any identifying information.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               /* FIXME, should I check raid_disks and level too?? */
-
-               if (match) {
-                       if (verbose >= 0) {
-                               if (match->devname && array_list->devname)
-                                       fprintf(stderr, Name
-                  ": we match both %s and %s - cannot decide which to use.\n",
-                                               match->devname, array_list->devname);
-                               else
-                                       fprintf(stderr, Name
-                                               ": multiple lines in mdadm.conf match\n");
-                       }
-                       return 2;
-               }
-               match = array_list;
-       }
+       /* 3/ Check if there is a match in mdadm.conf */
+       match = search_mdstat(st, &info, devname, verbose, &rv);
+       if (!match && rv == 2)
+               goto out;
 
        if (match && match->devname
            && strcasecmp(match->devname, "<ignore>") == 0) {
@@ -218,7 +216,7 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name ": array containing %s is explicitly"
                                " ignored by mdadm.conf\n",
                                devname);
-               return 1;
+               goto out;
        }
 
        /* 3a/ if not, check for homehost match.  If no match, continue
@@ -235,14 +233,14 @@ int Incremental(char *devname, int verbose, int runstop,
                trustworthy = FOREIGN;
 
 
-       if (!match && !conf_test_metadata(st->ss->name,
+       if (!match && !conf_test_metadata(st->ss->name, policy,
                                          (trustworthy == LOCAL))) {
                if (verbose >= 1)
                        fprintf(stderr, Name
                                ": %s has metadata type %s for which "
                                "auto-assembly is disabled\n",
                                devname, st->ss->name);
-               return 1;
+               goto out;
        }
        if (trustworthy == LOCAL_ANY)
                trustworthy = LOCAL;
@@ -257,23 +255,6 @@ int Incremental(char *devname, int verbose, int runstop,
        if (autof == 0)
                autof = ci->autof;
 
-       if (st->ss->container_content && st->loaded_container) {
-               if ((runstop > 0 && info.container_enough >= 0) ||
-                   info.container_enough > 0)
-                       /* pass */;
-               else {
-                       if (verbose)
-                               fprintf(stderr, Name ": not enough devices to start the container\n");
-                       return 0;
-               }
-
-               /* This is a pre-built container array, so we do something
-                * rather different.
-                */
-               return Incremental_container(st, devname, verbose, runstop,
-                                            autof, trustworthy);
-       }
-
        name_to_use = info.name;
        if (name_to_use[0] == 0 &&
            info.array.level == LEVEL_CONTAINER &&
@@ -304,23 +285,21 @@ int Incremental(char *devname, int verbose, int runstop,
                mdfd = -1;
 
        if (mdfd < 0) {
-               struct mdinfo *sra;
-               struct mdinfo dinfo;
 
                /* Couldn't find an existing array, maybe make a new one */
                mdfd = create_mddev(match ? match->devname : NULL,
                                    name_to_use, autof, trustworthy, chosen_name);
 
                if (mdfd < 0)
-                       return 1;
+                       goto out;
 
                sysfs_init(&info, mdfd, 0);
 
                if (set_array_info(mdfd, st, &info) != 0) {
                        fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                chosen_name, strerror(errno));
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
 
                dinfo = info;
@@ -330,10 +309,12 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, 0);
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), GET_DEVS);
+               sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                           GET_OFFSET | GET_SIZE));
+       
                if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
                        /* It really should be 'none' - must be old buggy
                         * kernel, and mdadm -I may not be able to complete.
@@ -343,12 +324,11 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                      ": You have an old buggy kernel which cannot support\n"
                                "      --incremental reliably.  Aborting.\n");
-                       close(mdfd);
                        sysfs_free(sra);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
                info.array.working_disks = 1;
-               sysfs_free(sra);
                /* 6/ Make sure /var/run/mdadm.map contains this array. */
                map_update(&map, fd2devnum(mdfd),
                           info.text_version,
@@ -361,10 +341,12 @@ int Incremental(char *devname, int verbose, int runstop,
                char dn[20];
                int dfd2;
                int err;
-               struct mdinfo *sra;
                struct supertype *st2;
                struct mdinfo info2, *d;
 
+               sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                           GET_OFFSET | GET_SIZE));
+       
                if (mp->path)
                        strcpy(chosen_name, mp->path);
                else
@@ -373,33 +355,31 @@ int Incremental(char *devname, int verbose, int runstop,
                /* It is generally not OK to add non-spare drives to a
                 * running array as they are probably missing because
                 * they failed.  However if runstop is 1, then the
-                * array was possibly started early and our best be is
-                * to add this anyway.  It would probably be good to
-                * allow explicit policy statement about this.
+                * array was possibly started early and our best bet is
+                * to add this anyway.
+                * Also if action policy is re-add or better we allow
+                * re-add.
+                * This doesn't apply to containers as the 'non-spare'
+                * flag has a different meaning.  The test has to happen
+                * at the device level there
                 */
-               if ((info.disk.state & (1<<MD_DISK_SYNC)) != 0
+               if (!st->ss->external
+                   && (info.disk.state & (1<<MD_DISK_SYNC)) != 0
+                   && ! policy_action_allows(policy, st->ss->name,
+                                             act_re_add)
                    && runstop < 1) {
-                       int active = 0;
-                       
-                       if (st->ss->external) {
-                               char *devname = devnum2devname(fd2devnum(mdfd));
-
-                               active = devname && is_container_active(devname);
-                               free(devname);
-                       } else if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0)
-                               active = 1;
-                       if (active) {
+                       if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
                                fprintf(stderr, Name
                                        ": not adding %s to active array (without --run) %s\n",
                                        devname, chosen_name);
-                               close(mdfd);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), (GET_DEVS | GET_STATE));
-               if (!sra)
-                       return 2;
-
+               if (!sra) {
+                       rv = 2;
+                       goto out;
+               }
                if (sra->devs) {
                        sprintf(dn, "%d:%d", sra->devs->disk.major,
                                sra->devs->disk.minor);
@@ -411,13 +391,13 @@ int Incremental(char *devname, int verbose, int runstop,
                                        ": metadata mismatch between %s and "
                                        "chosen array %s\n",
                                        devname, chosen_name);
-                               close(mdfd);
                                close(dfd2);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                        close(dfd2);
                        memset(&info2, 0, sizeof(info2));
-                       st2->ss->getinfo_super(st2, &info2);
+                       st2->ss->getinfo_super(st2, &info2, NULL);
                        st2->ss->free_super(st2);
                        if (info.array.level != info2.array.level ||
                            memcmp(info.uuid, info2.uuid, 16) != 0 ||
@@ -425,8 +405,8 @@ int Incremental(char *devname, int verbose, int runstop,
                                fprintf(stderr, Name
                                        ": unexpected difference between %s and %s.\n",
                                        chosen_name, devname);
-                               close(mdfd);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                }
                info2.disk.major = major(stb.st_rdev);
@@ -446,8 +426,8 @@ int Incremental(char *devname, int verbose, int runstop,
                if (err < 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
                info.array.working_disks = 0;
                for (d = sra->devs; d; d=d->next)
@@ -458,6 +438,7 @@ int Incremental(char *devname, int verbose, int runstop,
        /* 7/ Is there enough devices to possibly start the array? */
        /* 7a/ if not, finish with success. */
        if (info.array.level == LEVEL_CONTAINER) {
+               char *devname = NULL;
                /* Try to assemble within the container */
                map_unlock(&map);
                sysfs_uevent(&info, "change");
@@ -466,7 +447,10 @@ int Incremental(char *devname, int verbose, int runstop,
                                ": container %s now has %d devices\n",
                                chosen_name, info.array.working_disks);
                wait_for(chosen_name, mdfd);
+               if (st->ss->external)
+                       devname = devnum2devname(fd2devnum(mdfd));
                close(mdfd);
+               sysfs_free(sra);
                rv = Incremental(chosen_name, verbose, runstop,
                                 NULL, homehost, require_homehost, autof);
                if (rv == 1)
@@ -474,23 +458,34 @@ int Incremental(char *devname, int verbose, int runstop,
                         * have enough devices to start yet
                         */
                        rv = 0;
+               /* after spare is added, ping monitor for external metadata
+                * so that it can eg. try to rebuild degraded array */
+               if (st->ss->external) {
+                       ping_monitor(devname);
+                       free(devname);
+               }
                return rv;
        }
-       avail = NULL;
-       active_disks = count_active(st, mdfd, &avail, &info);
+
+       /* We have added something to the array, so need to re-read the
+        * state.  Eventually this state should be kept up-to-date as
+        * things change.
+        */
+       sysfs_free(sra);
+       sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                   GET_OFFSET | GET_SIZE));
+       active_disks = count_active(st, sra, mdfd, &avail, &info);
        if (enough(info.array.level, info.array.raid_disks,
                   info.array.layout, info.array.state & 1,
                   avail, active_disks) == 0) {
-               free(avail);
                if (verbose >= 0)
                        fprintf(stderr, Name
                             ": %s attached to %s, not enough to start (%d).\n",
                                devname, chosen_name, active_disks);
                map_unlock(&map);
-               close(mdfd);
-               return 0;
+               rv = 0;
+               goto out;
        }
-       free(avail);
 
        /* 7b/ if yes, */
        /* - if number of OK devices match expected, or -R and there */
@@ -503,14 +498,14 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                           ": %s attached to %s which is already active.\n",
                                devname, chosen_name);
-               close(mdfd);
                map_unlock(&map);
-               return 0;
+               rv = 0;
+               goto out;
        }
 
        map_unlock(&map);
        if (runstop > 0 || active_disks >= info.array.working_disks) {
-               struct mdinfo *sra;
+               struct mdinfo *dsk;
                /* Let's try to start it */
                if (match && match->bitmap_file) {
                        int bmfd = open(match->bitmap_file, O_RDWR);
@@ -518,20 +513,24 @@ int Incremental(char *devname, int verbose, int runstop,
                                fprintf(stderr, Name
                                        ": Could not open bitmap file %s.\n",
                                        match->bitmap_file);
-                               close(mdfd);
-                               return 1;
+                               goto out;
                        }
                        if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
                                close(bmfd);
                                fprintf(stderr, Name
                                        ": Failed to set bitmapfile for %s.\n",
                                        chosen_name);
-                               close(mdfd);
-                               return 1;
+                               goto out;
                        }
                        close(bmfd);
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), 0);
+               /* Need to remove from the array any devices which
+                * 'count_active' discerned were too old or inappropriate
+                */
+               for (d = sra ? sra->devs : NULL ; d ; d = d->next)
+                       if (d->disk.state & (1<<MD_DISK_REMOVED))
+                               remove_disk(mdfd, st, sra, d);
+
                if ((sra == NULL || active_disks >= info.array.working_disks)
                    && trustworthy != FOREIGN)
                        rv = ioctl(mdfd, RUN_ARRAY, NULL);
@@ -541,10 +540,23 @@ int Incremental(char *devname, int verbose, int runstop,
                if (rv == 0) {
                        if (verbose >= 0)
                                fprintf(stderr, Name
-                          ": %s attached to %s, which has been started.\n",
+                                       ": %s attached to %s, which has been started.\n",
                                        devname, chosen_name);
                        rv = 0;
                        wait_for(chosen_name, mdfd);
+                       /* We just started the array, so some devices
+                        * might have been evicted from the array
+                        * because their event counts were too old.
+                        * If the action=re-add policy is in-force for
+                        * those devices we should re-add them now.
+                        */
+                       for (dsk = sra->devs; dsk ; dsk = dsk->next) {
+                               if (disk_action_allows(dsk, st->ss->name, act_re_add) &&
+                                   add_disk(mdfd, st, sra, dsk) == 0)
+                                       fprintf(stderr, Name
+                                               ": %s re-added to %s\n",
+                                               dsk->sys_name, chosen_name);
+                       }
                } else {
                        fprintf(stderr, Name
                              ": %s attached to %s, but failed to start: %s.\n",
@@ -558,10 +570,92 @@ int Incremental(char *devname, int verbose, int runstop,
                                devname, chosen_name);
                rv = 0;
        }
-       close(mdfd);
+out:
+       free(avail);
+       if (dfd >= 0)
+               close(dfd);
+       if (mdfd >= 0)
+               close(mdfd);
+       if (policy)
+               dev_policy_free(policy);
+       if (sra)
+               sysfs_free(sra);
        return rv;
 }
 
+static struct mddev_ident *search_mdstat(struct supertype *st,
+                                          struct mdinfo *info,
+                                          char *devname,
+                                          int verbose, int *rvp)
+{
+       struct mddev_ident *array_list, *match;
+       array_list = conf_get_ident(NULL);
+       match = NULL;
+       for (; array_list; array_list = array_list->next) {
+               if (array_list->uuid_set &&
+                   same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid)
+                   == 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": UUID differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->name[0] &&
+                   strcasecmp(array_list->name, info->name) != 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Name differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->devices &&
+                   !match_oneof(array_list->devices, devname)) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Not a listed device for %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->super_minor != UnSet &&
+                   array_list->super_minor != info->array.md_minor) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Different super-minor to %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (!array_list->uuid_set &&
+                   !array_list->name[0] &&
+                   !array_list->devices &&
+                   array_list->super_minor == UnSet) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": %s doesn't have any identifying information.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               /* FIXME, should I check raid_disks and level too?? */
+
+               if (match) {
+                       if (verbose >= 0) {
+                               if (match->devname && array_list->devname)
+                                       fprintf(stderr, Name
+                                               ": we match both %s and %s - cannot decide which to use.\n",
+                                               match->devname, array_list->devname);
+                               else
+                                       fprintf(stderr, Name
+                                               ": multiple lines in mdadm.conf match\n");
+                       }
+                       *rvp = 2;
+                       match = NULL;
+                       break;
+               }
+               match = array_list;
+       }
+       return match;
+}
+
 static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        int number, __u64 events, int verbose,
                        char *array_name)
@@ -588,7 +682,7 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        close(dfd);
                        continue;
                }
-               st->ss->getinfo_super(st, &info);
+               st->ss->getinfo_super(st, &info, NULL);
                st->ss->free_super(st);
                close(dfd);
 
@@ -606,20 +700,28 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
        }
 }
 
-static int count_active(struct supertype *st, int mdfd, char **availp,
+static int count_active(struct supertype *st, struct mdinfo *sra,
+                       int mdfd, char **availp,
                        struct mdinfo *bestinfo)
 {
        /* count how many devices in sra think they are active */
        struct mdinfo *d;
-       int cnt = 0, cnt1 = 0;
+       int cnt = 0;
        __u64 max_events = 0;
-       struct mdinfo *sra = sysfs_read(mdfd, -1, GET_DEVS | GET_STATE);
        char *avail = NULL;
+       int *best;
+       char *devmap = NULL;
+       int numdevs = 0;
+       int devnum;
+       int b, i;
+       int raid_disks = 0;
 
        if (!sra)
                return 0;
 
-       for (d = sra->devs ; d ; d = d->next) {
+       for (d = sra->devs ; d ; d = d->next)
+               numdevs++;
+       for (d = sra->devs, devnum=0 ; d ; d = d->next, devnum++) {
                char dn[30];
                int dfd;
                int ok;
@@ -633,15 +735,21 @@ static int count_active(struct supertype *st, int mdfd, char **availp,
                close(dfd);
                if (ok != 0)
                        continue;
-               st->ss->getinfo_super(st, &info);
+               info.array.raid_disks = raid_disks;
+               st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
                if (!avail) {
-                       avail = malloc(info.array.raid_disks);
+                       raid_disks = info.array.raid_disks;
+                       avail = calloc(raid_disks, 1);
                        if (!avail) {
                                fprintf(stderr, Name ": out of memory.\n");
                                exit(1);
                        }
-                       memset(avail, 0, info.array.raid_disks);
                        *availp = avail;
+
+                       best = calloc(raid_disks, sizeof(int));
+                       devmap = calloc(raid_disks * numdevs, 1);
+
+                       st->ss->getinfo_super(st, &info, devmap);
                }
 
                if (info.disk.state & (1<<MD_DISK_SYNC))
@@ -650,35 +758,559 @@ static int count_active(struct supertype *st, int mdfd, char **availp,
                                cnt++;
                                max_events = info.events;
                                avail[info.disk.raid_disk] = 2;
-                               st->ss->getinfo_super(st, bestinfo);
+                               best[info.disk.raid_disk] = devnum;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        } else if (info.events == max_events) {
-                               cnt++;
                                avail[info.disk.raid_disk] = 2;
+                               best[info.disk.raid_disk] = devnum;
                        } else if (info.events == max_events-1) {
-                               cnt1++;
-                               avail[info.disk.raid_disk] = 1;
+                               if (avail[info.disk.raid_disk] == 0) {
+                                       avail[info.disk.raid_disk] = 1;
+                                       best[info.disk.raid_disk] = devnum;
+                               }
                        } else if (info.events < max_events - 1)
                                ;
                        else if (info.events == max_events+1) {
                                int i;
-                               cnt1 = cnt;
-                               cnt = 1;
                                max_events = info.events;
-                               for (i=0; i<info.array.raid_disks; i++)
+                               for (i=0; i < raid_disks; i++)
                                        if (avail[i])
                                                avail[i]--;
                                avail[info.disk.raid_disk] = 2;
-                               st->ss->getinfo_super(st, bestinfo);
+                               best[info.disk.raid_disk] = devnum;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        } else { /* info.events much bigger */
-                               cnt = 1; cnt1 = 0;
                                memset(avail, 0, info.disk.raid_disk);
                                max_events = info.events;
-                               st->ss->getinfo_super(st, bestinfo);
+                               avail[info.disk.raid_disk] = 2;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        }
                }
                st->ss->free_super(st);
        }
-       return cnt + cnt1;
+       if (!avail)
+               return 0;
+       /* We need to reject any device that thinks the best device is
+        * failed or missing */
+       for (b = 0; b < raid_disks; b++)
+               if (avail[b] == 2)
+                       break;
+       cnt = 0;
+       for (i = 0 ; i < raid_disks ; i++) {
+               if (i != b && avail[i])
+                       if (devmap[raid_disks * best[i] + b] == 0) {
+                               /* This device thinks 'b' is failed -
+                                * don't use it */
+                               devnum = best[i];
+                               for (d=sra->devs ; devnum; d = d->next)
+                                       devnum--;
+                               d->disk.state |= (1 << MD_DISK_REMOVED);
+                               avail[i] = 0;
+                       }
+               if (avail[i])
+                       cnt++;
+       }
+       free(best);
+       free(devmap);
+       return cnt;
+}
+
+/* test if container has degraded member(s) */
+static int container_members_max_degradation(struct map_ent *map, struct map_ent *me)
+{
+       mdu_array_info_t array;
+       int afd;
+       int max_degraded = 0;
+
+       for(; map; map = map->next) {
+               if (!is_subarray(map->metadata) ||
+                   devname2devnum(map->metadata+1) != me->devnum)
+                       continue;
+               afd = open_dev(map->devnum);
+               if (afd < 0)
+                       continue;
+               /* most accurate information regarding array degradation */
+               if (ioctl(afd, GET_ARRAY_INFO, &array) >= 0) {
+                       int degraded = array.raid_disks - array.active_disks -
+                                      array.spare_disks;
+                       if (degraded > max_degraded)
+                               max_degraded = degraded;
+               }
+               close(afd);
+       }
+       return (max_degraded);
+}
+
+static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                          struct map_ent *target, int bare,
+                          struct supertype *st, int verbose)
+{
+       /* This device doesn't have any md metadata
+        * The device policy allows 'spare' and if !bare, it allows spare-same-slot.
+        * If 'st' is not set, then we only know that some metadata allows this,
+        * others possibly don't.
+        * So look for a container or array to attach the device to.
+        * Prefer 'target' if that is set and the array is found.
+        *
+        * If st is set, then only arrays of that type are considered
+        * Return 0 on success, or some exit code on failure, probably 1.
+        */
+       int rv = 1;
+       struct stat stb;
+       struct map_ent *mp, *map = NULL;
+       struct mdinfo *chosen = NULL;
+       int dfd = *dfdp;
+
+       if (fstat(dfd, &stb) != 0)
+               return 1;
+
+       /*
+        * Now we need to find a suitable array to add this to.
+        * We only accept arrays that:
+        *  - match 'st'
+        *  - are in the same domains as the device
+        *  - are of an size for which the device will be useful
+        * and we choose the one that is the most degraded
+        */
+
+       if (map_lock(&map)) {
+               fprintf(stderr, Name ": failed to get exclusive lock on "
+                       "mapfile\n");
+               return 1;
+       }
+       for (mp = map ; mp ; mp = mp->next) {
+               struct supertype *st2;
+               struct domainlist *dl = NULL;
+               struct mdinfo *sra;
+               unsigned long long devsize;
+               unsigned long long component_size = 0;
+
+               if (is_subarray(mp->metadata))
+                       continue;
+               if (st) {
+                       st2 = st->ss->match_metadata_desc(mp->metadata);
+                       if (!st2 ||
+                           (st->minor_version >= 0 &&
+                            st->minor_version != st2->minor_version)) {
+                               if (verbose > 1)
+                                       fprintf(stderr, Name ": not adding %s to %s as metadata type doesn't match\n",
+                                               devname, mp->path);
+                               free(st2);
+                               continue;
+                       }
+                       free(st2);
+               }
+               sra = sysfs_read(-1, mp->devnum,
+                                GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+                                GET_DEGRADED|GET_COMPONENT|GET_VERSION);
+               if (!sra) {
+                       /* Probably a container - no degraded info */
+                       sra = sysfs_read(-1, mp->devnum,
+                                        GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+                                        GET_COMPONENT|GET_VERSION);
+                       if (sra)
+                               sra->array.failed_disks = -1;
+               }
+               if (!sra)
+                       continue;
+               if (st == NULL) {
+                       int i;
+                       st2 = NULL;
+                       for(i=0; !st2 && superlist[i]; i++)
+                               st2 = superlist[i]->match_metadata_desc(
+                                       sra->text_version);
+                       if (!st2) {
+                               if (verbose > 1)
+                                       fprintf(stderr, Name ": not adding %s to %s"
+                                               " as metadata not recognised.\n",
+                                               devname, mp->path);
+                               goto next;
+                       }
+                       /* Need to double check the 'act_spare' permissions applies
+                        * to this metadata.
+                        */
+                       if (!policy_action_allows(pol, st2->ss->name, act_spare))
+                               goto next;
+                       if (!bare && !policy_action_allows(pol, st2->ss->name,
+                                                          act_spare_same_slot))
+                               goto next;
+               } else
+                       st2 = st;
+               /* update number of failed disks for mostly degraded
+                * container member */
+               if (sra->array.failed_disks == -1)
+                       sra->array.failed_disks = container_members_max_degradation(map, mp);
+
+               get_dev_size(dfd, NULL, &devsize);
+               if (sra->component_size == 0) {
+                       /* true for containers, here we must read superblock
+                        * to obtain minimum spare size */
+                       struct supertype *st3 = dup_super(st2);
+                       int mdfd = open_dev(mp->devnum);
+                       if (!mdfd)
+                               goto next;
+                       if (st3->ss->load_container &&
+                           !st3->ss->load_container(st3, mdfd, mp->path)) {
+                               component_size = st3->ss->min_acceptable_spare_size(st3);
+                               st3->ss->free_super(st3);
+                       }
+                       free(st3);
+                       close(mdfd);
+               }
+               if ((sra->component_size > 0 &&
+                    st2->ss->avail_size(st2, devsize) < sra->component_size)
+                   ||
+                   (sra->component_size == 0 && devsize < component_size)) {
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": not adding %s to %s as it is too small\n",
+                                       devname, mp->path);
+                       goto next;
+               }
+               /* test against target.
+                * If 'target' is set and 'bare' is false, we only accept
+                * arrays/containers that match 'target'.
+                * If 'target' is set and 'bare' is true, we prefer the
+                * array which matches 'target'.
+                * target is considered only if we deal with degraded array
+                */
+               if (target && policy_action_allows(pol, st2->ss->name,
+                                                  act_spare_same_slot)) {
+                       if (strcmp(target->metadata, mp->metadata) == 0 &&
+                           memcmp(target->uuid, mp->uuid,
+                                  sizeof(target->uuid)) == 0 &&
+                           sra->array.failed_disks > 0) {
+                               /* This is our target!! */
+                               if (chosen)
+                                       sysfs_free(chosen);
+                               chosen = sra;
+                               sra = NULL;
+                               /* skip to end so we don't check any more */
+                               while (mp->next)
+                                       mp = mp->next;
+                               goto next;
+                       }
+                       /* not our target */
+                       if (!bare)
+                               goto next;
+               }
+
+               dl = domain_from_array(sra, st2->ss->name);
+               if (domain_test(dl, pol, st2->ss->name) != 1) {
+                       /* domain test fails */
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": not adding %s to %s as"
+                                       " it is not in a compatible domain\n",
+                                       devname, mp->path);
+
+                       goto next;
+               }
+               /* all tests passed, OK to add to this array */
+               if (!chosen) {
+                       chosen = sra;
+                       sra = NULL;
+               } else if (chosen->array.failed_disks < sra->array.failed_disks) {
+                       sysfs_free(chosen);
+                       chosen = sra;
+                       sra = NULL;
+               }
+       next:
+               if (sra)
+                       sysfs_free(sra);
+               if (st != st2)
+                       free(st2);
+               if (dl)
+                       domain_free(dl);
+       }
+       if (chosen) {
+               /* add current device to chosen array as a spare */
+               int mdfd = open_dev(devname2devnum(chosen->sys_name));
+               if (mdfd >= 0) {
+                       struct mddev_dev devlist;
+                       char devname[20];
+                       devlist.next = NULL;
+                       devlist.used = 0;
+                       devlist.re_add = 0;
+                       devlist.writemostly = 0;
+                       devlist.devname = devname;
+                       sprintf(devname, "%d:%d", major(stb.st_rdev),
+                               minor(stb.st_rdev));
+                       devlist.disposition = 'a';
+                       close(dfd);
+                       *dfdp = -1;
+                       rv =  Manage_subdevs(chosen->sys_name, mdfd, &devlist,
+                                            -1, 0, NULL);
+                       close(mdfd);
+               }
+               if (verbose > 0) {
+                       if (rv == 0)
+                               fprintf(stderr, Name ": added %s as spare for %s\n",
+                                       devname, chosen->sys_name);
+                       else
+                               fprintf(stderr, Name ": failed to add %s as spare for %s\n",
+                                       devname, chosen->sys_name);
+               }
+               sysfs_free(chosen);
+       }
+       return rv;
+}
+
+static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                              struct supertype *st, int verbose)
+{
+       /* we know that at least one partition virtual-metadata is
+        * allowed to incorporate spares like this device.  We need to
+        * find a suitable device to copy partition information from.
+        *
+        * Getting a list of all disk (not partition) devices is
+        * slightly non-trivial.  We could look at /sys/block, but
+        * that is theoretically due to be removed.  Maybe best to use
+        * /dev/disk/by-path/?* and ignore names ending '-partNN' as
+        * we depend on this directory of 'path' info.  But that fails
+        * to find loop devices and probably others.  Maybe don't
+        * worry about that, they aren't the real target.
+        *
+        * So: check things in /dev/disk/by-path to see if they are in
+        * a compatible domain, then load the partition table and see
+        * if it is OK for the new device, and choose the largest
+        * partition table that fits.
+        */
+       DIR *dir;
+       struct dirent *de;
+       char *chosen = NULL;
+       unsigned long long chosen_size = 0;
+       struct supertype *chosen_st = NULL;
+       int fd;
+
+       dir = opendir("/dev/disk/by-path");
+       if (!dir)
+               return 1;
+       while ((de = readdir(dir)) != NULL) {
+               char *ep;
+               struct dev_policy *pol2 = NULL;
+               struct domainlist *domlist = NULL;
+               int fd = -1;
+               struct mdinfo info;
+               struct supertype *st2 = NULL;
+               char *devname = NULL;
+               unsigned long long devsectors;
+
+               if (de->d_ino == 0 ||
+                   de->d_name[0] == '.' ||
+                   (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN))
+                       goto next;
+
+               ep = de->d_name + strlen(de->d_name);
+               while (ep > de->d_name &&
+                      isdigit(ep[-1]))
+                       ep--;
+               if (ep > de->d_name + 5 &&
+                   strncmp(ep-5, "-part", 5) == 0)
+                       /* This is a partition - skip it */
+                       goto next;
+
+               pol2 = path_policy(de->d_name, type_disk);
+
+               domain_merge(&domlist, pol2, st ? st->ss->name : NULL);
+               if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1)
+                       /* new device is incompatible with this device. */
+                       goto next;
+
+               domain_free(domlist);
+               domlist = NULL;
+
+               if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) {
+                       devname = NULL;
+                       goto next;
+               }
+               fd = open(devname, O_RDONLY);
+               if (fd < 0)
+                       goto next;
+               if (get_dev_size(fd, devname, &devsectors) == 0)
+                       goto next;
+               devsectors >>= 9;
+
+               if (st)
+                       st2 = dup_super(st);
+               else
+                       st2 = guess_super_type(fd, guess_partitions);
+               if (st2 == NULL ||
+                   st2->ss->load_super(st2, fd, NULL) < 0)
+                       goto next;
+
+               if (!st) {
+                       /* Check domain policy again, this time referring to metadata */
+                       domain_merge(&domlist, pol2, st2->ss->name);
+                       if (domain_test(domlist, pol, st2->ss->name) != 1)
+                               /* Incompatible devices for this metadata type */
+                               goto next;
+                       if (!policy_action_allows(pol, st2->ss->name, act_spare))
+                               /* Some partition types allow sparing, but not
+                                * this one.
+                                */
+                               goto next;
+               }
+
+               st2->ss->getinfo_super(st2, &info, NULL);
+               if (info.component_size > devsectors)
+                       /* This partitioning doesn't fit in the device */
+                       goto next;
+
+               /* This is an acceptable device to copy partition
+                * metadata from.  We could just stop here, but I
+                * think I want to keep looking incase a larger
+                * metadata which makes better use of the device can
+                * be found.
+                */
+               if (chosen == NULL ||
+                   chosen_size < info.component_size) {
+                       chosen_size = info.component_size;
+                       free(chosen);
+                       chosen = devname;
+                       devname = NULL;
+                       if (chosen_st) {
+                               chosen_st->ss->free_super(chosen_st);
+                               free(chosen_st);
+                       }
+                       chosen_st = st2;
+                       st2 = NULL;
+               }
+
+       next:
+               free(devname);
+               domain_free(domlist);
+               dev_policy_free(pol2);
+               if (st2)
+                       st2->ss->free_super(st2);
+               free(st2);
+
+               if (fd >= 0)
+                       close(fd);
+       }
+
+       if (!chosen)
+               return 1;
+
+       /* 'chosen' is the best device we can find.  Let's write its
+        * metadata to devname dfd is read-only so don't use that
+        */
+       fd = open(devname, O_RDWR);
+       if (fd >= 0) {
+               chosen_st->ss->store_super(chosen_st, fd);
+               close(fd);
+       }
+       free(chosen);
+       chosen_st->ss->free_super(chosen_st);
+       free(chosen_st);
+       return 0;
+}
+
+static int is_bare(int dfd)
+{
+       unsigned long long size = 0;
+       char bufpad[4096 + 4096];
+       char *buf = (char*)(((long)bufpad + 4096) & ~4095);
+
+       if (lseek(dfd, 0, SEEK_SET) != 0 ||
+           read(dfd, buf, 4096) != 4096)
+               return 0;
+
+       if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+               return 0;
+       if (memcmp(buf, buf+1, 4095) != 0)
+               return 0;
+
+       /* OK, first 4K appear blank, try the end. */
+       get_dev_size(dfd, NULL, &size);
+       if (lseek(dfd, size-4096, SEEK_SET) < 0 ||
+           read(dfd, buf, 4096) != 4096)
+               return 0;
+
+       if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+               return 0;
+       if (memcmp(buf, buf+1, 4095) != 0)
+               return 0;
+
+       return 1;
+}
+
+/* adding a spare to a regular array is quite different from adding one to
+ * a set-of-partitions virtual array.
+ * This function determines which is worth trying and tries as appropriate.
+ * Arrays are given priority over partitions.
+ */
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                    struct map_ent *target,
+                    struct supertype *st, int verbose)
+{
+       int i;
+       int rv;
+       int arrays_ok = 0;
+       int partitions_ok = 0;
+       int dfd = *dfdp;
+       int bare;
+
+       /* Can only add a spare if device has at least one domain */
+       if (pol_find(pol, pol_domain) == NULL)
+               return 1;
+       /* And only if some action allows spares */
+       if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare))
+               return 1;
+
+       /* Now check if the device is bare.
+        * bare devices can always be added as a spare
+        * non-bare devices can only be added if spare-same-slot is permitted,
+        * and this device is replacing a previous device - in which case 'target'
+        * will be set.
+        */
+       if (!is_bare(dfd)) {
+               /* Must have a target and allow same_slot */
+               /* Later - may allow force_spare without target */
+               if (!target ||
+                   !policy_action_allows(pol, st?st->ss->name:NULL,
+                                         act_spare_same_slot)) {
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": %s is not bare, so not "
+                                       "considering as a spare\n",
+                                       devname);
+                       return 1;
+               }
+               bare = 0;
+       } else
+               bare = 1;
+
+       /* It might be OK to add this device to an array - need to see
+        * what arrays might be candidates.
+        */
+       if (st) {
+               /* just try try 'array' or 'partition' based on this metadata */
+               if (st->ss->add_to_super)
+                       return array_try_spare(devname, dfdp, pol, target, bare,
+                                              st, verbose);
+               else
+                       return partition_try_spare(devname, dfdp, pol,
+                                                  st, verbose);
+       }
+       /* No metadata was specified or found so options are open.
+        * Check for whether any array metadata, or any partition metadata
+        * might allow adding the spare.  This check is just help to avoid
+        * a more costly scan of all arrays when we can be sure that will
+        * fail.
+        */
+       for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) {
+               if (superlist[i]->add_to_super && !arrays_ok &&
+                   policy_action_allows(pol, superlist[i]->name, act_spare))
+                       arrays_ok = 1;
+               if (superlist[i]->add_to_super == NULL && !partitions_ok &&
+                   policy_action_allows(pol, superlist[i]->name, act_spare))
+                       partitions_ok = 1;
+       }
+       rv = 1;
+       if (arrays_ok)
+               rv = array_try_spare(devname, dfdp, pol, target, bare,
+                                    st, verbose);
+       if (rv != 0 && partitions_ok)
+               rv = partition_try_spare(devname, dfdp, pol, st, verbose);
+       return rv;
 }
 
 int IncrementalScan(int verbose)
@@ -691,7 +1323,7 @@ int IncrementalScan(int verbose)
         */
        struct map_ent *mapl = NULL;
        struct map_ent *me;
-       mddev_ident_t devs, mddev;
+       struct mddev_ident *devs, *mddev;
        int rv = 0;
 
        map_read(&mapl);
@@ -785,26 +1417,70 @@ static char *container2devname(char *devname)
        return mdname;
 }
 
-int Incremental_container(struct supertype *st, char *devname, int verbose,
-                         int runstop, int autof, int trustworthy)
+static int Incremental_container(struct supertype *st, char *devname,
+                                char *homehost, int verbose,
+                                int runstop, int autof)
 {
        /* Collect the contents of this container and for each
         * array, choose a device name and assemble the array.
         */
 
-       struct mdinfo *list = st->ss->container_content(st);
+       struct mdinfo *list;
        struct mdinfo *ra;
        struct map_ent *map = NULL;
+       struct mdinfo info;
+       int trustworthy;
+       struct mddev_ident *match;
+       int rv = 0;
+       struct domainlist *domains;
+       struct map_ent *smp;
+       int suuid[4];
+       int sfd;
+
+       memset(&info, 0, sizeof(info));
+       st->ss->getinfo_super(st, &info, NULL);
+
+       if ((runstop > 0 && info.container_enough >= 0) ||
+           info.container_enough > 0)
+               /* pass */;
+       else {
+               if (verbose)
+                       fprintf(stderr, Name ": not enough devices to start the container\n");
+               return 0;
+       }
+
+       match = search_mdstat(st, &info, devname, verbose, &rv);
+       if (match == NULL && rv == 2)
+               return rv;
 
+       /* Need to compute 'trustworthy' */
+       if (match)
+               trustworthy = LOCAL;
+       else if (st->ss->match_home(st, homehost) == 1)
+               trustworthy = LOCAL;
+       else if (st->ss->match_home(st, "any") == 1)
+               trustworthy = LOCAL;
+       else
+               trustworthy = FOREIGN;
+
+       list = st->ss->container_content(st, NULL);
        if (map_lock(&map))
                fprintf(stderr, Name ": failed to get exclusive lock on "
                        "mapfile\n");
+       /* do not assemble arrays that might have bad blocks */
+       if (list->array.state & (1<<MD_SB_BBM_ERRORS)) {
+               fprintf(stderr, Name ": BBM log found in metadata. "
+                                       "Cannot activate array(s).\n");
+               /* free container data and exit */
+               sysfs_free(list);
+               return 2;
+       }
 
        for (ra = list ; ra ; ra = ra->next) {
                int mdfd;
                char chosen_name[1024];
                struct map_ent *mp;
-               struct mddev_ident_s *match = NULL;
+               struct mddev_ident *match = NULL;
 
                mp = map_by_uuid(&map, ra->uuid);
 
@@ -820,7 +1496,7 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
                         * member == ra->text_version after second slash.
                         */
                        char *sub = strchr(ra->text_version+1, '/');
-                       struct mddev_ident_s *array_list;
+                       struct mddev_ident *array_list;
                        if (sub) {
                                sub++;
                                array_list = conf_get_ident(NULL);
@@ -879,8 +1555,55 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
                }
 
                assemble_container_content(st, mdfd, ra, runstop,
-                                          chosen_name, verbose);
+                                          chosen_name, verbose, NULL);
+               close(mdfd);
        }
+
+       /* Now move all suitable spares from spare container */
+       domains = domain_from_array(list, st->ss->name);
+       memcpy(suuid, uuid_zero, sizeof(int[4]));
+       if (domains &&
+           (smp = map_by_uuid(&map, suuid)) != NULL &&
+           (sfd = open(smp->path, O_RDONLY)) >= 0) {
+               /* spare container found */
+               struct supertype *sst =
+                       super_imsm.match_metadata_desc("imsm");
+               struct mdinfo *sinfo;
+               unsigned long long min_size = 0;
+               if (st->ss->min_acceptable_spare_size)
+                       min_size = st->ss->min_acceptable_spare_size(st);
+               if (!sst->ss->load_container(sst, sfd, NULL)) {
+                       close(sfd);
+                       sinfo = container_choose_spares(sst, min_size,
+                                                       domains, NULL,
+                                                       st->ss->name, 0);
+                       sst->ss->free_super(sst);
+                       if (sinfo){
+                               int count = 0;
+                               struct mdinfo *disks = sinfo->devs;
+                               while (disks) {
+                                       /* move spare from spare
+                                        * container to currently
+                                        * assembled one
+                                        */
+                                       if (move_spare(
+                                                   smp->path,
+                                                   devname,
+                                                   makedev(disks->disk.major,
+                                                           disks->disk.minor)))
+                                               count++;
+                                       disks = disks->next;
+                               }
+                               if (count)
+                                       fprintf(stderr, Name
+                                               ": Added %d spare%s to %s\n",
+                                               count, count>1?"s":"", devname);
+                       }
+                       sysfs_free(sinfo);
+               } else
+                       close(sfd);
+       }
+       domain_free(domains);
        map_unlock(&map);
        return 0;
 }
@@ -890,16 +1613,22 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
  * raid arrays, and if so first fail (if needed) and then remove the device.
  *
  * @devname - The device we want to remove
+ * @id_path - name as found in /dev/disk/by-path for this device
  *
  * Note: the device name must be a kernel name like "sda", so
  * that we can find it in /proc/mdstat
  */
-int IncrementalRemove(char *devname, int verbose)
+int IncrementalRemove(char *devname, char *id_path, int verbose)
 {
        int mdfd;
        int rv;
        struct mdstat_ent *ent;
-       struct mddev_dev_s devlist;
+       struct mddev_dev devlist;
+
+       if (!id_path)
+               dprintf(Name ": incremental removal without --path <id_path> "
+                       "lacks the possibility to re-add new device in this "
+                       "port\n");
 
        if (strchr(devname, '/')) {
                fprintf(stderr, Name ": incremental removal requires a "
@@ -915,14 +1644,42 @@ int IncrementalRemove(char *devname, int verbose)
        mdfd = open_dev(ent->devnum);
        if (mdfd < 0) {
                fprintf(stderr, Name ": Cannot open array %s!!\n", ent->dev);
+               free_mdstat(ent);
                return 1;
        }
+
+       if (id_path) {
+               struct map_ent *map = NULL, *me;
+               me = map_by_devnum(&map, ent->devnum);
+               if (me)
+                       policy_save_path(id_path, me);
+               map_free(map);
+       }
+
        memset(&devlist, 0, sizeof(devlist));
        devlist.devname = devname;
        devlist.disposition = 'f';
-       Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0);
+       /* for a container, we must fail each member array */
+       if (ent->metadata_version &&
+           strncmp(ent->metadata_version, "external:", 9) == 0) {
+               struct mdstat_ent *mdstat = mdstat_read(0, 0);
+               struct mdstat_ent *memb;
+               for (memb = mdstat ; memb ; memb = memb->next)
+                       if (is_container_member(memb, ent->dev)) {
+                               int subfd = open_dev(memb->devnum);
+                               if (subfd >= 0) {
+                                       Manage_subdevs(memb->dev, subfd,
+                                                      &devlist, verbose, 0,
+                                                      NULL);
+                                       close(subfd);
+                               }
+                       }
+               free_mdstat(mdstat);
+       } else
+               Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL);
        devlist.disposition = 'r';
-       rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0);
+       rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL);
        close(mdfd);
+       free_mdstat(ent);
        return rv;
 }
diff --git a/Kill.c b/Kill.c
index 3d1810f04fbcc820f4f140caee8abf98411bb6db..29a43ea6bf20fe10071d43f4f9262a766e197a88 100644 (file)
--- a/Kill.c
+++ b/Kill.c
@@ -53,7 +53,7 @@ int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl)
        }
        if (st == NULL)
                st = guess_super(fd);
-       if (st == NULL) {
+       if (st == NULL || st->ss->init_super == NULL) {
                if (!quiet)
                        fprintf(stderr, Name ": Unrecognised md component device - %s\n", dev);
                close(fd);
@@ -96,16 +96,7 @@ int Kill_subarray(char *dev, char *subarray, int quiet)
 
        memset(st, 0, sizeof(*st));
 
-       if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >=
-           (int)sizeof(st->subarray)) {
-               if (!quiet)
-                       fprintf(stderr,
-                               Name ": Input overflow for subarray '%s' > %zu bytes\n",
-                               subarray, sizeof(st->subarray) - 1);
-               return 2;
-       }
-
-       fd = open_subarray(dev, st, quiet);
+       fd = open_subarray(dev, subarray, st, quiet);
        if (fd < 0)
                return 2;
 
index 0cc9a87c2b2de604f4794a27636de00d377e219f..2b888188cbb8497303ce4c1382580050fe18f22c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -71,8 +71,11 @@ CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
 MAP_DIR=/dev/.mdadm
 MAP_FILE = map
 MDMON_DIR = /dev/.mdadm
+# place for autoreplace cookies
+FAILED_SLOTS_DIR = /dev/.mdadm/failed-slots
 DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
 DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
+DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
 CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS)
 
 # The glibc TLS ABI requires applications that call clone(2) to set up
@@ -95,36 +98,42 @@ MAN4DIR = $(MANDIR)/man4
 MAN5DIR = $(MANDIR)/man5
 MAN8DIR = $(MANDIR)/man8
 
-OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
+OBJS =  mdadm.o config.o policy.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
        Incremental.o \
        mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+       super-mbr.o super-gpt.o \
        restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \
        platform-intel.o probe_roms.o
 
-SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
+SRCS =  mdadm.c config.c policy.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
        Incremental.c \
        mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+       super-mbr.c super-gpt.c \
        restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c \
        platform-intel.c probe_roms.c
 
-MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+INCL = mdadm.h part.h bitmap.h
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o policy.o \
        Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+       super-mbr.o super-gpt.o \
        super-ddf.o sha1.o crc32.o msg.o bitmap.o \
        platform-intel.o probe_roms.o
 
-MON_SRCS = mdmon.c monitor.c managemon.c util.c mdstat.c sysfs.c config.c \
+MON_SRCS = mdmon.c monitor.c managemon.c util.c mdstat.c sysfs.c config.c policy.c \
        Kill.c sg_io.c dlink.c ReadMe.c super0.c super1.c super-intel.c \
+       super-mbr.c super-gpt.c \
        super-ddf.c sha1.c crc32.c msg.c bitmap.c \
        platform-intel.c probe_roms.c
 
 STATICSRC = pwgr.c
 STATICOBJS = pwgr.o
 
-ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
+ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \
        super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
-       platform-intel.c probe_roms.c sysfs.c
+       platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c
 ASSEMBLE_AUTO_SRCS := mdopen.c
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
 ifdef MDASSEMBLE_AUTO
@@ -149,20 +158,20 @@ mdadm : $(OBJS)
 mdadm.static : $(OBJS) $(STATICOBJS)
        $(CC) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS)
 
-mdadm.tcc : $(SRCS) mdadm.h
+mdadm.tcc : $(SRCS) $(INCL)
        $(TCC) -o mdadm.tcc $(SRCS)
 
-mdadm.klibc : $(SRCS) mdadm.h
+mdadm.klibc : $(SRCS) $(INCL)
        rm -f $(OBJS) 
        $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
 
-mdadm.Os : $(SRCS) mdadm.h
+mdadm.Os : $(SRCS) $(INCL)
        $(CC) -o mdadm.Os $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
 
-mdadm.O2 : $(SRCS) mdadm.h mdmon.O2
+mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
        $(CC) -o mdadm.O2 $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
 
-mdmon.O2 : $(MON_SRCS) mdadm.h mdmon.h
+mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
        $(CC) -o mdmon.O2 $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
 
 # use '-z now' to guarantee no dynamic linker interactions with the monitor thread
@@ -173,25 +182,25 @@ msg.o: msg.c msg.h
 test_stripe : restripe.c mdadm.h
        $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
 
-mdassemble : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS)  $(STATICSRC)
 
-mdassemble.static : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.static : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(CC) $(LDFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC)
 
-mdassemble.auto : $(ASSEMBLE_SRCS) mdadm.h $(ASSEMBLE_AUTO_SRCS)
+mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS)
        rm -f mdassemble.static
        $(MAKE) MDASSEMBLE_AUTO=1 mdassemble.static
        mv mdassemble.static mdassemble.auto
 
-mdassemble.uclibc : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.uclibc : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OJS)
        $(UCLIBC_GCC) $(ASSEMBLE_FLAGS) -DUCLIBC -DHAVE_STDINT_H -static -o mdassemble.uclibc $(ASSEMBLE_SRCS) $(STATICSRC)
 
 # This doesn't work
-mdassemble.klibc : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.klibc : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS)
 
@@ -213,8 +222,8 @@ mdadm.conf.man : mdadm.conf.5
 mdassemble.man : mdassemble.8
        nroff -man mdassemble.8 > mdassemble.man
 
-$(OBJS) : mdadm.h mdmon.h bitmap.h
-$(MON_OBJS) : mdadm.h mdmon.h bitmap.h
+$(OBJS) : $(INCL) mdmon.h
+$(MON_OBJS) : $(INCL) mdmon.h
 
 sha1.o : sha1.c sha1.h md5.h
        $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
index 7ed47310745c48fba7cb71e15b206a6a007f72e1..5fc67d771b4456d0e6217d7c8d89d5dcc0a83e1d 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -56,7 +56,6 @@ int Manage_ro(char *devname, int fd, int readonly)
        mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
        if (mdi &&
            mdi->array.major_version == -1 &&
-           mdi->array.level > 0 &&
            is_subarray(mdi->text_version)) {
                char vers[64];
                strcpy(vers, "external:");
@@ -88,6 +87,8 @@ int Manage_ro(char *devname, int fd, int readonly)
                        if (*cp)
                                *cp = 0;
                        ping_monitor(vers+10);
+                       if (mdi->array.level <= 0)
+                               sysfs_set_str(mdi, NULL, "array_state", "active");
                }
                return 0;
        }
@@ -324,7 +325,8 @@ int Manage_resize(char *devname, int fd, long long size, int raid_disks)
 }
 
 int Manage_subdevs(char *devname, int fd,
-                  mddev_dev_t devlist, int verbose, int test)
+                  struct mddev_dev *devlist, int verbose, int test,
+                  char *update)
 {
        /* do something to each dev.
         * devmode can be
@@ -340,15 +342,16 @@ int Manage_subdevs(char *devname, int fd,
         * For 'f' and 'r', the device can also be a kernel-internal
         * name such as 'sdb'.
         */
-       mddev_dev_t add_devlist = NULL;
+       struct mddev_dev *add_devlist = NULL;
        mdu_array_info_t array;
        mdu_disk_info_t disc;
        unsigned long long array_size;
-       mddev_dev_t dv, next = NULL;
+       struct mddev_dev *dv, *next = NULL;
        struct stat stb;
        int j, jnext = 0;
        int tfd = -1;
        struct supertype *st, *tst;
+       char *subarray = NULL;
        int duuid[4];
        int ouuid[4];
        int lfd = -1;
@@ -369,7 +372,7 @@ int Manage_subdevs(char *devname, int fd,
        if (array_size <= 0)
                array_size = array.size * 2;
 
-       tst = super_by_fd(fd);
+       tst = super_by_fd(fd, &subarray);
        if (!tst) {
                fprintf(stderr, Name ": unsupport array - version %d.%d\n",
                        array.major_version, array.minor_version);
@@ -548,7 +551,7 @@ int Manage_subdevs(char *devname, int fd,
                        return 1;
                case 'a':
                        /* add the device */
-                       if (tst->subarray[0]) {
+                       if (subarray) {
                                fprintf(stderr, Name ": Cannot add disks to a"
                                        " \'member\' array, perform this"
                                        " operation on the parent container\n");
@@ -608,7 +611,7 @@ int Manage_subdevs(char *devname, int fd,
                                if (tst->sb)
                                        /* already loaded */;
                                else if (tst->ss->external) {
-                                       tst->ss->load_super(tst, fd, NULL);
+                                       tst->ss->load_container(tst, fd, NULL);
                                } else for (j = 0; j < tst->max_devs; j++) {
                                        char *dev;
                                        int dfd;
@@ -634,7 +637,7 @@ int Manage_subdevs(char *devname, int fd,
                                /* FIXME this is a bad test to be using */
                                if (!tst->sb) {
                                        close(tfd);
-                                       fprintf(stderr, Name ": cannot find valid superblock in this array - HELP\n");
+                                       fprintf(stderr, Name ": cannot load array metadata from %s\n", devname);
                                        return 1;
                                }
 
@@ -664,7 +667,7 @@ int Manage_subdevs(char *devname, int fd,
                                        ;
                                else if (st->sb) {
                                        struct mdinfo mdi;
-                                       st->ss->getinfo_super(st, &mdi);
+                                       st->ss->getinfo_super(st, &mdi, NULL);
                                        st->ss->uuid_from_super(st, ouuid);
                                        if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
                                            !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
@@ -689,6 +692,24 @@ int Manage_subdevs(char *devname, int fd,
                                                remove_partitions(tfd);
                                                close(tfd);
                                                tfd = -1;
+                                               if (update) {
+                                                       int rv = -1;
+                                                       tfd = dev_open(dv->devname, O_RDWR);
+
+                                                       if (tfd >= 0)
+                                                               rv = st->ss->update_super(
+                                                                       st, NULL, update,
+                                                                       devname, verbose, 0, NULL);
+                                                       if (rv == 0)
+                                                               rv = tst->ss->store_super(st, tfd);
+                                                       close(tfd);
+                                                       tfd = -1;
+                                                       if (rv != 0) {
+                                                               fprintf(stderr, Name ": failed to update"
+                                                                       " superblock during re-add\n");
+                                                               return 1;
+                                                       }
+                                               }
                                                /* don't even try if disk is marked as faulty */
                                                errno = 0;
                                                if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
@@ -707,6 +728,7 @@ int Manage_subdevs(char *devname, int fd,
                                        skip_re_add:
                                                re_add_failed = 1;
                                        }
+                                       st->ss->free_super(st);
                                }
                                if (add_dev != dv->devname) {
                                        if (verbose > 0)
@@ -779,17 +801,18 @@ int Manage_subdevs(char *devname, int fd,
                                if (dv->writemostly == 1)
                                        disc.state |= 1 << MD_DISK_WRITEMOSTLY;
                                dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+                               if (tst->ss->external &&
+                                   mdmon_running(tst->container_dev))
+                                       tst->update_tail = &tst->updates;
                                if (tst->ss->add_to_super(tst, &disc, dfd,
                                                          dv->devname)) {
                                        close(dfd);
                                        return 1;
                                }
-                               /* write_init_super will close 'dfd' */
-                               if (tst->ss->external)
-                                       /* mdmon will write the metadata */
+                               if (tst->ss->write_init_super(tst)) {
                                        close(dfd);
-                               else if (tst->ss->write_init_super(tst))
                                        return 1;
+                               }
                        } else if (dv->re_add) {
                                /*  this had better be raid1.
                                 * As we are "--re-add"ing we must find a spare slot
@@ -823,9 +846,8 @@ int Manage_subdevs(char *devname, int fd,
                        if (dv->writemostly == 1)
                                disc.state |= (1 << MD_DISK_WRITEMOSTLY);
                        if (tst->ss->external) {
-                               /* add a disk to an external metadata container
-                                * only if mdmon is around to see it
-                                */
+                               /* add a disk
+                                * to an external metadata container */
                                struct mdinfo new_mdi;
                                struct mdinfo *sra;
                                int container_fd;
@@ -836,13 +858,7 @@ int Manage_subdevs(char *devname, int fd,
                                        fprintf(stderr, Name ": add failed for %s:"
                                                " could not get exclusive access to container\n",
                                                dv->devname);
-                                       return 1;
-                               }
-
-                               if (!mdmon_running(devnum)) {
-                                       fprintf(stderr, Name ": add failed for %s: mdmon not running\n",
-                                               dv->devname);
-                                       close(container_fd);
+                                       tst->ss->free_super(tst);
                                        return 1;
                                }
 
@@ -851,27 +867,35 @@ int Manage_subdevs(char *devname, int fd,
                                        fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n",
                                                dv->devname);
                                        close(container_fd);
+                                       tst->ss->free_super(tst);
                                        return 1;
                                }
                                sra->array.level = LEVEL_CONTAINER;
                                /* Need to set data_offset and component_size */
-                               tst->ss->getinfo_super(tst, &new_mdi);
+                               tst->ss->getinfo_super(tst, &new_mdi, NULL);
                                new_mdi.disk.major = disc.major;
                                new_mdi.disk.minor = disc.minor;
                                new_mdi.recovery_start = 0;
+                               /* Make sure fds are closed as they are O_EXCL which
+                                * would block add_disk */
+                               tst->ss->free_super(tst);
                                if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
                                        fprintf(stderr, Name ": add new device to external metadata"
                                                " failed for %s\n", dv->devname);
                                        close(container_fd);
+                                       sysfs_free(sra);
                                        return 1;
                                }
                                ping_monitor(devnum2devname(devnum));
                                sysfs_free(sra);
                                close(container_fd);
-                       } else if (ioctl(fd, ADD_NEW_DISK, &disc)) {
-                               fprintf(stderr, Name ": add new device failed for %s as %d: %s\n",
-                                       dv->devname, j, strerror(errno));
-                               return 1;
+                       } else {
+                               tst->ss->free_super(tst);
+                               if (ioctl(fd, ADD_NEW_DISK, &disc)) {
+                                       fprintf(stderr, Name ": add new device failed for %s as %d: %s\n",
+                                               dv->devname, j, strerror(errno));
+                                       return 1;
+                               }
                        }
                        if (verbose >= 0)
                                fprintf(stderr, Name ": added %s\n", dv->devname);
@@ -879,7 +903,7 @@ int Manage_subdevs(char *devname, int fd,
 
                case 'r':
                        /* hot remove */
-                       if (tst->subarray[0]) {
+                       if (subarray) {
                                fprintf(stderr, Name ": Cannot remove disks from a"
                                        " \'member\' array, perform this"
                                        " operation on the parent container\n");
@@ -1031,22 +1055,14 @@ int autodetect(void)
        return rv;
 }
 
-int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet)
+int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet)
 {
        struct supertype supertype, *st = &supertype;
        int fd, rv = 2;
 
        memset(st, 0, sizeof(*st));
-       if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >=
-           (signed)sizeof(st->subarray)) {
-               if (!quiet)
-                       fprintf(stderr,
-                               Name ": Input overflow for subarray '%s' > %zu bytes\n",
-                               subarray, sizeof(st->subarray) - 1);
-               return 2;
-       }
 
-       fd = open_subarray(dev, st, quiet);
+       fd = open_subarray(dev, subarray, st, quiet);
        if (fd < 0)
                return 2;
 
@@ -1061,7 +1077,7 @@ int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident
        if (mdmon_running(st->devnum))
                st->update_tail = &st->updates;
 
-       rv = st->ss->update_subarray(st, update, ident);
+       rv = st->ss->update_subarray(st, subarray, update, ident);
 
        if (rv) {
                if (!quiet)
@@ -1083,4 +1099,48 @@ int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident
 
        return rv;
 }
+
+/* Move spare from one array to another
+ * If adding to destination array fails
+ * add back to original array
+ * Returns 1 on success, 0 on failure */
+int move_spare(char *from_devname, char *to_devname, dev_t devid)
+{
+       struct mddev_dev devlist;
+       char devname[20];
+
+       /* try to remove and add */
+       int fd1 = open(to_devname, O_RDONLY);
+       int fd2 = open(from_devname, O_RDONLY);
+
+       if (fd1 < 0 || fd2 < 0) {
+               if (fd1>=0) close(fd1);
+               if (fd2>=0) close(fd2);
+               return 0;
+       }
+
+       devlist.next = NULL;
+       devlist.used = 0;
+       devlist.re_add = 0;
+       devlist.writemostly = 0;
+       devlist.devname = devname;
+       sprintf(devname, "%d:%d", major(devid), minor(devid));
+
+       devlist.disposition = 'r';
+       if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL) == 0) {
+               devlist.disposition = 'a';
+               if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, NULL) == 0) {
+                       /* make sure manager is aware of changes */
+                       ping_manager(to_devname);
+                       ping_manager(from_devname);
+                       close(fd1);
+                       close(fd2);
+                       return 1;
+               }
+               else Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL);
+       }
+       close(fd1);
+       close(fd2);
+       return 0;
+}
 #endif
index 0f0adb54502186095c0463cf0818d70fb95b19fc..d3795b1713d8e07226c4f83666ce01459e7452e9 100644 (file)
--- a/Monitor.c
+++ b/Monitor.c
 #include       <limits.h>
 #include       <syslog.h>
 
-static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom,
-                 char *cmd, int dosyslog);
-
 /* The largest number of disks current arrays can manage is 384
  * This really should be dynamically, but that will have to wait
  * At least it isn't MD_SB_DISKS.
  */
 #define MaxDisks 384
-int Monitor(mddev_dev_t devlist,
+struct state {
+       char *devname;
+       int devnum;     /* to sync with mdstat info */
+       long utime;
+       int err;
+       char *spare_group;
+       int active, working, failed, spare, raid;
+       int expected_spares;
+       int devstate[MaxDisks];
+       dev_t devid[MaxDisks];
+       int percent;
+       int parent_dev; /* For subarray, devnum of parent.
+                        * For others, NoMdDev
+                        */
+       struct supertype *metadata;
+       struct state *subarray;/* for a container it is a link to first subarray
+                               * for a subarray it is a link to next subarray
+                               * in the same container */
+       struct state *parent;  /* for a subarray it is a link to its container
+                               */
+       struct state *next;
+};
+
+struct alert_info {
+       char *mailaddr;
+       char *mailfrom;
+       char *alert_cmd;
+       int dosyslog;
+};
+static int make_daemon(char *pidfile);
+static int check_one_sharer(int scan);
+static void alert(char *event, char *dev, char *disc, struct alert_info *info);
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+                      int test, struct alert_info *info,
+                      int increments);
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+                         int test, struct alert_info *info);
+static void try_spare_migration(struct state *statelist, struct alert_info *info);
+static void link_containers_with_subarrays(struct state *list);
+
+int Monitor(struct mddev_dev *devlist,
            char *mailaddr, char *alert_cmd,
            int period, int daemonise, int scan, int oneshot,
-           int dosyslog, int test, char* pidfile, int increments)
+           int dosyslog, int test, char *pidfile, int increments,
+           int share)
 {
        /*
         * Every few seconds, scan every md device looking for changes
@@ -85,22 +123,11 @@ int Monitor(mddev_dev_t devlist,
         * that appears in /proc/mdstat
         */
 
-       struct state {
-               char *devname;
-               int devnum;     /* to sync with mdstat info */
-               long utime;
-               int err;
-               char *spare_group;
-               int active, working, failed, spare, raid;
-               int expected_spares;
-               int devstate[MaxDisks];
-               unsigned devid[MaxDisks];
-               int percent;
-               struct state *next;
-       } *statelist = NULL;
+       struct state *statelist = NULL;
        int finished = 0;
        struct mdstat_ent *mdstat = NULL;
        char *mailfrom = NULL;
+       struct alert_info info;
 
        if (!mailaddr) {
                mailaddr = conf_get_mailaddr();
@@ -120,44 +147,30 @@ int Monitor(mddev_dev_t devlist,
                fprintf(stderr, Name ": No mail address or alert command - not monitoring.\n");
                return 1;
        }
+       info.alert_cmd = alert_cmd;
+       info.mailaddr = mailaddr;
+       info.mailfrom = mailfrom;
+       info.dosyslog = dosyslog;
 
        if (daemonise) {
-               int pid = fork();
-               if (pid > 0) {
-                       if (!pidfile)
-                               printf("%d\n", pid);
-                       else {
-                               FILE *pid_file;
-                               pid_file=fopen(pidfile, "w");
-                               if (!pid_file)
-                                       perror("cannot create pid file");
-                               else {
-                                       fprintf(pid_file,"%d\n", pid);
-                                       fclose(pid_file);
-                               }
-                       }
-                       return 0;
-               }
-               if (pid < 0) {
-                       perror("daemonise");
-                       return 1;
-               }
-               close(0);
-               open("/dev/null", O_RDWR);
-               dup2(0,1);
-               dup2(0,2);
-               setsid();
+               int rv = make_daemon(pidfile);
+               if (rv >= 0)
+                       return rv;
        }
 
+       if (share) 
+               if (check_one_sharer(scan))
+                       return 1;
+
        if (devlist == NULL) {
-               mddev_ident_t mdlist = conf_get_ident(NULL);
+               struct mddev_ident *mdlist = conf_get_ident(NULL);
                for (; mdlist; mdlist=mdlist->next) {
                        struct state *st;
                        if (mdlist->devname == NULL)
                                continue;
                        if (strcasecmp(mdlist->devname, "<ignore>") == 0)
                                continue;
-                       st = malloc(sizeof *st);
+                       st = calloc(1, sizeof *st);
                        if (st == NULL)
                                continue;
                        if (mdlist->devname[0] == '/')
@@ -167,33 +180,26 @@ int Monitor(mddev_dev_t devlist,
                                strcpy(strcpy(st->devname, "/dev/md/"),
                                       mdlist->devname);
                        }
-                       st->utime = 0;
                        st->next = statelist;
-                       st->err = 0;
                        st->devnum = INT_MAX;
                        st->percent = -2;
                        st->expected_spares = mdlist->spare_disks;
                        if (mdlist->spare_group)
                                st->spare_group = strdup(mdlist->spare_group);
-                       else
-                               st->spare_group = NULL;
                        statelist = st;
                }
        } else {
-               mddev_dev_t dv;
+               struct mddev_dev *dv;
                for (dv=devlist ; dv; dv=dv->next) {
-                       mddev_ident_t mdlist = conf_get_ident(dv->devname);
-                       struct state *st = malloc(sizeof *st);
+                       struct mddev_ident *mdlist = conf_get_ident(dv->devname);
+                       struct state *st = calloc(1, sizeof *st);
                        if (st == NULL)
                                continue;
                        st->devname = strdup(dv->devname);
-                       st->utime = 0;
                        st->next = statelist;
-                       st->err = 0;
                        st->devnum = INT_MAX;
                        st->percent = -2;
                        st->expected_spares = -1;
-                       st->spare_group = NULL;
                        if (mdlist) {
                                st->expected_spares = mdlist->spare_disks;
                                if (mdlist->spare_group)
@@ -207,305 +213,27 @@ int Monitor(mddev_dev_t devlist,
        while (! finished) {
                int new_found = 0;
                struct state *st;
+               int anydegraded = 0;
 
                if (mdstat)
                        free_mdstat(mdstat);
                mdstat = mdstat_read(oneshot?0:1, 0);
 
-               for (st=statelist; st; st=st->next) {
-                       struct { int state, major, minor; } info[MaxDisks];
-                       mdu_array_info_t array;
-                       struct mdstat_ent *mse = NULL, *mse2;
-                       char *dev = st->devname;
-                       int fd;
-                       int i;
-
-                       if (test)
-                               alert("TestMessage", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       fd = open(dev, O_RDONLY);
-                       if (fd < 0) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, NULL,
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-/*                                     fprintf(stderr, Name ": cannot open %s: %s\n",
-                                               dev, strerror(errno));
-*/                             st->err=1;
-                               continue;
-                       }
-                       fcntl(fd, F_SETFD, FD_CLOEXEC);
-                       if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, NULL,
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-/*                                     fprintf(stderr, Name ": cannot get array info for %s: %s\n",
-                                               dev, strerror(errno));
-*/                             st->err=1;
-                               close(fd);
-                               continue;
-                       }
-                       /* It's much easier to list what array levels can't
-                        * have a device disappear than all of them that can
-                        */
-                       if (array.level == 0 || array.level == -1) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, "Wrong-Level",
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-                               st->err = 1;
-                               close(fd);
-                               continue;
-                       }
-                       if (st->devnum == INT_MAX) {
-                               struct stat stb;
-                               if (fstat(fd, &stb) == 0 &&
-                                   (S_IFMT&stb.st_mode)==S_IFBLK) {
-                                       if (major(stb.st_rdev) == MD_MAJOR)
-                                               st->devnum = minor(stb.st_rdev);
-                                       else
-                                               st->devnum = -1- (minor(stb.st_rdev)>>6);
-                               }
-                       }
-
-                       for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
-                               if (mse2->devnum == st->devnum) {
-                                       mse2->devnum = INT_MAX; /* flag it as "used" */
-                                       mse = mse2;
-                               }
-
-                       if (array.utime == 0)
-                               /* external arrays don't update utime */
-                               array.utime = time(0);
-
-                       if (st->utime == array.utime &&
-                           st->failed == array.failed_disks &&
-                           st->working == array.working_disks &&
-                           st->spare == array.spare_disks &&
-                           (mse == NULL  || (
-                                   mse->percent == st->percent
-                                   ))) {
-                               close(fd);
-                               st->err = 0;
-                               continue;
-                       }
-                       if (st->utime == 0 && /* new array */
-                           mse &&      /* is in /proc/mdstat */
-                           mse->pattern && strchr(mse->pattern, '_') /* degraded */
-                               )
-                               alert("DegradedArray", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-
-                       if (st->utime == 0 && /* new array */
-                           st->expected_spares > 0 &&
-                           array.spare_disks < st->expected_spares)
-                               alert("SparesMissing", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       if (mse &&
-                           st->percent == -1 &&
-                           mse->percent >= 0)
-                               alert("RebuildStarted", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       if (mse &&
-                           st->percent >= 0 &&
-                           mse->percent >= 0 &&
-                           (mse->percent / increments) > (st->percent / increments)) {
-                               char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
-
-                               if((mse->percent / increments) == 0)
-                                       snprintf(percentalert, sizeof(percentalert), "RebuildStarted");
-                               else
-                                       snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent);
-
-                               alert(percentalert,
-                                     dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       }
-
-                       if (mse &&
-                           mse->percent == -1 &&
-                           st->percent >= 0) {
-                               /* Rebuild/sync/whatever just finished.
-                                * If there is a number in /mismatch_cnt,
-                                * we should report that.
-                                */
-                               struct mdinfo *sra =
-                                      sysfs_read(-1, st->devnum, GET_MISMATCH);
-                               if (sra && sra->mismatch_cnt > 0) {
-                                       char cnt[40];
-                                       sprintf(cnt, " mismatches found: %d", sra->mismatch_cnt);
-                                       alert("RebuildFinished", dev, cnt, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               } else
-                                       alert("RebuildFinished", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               if (sra)
-                                       free(sra);
-                       }
-
-                       if (mse)
-                               st->percent = mse->percent;
-
-
-                       for (i=0; i<MaxDisks && i <= array.raid_disks + array.nr_disks;
-                            i++) {
-                               mdu_disk_info_t disc;
-                               disc.number = i;
-                               if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) {
-                                       info[i].state = disc.state;
-                                       info[i].major = disc.major;
-                                       info[i].minor = disc.minor;
-                               } else
-                                       info[i].major = info[i].minor = 0;
-                       }
-                       close(fd);
-
-                       for (i=0; i<MaxDisks; i++) {
-                               mdu_disk_info_t disc = {0,0,0,0,0};
-                               int newstate=0;
-                               int change;
-                               char *dv = NULL;
-                               disc.number = i;
-                               if (i > array.raid_disks + array.nr_disks) {
-                                       newstate = 0;
-                                       disc.major = disc.minor = 0;
-                               } else if (info[i].major || info[i].minor) {
-                                       newstate = info[i].state;
-                                       dv = map_dev(info[i].major, info[i].minor, 1);
-                                       disc.state = newstate;
-                                       disc.major = info[i].major;
-                                       disc.minor = info[i].minor;
-                               } else if (mse &&  mse->pattern && i < (int)strlen(mse->pattern)) {
-                                       switch(mse->pattern[i]) {
-                                       case 'U': newstate = 6 /* ACTIVE/SYNC */; break;
-                                       case '_': newstate = 0; break;
-                                       }
-                                       disc.major = disc.minor = 0;
-                               }
-                               if (dv == NULL && st->devid[i])
-                                       dv = map_dev(major(st->devid[i]),
-                                                    minor(st->devid[i]), 1);
-                               change = newstate ^ st->devstate[i];
-                               if (st->utime && change && !st->err) {
-                                       if (i < array.raid_disks &&
-                                           (((newstate&change)&(1<<MD_DISK_FAULTY)) ||
-                                            ((st->devstate[i]&change)&(1<<MD_DISK_ACTIVE)) ||
-                                            ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)))
-                                               )
-                                               alert("Fail", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       else if (i >= array.raid_disks &&
-                                                (disc.major || disc.minor) &&
-                                                st->devid[i] == makedev(disc.major, disc.minor) &&
-                                                ((newstate&change)&(1<<MD_DISK_FAULTY))
-                                               )
-                                               alert("FailSpare", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       else if (i < array.raid_disks &&
-                                                ! (newstate & (1<<MD_DISK_REMOVED)) &&
-                                                (((st->devstate[i]&change)&(1<<MD_DISK_FAULTY)) ||
-                                                 ((newstate&change)&(1<<MD_DISK_ACTIVE)) ||
-                                                 ((newstate&change)&(1<<MD_DISK_SYNC)))
-                                               )
-                                               alert("SpareActive", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               }
-                               st->devstate[i] = newstate;
-                               st->devid[i] = makedev(disc.major, disc.minor);
-                       }
-                       st->active = array.active_disks;
-                       st->working = array.working_disks;
-                       st->spare = array.spare_disks;
-                       st->failed = array.failed_disks;
-                       st->utime = array.utime;
-                       st->raid = array.raid_disks;
-                       st->err = 0;
-               }
+               for (st=statelist; st; st=st->next)
+                       if (check_array(st, mdstat, test, &info, increments))
+                               anydegraded = 1;
+               
                /* now check if there are any new devices found in mdstat */
-               if (scan) {
-                       struct mdstat_ent *mse;
-                       for (mse=mdstat; mse; mse=mse->next)
-                               if (mse->devnum != INT_MAX &&
-                                   mse->level &&
-                                   (strcmp(mse->level, "raid0")!=0 &&
-                                    strcmp(mse->level, "linear")!=0)
-                                       ) {
-                                       struct state *st = malloc(sizeof *st);
-                                       mdu_array_info_t array;
-                                       int fd;
-                                       if (st == NULL)
-                                               continue;
-                                       st->devname = strdup(get_md_name(mse->devnum));
-                                       if ((fd = open(st->devname, O_RDONLY)) < 0 ||
-                                           ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
-                                               /* no such array */
-                                               if (fd >=0) close(fd);
-                                               put_md_name(st->devname);
-                                               free(st->devname);
-                                               free(st);
-                                               continue;
-                                       }
-                                       close(fd);
-                                       st->utime = 0;
-                                       st->next = statelist;
-                                       st->err = 1;
-                                       st->devnum = mse->devnum;
-                                       st->percent = -2;
-                                       st->spare_group = NULL;
-                                       st->expected_spares = -1;
-                                       statelist = st;
-                                       if (test)
-                                               alert("TestMessage", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       alert("NewArray", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       new_found = 1;
-                               }
-               }
+               if (scan)
+                       new_found = add_new_arrays(mdstat, &statelist, test,
+                                                  &info);
+
                /* If an array has active < raid && spare == 0 && spare_group != NULL
                 * Look for another array with spare > 0 and active == raid and same spare_group
                 *  if found, choose a device and hotremove/hotadd
                 */
-               for (st = statelist; st; st=st->next)
-                       if (st->active < st->raid &&
-                           st->spare == 0 &&
-                           st->spare_group != NULL) {
-                               struct state *st2;
-                               for (st2=statelist ; st2 ; st2=st2->next)
-                                       if (st2 != st &&
-                                           st2->spare > 0 &&
-                                           st2->active == st2->raid &&
-                                           st2->spare_group != NULL &&
-                                           strcmp(st->spare_group, st2->spare_group) == 0) {
-                                               /* try to remove and add */
-                                               int fd1 = open(st->devname, O_RDONLY);
-                                               int fd2 = open(st2->devname, O_RDONLY);
-                                               int dev = -1;
-                                               int d;
-                                               if (fd1 < 0 || fd2 < 0) {
-                                                       if (fd1>=0) close(fd1);
-                                                       if (fd2>=0) close(fd2);
-                                                       continue;
-                                               }
-                                               for (d=st2->raid; d < MaxDisks; d++) {
-                                                       if (st2->devid[d] > 0 &&
-                                                           st2->devstate[d] == 0) {
-                                                               dev = st2->devid[d];
-                                                               break;
-                                                       }
-                                               }
-                                               if (dev > 0) {
-                                                       struct mddev_dev_s devlist;
-                                                       char devname[20];
-                                                       devlist.next = NULL;
-                                                       devlist.used = 0;
-                                                       devlist.re_add = 0;
-                                                       devlist.writemostly = 0;
-                                                       devlist.devname = devname;
-                                                       sprintf(devname, "%d:%d", major(dev), minor(dev));
-
-                                                       devlist.disposition = 'r';
-                                                       if (Manage_subdevs(st2->devname, fd2, &devlist, -1, 0) == 0) {
-                                                               devlist.disposition = 'a';
-                                                               if (Manage_subdevs(st->devname, fd1, &devlist, -1, 0) == 0) {
-                                                                       alert("MoveSpare", st->devname, st2->devname, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                                                       close(fd1);
-                                                                       close(fd2);
-                                                                       break;
-                                                               }
-                                                               else Manage_subdevs(st2->devname, fd2, &devlist, -1, 0);
-                                                       }
-                                               }
-                                               close(fd1);
-                                               close(fd2);
-                                       }
-                       }
+               if (share && anydegraded)
+                       try_spare_migration(statelist, &info);
                if (!new_found) {
                        if (oneshot)
                                break;
@@ -519,18 +247,100 @@ int Monitor(mddev_dev_t devlist,
        return 0;
 }
 
+static int make_daemon(char *pidfile)
+{
+       /* Return:
+        * -1 in the forked daemon
+        *  0 in the parent
+        *  1 on error
+        * so a none-negative becomes the exit code.
+        */
+       int pid = fork();
+       if (pid > 0) {
+               if (!pidfile)
+                       printf("%d\n", pid);
+               else {
+                       FILE *pid_file;
+                       pid_file=fopen(pidfile, "w");
+                       if (!pid_file)
+                               perror("cannot create pid file");
+                       else {
+                               fprintf(pid_file,"%d\n", pid);
+                               fclose(pid_file);
+                       }
+               }
+               return 0;
+       }
+       if (pid < 0) {
+               perror("daemonise");
+               return 1;
+       }
+       close(0);
+       open("/dev/null", O_RDWR);
+       dup2(0,1);
+       dup2(0,2);
+       setsid();
+       return -1;
+}
+
+static int check_one_sharer(int scan)
+{
+       int pid, rv;
+       FILE *fp;
+       char dir[20];
+       struct stat buf;
+       fp = fopen("/var/run/mdadm/autorebuild.pid", "r");
+       if (fp) {
+               if (fscanf(fp, "%d", &pid) != 1)
+                       pid = -1;
+               sprintf(dir, "/proc/%d", pid);
+               rv = stat(dir, &buf);
+               if (rv != -1) {
+                       if (scan) {
+                               fprintf(stderr, Name ": Only one "
+                                       "autorebuild process allowed"
+                                       " in scan mode, aborting\n");
+                               fclose(fp);
+                               return 1;
+                       } else {
+                               fprintf(stderr, Name ": Warning: One"
+                                       " autorebuild process already"
+                                       " running.\n");
+                       }
+               }
+               fclose(fp);
+       }
+       if (scan) {
+               if (mkdir("/var/run/mdadm", S_IRWXU) < 0 &&
+                   errno != EEXIST) {
+                       fprintf(stderr, Name ": Can't create "
+                               "autorebuild.pid file\n");
+               } else {
+                       fp = fopen("/var/run/mdadm/autorebuild.pid", "w");
+                       if (!fp)
+                               fprintf(stderr, Name ": Cannot create"
+                                       " autorebuild.pid"
+                                       "file\n");
+                       else {
+                               pid = getpid();
+                               fprintf(fp, "%d\n", pid);
+                               fclose(fp);
+                       }
+               }
+       }
+       return 0;
+}
 
-static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd,
-                 int dosyslog)
+static void alert(char *event, char *dev, char *disc, struct alert_info *info)
 {
        int priority;
 
-       if (!cmd && !mailaddr) {
+       if (!info->alert_cmd && !info->mailaddr) {
                time_t now = time(0);
 
                printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device");
        }
-       if (cmd) {
+       if (info->alert_cmd) {
                int pid = fork();
                switch(pid) {
                default:
@@ -539,11 +349,12 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                case -1:
                        break;
                case 0:
-                       execl(cmd, cmd, event, dev, disc, NULL);
+                       execl(info->alert_cmd, info->alert_cmd,
+                             event, dev, disc, NULL);
                        exit(2);
                }
        }
-       if (mailaddr &&
+       if (info->mailaddr &&
            (strncmp(event, "Fail", 4)==0 ||
             strncmp(event, "Test", 4)==0 ||
             strncmp(event, "Spares", 6)==0 ||
@@ -554,20 +365,27 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        char hname[256];
                        gethostname(hname, sizeof(hname));
                        signal(SIGPIPE, SIG_IGN);
-                       if (mailfrom)
-                               fprintf(mp, "From: %s\n", mailfrom);
+                       if (info->mailfrom)
+                               fprintf(mp, "From: %s\n", info->mailfrom);
                        else
                                fprintf(mp, "From: " Name " monitoring <root>\n");
-                       fprintf(mp, "To: %s\n", mailaddr);
-                       fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
+                       fprintf(mp, "To: %s\n", info->mailaddr);
+                       fprintf(mp, "Subject: %s event on %s:%s\n\n",
+                               event, dev, hname);
 
-                       fprintf(mp, "This is an automatically generated mail message from " Name "\n");
+                       fprintf(mp,
+                               "This is an automatically generated"
+                               " mail message from " Name "\n");
                        fprintf(mp, "running on %s\n\n", hname);
 
-                       fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
+                       fprintf(mp,
+                               "A %s event had been detected on"
+                               " md device %s.\n\n", event, dev);
 
                        if (disc && disc[0] != ' ')
-                               fprintf(mp, "It could be related to component device %s.\n\n", disc);
+                               fprintf(mp,
+                                       "It could be related to"
+                                       " component device %s.\n\n", disc);
                        if (disc && disc[0] == ' ')
                                fprintf(mp, "Extra information:%s.\n\n", disc);
 
@@ -577,18 +395,19 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        if (mdstat) {
                                char buf[8192];
                                int n;
-                               fprintf(mp, "\nP.S. The /proc/mdstat file currently contains the following:\n\n");
+                               fprintf(mp,
+                                       "\nP.S. The /proc/mdstat file"
+                                       " currently contains the following:\n\n");
                                while ( (n=fread(buf, 1, sizeof(buf), mdstat)) > 0)
-                                       n=fwrite(buf, 1, n, mp); /* yes, i don't care about the result */
+                                       n=fwrite(buf, 1, n, mp);
                                fclose(mdstat);
                        }
                        pclose(mp);
                }
-
        }
 
        /* log the event to syslog maybe */
-       if (dosyslog) {
+       if (info->dosyslog) {
                /* Log at a different severity depending on the event.
                 *
                 * These are the critical events:  */
@@ -606,10 +425,526 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        priority = LOG_INFO;
 
                if (disc)
-                       syslog(priority, "%s event detected on md device %s, component device %s", event, dev, disc);
+                       syslog(priority,
+                              "%s event detected on md device %s,"
+                              " component device %s", event, dev, disc);
                else
-                       syslog(priority, "%s event detected on md device %s", event, dev);
+                       syslog(priority,
+                              "%s event detected on md device %s",
+                              event, dev);
+       }
+}
+
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+                      int test, struct alert_info *ainfo,
+                      int increments)
+{
+       /* Update the state 'st' to reflect any changes shown in mdstat,
+        * or found by directly examining the array, and return
+        * '1' if the array is degraded, or '0' if it is optimal (or dead).
+        */
+       struct { int state, major, minor; } info[MaxDisks];
+       mdu_array_info_t array;
+       struct mdstat_ent *mse = NULL, *mse2;
+       char *dev = st->devname;
+       int fd;
+       int i;
+
+       if (test)
+               alert("TestMessage", dev, NULL, ainfo);
+       fd = open(dev, O_RDONLY);
+       if (fd < 0) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, NULL, ainfo);
+               st->err=1;
+               return 0;
+       }
+       fcntl(fd, F_SETFD, FD_CLOEXEC);
+       if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, NULL, ainfo);
+               st->err=1;
+               close(fd);
+               return 0;
        }
+       /* It's much easier to list what array levels can't
+        * have a device disappear than all of them that can
+        */
+       if (array.level == 0 || array.level == -1) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, "Wrong-Level", ainfo);
+               st->err = 1;
+               close(fd);
+               return 0;
+       }
+       if (st->devnum == INT_MAX) {
+               struct stat stb;
+               if (fstat(fd, &stb) == 0 &&
+                   (S_IFMT&stb.st_mode)==S_IFBLK) {
+                       if (major(stb.st_rdev) == MD_MAJOR)
+                               st->devnum = minor(stb.st_rdev);
+                       else
+                               st->devnum = -1- (minor(stb.st_rdev)>>6);
+               }
+       }
+
+       for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
+               if (mse2->devnum == st->devnum) {
+                       mse2->devnum = INT_MAX; /* flag it as "used" */
+                       mse = mse2;
+               }
+
+       if (!mse) {
+               /* duplicated array in statelist
+                * or re-created after reading mdstat*/
+               st->err = 1;
+               close(fd);
+               return 0;
+       }
+       /* this array is in /proc/mdstat */
+       if (array.utime == 0)
+               /* external arrays don't update utime, so
+                * just make sure it is always different. */
+               array.utime = st->utime + 1;;
+
+       if (st->utime == array.utime &&
+           st->failed == array.failed_disks &&
+           st->working == array.working_disks &&
+           st->spare == array.spare_disks &&
+           (mse == NULL  || (
+                   mse->percent == st->percent
+                   ))) {
+               close(fd);
+               st->err = 0;
+               if ((st->active < st->raid) && st->spare == 0)
+                       return 1;
+               else
+                       return 0;
+       }
+       if (st->utime == 0 && /* new array */
+           mse->pattern && strchr(mse->pattern, '_') /* degraded */
+               )
+               alert("DegradedArray", dev, NULL, ainfo);
+
+       if (st->utime == 0 && /* new array */
+           st->expected_spares > 0 &&
+           array.spare_disks < st->expected_spares)
+               alert("SparesMissing", dev, NULL, ainfo);
+       if (st->percent == -1 &&
+           mse->percent >= 0)
+               alert("RebuildStarted", dev, NULL, ainfo);
+       if (st->percent >= 0 &&
+           mse->percent >= 0 &&
+           (mse->percent / increments) > (st->percent / increments)) {
+               char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
+
+               if((mse->percent / increments) == 0)
+                       snprintf(percentalert, sizeof(percentalert), "RebuildStarted");
+               else
+                       snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent);
+
+               alert(percentalert, dev, NULL, ainfo);
+       }
+
+       if (mse->percent == -1 &&
+           st->percent >= 0) {
+               /* Rebuild/sync/whatever just finished.
+                * If there is a number in /mismatch_cnt,
+                * we should report that.
+                */
+               struct mdinfo *sra =
+                       sysfs_read(-1, st->devnum, GET_MISMATCH);
+               if (sra && sra->mismatch_cnt > 0) {
+                       char cnt[40];
+                       sprintf(cnt, " mismatches found: %d", sra->mismatch_cnt);
+                       alert("RebuildFinished", dev, cnt, ainfo);
+               } else
+                       alert("RebuildFinished", dev, NULL, ainfo);
+               if (sra)
+                       free(sra);
+       }
+       st->percent = mse->percent;
+
+       for (i=0; i<MaxDisks && i <= array.raid_disks + array.nr_disks;
+            i++) {
+               mdu_disk_info_t disc;
+               disc.number = i;
+               if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) {
+                       info[i].state = disc.state;
+                       info[i].major = disc.major;
+                       info[i].minor = disc.minor;
+               } else
+                       info[i].major = info[i].minor = 0;
+       }
+
+       if (strncmp(mse->metadata_version, "external:", 9) == 0 &&
+           is_subarray(mse->metadata_version+9))
+               st->parent_dev =
+                       devname2devnum(mse->metadata_version+10);
+       else
+               st->parent_dev = NoMdDev;
+       if (st->metadata == NULL &&
+           st->parent_dev == NoMdDev)
+               st->metadata = super_by_fd(fd, NULL);
+
+       close(fd);
+
+       for (i=0; i<MaxDisks; i++) {
+               mdu_disk_info_t disc = {0,0,0,0,0};
+               int newstate=0;
+               int change;
+               char *dv = NULL;
+               disc.number = i;
+               if (i > array.raid_disks + array.nr_disks) {
+                       newstate = 0;
+                       disc.major = disc.minor = 0;
+               } else if (info[i].major || info[i].minor) {
+                       newstate = info[i].state;
+                       dv = map_dev(info[i].major, info[i].minor, 1);
+                       disc.state = newstate;
+                       disc.major = info[i].major;
+                       disc.minor = info[i].minor;
+               } else if (mse &&  mse->pattern && i < (int)strlen(mse->pattern)) {
+                       switch(mse->pattern[i]) {
+                       case 'U': newstate = 6 /* ACTIVE/SYNC */; break;
+                       case '_': newstate = 0; break;
+                       }
+                       disc.major = disc.minor = 0;
+               }
+               if (dv == NULL && st->devid[i])
+                       dv = map_dev(major(st->devid[i]),
+                                    minor(st->devid[i]), 1);
+               change = newstate ^ st->devstate[i];
+               if (st->utime && change && !st->err) {
+                       if (i < array.raid_disks &&
+                           (((newstate&change)&(1<<MD_DISK_FAULTY)) ||
+                            ((st->devstate[i]&change)&(1<<MD_DISK_ACTIVE)) ||
+                            ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)))
+                               )
+                               alert("Fail", dev, dv, ainfo);
+                       else if (i >= array.raid_disks &&
+                                (disc.major || disc.minor) &&
+                                st->devid[i] == makedev(disc.major, disc.minor) &&
+                                ((newstate&change)&(1<<MD_DISK_FAULTY))
+                               )
+                               alert("FailSpare", dev, dv, ainfo);
+                       else if (i < array.raid_disks &&
+                                ! (newstate & (1<<MD_DISK_REMOVED)) &&
+                                (((st->devstate[i]&change)&(1<<MD_DISK_FAULTY)) ||
+                                 ((newstate&change)&(1<<MD_DISK_ACTIVE)) ||
+                                 ((newstate&change)&(1<<MD_DISK_SYNC)))
+                               )
+                               alert("SpareActive", dev, dv, ainfo);
+               }
+               st->devstate[i] = newstate;
+               st->devid[i] = makedev(disc.major, disc.minor);
+       }
+       st->active = array.active_disks;
+       st->working = array.working_disks;
+       st->spare = array.spare_disks;
+       st->failed = array.failed_disks;
+       st->utime = array.utime;
+       st->raid = array.raid_disks;
+       st->err = 0;
+       if ((st->active < st->raid) && st->spare == 0)
+               return 1;
+       return 0;
+}
+
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+                         int test, struct alert_info *info)
+{
+       struct mdstat_ent *mse;
+       int new_found = 0;
+
+       for (mse=mdstat; mse; mse=mse->next)
+               if (mse->devnum != INT_MAX &&
+                   (!mse->level  || /* retrieve containers */
+                    (strcmp(mse->level, "raid0") != 0 &&
+                     strcmp(mse->level, "linear") != 0))
+                       ) {
+                       struct state *st = calloc(1, sizeof *st);
+                       mdu_array_info_t array;
+                       int fd;
+                       if (st == NULL)
+                               continue;
+                       st->devname = strdup(get_md_name(mse->devnum));
+                       if ((fd = open(st->devname, O_RDONLY)) < 0 ||
+                           ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
+                               /* no such array */
+                               if (fd >=0) close(fd);
+                               put_md_name(st->devname);
+                               free(st->devname);
+                               if (st->metadata) {
+                                       st->metadata->ss->free_super(st->metadata);
+                                       free(st->metadata);
+                               }
+                               free(st);
+                               continue;
+                       }
+                       close(fd);
+                       st->next = *statelist;
+                       st->err = 1;
+                       st->devnum = mse->devnum;
+                       st->percent = -2;
+                       st->expected_spares = -1;
+                       if (strncmp(mse->metadata_version, "external:", 9) == 0 &&
+                           is_subarray(mse->metadata_version+9))
+                               st->parent_dev =
+                                       devname2devnum(mse->metadata_version+10);
+                       else
+                               st->parent_dev = NoMdDev;
+                       *statelist = st;
+                       if (test)
+                               alert("TestMessage", st->devname, NULL, info);
+                       alert("NewArray", st->devname, NULL, info);
+                       new_found = 1;
+               }
+       return new_found;
+}
+
+static int get_min_spare_size_required(struct state *st, unsigned long long *sizep)
+{
+       int fd;
+
+       if (!st->metadata ||
+           !st->metadata->ss->min_acceptable_spare_size) {
+               *sizep = 0;
+               return 0;
+       }
+
+       fd = open(st->devname, O_RDONLY);
+       if (fd < 0)
+               return 1;
+       if (st->metadata->ss->external)
+               st->metadata->ss->load_container(st->metadata, fd, st->devname);
+       else
+               st->metadata->ss->load_super(st->metadata, fd, st->devname);
+       close(fd);
+       if (!st->metadata->sb)
+               return 1;
+       *sizep = st->metadata->ss->min_acceptable_spare_size(st->metadata);
+       st->metadata->ss->free_super(st->metadata);
+
+       return 0;
+}
+
+static int check_donor(struct state *from, struct state *to)
+{
+       struct state *sub;
+
+       if (from == to)
+               return 0;
+       if (from->parent)
+               /* Cannot move from a member */
+               return 0;
+       if (from->err)
+               return 0;
+       for (sub = from->subarray; sub; sub = sub->subarray)
+               /* If source array has degraded subarrays, don't
+                * remove anything
+                */
+               if (sub->active < sub->raid)
+                       return 0;
+       if (from->metadata->ss->external == 0)
+               if (from->active < from->raid)
+                       return 0;
+       if (from->spare <= 0)
+               return 0;
+       return 1;
+}
+
+static dev_t choose_spare(struct state *from, struct state *to,
+                       struct domainlist *domlist, unsigned long long min_size)
+{
+       int d;
+       dev_t dev = 0;
+
+       for (d = from->raid; !dev && d < MaxDisks; d++) {
+               if (from->devid[d] > 0 &&
+                   from->devstate[d] == 0) {
+                       struct dev_policy *pol;
+                       unsigned long long dev_size;
+
+                       if (to->metadata->ss->external &&
+                           test_partition_from_id(from->devid[d]))
+                               continue;
+
+                       if (min_size &&
+                           dev_size_from_id(from->devid[d], &dev_size) &&
+                           dev_size < min_size)
+                               continue;
+
+                       pol = devnum_policy(from->devid[d]);
+                       if (from->spare_group)
+                               pol_add(&pol, pol_domain,
+                                       from->spare_group, NULL);
+                       if (domain_test(domlist, pol, to->metadata->ss->name) == 1)
+                           dev = from->devid[d];
+                       dev_policy_free(pol);
+               }
+       }
+       return dev;
+}
+
+static dev_t container_choose_spare(struct state *from, struct state *to,
+                                   struct domainlist *domlist,
+                                   unsigned long long min_size, int active)
+{
+       /* This is similar to choose_spare, but we cannot trust devstate,
+        * so we need to read the metadata instead
+        */
+       struct mdinfo *list;
+       struct supertype *st = from->metadata;
+       int fd = open(from->devname, O_RDONLY);
+       int err;
+       dev_t dev = 0;
+
+       if (fd < 0)
+               return 0;
+       if (!st->ss->getinfo_super_disks) {
+               close(fd);
+               return 0;
+       }
+       
+       err = st->ss->load_container(st, fd, NULL);
+       close(fd);
+       if (err)
+               return 0;
+       
+       if (from == to) {
+               /* We must check if number of active disks has not increased
+                * since ioctl in main loop. mdmon may have added spare
+                * to subarray. If so we do not need to look for more spares
+                * so return non zero value */
+               int active_cnt = 0;
+               struct mdinfo *dp;
+               list = st->ss->getinfo_super_disks(st);
+               if (!list) {
+                       st->ss->free_super(st);
+                       return 1;
+               }
+               dp = list->devs;
+               while (dp) {
+                       if (dp->disk.state & (1<<MD_DISK_SYNC) &&
+                           !(dp->disk.state & (1<<MD_DISK_FAULTY)))
+                               active_cnt++;
+                       dp = dp->next;
+               }
+               sysfs_free(list);
+               if (active < active_cnt) {
+                       /* Spare just activated.*/
+                       st->ss->free_super(st);
+                       return 1;
+               }
+       }
+
+       /* We only need one spare so full list not needed */
+       list = container_choose_spares(st, min_size, domlist, from->spare_group,
+                                      to->metadata->ss->name, 1);
+       if (list) {
+               struct mdinfo *disks = list->devs;
+               if (disks)
+                       dev = makedev(disks->disk.major, disks->disk.minor);
+               sysfs_free(list);
+       }
+       st->ss->free_super(st);
+       return dev;
+}
+
+
+static void try_spare_migration(struct state *statelist, struct alert_info *info)
+{
+       struct state *from;
+       struct state *st;
+
+       link_containers_with_subarrays(statelist);
+       for (st = statelist; st; st = st->next)
+               if (st->active < st->raid &&
+                   st->spare == 0 && !st->err) {
+                       struct domainlist *domlist = NULL;
+                       int d;
+                       struct state *to = st;
+                       unsigned long long min_size;
+
+                       if (to->parent_dev != NoMdDev && !to->parent)
+                               /* subarray monitored without parent container
+                                * we can't move spares here */
+                               continue;
+                       
+                       if (to->parent)
+                               /* member of a container */
+                               to = to->parent;
+
+                       if (get_min_spare_size_required(to, &min_size))
+                               continue;
+                       if (to->metadata->ss->external) {
+                               /* We must make sure there is
+                                * no suitable spare in container already.
+                                * If there is we don't add more */
+                               dev_t devid = container_choose_spare(
+                                       to, to, NULL, min_size, st->active);
+                               if (devid > 0)
+                                       continue;
+                       }
+                       for (d = 0; d < MaxDisks; d++)
+                               if (to->devid[d])
+                                       domainlist_add_dev(&domlist,
+                                                          to->devid[d],
+                                                          to->metadata->ss->name);
+                       if (to->spare_group)
+                               domain_add(&domlist, to->spare_group);
+                       /*
+                        * No spare migration if the destination
+                        * has no domain. Skip this array.
+                        */
+                       if (!domlist)
+                               continue;
+                       for (from=statelist ; from ; from=from->next) {
+                               dev_t devid;
+                               if (!check_donor(from, to))
+                                       continue;
+                               if (from->metadata->ss->external)
+                                       devid = container_choose_spare(
+                                               from, to, domlist, min_size, 0);
+                               else
+                                       devid = choose_spare(from, to, domlist,
+                                                            min_size);
+                               if (devid > 0
+                                   && move_spare(from->devname, to->devname, devid)) {
+                                       alert("MoveSpare", to->devname, from->devname, info);
+                                       break;
+                               }
+                       }
+                       domain_free(domlist);
+               }
+}
+
+/* search the statelist to connect external
+ * metadata subarrays with their containers
+ * We always completely rebuild the tree from scratch as
+ * that is safest considering the possibility of entries
+ * disappearing or changing.
+ */
+static void link_containers_with_subarrays(struct state *list)
+{
+       struct state *st;
+       struct state *cont;
+       for (st = list; st; st = st->next) {
+               st->parent = NULL;
+               st->subarray = NULL;
+       }
+       for (st = list; st; st = st->next)
+               if (st->parent_dev != NoMdDev)
+                       for (cont = list; cont; cont = cont->next)
+                               if (!cont->err &&
+                                   cont->parent_dev == NoMdDev &&
+                                   cont->devnum == st->parent_dev) {
+                                       st->parent = cont;
+                                       st->subarray = cont->subarray;
+                                       cont->subarray = st;
+                                       break;
+                               }
 }
 
 /* Not really Monitor but ... */
diff --git a/Query.c b/Query.c
index 8847be7ec0b6a1e0880865d672b38f9f0374d2e2..f9857d6d9e2ddadd86726db70046324bf4b0ee7d 100644 (file)
--- a/Query.c
+++ b/Query.c
@@ -90,7 +90,7 @@ int Query(char *dev)
        close(fd);
        if (superror == 0) {
                /* array might be active... */
-               st->ss->getinfo_super(st, &info);
+               st->ss->getinfo_super(st, &info, NULL);
                if (st->ss == &super0) {
                        mddev = get_md_name(info.array.md_minor);
                        disc.number = info.disk.number;
index e1f8c6867851ed532ef483c386736261d85698c7..90cab8713058583299ef2a517e1f4b6e8e3ac8e3 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -24,7 +24,7 @@
 
 #include "mdadm.h"
 
-char Version[] = Name " - v3.1.4 - 31st August 2010\n";
+char Version[] = Name " - v3.2 DEVELOPER_ONLY - 1st February 2011 (USE WITH CARE)\n";
 
 /*
  * File: ReadMe.c
@@ -93,8 +93,8 @@ char short_bitmap_auto_options[]=
                    "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
 
 struct option long_options[] = {
-    {"manage",    0, 0, '@'},
-    {"misc",      0, 0, '#'},
+    {"manage",    0, 0, ManageOpt},
+    {"misc",      0, 0, MiscOpt},
     {"assemble",  0, 0, 'A'},
     {"build",     0, 0, 'B'},
     {"create",    0, 0, 'C'},
@@ -110,35 +110,37 @@ struct option long_options[] = {
     {"detail-platform", 0, 0, DetailPlatform},
     {"kill-subarray", 1, 0, KillSubarray},
     {"update-subarray", 1, 0, UpdateSubarray},
+    {"udev-rules", 2, 0, UdevRules},
 
     /* synonyms */
     {"monitor",   0, 0, 'F'},
 
     /* after those will normally come the name of the md device */
     {"help",      0, 0, 'h'},
-    {"help-options",0,0,'h'},
+    {"help-options",0,0, HelpOptions},
     {"version",          0, 0, 'V'},
     {"verbose",   0, 0, 'v'},
     {"quiet",    0, 0, 'q'},
 
     /* For create or build: */
-    {"chunk",    1, 0, 'c'},
-    {"rounding",  1, 0, 'c'}, /* for linear, chunk is really a rounding number */
+    {"chunk",    1, 0, ChunkSize},
+    {"rounding",  1, 0, ChunkSize}, /* for linear, chunk is really a
+                                    * rounding number */
     {"level",     1, 0, 'l'}, /* 0,1,4,5,6,linear */
-    {"parity",    1, 0, 'p'}, /* {left,right}-{a,}symmetric */
-    {"layout",    1, 0, 'p'},
+    {"parity",    1, 0, Layout}, /* {left,right}-{a,}symmetric */
+    {"layout",    1, 0, Layout},
     {"raid-disks",1, 0, 'n'},
     {"raid-devices",1, 0, 'n'},
     {"spare-disks",1,0, 'x'},
     {"spare-devices",1,0, 'x'},
     {"size",     1, 0, 'z'},
-    {"auto",     1, 0, 'a'}, /* also for --assemble */
+    {"auto",     1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
     {"metadata",  1, 0, 'e'}, /* superblock format */
-    {"bitmap",   1, 0, 'b'},
+    {"bitmap",   1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
     {"write-behind", 2, 0, WriteBehind},
-    {"write-mostly",0, 0, 'W'},
+    {"write-mostly",0, 0, WriteMostly},
     {"re-add",    0, 0,  ReAdd},
     {"homehost",  1, 0,  HomeHost},
 #if 0
@@ -148,49 +150,54 @@ struct option long_options[] = {
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
-    {"super-minor",1,0, 'm'},
+    {"super-minor",1,0, SuperMinor},
     {"name",     1, 0, 'N'},
-    {"config",    1, 0, 'c'},
+    {"config",    1, 0, ConfigFile},
     {"scan",      0, 0, 's'},
-    {"force",    0, 0, 'f'},
+    {"force",    0, 0, Force},
     {"update",   1, 0, 'U'},
 
     /* Management */
-    {"add",       0, 0, 'a'},
-    {"remove",    0, 0, 'r'},
-    {"fail",      0, 0, 'f'},
-    {"set-faulty",0, 0, 'f'},
+    {"add",       0, 0, Add},
+    {"remove",    0, 0, Remove},
+    {"fail",      0, 0, Fail},
+    {"set-faulty",0, 0, Fail},
     {"run",       0, 0, 'R'},
     {"stop",      0, 0, 'S'},
     {"readonly",  0, 0, 'o'},
     {"readwrite", 0, 0, 'w'},
     {"no-degraded",0,0,  NoDegraded },
-    {"wait",     0, 0, 'W'},
+    {"wait",     0, 0,  WaitOpt},
     {"wait-clean", 0, 0, Waitclean },
 
     /* For Detail/Examine */
-    {"brief",    0, 0, 'b'},
+    {"brief",    0, 0, Brief},
     {"export",   0, 0, 'Y'},
     {"sparc2.2",  0, 0, Sparc22},
     {"test",      0, 0, 't'},
 
     /* For Follow/monitor */
-    {"mail",      1, 0, 'm'},
-    {"program",   1, 0, 'p'},
-    {"alert",     1, 0, 'p'},
-    {"increment", 1, 0, 'r'},
+    {"mail",      1, 0, EMail},
+    {"program",   1, 0, ProgramOpt},
+    {"alert",     1, 0, ProgramOpt},
+    {"increment", 1, 0, Increment},
     {"delay",     1, 0, 'd'},
-    {"daemonise", 0, 0, 'f'},
-    {"daemonize", 0, 0, 'f'},
+    {"daemonise", 0, 0, Fork},
+    {"daemonize", 0, 0, Fork},
     {"oneshot",   0, 0, '1'},
     {"pid-file",  1, 0, 'i'},
     {"syslog",    0, 0, 'y'},
+    {"no-sharing", 0, 0, NoSharing},
+
     /* For Grow */
     {"backup-file", 1,0, BackupFile},
+    {"invalid-backup",0,0,InvalidBackup},
     {"array-size", 1, 0, 'Z'},
 
     /* For Incremental */
-    {"rebuild-map", 0, 0, 'r'},
+    {"rebuild-map", 0, 0, RebuildMapOpt},
+    {"path", 1, 0, IncrementalPath},
+
     {0, 0, 0, 0}
 };
 
index 541a85d83e7ff17b7a0c508cb4c2d86d380fcb43..1f78c689ba5d4407d4331967c9cdb90ae86d8519 100644 (file)
--- a/config.c
+++ b/config.c
@@ -75,7 +75,7 @@ char DefaultConfFile[] = CONFFILE;
 char DefaultAltConfFile[] = CONFFILE2;
 
 enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
-               Homehost, AutoMode, LTEnd };
+               Homehost, AutoMode, Policy, PartPolicy, LTEnd };
 char *keywords[] = {
        [Devices]  = "devices",
        [Array]    = "array",
@@ -85,6 +85,8 @@ char *keywords[] = {
        [CreateDev]= "create",
        [Homehost] = "homehost",
        [AutoMode] = "auto",
+       [Policy]   = "policy",
+       [PartPolicy]="part-policy",
        [LTEnd]    = NULL
 };
 
@@ -229,11 +231,11 @@ struct conf_dev {
     char *name;
 } *cdevlist = NULL;
 
-mddev_dev_t load_partitions(void)
+struct mddev_dev *load_partitions(void)
 {
        FILE *f = fopen("/proc/partitions", "r");
        char buf[1024];
-       mddev_dev_t rv = NULL;
+       struct mddev_dev *rv = NULL;
        if (f == NULL) {
                fprintf(stderr, Name ": cannot open /proc/partitions\n");
                return NULL;
@@ -241,7 +243,7 @@ mddev_dev_t load_partitions(void)
        while (fgets(buf, 1024, f)) {
                int major, minor;
                char *name, *mp;
-               mddev_dev_t d;
+               struct mddev_dev *d;
 
                buf[1023] = '\0';
                if (buf[0] != ' ')
@@ -258,19 +260,18 @@ mddev_dev_t load_partitions(void)
                d->devname = strdup(name);
                d->next = rv;
                d->used = 0;
-               d->content = NULL;
                rv = d;
        }
        fclose(f);
        return rv;
 }
 
-mddev_dev_t load_containers(void)
+struct mddev_dev *load_containers(void)
 {
        struct mdstat_ent *mdstat = mdstat_read(1, 0);
        struct mdstat_ent *ent;
-       mddev_dev_t d;
-       mddev_dev_t rv = NULL;
+       struct mddev_dev *d;
+       struct mddev_dev *rv = NULL;
 
        if (!mdstat)
                return NULL;
@@ -288,7 +289,6 @@ mddev_dev_t load_containers(void)
                        }
                        d->next = rv;
                        d->used = 0;
-                       d->content = NULL;
                        rv = d;
                }
        free_mdstat(mdstat);
@@ -440,8 +440,8 @@ void devline(char *line)
        }
 }
 
-mddev_ident_t mddevlist = NULL;
-mddev_ident_t *mddevlp = &mddevlist;
+struct mddev_ident *mddevlist = NULL;
+struct mddev_ident **mddevlp = &mddevlist;
 
 static int is_number(char *w)
 {
@@ -458,8 +458,8 @@ void arrayline(char *line)
 {
        char *w;
 
-       struct mddev_ident_s mis;
-       mddev_ident_t mi;
+       struct mddev_ident mis;
+       struct mddev_ident *mi;
 
        mis.uuid_set = 0;
        mis.super_minor = UnSet;
@@ -675,24 +675,113 @@ void homehostline(char *line)
        }
 }
 
-static char *auto_options = NULL;
+char auto_yes[] = "yes";
+char auto_no[] = "no";
+char auto_homehost[] = "homehost";
+
+static int auto_seen = 0;
 void autoline(char *line)
 {
        char *w;
+       char *seen;
+       int super_cnt;
+       char *dflt = auto_yes;
+       int homehost = 0;
+       int i;
 
-       if (auto_options) {
+       if (auto_seen) {
                fprintf(stderr, Name ": AUTO line may only be give once."
                        "  Subsequent lines ignored\n");
                return;
        }
+       /* Parse the 'auto' line creating policy statements for the 'auto' policy.
+        *
+        * The default is 'yes' but the 'auto' line might over-ride that.
+        * Words in the line are processed in order with the first
+        * match winning.
+        * word can be:
+        *   +version   - that version can be assembled
+        *   -version   - that version cannot be auto-assembled
+        *   yes or +all - any other version can be assembled
+        *   no or -all  - no other version can be assembled.
+        *   homehost   - any array associated by 'homehost' to this
+        *                host can be assembled.
+        *
+        * Thus:
+        *   +ddf -0.90 homehost -all
+        * will auto-assemble any ddf array, no 0.90 array, and
+        * any other array (imsm, 1.x) if and only if it is identified
+        * as belonging to this host.
+        *
+        * We translate that to policy by creating 'auto=yes' when we see
+        * a '+version' line, 'auto=no' if we see '-version' before 'homehost',
+        * or 'auto=homehost' if we see '-version' after 'homehost'.
+        * When we see yes, no, +all or -all we stop an any version that hasn't
+        * been seen gets an appropriate auto= entry.
+        */
 
-       auto_options = dl_strdup(line);
-       dl_init(auto_options);
+       for (super_cnt = 0; superlist[super_cnt]; super_cnt++)
+               ;
+       seen = calloc(super_cnt, 1);
 
-       for (w=dl_next(line); w != line ; w=dl_next(w)) {
-               char *w2 = dl_strdup(w);
-               dl_add(auto_options, w2);
+       for (w = dl_next(line); w != line ; w = dl_next(w)) {
+               char *val;
+
+               if (strcasecmp(w, "yes") == 0) {
+                       dflt = auto_yes;
+                       break;
+               }
+               if (strcasecmp(w, "no") == 0) {
+                       if (homehost)
+                               dflt = auto_homehost;
+                       else
+                               dflt = auto_no;
+                       break;
+               }
+               if (strcasecmp(w, "homehost") == 0) {
+                       homehost = 1;
+                       continue;
+               }
+               if (w[0] == '+')
+                       val = auto_yes;
+               else if (w[0] == '-') {
+                       if (homehost)
+                               val = auto_homehost;
+                       else
+                               val = auto_no;
+               } else
+                       continue;
+
+               if (strcasecmp(w+1, "all") == 0) {
+                       dflt = val;
+                       break;
+               }
+               for (i = 0; superlist[i]; i++) {
+                       const char *version = superlist[i]->name;
+                       if (strcasecmp(w+1, version) == 0)
+                               break;
+                       /* 1 matches 1.x, 0 matches 0.90 */
+                       if (version[1] == '.' &&
+                           strlen(w+1) == 1 &&
+                           w[1] == version[0])
+                               break;
+                       /* 1.anything matches 1.x */
+                       if (strcmp(version, "1.x") == 0 &&
+                           strncmp(w+1, "1.", 2) == 0)
+                               break;
+               }
+               if (superlist[i] == NULL)
+                       /* ignore this word */
+                       continue;
+               if (seen[i])
+                       /* already know about this metadata */
+                       continue;
+               policy_add(rule_policy, pol_auto, val, pol_metadata, superlist[i]->name, NULL);
+               seen[i] = 1;
        }
+       for (i = 0; i < super_cnt; i++)
+               if (!seen[i])
+                       policy_add(rule_policy, pol_auto, dflt, pol_metadata, superlist[i]->name, NULL);
 }
 
 int loaded = 0;
@@ -767,6 +856,12 @@ void load_conffile(void)
                case AutoMode:
                        autoline(line);
                        break;
+               case Policy:
+                       policyline(line, rule_policy);
+                       break;
+               case PartPolicy:
+                       policyline(line, rule_part);
+                       break;
                default:
                        fprintf(stderr, Name ": Unknown keyword %s\n", line);
                }
@@ -810,9 +905,9 @@ struct createinfo *conf_get_create_info(void)
        return &createinfo;
 }
 
-mddev_ident_t conf_get_ident(char *dev)
+struct mddev_ident *conf_get_ident(char *dev)
 {
-       mddev_ident_t rv;
+       struct mddev_ident *rv;
        load_conffile();
        rv = mddevlist;
        while (dev && rv && (rv->devname == NULL
@@ -821,23 +916,23 @@ mddev_ident_t conf_get_ident(char *dev)
        return rv;
 }
 
-static void append_dlist(mddev_dev_t *dlp, mddev_dev_t list)
+static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list)
 {
        while (*dlp)
                dlp = &(*dlp)->next;
        *dlp = list;
 }
 
-mddev_dev_t conf_get_devs()
+struct mddev_dev *conf_get_devs()
 {
        glob_t globbuf;
        struct conf_dev *cd;
        int flags = 0;
-       static mddev_dev_t dlist = NULL;
+       static struct mddev_dev *dlist = NULL;
        unsigned int i;
 
        while (dlist) {
-               mddev_dev_t t = dlist;
+               struct mddev_dev *t = dlist;
                dlist = dlist->next;
                free(t->devname);
                free(t);
@@ -863,11 +958,10 @@ mddev_dev_t conf_get_devs()
        }
        if (flags & GLOB_APPEND) {
                for (i=0; i<globbuf.gl_pathc; i++) {
-                       mddev_dev_t t = malloc(sizeof(*t));
+                       struct mddev_dev *t = malloc(sizeof(*t));
                        t->devname = strdup(globbuf.gl_pathv[i]);
                        t->next = dlist;
                        t->used = 0;
-                       t->content = NULL;
                        dlist = t;
 /*     printf("one dev is %s\n", t->devname);*/
                }
@@ -892,64 +986,30 @@ int conf_test_dev(char *devname)
        return 0;
 }
 
-int conf_test_metadata(const char *version, int is_homehost)
+int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost)
 {
-       /* Check if the given metadata version is allowed
-        * to be auto-assembled.
-        * The default is 'yes' but the 'auto' line might over-ride that.
-        * Words in auto_options are processed in order with the first
-        * match winning.
-        * word can be:
-        *   +version   - that version can be assembled
-        *   -version   - that version cannot be auto-assembled
-        *   yes or +all - any other version can be assembled
-        *   no or -all  - no other version can be assembled.
-        *   homehost   - any array associated by 'homehost' to this
-        *                host can be assembled.
-        *
-        * Thus:
-        *   +ddf -0.90 homehost -all
-        * will auto-assemble any ddf array, no 0.90 array, and
-        * any other array (imsm, 1.x) if and only if it is identified
-        * as belonging to this host.
+       /* If anyone said 'yes', that sticks.
+        * else if homehost applies, use that
+        * else if there is a 'no', say 'no'.
+        * else 'yes'.
         */
-       char *w;
+       struct dev_policy *p;
+       int no=0, found_auto=0;
        load_conffile();
-       if (!auto_options)
-               return 1;
-       for (w = dl_next(auto_options); w != auto_options; w = dl_next(w)) {
-               int rv;
-               if (strcasecmp(w, "yes") == 0)
+
+       pol = pol_find(pol, pol_auto);
+       pol_for_each(p, pol, version) {
+               if (strcmp(p->value, "yes") == 0)
                        return 1;
-               if (strcasecmp(w, "no") == 0)
-                       return 0;
-               if (strcasecmp(w, "homehost") == 0) {
-                       if (is_homehost)
-                               return 1;
-                       else
-                               continue;
-               }
-               if (w[0] == '+')
-                       rv = 1;
-               else if (w[0] == '-')
-                       rv = 0;
-               else continue;
-
-               if (strcasecmp(w+1, "all") == 0)
-                       return rv;
-               if (strcasecmp(w+1, version) == 0)
-                       return rv;
-               /* allow  '0' to match version '0.90'
-                * and 1 or 1.whatever to match version '1.x'
-                */
-               if (version[1] == '.' &&
-                   strlen(w+1) == 1 &&
-                   w[1] == version[0])
-                       return rv;
-               if (version[1] == '.' && version[2] == 'x' &&
-                   strncmp(w+1, version, 2) == 0)
-                       return rv;
+               if (strcmp(p->value, "auto") == 0)
+                       found_auto = 1;
+               if (strcmp(p->value, "no") == 0)
+                       no = 1;
        }
+       if (is_homehost && found_auto)
+               return 1;
+       if (no)
+               return 0;
        return 1;
 }
 
@@ -959,7 +1019,6 @@ int match_oneof(char *devices, char *devname)
      * matches devname
      */
 
-
     while (devices && *devices) {
        char patn[1024];
        char *p = devices;
@@ -1016,7 +1075,7 @@ int conf_name_is_free(char *name)
         * It can be taken either by a match on devname, name, or
         * even super-minor.
         */
-       mddev_ident_t dev;
+       struct mddev_ident *dev;
 
        load_conffile();
        for (dev = mddevlist; dev; dev = dev->next) {
@@ -1033,9 +1092,9 @@ int conf_name_is_free(char *name)
        return 1;
 }
 
-struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st)
+struct mddev_ident *conf_match(struct mdinfo *info, struct supertype *st)
 {
-       struct mddev_ident_s *array_list, *match;
+       struct mddev_ident *array_list, *match;
        int verbose = 0;
        char *devname = NULL;
        array_list = conf_get_ident(NULL);
diff --git a/external-reshape-design.txt b/external-reshape-design.txt
new file mode 100644 (file)
index 0000000..4eb04a2
--- /dev/null
@@ -0,0 +1,280 @@
+External Reshape
+
+1 Problem statement
+
+External (third-party metadata) reshape differs from native-metadata
+reshape in three key ways:
+
+1.1 Format specific constraints
+
+In the native case reshape is limited by what is implemented in the
+generic reshape routine (Grow_reshape()) and what is supported by the
+kernel.  There are exceptional cases where Grow_reshape() may block
+operations when it knows that the kernel implementation is broken, but
+otherwise the kernel is relied upon to be the final arbiter of what
+reshape operations are supported.
+
+In the external case the kernel, and the generic checks in
+Grow_reshape(), become the super-set of what reshapes are possible.  The
+metadata format may not support, or have yet to implement a given
+reshape type.  The implication for Grow_reshape() is that it must query
+the metadata handler and effect changes in the metadata before the new
+geometry is posted to the kernel.  The ->reshape_super method allows
+Grow_reshape() to validate the requested operation and post the metadata
+update.
+
+1.2 Scope of reshape
+
+Native metadata reshape is always performed at the array scope (no
+metadata relationship with sibling arrays on the same disks).  External
+reshape, depending on the format, may not allow the number of member
+disks to be changed in a subarray unless the change is simultaneously
+applied to all subarrays in the container.  For example the imsm format
+requires all member disks to be a member of all subarrays, so a 4-disk
+raid5 in a container that also houses a 4-disk raid10 array could not be
+reshaped to 5 disks as the imsm format does not support a 5-disk raid10
+representation.  This requires the ->reshape_super method to check the
+contents of the array and ask the user to run the reshape at container
+scope (if all subarrays are agreeable to the change), or report an
+error in the case where one subarray cannot support the change.
+
+1.3 Monitoring / checkpointing
+
+Reshape, unlike rebuild/resync, requires strict checkpointing to survive
+interrupted reshape operations.  For example when expanding a raid5
+array the first few stripes of the array will be overwritten in a
+destructive manner.  When restarting the reshape process we need to know
+the exact location of the last successfully written stripe, and we need
+to restore the data in any partially overwritten stripe.  Native
+metadata stores this backup data in the unused portion of spares that
+are being promoted to array members, or in an external backup file
+(located on a non-involved block device).
+
+The kernel is in charge of recording checkpoints of reshape progress,
+but mdadm is delegated the task of managing the backup space which
+involves:
+1/ Identifying what data will be overwritten in the next unit of reshape
+   operation
+2/ Suspending access to that region so that a snapshot of the data can
+   be transferred to the backup space.
+3/ Allowing the kernel to reshape the saved region and setting the
+   boundary for the next backup.
+
+In the external reshape case we want to preserve this mdadm
+'reshape-manager' arrangement, but have a third actor, mdmon, to
+consider.  It is tempting to give the role of managing reshape to mdmon,
+but that is counter to its role as a monitor, and conflicts with the
+existing capabilities and role of mdadm to manage the progress of
+reshape.  For clarity the external reshape implementation maintains the
+role of mdmon as a (mostly) passive recorder of raid events, and mdadm
+treats it as it would the kernel in the native reshape case (modulo
+needing to send explicit metadata update messages and checking that
+mdmon took the expected action).
+
+External reshape can use the generic md backup file as a fallback, but in the
+optimal/firmware-compatible case the reshape-manager will use the metadata
+specific areas for managing reshape.  The implementation also needs to spawn a
+reshape-manager per subarray when the reshape is being carried out at the
+container level.  For these two reasons the ->manage_reshape() method is
+introduced.  This method in addition to base tasks mentioned above:
+1/ Processed each subarray one at a time in series - where appropriate.
+2/ Uses either generic routines in Grow.c for md-style backup file
+   support, or uses the metadata-format specific location for storing
+   recovery data.
+This aims to avoid a "midlayer mistake"[1] and lets the metadata handler
+optionally take advantage of generic infrastructure in Grow.c
+
+2 Details for specific reshape requests
+
+There are quite a few moving pieces spread out across md, mdadm, and mdmon for
+the support of external reshape, and there are several different types of
+reshape that need to be comprehended by the implementation.  A rundown of
+these details follows.
+
+2.0 General provisions:
+
+Obtain an exclusive open on the container to make sure we are not
+running concurrently with a Create() event.
+
+2.1 Freezing sync_action
+
+   Before making any attempt at a reshape we 'freeze' every array in
+   the container to ensure no spare assignment or recovery happens.
+   This involves writing 'frozen' to sync_action and changing the '/'
+   after 'external:' in metadata_version to a '-'. mdmon knows that
+   this means not to perform any management.
+
+   Before doing this we check that all sync_actions are 'idle', which
+   is racy but still useful.
+   Afterwards we check that all member arrays have no spares
+   or partial spares (recovery_start != 'none') which would indicate a
+   race.  If they do, we unfreeze again.
+
+   Once this completes we know all the arrays are stable.  They may
+   still have failed devices as devices can fail at any time.  However
+   we treat those like failures that happen during the reshape.
+
+2.2 Reshape size
+
+   1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+      initializes st->update_tail
+   2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change
+      is allowed (being performed at subarray scope / enough room) prepares a
+      metadata update
+   3/ mdadm::Grow_reshape(): flushes the metadata update (via
+      flush_metadata_update(), or ->sync_metadata())
+   4/ mdadm::Grow_reshape(): post the new size to the kernel
+
+
+2.3 Reshape level (simple-takeover)
+
+"simple-takeover" implies the level change can be satisfied without touching
+sync_action
+
+    1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+       initializes st->update_tail
+    2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change
+       is allowed (being performed at subarray scope) prepares a
+       metadata update
+       2a/ raid10 --> raid0: degrade all mirror legs prior to calling
+           ->reshape_super
+    3/ mdadm::Grow_reshape(): flushes the metadata update (via
+       flush_metadata_update(), or ->sync_metadata())
+    4/ mdadm::Grow_reshape(): post the new level to the kernel
+
+2.4 Reshape chunk, layout
+
+2.5 Reshape raid disks (grow)
+
+    1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail
+       because only redundant raid levels can modify the number of raid disks
+    2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level
+       change is allowed (being performed at proper scope / permissible
+       geometry / proper spares available in the container), chooses
+       the spares to use, and prepares a metadata update.
+    3/ mdadm::Grow_reshape(): Converts each subarray in the container to the
+       raid level that can perform the reshape and starts mdmon.
+    4/ mdadm::Grow_reshape(): Pushes the update to mdmon.
+    5/ mdadm::Grow_reshape(): uses container_content to find details of
+       the spares and passes them to the kernel.
+    6/ mdadm::Grow_reshape(): gives raid_disks update to the kernel,
+       sets sync_max, sync_min, suspend_lo, suspend_hi all to zero,
+       and starts the reshape by writing 'reshape' to sync_action.
+    7/ mdmon::monitor notices the sync_action change and tells
+       managemon to check for new devices.  managemon notices the new
+       devices, opens relevant sysfs file, and passes them all to
+       monitor.
+    8/ mdadm::Grow_reshape() calls ->manage_reshape to oversee the
+       rest of the reshape.
+       
+    9/ mdadm::<format>->manage_reshape(): saves data that will be overwritten by
+       the kernel to either the backup file or the metadata specific location,
+       advances sync_max, waits for reshape, ping mdmon, repeat.
+       Meanwhile mdmon::read_and_act(): records checkpoints.
+       Specifically.
+
+       9a/ if the 'next' stripe to be reshaped will over-write
+           itself during reshape then:
+       9a.1/ increase suspend_hi to cover a suitable number of
+           stripes.
+       9a.2/ backup those stripes safely.
+       9a.3/ advance sync_max to allow those stripes to be backed up
+       9a.4/ when sync_completed indicates that those stripes have
+           been reshaped, manage_reshape must ping_manager
+       9a.5/ when mdmon notices that sync_completed has been updated,
+           it records the new checkpoint in the metadata
+       9a.6/ after the ping_manager, manage_reshape will increase
+           suspend_lo to allow access to those stripes again
+
+       9b/ if the 'next' stripe to be reshaped will over-write unused
+           space during reshape then we apply same process as above,
+          except that there is no need to back anything up.
+          Note that we *do* need to keep suspend_hi progressing as
+          it is not safe to write to the area-under-reshape.  For
+          kernel-managed-metadata this protection is provided by
+          ->reshape_safe, but that does not protect us in the case
+          of user-space-managed-metadata.
+          
+   10/ mdadm::<format>->manage_reshape(): Once reshape completes changes the raid
+       level back to the nominal raid level (if necessary)
+
+       FIXME: native metadata does not have the capability to record the original
+       raid level in reshape-restart case because the kernel always records current
+       raid level to the metadata, whereas external metadata can masquerade at an
+       alternate level based on the reshape state.
+
+2.6 Reshape raid disks (shrink)
+
+3 Interaction with metadata handle.
+
+  The following calls are made into the metadata handler to assist
+  with initiating and monitoring a 'reshape'.
+
+  1/ ->reshape_super is called quite early (after only minimial
+     checks) to make sure that the metadata can record the new shape
+     and any necessary transitions.  It may be passed a 'container'
+     or an individual array within a container, and it should notice
+     the difference and act accordingly.
+     When a reshape is requested against a container it is expected
+     that it should be applied to every array in the container,
+     however it is up to the metadata handler to determine final
+     policy.
+
+     If the reshape is supportable, the internal copy of the metadata
+     should be updated, and a metadata update suitable for sending
+     to mdmon should be queued.
+
+     If the reshape will involve converting spares into array members,
+     this must be recorded in the metadata too.
+
+  2/ ->container_content will be called to find out the new state
+     of all the array, or all arrays in the container.  Any newly
+     added devices (with state==0 and raid_disk >= 0) will be added
+     to the array as spares with the relevant slot number.
+
+     It is likely that the info returned by  ->container_content will
+     have ->reshape_active set, ->reshape_progress set to e.g. 0, and
+     new_* set appropriately.  mdadm will use this information to
+     cause the correct reshape to start at an appropriate time.
+
+  3/ ->set_array_state will be called by mdmon when reshape has
+     started and again periodically as it progresses.  This should
+     record the ->last_checkpoint as the point where reshape has
+     progressed to.  When the reshape finished this will be called
+     again and it should notice that ->curr_action is no longer
+     'reshape' and so should record that the reshape has finished
+     providing 'last_checkpoint' has progressed suitably.
+
+  4/ ->manage_reshape will be called once the reshape has been set
+     up in the kernel but before sync_max has been moved from 0, so
+     no actual reshape will have happened.
+
+     ->manage_reshape should call progress_reshape() to allow the
+     reshape to progress, and should back-up any data as indicated
+     by the return value.  See the documentation of that function
+     for more details.
+     ->manage_reshape will be called multiple times when a
+     container is being reshaped, once for each member array in
+     the container.
+
+
+   The progress of the metadata is as follows:
+    1/ mdadm sends a metadata update to mdmon which marks the array
+       as undergoing a reshape. This is set up by
+       ->reshape_super and applied by ->process_update
+       For container-wide reshape, this happens once for the whole
+       container.
+    2/ mdmon notices progress via the sysfs files and calls
+       ->set_array_state to update the state periodically
+       For container-wide reshape, this happens repeatedly for
+       one array, then repeatedly for the next, etc.
+    3/ mdmon notices when reshape has finished and call
+       ->set_array_state to record the the reshape is complete.
+       For container-wide reshape, this happens once for each
+       member array.
+     
+     
+   
+...
+
+[1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/
index af153e971f0fc0f2de28908bd28614e7d644db57..d960099840ab91f71cf573f7a5d9af0c49870b13 100755 (executable)
--- a/inventory
+++ b/inventory
@@ -8,6 +8,7 @@ ANNOUNCE-3.1.1
 ANNOUNCE-3.1.2
 ANNOUNCE-3.1.3
 ANNOUNCE-3.1.4
+ANNOUNCE-3.2
 Assemble.c
 bitmap.c
 bitmap.h
@@ -22,6 +23,7 @@ Detail.c
 dlink.c
 dlink.h
 Examine.c
+external-reshape-design.txt
 .gitignore
 Grow.c
 Incremental.c
@@ -50,6 +52,7 @@ mdassemble.8
 mdassemble.c
 mdmon.8
 mdmon.c
+mdmon-design.txt
 mdmon.h
 mdopen.c
 md_p.h
@@ -62,8 +65,10 @@ monitor.c
 Monitor.c
 msg.c
 msg.h
+part.h
 platform-intel.c
 platform-intel.h
+policy.c
 probe_roms.c
 probe_roms.h
 pwgr.c
@@ -78,7 +83,9 @@ sha1.h
 super0.c
 super1.c
 super-ddf.c
+super-gpt.c
 super-intel.c
+super-mbr.c
 swap_super.c
 sysfs.c
 test
@@ -135,12 +142,48 @@ tests/08imsm-overlap
 tests/09imsm-assemble
 tests/09imsm-create-fail-rebuild
 tests/10ddf-create
+tests/11spare-migration
+tests/12imsm-r0_2d-grow-r0_3d
+tests/12imsm-r0_2d-grow-r0_4d
+tests/12imsm-r0_2d-grow-r0_5d
+tests/12imsm-r0_3d-grow-r0_4d
+tests/12imsm-r5_3d-grow-r5_4d
+tests/12imsm-r5_3d-grow-r5_5d
+tests/13imsm-r0_r0_2d-grow-r0_r0_4d
+tests/13imsm-r0_r0_2d-grow-r0_r0_5d
+tests/13imsm-r0_r0_3d-grow-r0_r0_4d
+tests/13imsm-r0_r5_3d-grow-r0_r5_4d
+tests/13imsm-r0_r5_3d-grow-r0_r5_5d
+tests/13imsm-r5_r0_3d-grow-r5_r0_4d
+tests/13imsm-r5_r0_3d-grow-r5_r0_5d
+tests/14imsm-r0_3d_no_spares-migrate-r5_3d
+tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d
+tests/14imsm-r0_r0_2d-takeover-r10_4d
+tests/14imsm-r10_4d-grow-r10_5d
+tests/14imsm-r10_r5_4d-takeover-r0_2d
+tests/14imsm-r1_2d-grow-r1_3d
+tests/14imsm-r1_2d-takeover-r0_2d
+tests/14imsm-r5_3d-grow-r5_5d-no-spares
+tests/14imsm-r5_3d-migrate-r4_3d
+tests/15imsm-r0_3d_64k-migrate-r0_3d_256k
+tests/15imsm-r5_3d_4k-migrate-r5_3d_256k
+tests/15imsm-r5_3d_64k-migrate-r5_3d_256k
+tests/15imsm-r5_6d_4k-migrate-r5_6d_256k
+tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k
+tests/16imsm-r0_3d-migrate-r5_4d
+tests/16imsm-r0_5d-migrate-r5_6d
+tests/16imsm-r5_3d-migrate-r0_3d
+tests/16imsm-r5_5d-migrate-r0_5d
+tests/18imsm-1d-takeover-r0_1d
+tests/18imsm-1d-takeover-r1_2d
+tests/18imsm-r0_2d-takeover-r10_4d
+tests/18imsm-r10_4d-takeover-r0_2d
 tests/check
-tests/env-08imsm-overlap
-tests/env-09imsm-assemble
-tests/env-09imsm-create-fail-rebuild
+tests/env-imsm-template
+tests/imsm-grow-template
 tests/testdev
 tests/ToTest
+tests/utils
 TODO
 udev-md-raid.rules
 util.c
index 3cf2389a3ad523bfc9db1d25cafbff19eef901f8..19e5f4179075b0043a774807e350ef7cf8a443f2 100644 (file)
@@ -120,6 +120,8 @@ static void close_aa(struct active_array *aa)
        close(aa->action_fd);
        close(aa->info.state_fd);
        close(aa->resync_start_fd);
+       close(aa->metadata_fd);
+       close(aa->sync_completed_fd);
 }
 
 static void free_aa(struct active_array *aa)
@@ -215,10 +217,16 @@ static void free_updates(struct metadata_update **update)
 {
        while (*update) {
                struct metadata_update *this = *update;
+               void **space_list = this->space_list;
 
                *update = this->next;
                free(this->buf);
                free(this->space);
+               while (space_list) {
+                       void *space = space_list;
+                       space_list = *space_list;
+                       free(space);
+               }
                free(this);
        }
 }
@@ -276,7 +284,7 @@ static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
         */
        st2 = dup_super(st);
        if (st2->ss->load_super(st2, dfd, NULL) == 0) {
-               st2->ss->getinfo_super(st, &info);
+               st2->ss->getinfo_super(st, &info, NULL);
                if (st->ss->compare_super(st, st2) == 0 &&
                    info.disk.raid_disk >= 0) {
                        /* Looks like a good member of array.
@@ -297,12 +305,44 @@ static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
        st->update_tail = NULL;
 }
 
+/*
+ * Create and queue update structure about the removed disks.
+ * The update is prepared by super type handler and passed to the monitor
+ * thread.
+ */
+static void remove_disk_from_container(struct supertype *st, struct mdinfo *sd)
+{
+       struct metadata_update *update = NULL;
+       mdu_disk_info_t dk = {
+               .number = -1,
+               .major = sd->disk.major,
+               .minor = sd->disk.minor,
+               .raid_disk = -1,
+               .state = 0,
+       };
+       /* nothing to do if super type handler does not support
+        * remove disk primitive
+        */
+       if (!st->ss->remove_from_super)
+               return;
+       dprintf("%s: remove %d:%d from container\n",
+               __func__, sd->disk.major, sd->disk.minor);
+
+       st->update_tail = &update;
+       st->ss->remove_from_super(st, &dk);
+       st->ss->write_init_super(st);
+       queue_metadata_update(update);
+       st->update_tail = NULL;
+}
+
 static void manage_container(struct mdstat_ent *mdstat,
                             struct supertype *container)
 {
-       /* The only thing of interest here is if a new device
-        * has been added to the container.  We add it to the
-        * array ignoring any metadata on it.
+       /* Of interest here are:
+        * - if a new device has been added to the container, we 
+        *   add it to the array ignoring any metadata on it.
+        * - if a device has been removed from the container, we
+        *   remove it from the device list and update the metadata.
         * FIXME should we look for compatible metadata and take hints
         * about spare assignment.... probably not.
         */
@@ -334,6 +374,7 @@ static void manage_container(struct mdstat_ent *mdstat,
                        if (!found) {
                                cd = *cdp;
                                *cdp = (*cdp)->next;
+                               remove_disk_from_container(container, cd);
                                free(cd);
                        } else
                                cdp = &(*cdp)->next;
@@ -385,20 +426,47 @@ static void manage_member(struct mdstat_ent *mdstat,
         * We do not need to look for device state changes here, that
         * is dealt with by the monitor.
         *
-        * We just look for changes which suggest that a reshape is
-        * being requested.
-        * Unfortunately decreases in raid_disks don't show up in
-        * mdstat until the reshape completes FIXME.
+        * If a reshape is being requested, monitor will have noticed
+        * that sync_action changed and will have set check_reshape.
+        * We just need to see if new devices have appeared.  All metadata
+        * updates will already have been processed.
         *
-        * Actually, we also want to handle degraded arrays here by
+        * We also want to handle degraded arrays here by
         * trying to find and assign a spare.
         * We do that whenever the monitor tells us too.
         */
+       char buf[64];
+       int frozen;
+
        // FIXME
        a->info.array.raid_disks = mdstat->raid_disks;
        // MORE
 
-       if (a->check_degraded) {
+       /* honor 'frozen' */
+       if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0)
+               frozen = buf[9] == '-';
+       else
+               frozen = 1; /* can't read metadata_version assume the worst */
+
+       /* If sync_action is not 'idle' then don't try recovery now */
+       if (!frozen
+           && sysfs_get_str(&a->info, NULL, "sync_action", buf, sizeof(buf)) > 0
+           && strncmp(buf, "idle", 4) != 0)
+               frozen = 1;
+
+       if (mdstat->level) {
+               int level = map_name(pers, mdstat->level);
+               if (a->info.array.level != level && level >= 0) {
+                       struct active_array *newa = duplicate_aa(a);
+                       if (newa) {
+                               newa->info.array.level = level;
+                               replace_array(a->container, a, newa);
+                               a = newa;
+                       }
+               }
+       }
+
+       if (a->check_degraded && !frozen) {
                struct metadata_update *updates = NULL;
                struct mdinfo *newdev = NULL;
                struct active_array *newa;
@@ -444,6 +512,52 @@ static void manage_member(struct mdstat_ent *mdstat,
                }
                free_updates(&updates);
        }
+
+       if (a->check_reshape) {
+               /* mdadm might have added some devices to the array.
+                * We want to disk_init_and_add any such device to a
+                * duplicate_aa and replace a with that.
+                * mdstat doesn't have enough info so we sysfs_read
+                * and look for new stuff.
+                */
+               struct mdinfo *info, *d, *d2, *newd;
+               unsigned long long array_size;
+               struct active_array *newa = NULL;
+               a->check_reshape = 0;
+               info = sysfs_read(-1, mdstat->devnum,
+                                 GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+               if (!info)
+                       goto out2;
+               for (d = info->devs; d; d = d->next) {
+                       if (d->disk.raid_disk < 0)
+                               continue;
+                       for (d2 = a->info.devs; d2; d2 = d2->next)
+                               if (d2->disk.raid_disk ==
+                                   d->disk.raid_disk)
+                                       break;
+                       if (d2)
+                               /* already have this one */
+                               continue;
+                       if (!newa) {
+                               newa = duplicate_aa(a);
+                               if (!newa)
+                                       break;
+                       }
+                       newd = malloc(sizeof(*newd));
+                       if (!newd)
+                               continue;
+                       disk_init_and_add(newd, d, newa);
+               }
+               if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0
+                   && a->info.custom_array_size > array_size*2) {
+                       sysfs_set_num(info, NULL, "array_size",
+                                     a->info.custom_array_size/2);
+               }
+       out2:
+               sysfs_free(info);
+               if (newa)
+                       replace_array(a->container, a, newa);
+       }
 }
 
 static int aa_ready(struct active_array *aa)
@@ -482,6 +596,7 @@ static void manage_new(struct mdstat_ent *mdstat,
        char *inst;
        int i;
        int failed = 0;
+       char buf[40];
 
        /* check if array is ready to be monitored */
        if (!mdstat->active)
@@ -510,7 +625,7 @@ static void manage_new(struct mdstat_ent *mdstat,
 
        new->container = container;
 
-       inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+       inst = to_subarray(mdstat, container->devname);
 
        new->info.array = mdi->array;
        new->info.component_size = mdi->component_size;
@@ -543,6 +658,29 @@ static void manage_new(struct mdstat_ent *mdstat,
        dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
                new->action_fd, new->info.state_fd);
 
+       /* reshape_position is set by mdadm in sysfs
+        * read this information for new arrays only (empty victim)
+        */
+       if ((victim == NULL) &&
+           (sysfs_get_str(mdi, NULL, "sync_action", buf, 40) > 0) &&
+           (strncmp(buf, "reshape", 7) == 0)) {
+               if (sysfs_get_ll(mdi, NULL, "reshape_position",
+                       &new->last_checkpoint) != 0)
+                       new->last_checkpoint = 0;
+               else {
+                       int data_disks = mdi->array.raid_disks;
+                       if (mdi->array.level == 4 || mdi->array.level == 5)
+                               data_disks--;
+                       if (mdi->array.level == 6)
+                               data_disks -= 2;
+
+                       new->last_checkpoint /= data_disks;
+               }
+               dprintf("mdmon: New monitored array is under reshape.\n"
+                       "       Last checkpoint is: %llu\n",
+                       new->last_checkpoint);
+       }
+
        sysfs_free(mdi);
 
        /* if everything checks out tell the metadata handler we want to
@@ -626,6 +764,7 @@ static void handle_message(struct supertype *container, struct metadata_update *
                mu->buf = msg->buf;
                msg->buf = NULL;
                mu->space = NULL;
+               mu->space_list = NULL;
                mu->next = NULL;
                if (container->ss->prepare_update)
                        container->ss->prepare_update(container, mu);
@@ -655,7 +794,13 @@ void read_sock(struct supertype *container)
                /* read and validate the message */
                if (receive_message(fd, &msg, tmo) == 0) {
                        handle_message(container, &msg);
-                       if (ack(fd, tmo) < 0)
+                       if (msg.len == 0) {
+                               /* ping reply with version */
+                               msg.buf = Version;
+                               msg.len = strlen(Version) + 1;
+                               if (send_message(fd, &msg, tmo) < 0)
+                                       terminate = 1;
+                       } else if (ack(fd, tmo) < 0)
                                terminate = 1;
                } else
                        terminate = 1;
index f334822bf697783ba01845b7aee814c7fff7989b..ff1e97360dc20cf06e586567e22d58e1c982da97 100644 (file)
--- a/mapfile.c
+++ b/mapfile.c
@@ -334,31 +334,20 @@ struct map_ent *map_by_name(struct map_ent **map, char *name)
  * version super_by_fd does this automatically, this routine is meant as
  * a supplement for guess_super()
  */
-static void set_member_info(struct supertype *st, struct mdstat_ent *ent)
+static char *get_member_info(struct mdstat_ent *ent)
 {
 
-       st->subarray[0] = '\0';
-
        if (ent->metadata_version == NULL ||
            strncmp(ent->metadata_version, "external:", 9) != 0)
-               return;
+               return NULL;
 
        if (is_subarray(&ent->metadata_version[9])) {
-               char version[strlen(ent->metadata_version)+1];
                char *subarray;
-               char *name = &version[10];
-
-               strcpy(version, ent->metadata_version);
-               subarray = strrchr(version, '/');
-               name = &version[10];
 
-               if (!subarray)
-                       return;
-               *subarray++ = '\0';
-
-               st->container_dev = devname2devnum(name);
-               strncpy(st->subarray, subarray, sizeof(st->subarray));
+               subarray = strrchr(ent->metadata_version, '/');
+               return subarray + 1;
        }
+       return NULL;
 }
 
 void RebuildMap(void)
@@ -391,8 +380,9 @@ void RebuildMap(void)
                        int dfd;
                        int ok;
                        struct supertype *st;
+                       char *subarray = NULL;
                        char *path;
-                       struct mdinfo info;
+                       struct mdinfo *info;
 
                        sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor);
                        dfd = dev_open(dn, O_RDONLY);
@@ -402,13 +392,14 @@ void RebuildMap(void)
                        if ( st == NULL)
                                ok = -1;
                        else {
-                               set_member_info(st, md);
+                               subarray = get_member_info(md);
                                ok = st->ss->load_super(st, dfd, NULL);
                        }
                        close(dfd);
                        if (ok != 0)
                                continue;
-                       st->ss->getinfo_super(st, &info);
+                       info = st->ss->container_content(st, subarray);
+
                        if (md->devnum >= 0)
                                path = map_dev(MD_MAJOR, md->devnum, 0);
                        else
@@ -428,7 +419,7 @@ void RebuildMap(void)
                                 *   find a unique name based on metadata name.
                                 *   
                                 */
-                               struct mddev_ident_s *match = conf_match(&info, st);
+                               struct mddev_ident *match = conf_match(info, st);
                                struct stat stb;
                                if (match && match->devname && match->devname[0] == '/') {
                                        path = match->devname;
@@ -446,13 +437,13 @@ void RebuildMap(void)
                                             st->ss->match_home(st, homehost) != 1) &&
                                            st->ss->match_home(st, "any") != 1 &&
                                            (require_homehost
-                                            || ! conf_name_is_free(info.name)))
+                                            || ! conf_name_is_free(info->name)))
                                                /* require a numeric suffix */
                                                unum = 0;
                                        else
                                                /* allow name to be used as-is if no conflict */
                                                unum = -1;
-                                       name = info.name;
+                                       name = info->name;
                                        if (!*name) {
                                                name = st->ss->name;
                                                if (!isdigit(name[strlen(name)-1]) &&
@@ -485,9 +476,10 @@ void RebuildMap(void)
                                }
                        }
                        map_add(&map, md->devnum,
-                               info.text_version,
-                               info.uuid, path);
+                               info->text_version,
+                               info->uuid, path);
                        st->ss->free_super(st);
+                       free(info);
                        break;
                }
                sysfs_free(sra);
diff --git a/md.4 b/md.4
index 29b7cb7eb1805dff36679c3e9678b484683fa4da..5e796393ed621f080628ab1f977968e3353cfe18 100644 (file)
--- a/md.4
+++ b/md.4
@@ -584,8 +584,12 @@ array (so the stripes are wider), changing the chunk size (so stripes
 are deeper or shallower), or changing the arrangement of data and
 parity (possibly changing the raid level, e.g. 1 to 5 or 5 to 6).
 
-As of Linux 2.6.17, md can reshape a raid5 array to have more
-devices.  Other possibilities may follow in future kernels.
+As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to
+have a different number of devices (more or fewer) and to have a
+different layout or chunk size.  It can also convert between these
+different RAID levels.  It can also convert between RAID0 and RAID10,
+and between RAID0 and RAID4 or RAID5.
+Other possibilities may follow in future kernels.
 
 During any stripe process there is a 'critical section' during which
 live data is being overwritten on disk.  For the operation of
@@ -595,6 +599,9 @@ and new number of devices).  After this critical section is passed,
 data is only written to areas of the array which no longer hold live
 data \(em the live data has already been located away.
 
+For a reshape which reduces the number of devices, the 'critical
+section' is at the end of the reshape process.
+
 md is not able to ensure data preservation if there is a crash
 (e.g. power failure) during the critical section.  If md is asked to
 start an array which failed during a critical section of restriping,
@@ -622,8 +629,7 @@ For operations that do not change the size of the array, like simply
 increasing chunk size, or converting RAID5 to RAID6 with one extra
 device, the entire process is the critical section.  In this case, the
 restripe will need to progress in stages, as a section is suspended,
-backed up,
-restriped, and released; this is not yet implemented.
+backed up, restriped, and released.
 
 .SS SYSFS INTERFACE
 Each block device appears as a directory in
diff --git a/md_p.h b/md_p.h
index 4594a36a89ff90217c6f0d9fbfcd87e3326c458e..6c79a3d12f347f54aa05ecfac01265f787e82dad 100644 (file)
--- a/md_p.h
+++ b/md_p.h
@@ -100,6 +100,7 @@ typedef struct mdp_device_descriptor_s {
  */
 #define MD_SB_CLEAN            0
 #define MD_SB_ERRORS           1
+#define MD_SB_BBM_ERRORS       2
 
 #define        MD_SB_BITMAP_PRESENT    8 /* bitmap may be present nearby */
 
index 08e425584725997b337455f43b5a14309c24b46f..d99bc05dbdcc9c5818da1cdcee1391e0f791cbd2 100644 (file)
@@ -5,7 +5,7 @@
 .\"   the Free Software Foundation; either version 2 of the License, or
 .\"   (at your option) any later version.
 .\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v3.1.4
+.TH MDADM 8 "" v3.2
 .SH NAME
 mdadm \- manage MD devices
 .I aka
@@ -122,9 +122,10 @@ missing, spare, or failed drives, so there is nothing to monitor.
 .B "Grow"
 Grow (or shrink) an array, or otherwise reshape it in some way.
 Currently supported growth options including changing the active size
-of component devices and changing the number of active devices in RAID
-levels 1/4/5/6, changing the RAID level between 1, 5, and 6, changing
-the chunk size and layout for RAID5 and RAID5, as well as adding or
+of component devices and changing the number of active devices in
+Linear and RAID levels 0/1/4/5/6,
+changing the RAID level between 0, 1, 5, and 6, and between 0 and 10,
+changing the chunk size and layout for RAID 0,4,5,6, as well as adding or
 removing a write-intent bitmap.
 
 .TP
@@ -414,6 +415,9 @@ If this is not specified
 size, though if there is a variance among the drives of greater than 1%, a warning is
 issued.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 This value can be set with
 .B \-\-grow
 for RAID level 1/4/5/6.  If the array was created with a size smaller
@@ -440,6 +444,39 @@ This value can not be used with
 .B CONTAINER
 metadata such as DDF and IMSM.
 
+.TP
+.BR \-Z ", " \-\-array\-size=
+This is only meaningful with
+.B \-\-grow
+and its effect is not persistent: when the array is stopped and
+restarted the default array size will be restored.
+
+Setting the array-size causes the array to appear smaller to programs
+that access the data.  This is particularly needed before reshaping an
+array so that it will be smaller.  As the reshape is not reversible,
+but setting the size with
+.B \-\-array-size
+is, it is required that the array size is reduced as appropriate
+before the number of devices in the array is reduced.
+
+Before reducing the size of the array you should make sure that space
+isn't needed.  If the device holds a filesystem, you would need to
+resize the filesystem to use less space.
+
+After reducing the array size you should check that the data stored in
+the device is still available.  If the device holds a filesystem, then
+an 'fsck' of the filesystem is a minimum requirement.  If there are
+problems the array can be made bigger again with no loss with another
+.B "\-\-grow \-\-array\-size="
+command.
+
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+A value of
+.B max
+restores the apparent size of the array to be whatever the real
+amount of available space is.
+
 .TP
 .BR \-c ", " \-\-chunk=
 Specify chunk size of kibibytes.  The default when creating an
@@ -447,6 +484,9 @@ array is 512KB.  To ensure compatibility with earlier versions, the
 default when Building and array with no persistent metadata is 64KB.
 This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 .TP
 .BR \-\-rounding=
 Specify rounding factor for a Linear array.  The size of each
@@ -619,6 +659,9 @@ When using an
 bitmap, the chunksize defaults to 64Meg, or larger if necessary to
 fit the bitmap into the available space.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 .TP
 .BR \-W ", " \-\-write\-mostly
 subsequent devices listed in a
@@ -667,37 +710,6 @@ or layout.  See the GROW MODE section below on RAID\-DEVICES CHANGES.
 The file must be stored on a separate device, not on the RAID array
 being reshaped.
 
-.TP
-.BR \-\-array-size= ", " \-Z
-This is only meaningful with
-.B \-\-grow
-and its effect is not persistent: when the array is stopped an
-restarted the default array size will be restored.
-
-Setting the array-size causes the array to appear smaller to programs
-that access the data.  This is particularly needed before reshaping an
-array so that it will be smaller.  As the reshape is not reversible,
-but setting the size with
-.B \-\-array-size
-is, it is required that the array size is reduced as appropriate
-before the number of devices in the array is reduced.
-
-A value of
-.B max
-restores the apparent size of the array to be whatever the real
-amount of available space is.
-
-Before reducing the size of the array you should make sure that space
-isn't needed.  If the device holds a filesystem, you would need to
-resize the filesystem to use less space.
-
-After reducing the array size you should check that the data stored in
-the device is still available.  If the device holds a filesystem, then
-an 'fsck' of the filesystem is a minimum requirement.  If there are
-problems the array can be made bigger again with no loss with another
-.B "\-\-grow \-\-array\-size="
-command.
-
 .TP
 .BR \-N ", " \-\-name=
 Set a
@@ -889,6 +901,28 @@ not as reliable as you would like.
 .BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}"
 See this option under Create and Build options.
 
+.TP
+.BR \-a ", " "\-\-add"
+This option can be used in Grow mode in two cases.
+
+If the target array is a Linear array, then
+.B \-\-add
+can be used to add one or more devices to the array.  They
+are simply catenated on to the end of the array.  Once added, the
+devices cannot be removed.
+
+If the
+.B \-\-raid\-disks
+option is being used to increase the number of devices in an array,
+then
+.B \-\-add
+can be used to add some extra devices to be included in the array.
+In most cases this is not needed as the extra devices can be added as
+spares first, and then the number of raid-disks can be changed.
+However for RAID0, it is not possible to add spares.  So to increase
+the number of devices in a RAID0, it is necessary to set the new
+number of devices, and to add the new devices, in the same command.
+
 .TP
 .BR \-b ", " \-\-bitmap=
 Specify the bitmap file that was given when the array was created.  If
@@ -900,15 +934,25 @@ bitmap, there is no need to specify this when assembling the array.
 .BR \-\-backup\-file=
 If
 .B \-\-backup\-file
-was used when requesting a grow, shrink, RAID level change or other
-reshape, and the system crashed during the critical section, then the
-same
+was used while reshaping an array (e.g. changing number of devices or
+chunk size) and the system crashed during the critical section, then the same
 .B \-\-backup\-file
 must be presented to
 .B \-\-assemble
 to allow possibly corrupted data to be restored, and the reshape
 to be completed.
 
+.TP
+.BR \-\-invalid\-backup
+If the file needed for the above option is not available for any
+reason an empty file can be given together with this option to
+indicate that the backup file is invalid.  In this case the data that
+was being rearranged at the time of the crash could be irrecoverably
+lost, but the rest of the array may still be recoverable.  This option
+should only be used as a last resort if there is no way to recover the
+backup file.
+
+
 .TP
 .BR \-U ", " \-\-update=
 Update the superblock on each device while assembling the array.  The
@@ -1055,7 +1099,7 @@ will report failure if these specifiers didn't find any match.
 .BR \-a ", " \-\-add
 hot-add listed devices.
 If a device appears to have recently been part of the array
-(possibly it failed or was removed) the device is re-added as describe
+(possibly it failed or was removed) the device is re\-added as describe
 in the next point.
 If that fails or the device was never part of the array, the device is
 added as a hot-spare.
@@ -1081,6 +1125,13 @@ When used on an array that has no metadata (i.e. it was built with
 it will be assumed that bitmap-based recovery is enough to make the
 device fully consistent with the array.
 
+When
+.B \-\-re\-add
+can be accompanied by
+.BR \-\-update=devicesize .
+See the description of this option when used in Assemble mode for an
+explanation of its use.
+
 If the device name given is
 .B missing
 then mdadm will try to find any device that looks like it should be
@@ -1325,6 +1376,16 @@ The device name given should be a kernel device name such as "sda",
 not a name in
 .IR /dev .
 
+.TP
+.BR \-\-path=
+Only used with \-\-fail.  The 'path' given will be recorded so that if
+a new device appears at the same location it can be automatically
+added to the same array.  This allows the failed device to be
+automatically replaced by a new device without metadata if it appears
+at specified path.   This option is normally only set by a
+.I udev
+script.
+
 .SH For Monitor mode:
 .TP
 .BR \-m ", " \-\-mail
@@ -1396,6 +1457,14 @@ alert for every array found at startup.  This alert gets mailed and
 passed to the alert program.  This can be used for testing that alert
 message do get through successfully.
 
+.TP
+.BR \-\-no\-sharing
+This inhibits the functionality for moving spares between arrays.
+Only one monitoring process started with
+.B \-\-scan
+but without this flag is allowed, otherwise the two could interfere
+with each other.
+
 .SH ASSEMBLE MODE
 
 .HP 12
@@ -1797,7 +1866,7 @@ or
 .B \-\-scan
 will cause the output to be less detailed and the format to be
 suitable for inclusion in
-.BR /etc/mdadm.conf .
+.BR mdadm.conf .
 The exit status of
 .I mdadm
 will normally be 0 unless
@@ -1874,7 +1943,7 @@ or
 is given, then multiple devices that are components of the one array
 are grouped together and reported in a single entry suitable
 for inclusion in
-.BR /etc/mdadm.conf .
+.BR mdadm.conf .
 
 Having
 .B \-\-scan
@@ -1946,6 +2015,8 @@ As well as reporting events,
 may move a spare drive from one array to another if they are in the
 same
 .B spare-group
+or
+.B domain
 and if the destination array has a failed drive but no spares.
 
 If any devices are listed on the command line,
@@ -2052,6 +2123,8 @@ notices that an array is degraded when it first sees the array.
 .B MoveSpare
 A spare drive has been moved from one array in a
 .B spare-group
+or
+.B domain
 to another to allow a failed drive to be replaced.
 (syslog priority: Info)
 
@@ -2104,6 +2177,7 @@ For
 to move spares from one array to another, the different arrays need to
 be labeled with the same
 .B spare-group
+or the spares must be allowed to migrate through matching POLICY domains
 in the configuration file.  The
 .B spare-group
 name can be any string; it is only necessary that different spare
@@ -2120,35 +2194,43 @@ first.
 If the removal succeeds but the adding fails, then it is added back to
 the original array.
 
+If the spare group for a degraded array is not defined,
+.I mdadm
+will look at the rules of spare migration specified by POLICY lines in
+.B mdadm.conf
+and then follow similar steps as above if a matching spare is found.
+
 .SH GROW MODE
 The GROW mode is used for changing the size or shape of an active
 array.
 For this to work, the kernel must support the necessary change.
-Various types of growth are being added during 2.6 development,
-including restructuring a RAID5 array to have more active devices.
+Various types of growth are being added during 2.6 development.
 
-Currently the only support available is to
+Currently the supported changes include
 .IP \(bu 4
-change the "size" attribute
-for RAID1, RAID5 and RAID6.
+change the "size" attribute for RAID1, RAID4, RAID5 and RAID6.
 .IP \(bu 4
-increase or decrease the "raid\-devices" attribute of RAID1, RAID5,
-and RAID6.
+increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4,
+RAID5, and RAID6.
 .IP \bu 4
-change the chunk-size and layout of RAID5 and RAID6.
+change the chunk-size and layout of RAID0, RAID4, RAID5 and RAID6.
 .IP \bu 4
-convert between RAID1 and RAID5, and between RAID5 and RAID6.
+convert between RAID1 and RAID5, between RAID5 and RAID6, between
+RAID0, RAID5, and RAID5, and between RAID0 and RAID10 (in the near-2 mode).
 .IP \(bu 4
 add a write-intent bitmap to any array which supports these bitmaps, or
 remove a write-intent bitmap from such an array.
 .PP
 
-GROW mode is not currently supported for
-.B CONTAINERS
-or arrays inside containers.
+Using GROW on containers is currently only support for Intel's IMSM
+container format.  The number of devices in a container can be
+increased - which affects all arrays in the container - or an array
+in a container can be converted between levels where those levels are
+supported by the container, and the conversion is on of those listed
+above.
 
 .SS SIZE CHANGES
-Normally when an array is built the "size" it taken from the smallest
+Normally when an array is built the "size" is taken from the smallest
 of the drives.  If all the small drives in an arrays are, one at a
 time, removed and replaced with larger drives, then you could have an
 array of large drives with only a small amount used.  In this
@@ -2187,6 +2269,16 @@ increase the number of devices in a RAID5 safely, including restarting
 an interrupted "reshape".  From 2.6.31, the Linux Kernel is able to
 increase or decrease the number of devices in a RAID5 or RAID6.
 
+From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4
+or RAID5.
+.I mdadm
+uses this functionality and the ability to add
+devices to a RAID4 to allow devices to be added to a RAID0.  When
+requested to do this,
+.I mdadm
+will convert the RAID0 to a RAID4, add the necessary disks and make
+the reshape happen, and then convert the RAID4 back to RAID0.
+
 When decreasing the number of devices, the size of the array will also
 decrease.  If there was data in the array, it could get destroyed and
 this is not reversible.  To help prevent accidents,
@@ -2302,9 +2394,10 @@ adds the device to the array and conditionally starts the array.
 
 Note that
 .I mdadm
-will only add devices to an array which were previously working
-(active or spare) parts of that array.  It does not currently support
-automatic inclusion of a new drive as a spare in some array.
+will normally only add devices to an array which were previously working
+(active or spare) parts of that array.  The support for automatic
+inclusion of a new drive as a spare in some array requires
+a configuration through POLICY in config file.
 
 The tests that
 .I mdadm
@@ -2336,7 +2429,8 @@ then only that style of metadata is accepted, otherwise
 .I mdadm
 finds any known version of metadata.  If no
 .I md
-metadata is found, the device is rejected.
+metadata is found, the device may be still added to an array
+as a spare if POLICY allows.
 
 .ig
 .IP +
@@ -2636,7 +2730,9 @@ avoid conflicted between multiple arrays that have the same name.  If
 .I mdadm
 can reasonably determine that the array really is meant for this host,
 either by a hostname in the metadata, or by the presence of the array
-in /etc/mdadm.conf, then it will leave off the suffix if possible.
+in
+.BR mdadm.conf ,
+then it will leave off the suffix if possible.
 Also if the homehost is specified as
 .B <ignore>
 .I mdadm
diff --git a/mdadm.c b/mdadm.c
index efba8dd84cd512a74aad7b70f26324935a2cd5b7..c44c43228067c536eeccd18d2480310bc0dac444 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -47,23 +47,24 @@ int main(int argc, char *argv[])
        char *layout_str = NULL;
        int raiddisks = 0;
        int sparedisks = 0;
-       struct mddev_ident_s ident;
+       struct mddev_ident ident;
        char *configfile = NULL;
        char *cp;
        char *update = NULL;
        int scan = 0;
-       char devmode = 0;
+       int devmode = 0;
        int runstop = 0;
        int readonly = 0;
        int write_behind = 0;
        int bitmap_fd = -1;
        char *bitmap_file = NULL;
        char *backup_file = NULL;
+       int invalid_backup = 0;
        int bitmap_chunk = UnSet;
        int SparcAdjust = 0;
-       mddev_dev_t devlist = NULL;
-       mddev_dev_t *devlistend = & devlist;
-       mddev_dev_t dv;
+       struct mddev_dev *devlist = NULL;
+       struct mddev_dev **devlistend = & devlist;
+       struct mddev_dev *dv;
        int devs_found = 0;
        int verbose = 0;
        int quiet = 0;
@@ -95,6 +96,7 @@ int main(int argc, char *argv[])
        int daemonise = 0;
        char *pidfile = NULL;
        int oneshot = 0;
+       int spare_sharing = 1;
        struct supertype *ss = NULL;
        int writemostly = 0;
        int re_add = 0;
@@ -103,6 +105,8 @@ int main(int argc, char *argv[])
        int rebuild_map = 0;
        int auto_update_home = 0;
        char *subarray = NULL;
+       char *remove_path = NULL;
+       char *udev_filename = NULL;
 
        int print_help = 0;
        FILE *outf;
@@ -132,12 +136,11 @@ int main(int argc, char *argv[])
                int newmode = mode;
                /* firstly, some mode-independent options */
                switch(opt) {
+               case HelpOptions:
+                       print_help = 2;
+                       continue;
                case 'h':
-                       if (option_index > 0 &&
-                           strcmp(long_options[option_index].name, "help-options")==0)
-                               print_help = 2;
-                       else
-                               print_help = 1;
+                       print_help = 1;
                        continue;
 
                case 'V':
@@ -151,9 +154,11 @@ int main(int argc, char *argv[])
                        continue;
 
                case 'b':
-                       if (mode == ASSEMBLE || mode == BUILD || mode == CREATE || mode == GROW ||
-                           mode == INCREMENTAL || mode == MANAGE)
+                       if (mode == ASSEMBLE || mode == BUILD || mode == CREATE
+                           || mode == GROW || mode == INCREMENTAL
+                           || mode == MANAGE)
                                break; /* b means bitmap */
+               case Brief:
                        brief = 1;
                        continue;
 
@@ -178,13 +183,16 @@ int main(int argc, char *argv[])
                 */
 
                switch(opt) {
-               case '@': /* just incase they say --manage */
+               case ManageOpt:
                        newmode = MANAGE;
                        shortopt = short_bitmap_options;
                        break;
                case 'a':
+               case Add:
                case 'r':
+               case Remove:
                case 'f':
+               case Fail:
                case ReAdd: /* re-add */
                        if (!mode) {
                                newmode = MANAGE;
@@ -204,7 +212,7 @@ int main(int argc, char *argv[])
                case AutoDetect:
                        newmode = AUTODETECT; break;
 
-               case '#':
+               case MiscOpt:
                case 'D':
                case 'E':
                case 'X':
@@ -214,18 +222,22 @@ int main(int argc, char *argv[])
                case 'o':
                case 'w':
                case 'W':
+               case WaitOpt:
                case Waitclean:
                case DetailPlatform:
                case KillSubarray:
                case UpdateSubarray:
                        if (opt == KillSubarray || opt == UpdateSubarray) {
                                if (subarray) {
-                                       fprintf(stderr, Name ": subarray can only be specified once\n");
+                                       fprintf(stderr, Name ": subarray can only"
+                                               " be specified once\n");
                                        exit(2);
                                }
                                subarray = optarg;
                        }
+               case UdevRules:
                case 'K': if (!mode) newmode = MISC; break;
+               case NoSharing: newmode = MONITOR; break;
                }
                if (mode && newmode == mode) {
                        /* everybody happy ! */
@@ -244,7 +256,7 @@ int main(int argc, char *argv[])
                        mode = newmode;
                } else {
                        /* special case of -c --help */
-                       if (opt == 'c' &&
+                       if ((opt == 'c' || opt == ConfigFile) &&
                            ( strncmp(optarg, "--h", 3)==0 ||
                              strncmp(optarg, "-h", 2)==0)) {
                                fputs(Help_config, stdout);
@@ -264,7 +276,6 @@ int main(int argc, char *argv[])
                                        dv->writemostly = writemostly;
                                        dv->re_add = re_add;
                                        dv->used = 0;
-                                       dv->content = NULL;
                                        dv->next = NULL;
                                        *devlistend = dv;
                                        devlistend = &dv->next;
@@ -287,8 +298,8 @@ int main(int argc, char *argv[])
 
                /* if we just set the mode, then done */
                switch(opt) {
-               case '@':
-               case '#':
+               case ManageOpt:
+               case MiscOpt:
                case 'A':
                case 'B':
                case 'C':
@@ -301,12 +312,14 @@ int main(int argc, char *argv[])
                if (opt == 1) {
                        /* an undecorated option - must be a device name.
                         */
-                       if (devs_found > 0 && mode == '@' && !devmode) {
-                               fprintf(stderr, Name ": Must give one of -a/-r/-f for subsequent devices at %s\n", optarg);
+                       if (devs_found > 0 && mode == MANAGE && !devmode) {
+                               fprintf(stderr, Name ": Must give one of -a/-r/-f"
+                                       " for subsequent devices at %s\n", optarg);
                                exit(2);
                        }
-                       if (devs_found > 0 && mode == 'G' && !devmode) {
-                               fprintf(stderr, Name ": Must give one of -a for devices do add: %s\n", optarg);
+                       if (devs_found > 0 && mode == GROW && !devmode) {
+                               fprintf(stderr, Name ": Must give -a/--add for"
+                                       " devices to add: %s\n", optarg);
                                exit(2);
                        }
                        dv = malloc(sizeof(*dv));
@@ -319,7 +332,6 @@ int main(int argc, char *argv[])
                        dv->writemostly = writemostly;
                        dv->re_add = re_add;
                        dv->used = 0;
-                       dv->content = NULL;
                        dv->next = NULL;
                        *devlistend = dv;
                        devlistend = &dv->next;
@@ -330,22 +342,27 @@ int main(int argc, char *argv[])
 
                /* We've got a mode, and opt is now something else which
                 * could depend on the mode */
-#define O(a,b) ((a<<8)|b)
+#define O(a,b) ((a<<16)|b)
                switch (O(mode,opt)) {
                case O(GROW,'c'):
+               case O(GROW,ChunkSize):
                case O(CREATE,'c'):
+               case O(CREATE,ChunkSize):
                case O(BUILD,'c'): /* chunk or rounding */
+               case O(BUILD,ChunkSize): /* chunk or rounding */
                        if (chunk) {
                                fprintf(stderr, Name ": chunk/rounding may only be specified once. "
                                        "Second value is %s.\n", optarg);
                                exit(2);
                        }
-                       chunk = strtol(optarg, &c, 10);
-                       if (!optarg[0] || *c || chunk<4 || ((chunk-1)&chunk)) {
+                       chunk = parse_size(optarg);
+                       if (chunk < 8 || ((chunk-1)&chunk)) {
                                fprintf(stderr, Name ": invalid chunk/rounding value: %s\n",
                                        optarg);
                                exit(2);
                        }
+                       /* Covert sectors to K */
+                       chunk /= 2;
                        continue;
 
 #if 0
@@ -371,8 +388,11 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MANAGE,'W'):
+               case O(MANAGE,WriteMostly):
                case O(BUILD,'W'):
+               case O(BUILD,WriteMostly):
                case O(CREATE,'W'):
+               case O(CREATE,WriteMostly):
                        /* set write-mostly for following devices */
                        writemostly = 1;
                        continue;
@@ -454,6 +474,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(GROW, 'p'): /* new layout */
+               case O(GROW, Layout):
                        if (layout_str) {
                                fprintf(stderr,Name ": layout may only be sent once.  "
                                        "Second value was %s\n", optarg);
@@ -464,7 +485,9 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(CREATE,'p'): /* raid5 layout */
+               case O(CREATE,Layout):
                case O(BUILD,'p'): /* faulty layout */
+               case O(BUILD,Layout):
                        if (layout != UnSet) {
                                fprintf(stderr,Name ": layout may only be sent once.  "
                                        "Second value was %s\n", optarg);
@@ -559,9 +582,13 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(CREATE,'a'):
+               case O(CREATE,Auto):
                case O(BUILD,'a'):
+               case O(BUILD,Auto):
                case O(INCREMENTAL,'a'):
-               case O(ASSEMBLE,'a'): /* auto-creation of device node */
+               case O(INCREMENTAL,Auto):
+               case O(ASSEMBLE,'a'):
+               case O(ASSEMBLE,Auto): /* auto-creation of device node */
                        autof = parse_auto(optarg, "--auto flag", 0);
                        continue;
 
@@ -572,10 +599,15 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(BUILD,'f'): /* force honouring '-n 1' */
+               case O(BUILD,Force): /* force honouring '-n 1' */
                case O(GROW,'f'): /* ditto */
+               case O(GROW,Force): /* ditto */
                case O(CREATE,'f'): /* force honouring of device list */
+               case O(CREATE,Force): /* force honouring of device list */
                case O(ASSEMBLE,'f'): /* force assembly */
+               case O(ASSEMBLE,Force): /* force assembly */
                case O(MISC,'f'): /* force zero */
+               case O(MISC,Force): /* force zero */
                        force=1;
                        continue;
 
@@ -616,6 +648,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(ASSEMBLE,'m'): /* super-minor for array */
+               case O(ASSEMBLE,SuperMinor):
                        if (ident.super_minor != UnSet) {
                                fprintf(stderr, Name ": super-minor cannot be set twice.  "
                                        "Second value: %s.\n", optarg);
@@ -635,12 +668,14 @@ int main(int argc, char *argv[])
                case O(ASSEMBLE,'U'): /* update the superblock */
                case O(MISC,'U'):
                        if (update) {
-                               fprintf(stderr, Name ": Can only update one aspect of superblock, both %s and %s given.\n",
+                               fprintf(stderr, Name ": Can only update one aspect"
+                                       " of superblock, both %s and %s given.\n",
                                        update, optarg);
                                exit(2);
                        }
                        if (mode == MISC && !subarray) {
-                               fprintf(stderr, Name ": Only subarrays can be updated in misc mode\n");
+                               fprintf(stderr, Name ": Only subarrays can be"
+                                       " updated in misc mode\n");
                                exit(2);
                        }
                        update = optarg;
@@ -664,13 +699,17 @@ int main(int argc, char *argv[])
                                continue;
                        if (strcmp(update, "byteorder")==0) {
                                if (ss) {
-                                       fprintf(stderr, Name ": must not set metadata type with --update=byteorder.\n");
+                                       fprintf(stderr,
+                                               Name ": must not set metadata"
+                                               " type with --update=byteorder.\n");
                                        exit(2);
                                }
                                for(i=0; !ss && superlist[i]; i++)
-                                       ss = superlist[i]->match_metadata_desc("0.swap");
+                                       ss = superlist[i]->match_metadata_desc(
+                                               "0.swap");
                                if (!ss) {
-                                       fprintf(stderr, Name ": INTERNAL ERROR cannot find 0.swap\n");
+                                       fprintf(stderr, Name ": INTERNAL ERROR"
+                                               " cannot find 0.swap\n");
                                        exit(2);
                                }
 
@@ -692,6 +731,27 @@ int main(int argc, char *argv[])
                "     'no-bitmap'\n");
                        exit(outf == stdout ? 0 : 2);
 
+               case O(MANAGE,'U'):
+                       /* update=devicesize is allowed with --re-add */
+                       if (devmode != 'a' || re_add != 1) {
+                               fprintf(stderr, Name "--update in Manage mode only"
+                                       " allowed with --re-add.\n");
+                               exit(1);
+                       }
+                       if (update) {
+                               fprintf(stderr, Name ": Can only update one aspect"
+                                       " of superblock, both %s and %s given.\n",
+                                       update, optarg);
+                               exit(2);
+                       }
+                       update = optarg;
+                       if (strcmp(update, "devicesize") != 0) {
+                               fprintf(stderr, Name ": only 'devicesize' can be"
+                                       " updated with --re-add\n");
+                               exit(2);
+                       }
+                       continue;
+
                case O(INCREMENTAL,NoDegraded):
                        fprintf(stderr, Name ": --no-degraded is deprecated in Incremental mode\n");
                case O(ASSEMBLE,NoDegraded): /* --no-degraded */
@@ -699,10 +759,14 @@ int main(int argc, char *argv[])
                                       * so we overload slightly */
                        continue;
 
-               case O(ASSEMBLE,'c'): /* config file */
+               case O(ASSEMBLE,'c'):
+               case O(ASSEMBLE,ConfigFile):
                case O(INCREMENTAL, 'c'):
+               case O(INCREMENTAL, ConfigFile):
                case O(MISC, 'c'):
+               case O(MISC, ConfigFile):
                case O(MONITOR,'c'):
+               case O(MONITOR,ConfigFile):
                        if (configfile) {
                                fprintf(stderr, Name ": configfile cannot be set twice.  "
                                        "Second value is %s.\n", optarg);
@@ -720,6 +784,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'m'): /* mail address */
+               case O(MONITOR,EMail):
                        if (mailaddr)
                                fprintf(stderr, Name ": only specify one mailaddress. %s ignored.\n",
                                        optarg);
@@ -728,6 +793,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'p'): /* alert program */
+               case O(MONITOR,ProgramOpt): /* alert program */
                        if (program)
                                fprintf(stderr, Name ": only specify one alter program. %s ignored.\n",
                                        optarg);
@@ -736,6 +802,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'r'): /* rebuild increments */
+               case O(MONITOR,Increment):
                        increments = atoi(optarg);
                        if (increments>99 || increments<1) {
                                fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n");
@@ -760,6 +827,7 @@ int main(int argc, char *argv[])
                        }
                        continue;
                case O(MONITOR,'f'): /* daemonise */
+               case O(MONITOR,Fork):
                        daemonise = 1;
                        continue;
                case O(MONITOR,'i'): /* pid */
@@ -779,12 +847,16 @@ int main(int argc, char *argv[])
                        openlog("mdadm", LOG_PID, SYSLOG_FACILITY);
                        dosyslog = 1;
                        continue;
-
+               case O(MONITOR, NoSharing):
+                       spare_sharing = 0;
+                       continue;
                        /* now the general management options.  Some are applicable
                         * to other modes. None have arguments.
                         */
                case O(GROW,'a'):
-               case O(MANAGE,'a'): /* add a drive */
+               case O(GROW,Add):
+               case O(MANAGE,'a'):
+               case O(MANAGE,Add): /* add a drive */
                        devmode = 'a';
                        re_add = 0;
                        continue;
@@ -793,10 +865,14 @@ int main(int argc, char *argv[])
                        re_add = 1;
                        continue;
                case O(MANAGE,'r'): /* remove a drive */
+               case O(MANAGE,Remove):
                        devmode = 'r';
                        continue;
                case O(MANAGE,'f'): /* set faulty */
-               case O(INCREMENTAL,'f'): /* r for incremental is taken, use f
+               case O(MANAGE,Fail):
+               case O(INCREMENTAL,'f'):
+               case O(INCREMENTAL,Remove):
+               case O(INCREMENTAL,Fail): /* r for incremental is taken, use f
                                          * even though we will both fail and
                                          * remove the device */
                        devmode = 'f';
@@ -833,6 +909,7 @@ int main(int argc, char *argv[])
                case O(MISC,'o'):
                case O(MISC,'w'):
                case O(MISC,'W'):
+               case O(MISC, WaitOpt):
                case O(MISC, Waitclean):
                case O(MISC, DetailPlatform):
                case O(MISC, KillSubarray):
@@ -854,6 +931,20 @@ int main(int argc, char *argv[])
                        }
                        devmode = opt;
                        continue;
+               case O(MISC, UdevRules):
+                      if (devmode && devmode != opt) {
+                               fprintf(stderr, Name ": --udev-rules must"
+                                      " be the only option.\n");
+                      } else {
+                              if (udev_filename)
+                                      fprintf(stderr, Name ": only specify one udev "
+                                              "rule filename. %s ignored.\n",
+                                              optarg);
+                              else
+                                      udev_filename = optarg;
+                      }
+                      devmode = opt;
+                      continue;
                case O(MISC,'t'):
                        test = 1;
                        continue;
@@ -867,6 +958,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */
+               case O(ASSEMBLE,Bitmap):
                        if (!optarg) {
                                fprintf(stderr, Name ": bitmap file needed with -b in --assemble mode\n");
                                exit(2);
@@ -895,8 +987,17 @@ int main(int argc, char *argv[])
                        backup_file = optarg;
                        continue;
 
+               case O(ASSEMBLE, InvalidBackup):
+                       /* Acknowledge that the backupfile is invalid, but ask
+                        * to continue anyway
+                        */
+                       invalid_backup = 1;
+                       continue;
+
                case O(BUILD,'b'):
-               case O(CREATE,'b'): /* here we create the bitmap */
+               case O(BUILD,Bitmap):
+               case O(CREATE,'b'):
+               case O(CREATE,Bitmap): /* here we create the bitmap */
                        if (strcmp(optarg, "none") == 0) {
                                fprintf(stderr, Name ": '--bitmap none' only"
                                        " support for --grow\n");
@@ -904,6 +1005,7 @@ int main(int argc, char *argv[])
                        }
                        /* FALL THROUGH */
                case O(GROW,'b'):
+               case O(GROW,Bitmap):
                        if (strcmp(optarg, "internal")== 0 ||
                            strcmp(optarg, "none")== 0 ||
                            strchr(optarg, '/') != NULL) {
@@ -918,15 +1020,16 @@ int main(int argc, char *argv[])
                case O(GROW,BitmapChunk):
                case O(BUILD,BitmapChunk):
                case O(CREATE,BitmapChunk): /* bitmap chunksize */
-                       bitmap_chunk = strtol(optarg, &c, 10);
-                       if (!optarg[0] || *c || bitmap_chunk < 0 ||
-                                       bitmap_chunk & (bitmap_chunk - 1)) {
-                               fprintf(stderr, Name ": invalid bitmap chunksize: %s\n",
-                                               optarg);
+                       bitmap_chunk = parse_size(optarg);
+                       if (bitmap_chunk < 0 ||
+                           bitmap_chunk & (bitmap_chunk - 1)) {
+                               fprintf(stderr,
+                                       Name ": invalid bitmap chunksize: %s\n",
+                                       optarg);
                                exit(2);
                        }
-                       /* convert K to B, chunk of 0K means 512B */
-                       bitmap_chunk = bitmap_chunk ? bitmap_chunk * 1024 : 512;
+                       /* convert sectors to B, chunk of 0 means 512B */
+                       bitmap_chunk = bitmap_chunk ? bitmap_chunk * 512 : 512;
                        continue;
 
                case O(GROW, WriteBehind):
@@ -944,8 +1047,12 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(INCREMENTAL, 'r'):
+               case O(INCREMENTAL, RebuildMapOpt):
                        rebuild_map = 1;
                        continue;
+               case O(INCREMENTAL, IncrementalPath):
+                       remove_path = optarg;
+                       continue;
                }
                /* We have now processed all the valid options. Anything else is
                 * an error
@@ -1089,7 +1196,8 @@ int main(int argc, char *argv[])
                        rv = Manage_ro(devlist->devname, mdfd, readonly);
                if (!rv && devs_found>1)
                        rv = Manage_subdevs(devlist->devname, mdfd,
-                                           devlist->next, verbose-quiet, test);
+                                           devlist->next, verbose-quiet, test,
+                                           update);
                if (!rv && readonly < 0)
                        rv = Manage_ro(devlist->devname, mdfd, readonly);
                if (!rv && runstop)
@@ -1099,7 +1207,7 @@ int main(int argc, char *argv[])
                if (devs_found == 1 && ident.uuid_set == 0 &&
                    ident.super_minor == UnSet && ident.name[0] == 0 && !scan ) {
                        /* Only a device has been given, so get details from config file */
-                       mddev_ident_t array_ident = conf_get_ident(devlist->devname);
+                       struct mddev_ident *array_ident = conf_get_ident(devlist->devname);
                        if (array_ident == NULL) {
                                fprintf(stderr, Name ": %s not identified in config file.\n",
                                        devlist->devname);
@@ -1110,14 +1218,14 @@ int main(int argc, char *argv[])
                                if (array_ident->autof == 0)
                                        array_ident->autof = autof;
                                rv |= Assemble(ss, devlist->devname, array_ident,
-                                              NULL, backup_file,
+                                              NULL, backup_file, invalid_backup,
                                               readonly, runstop, update,
                                               homehost, require_homehost,
                                               verbose-quiet, force);
                        }
                } else if (!scan)
                        rv = Assemble(ss, devlist->devname, &ident,
-                                     devlist->next, backup_file,
+                                     devlist->next, backup_file, invalid_backup,
                                      readonly, runstop, update,
                                      homehost, require_homehost,
                                      verbose-quiet, force);
@@ -1131,7 +1239,7 @@ int main(int argc, char *argv[])
                                exit(1);
                        }
                        for (dv = devlist ; dv ; dv=dv->next) {
-                               mddev_ident_t array_ident = conf_get_ident(dv->devname);
+                               struct mddev_ident *array_ident = conf_get_ident(dv->devname);
                                if (array_ident == NULL) {
                                        fprintf(stderr, Name ": %s not identified in config file.\n",
                                                dv->devname);
@@ -1141,14 +1249,14 @@ int main(int argc, char *argv[])
                                if (array_ident->autof == 0)
                                        array_ident->autof = autof;
                                rv |= Assemble(ss, dv->devname, array_ident,
-                                              NULL, backup_file,
+                                              NULL, backup_file, invalid_backup,
                                               readonly, runstop, update,
                                               homehost, require_homehost,
                                               verbose-quiet, force);
                        }
                } else {
-                       mddev_ident_t a, array_list =  conf_get_ident(NULL);
-                       mddev_dev_t devlist = conf_get_devs();
+                       struct mddev_ident *a, *array_list =  conf_get_ident(NULL);
+                       struct mddev_dev *devlist = conf_get_devs();
                        int cnt = 0;
                        int failures, successes;
                        if (devlist == NULL) {
@@ -1182,7 +1290,7 @@ int main(int argc, char *argv[])
                                
                                        r = Assemble(ss, a->devname,
                                                     a,
-                                                    NULL, NULL,
+                                                    NULL, NULL, 0,
                                                     readonly, runstop, NULL,
                                                     homehost, require_homehost,
                                                     verbose-quiet, force);
@@ -1204,12 +1312,12 @@ int main(int argc, char *argv[])
                                int acnt;
                                ident.autof = autof;
                                do {
-                                       mddev_dev_t devlist = conf_get_devs();
+                                       struct mddev_dev *devlist = conf_get_devs();
                                        acnt = 0;
                                        do {
                                                rv2 = Assemble(ss, NULL,
                                                               &ident,
-                                                              devlist, NULL,
+                                                              devlist, NULL, 0,
                                                               readonly, runstop, NULL,
                                                               homehost, require_homehost,
                                                               verbose-quiet, force);
@@ -1231,12 +1339,15 @@ int main(int argc, char *argv[])
                                        do {
                                                acnt = 0;
                                                do {
-                                                       rv2 = Assemble(ss, NULL,
-                                                                      &ident,
-                                                                      NULL, NULL,
-                                                                      readonly, runstop, "homehost",
-                                                                      homehost, require_homehost,
-                                                                      verbose-quiet, force);
+                                                       rv2 = Assemble(
+                                                               ss, NULL,
+                                                               &ident,
+                                                               NULL, NULL, 0,
+                                                               readonly, runstop,
+                                                               "homehost",
+                                                               homehost,
+                                                               require_homehost,
+                                                               verbose-quiet, force);
                                                        if (rv2==0) {
                                                                cnt++;
                                                                acnt++;
@@ -1398,6 +1509,8 @@ int main(int argc, char *argv[])
                                                free_mdstat(ms);
                                        } while (!last && err);
                                        if (err) rv |= 1;
+                               } else if (devmode == UdevRules) {
+                                       rv = Write_rules(udev_filename);
                                } else {
                                        fprintf(stderr, Name ": No devices given.\n");
                                        exit(2);
@@ -1427,6 +1540,7 @@ int main(int argc, char *argv[])
                                case 'X':
                                        rv |= ExamineBitmap(dv->devname, brief, ss); continue;
                                case 'W':
+                               case WaitOpt:
                                        rv |= Wait(dv->devname); continue;
                                case Waitclean:
                                        rv |= WaitClean(dv->devname, -1, verbose-quiet); continue;
@@ -1473,7 +1587,7 @@ int main(int argc, char *argv[])
                        break;
                }
                if (delay == 0) {
-                       if (get_linux_version() > 20616)
+                       if (get_linux_version() > 2006016)
                                /* mdstat responds to poll */
                                delay = 1000;
                        else
@@ -1481,7 +1595,7 @@ int main(int argc, char *argv[])
                }
                rv= Monitor(devlist, mailaddr, program,
                            delay?delay:60, daemonise, scan, oneshot,
-                           dosyslog, test, pidfile, increments);
+                           dosyslog, test, pidfile, increments, spare_sharing);
                break;
 
        case GROW:
@@ -1516,22 +1630,26 @@ int main(int argc, char *argv[])
                                break;
                        }
                }
-               if (devs_found > 1) {
-
+               if (devs_found > 1 && raiddisks == 0) {
                        /* must be '-a'. */
-                       if (size >= 0 || raiddisks || chunk || layout_str != NULL || bitmap_file) {
-                               fprintf(stderr, Name ": --add cannot be used with other geometry changes in --grow mode\n");
+                       if (size >= 0 || chunk || layout_str != NULL || bitmap_file) {
+                               fprintf(stderr, Name ": --add cannot be used with "
+                                       "other geometry changes in --grow mode\n");
                                rv = 1;
                                break;
                        }
                        for (dv=devlist->next; dv ; dv=dv->next) {
-                               rv = Grow_Add_device(devlist->devname, mdfd, dv->devname);
+                               rv = Grow_Add_device(devlist->devname, mdfd,
+                                                    dv->devname);
                                if (rv)
                                        break;
                        }
                } else if (bitmap_file) {
-                       if (size >= 0 || raiddisks || chunk || layout_str != NULL) {
-                               fprintf(stderr, Name ": --bitmap changes cannot be used with other geometry changes in --grow mode\n");
+                       if (size >= 0 || raiddisks || chunk ||
+                           layout_str != NULL || devs_found) {
+                               fprintf(stderr, Name ": --bitmap changes cannot be "
+                                       "used with other geometry changes "
+                                       "in --grow mode\n");
                                rv = 1;
                                break;
                        }
@@ -1542,7 +1660,9 @@ int main(int argc, char *argv[])
                } else if (size >= 0 || raiddisks != 0 || layout_str != NULL
                           || chunk != 0 || level != UnSet) {
                        rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file,
-                                         size, level, layout_str, chunk, raiddisks);
+                                         size, level, layout_str, chunk, raiddisks,
+                                         devlist->next,
+                                         force);
                } else if (array_size < 0)
                        fprintf(stderr, Name ": no changes to --grow\n");
                break;
@@ -1577,12 +1697,13 @@ int main(int argc, char *argv[])
                        rv = 1;
                        break;
                }
-               if (devmode == 'f') {
-                       rv = IncrementalRemove(devlist->devname, verbose-quiet);
-                       break;
-               }
-               rv = Incremental(devlist->devname, verbose-quiet, runstop,
-                                ss, homehost, require_homehost, autof);
+               if (devmode == 'f')
+                       rv = IncrementalRemove(devlist->devname, remove_path,
+                                              verbose-quiet);
+               else
+                       rv = Incremental(devlist->devname, verbose-quiet,
+                                        runstop, ss, homehost,
+                                        require_homehost, autof);
                break;
        case AUTODETECT:
                autodetect();
index e677ba9ec4ddd65c0b2377f7455875407a35f79d..65b00293723c7a5834f400456b4391293a7ebef4 100644 (file)
@@ -418,6 +418,86 @@ The known metadata types are
 .BR ddf ,
 .BR imsm .
 
+.TP
+.B POLICY
+This is used to specify what automatic behavior is allowed on devices 
+newly appearing in the system and provides a way of marking spares that can
+be moved to other arrays as well as the migration domains.
+.I Domain
+can be defined through
+.I policy
+line by specifying a domain name for a number of paths from
+.BR /dev/disk/by-path/ .
+A device may belong to several domains. The domain of an array is a union
+of domains of all devices in that array.  A spare can be automatically
+moved from one array to another if the set of the destination array's
+.I domains
+contains all the
+.I domains
+of the new disk or if both arrays have the same
+.IR spare-group .
+
+To update hot plug configuration it is necessary to execute
+.B mdadm \-\-udev\-rules
+command after changing the config file
+
+Key words used in the
+.I POLICY
+line and supported values are:
+
+.RS 7
+.TP
+.B domain=
+any arbitrary string
+.TP
+.B metadata=
+0.9 1.x ddf or imsm
+.TP
+.B path=
+file glob matching anything from
+.B /dev/disk/by-path
+.B type=
+either 
+.B disk
+or
+.BR part .
+.TP
+.B action=
+include, re-add, spare, spare-same-slot, or force-spare
+
+.P
+The
+.I action
+item determines the automatic behavior allowed for devices matching the
+.I path
+and
+.I type
+in the same line.  If a device matches several lines with different
+.I  actions
+then the most permissive will apply. The ordering of policy lines
+is irrelevant to the end result.
+.TP
+.B include
+allows adding a disk to an array if metadata on that disk matches that array
+.TP
+.B re-add
+will include the device in the array if it appears to be a current member
+or a member that was recently removed
+.TP
+.B spare
+as above and additionally: if the device is bare it can
+become a spare if there is any array that it is a candidate for based
+on domains and metadata.
+.TP
+.B spare\-same\-slot
+as above and additionally if given slot was used by an array that went
+degraded recently and the device plugged in has no metadata then it will
+be automatically added to that array (or it's container)
+.TP
+.B force-spare
+as above and the disk will become a spare in remaining cases
+.RE
+
 .SH EXAMPLE
 DEVICE /dev/sd[bcdjkl]1
 .br
@@ -456,7 +536,25 @@ ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977
 ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b
 .br
            auto=part
-
+.br
+POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-*
+.br
+           action=spare
+.br
+POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]*
+.br
+           action=include
+.br
+# One domain comprising of devices attached to specified paths is defined.
+.br
+# Bare device matching first path will be made an imsm spare on hot plug.
+.br
+# If more than one array is created on devices belonging to domain1 and
+.br
+# one of them becomes degraded, then any imsm spare matching any path for
+.br
+# given domain name can be migrated.
+.br
 MAILADDR root@mydomain.tld
 .br
 PROGRAM /usr/sbin/handle\-mdadm\-events
diff --git a/mdadm.h b/mdadm.h
index fcb0e3b32970634ffa80979abdb74290165f1994..d3ed50a6d82e8811f5008c4ec92a2326ba1a797c 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -64,6 +64,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
 #endif
 
+#define DEFAULT_CHUNK 512
 #define DEFAULT_BITMAP_CHUNK 4096
 #define DEFAULT_BITMAP_DELAY 5
 #define DEFAULT_MAX_WRITE_BEHIND 256
@@ -93,6 +94,14 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define MDMON_DIR "/dev/.mdadm/"
 #endif /* MDMON_DIR */
 
+/* FAILED_SLOTS is where to save files storing recent removal of array
+ * member in order to allow future reuse of disk inserted in the same
+ * slot for array recovery
+ */
+#ifndef FAILED_SLOTS_DIR
+#define FAILED_SLOTS_DIR "/dev/.mdadm/failed-slots"
+#endif /* FAILED_SLOTS */
+
 #include       "md_u.h"
 #include       "md_p.h"
 #include       "bitmap.h"
@@ -261,17 +270,16 @@ extern char Version[], Usage[], Help[], OptionHelp[],
        Help_manage[], Help_misc[], Help_monitor[], Help_config[];
 
 /* for option that don't have short equivilents, we assign arbitrary
- * small numbers.  '1' means an undecorated option, so we start at '2'.
- * (note we must stop before we get to 65 i.e. 'A')
+ * numbers later than any 'short' character option.
  */
 enum special_options {
-       AssumeClean = 2,
+       AssumeClean = 300,
        BitmapChunk,
        WriteBehind,
        ReAdd,
        NoDegraded,
        Sparc22,
-       BackupFile, /* 8 */
+       BackupFile,
        HomeHost,
        AutoHomeHost,
        Symlinks,
@@ -279,7 +287,32 @@ enum special_options {
        Waitclean,
        DetailPlatform,
        KillSubarray,
-       UpdateSubarray, /* 16 */
+       UpdateSubarray,
+       IncrementalPath,
+       NoSharing,
+       HelpOptions,
+       Brief,
+       ManageOpt,
+       Add,
+       Remove,
+       Fail,
+       MiscOpt,
+       WaitOpt,
+       ConfigFile,
+       ChunkSize,
+       WriteMostly,
+       Layout,
+       Auto,
+       Force,
+       SuperMinor,
+       EMail,
+       ProgramOpt,
+       Increment,
+       Fork,
+       Bitmap,
+       RebuildMapOpt,
+       InvalidBackup,
+       UdevRules,
 };
 
 /* structures read from config file */
@@ -293,7 +326,7 @@ enum special_options {
  * devices is considered
  */
 #define UnSet (0xfffe)
-typedef struct mddev_ident_s {
+struct mddev_ident {
        char    *devname;
 
        int     uuid_set;
@@ -321,26 +354,24 @@ typedef struct mddev_ident_s {
                                 */
        char    *member;        /* subarray within a container */
 
-       struct mddev_ident_s *next;
+       struct mddev_ident *next;
        union {
                /* fields needed by different users of this structure */
                int assembled;  /* set when assembly succeeds */
        };
-} *mddev_ident_t;
+};
 
 /* List of device names - wildcards expanded */
-typedef struct mddev_dev_s {
+struct mddev_dev {
        char *devname;
-       char disposition;       /* 'a' for add, 'r' for remove, 'f' for fail.
+       int disposition;        /* 'a' for add, 'r' for remove, 'f' for fail.
                                 * Not set for names read from .config
                                 */
        char writemostly;       /* 1 for 'set writemostly', 2 for 'clear writemostly' */
        char re_add;
        char used;              /* set when used */
-       struct mdinfo *content; /* If devname is a container, this might list
-                                * the remaining member arrays. */
-       struct mddev_dev_s *next;
-} *mddev_dev_t;
+       struct mddev_dev *next;
+};
 
 typedef struct mapping {
        char *name;
@@ -355,7 +386,7 @@ struct mdstat_ent {
        char            *level;
        char            *pattern; /* U or up, _ for down */
        int             percent; /* -1 if no resync */
-       int             resync; /* 1 if resync, 0 if recovery */
+       int             resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */
        int             devcnt;
        int             raid_disks;
        char *          metadata_version;
@@ -372,6 +403,7 @@ extern void mdstat_wait(int seconds);
 extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
 extern int mddev_busy(int devnum);
 extern struct mdstat_ent *mdstat_by_component(char *name);
+extern struct mdstat_ent *mdstat_by_subdev(char *subdev, int container);
 
 struct map_ent {
        struct map_ent *next;
@@ -435,6 +467,8 @@ extern int sysfs_fd_get_ll(int fd, unsigned long long *val);
 extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                        char *name, unsigned long long *val);
 extern int sysfs_fd_get_str(int fd, char *val, int size);
+extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev,
+                                    char *name);
 extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, char *val, int size);
 extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
@@ -442,8 +476,28 @@ extern int sysfs_set_array(struct mdinfo *info, int vers);
 extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume);
 extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
 extern int sysfs_unique_holder(int devnum, long rdev);
+extern int sysfs_freeze_array(struct mdinfo *sra);
 extern int load_sys(char *path, char *buf);
-
+extern int reshape_prepare_fdlist(char *devname,
+                                 struct mdinfo *sra,
+                                 int raid_disks,
+                                 int nrdisks,
+                                 unsigned long blocks,
+                                 char *backup_file,
+                                 int *fdlist,
+                                 unsigned long long *offsets);
+extern void reshape_free_fdlist(int *fdlist,
+                               unsigned long long *offsets,
+                               int size);
+extern int reshape_open_backup_file(char *backup,
+                                   int fd,
+                                   char *devname,
+                                   long blocks,
+                                   int *fdlist,
+                                   unsigned long long *offsets,
+                                   int restart);
+extern unsigned long compute_backup_blocks(int nchunk, int ochunk,
+                                          unsigned int ndata, unsigned int odata);
 
 extern int save_stripes(int *source, unsigned long long *offsets,
                        int raid_disks, int chunk_size, int level, int layout,
@@ -470,6 +524,31 @@ extern char *map_dev(int major, int minor, int create);
 struct active_array;
 struct metadata_update;
 
+
+/* 'struct reshape' records the intermediate states
+ * a general reshape.
+ * The starting geometry is converted to the 'before' geometry
+ * by at most an atomic level change. They could be the same.
+ * Similarly the 'after' geometry is converted to the final
+ * geometry by at most a level change.
+ * Note that 'before' and 'after' must have the same level.
+ * 'blocks' is the minimum number of sectors for a reshape unit.
+ * This will be a multiple of the stripe size in each of the
+ * 'before' and 'after' geometries.
+ * If 'blocks' is 0, no restriping is necessary.
+ */
+struct reshape {
+       int level;
+       int parity; /* number of parity blocks/devices */
+       struct {
+               int layout;
+               int data_disks;
+       } before, after;
+       unsigned long long backup_blocks;
+       unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/
+       unsigned long long new_size; /* New size of array in sectors */
+};
+
 /* A superswitch provides entry point the a metadata handler.
  *
  * The super_switch primarily operates on some "metadata" that
@@ -533,9 +612,13 @@ extern struct superswitch {
         * The particular device should be:
         *   The last device added by add_to_super
         *   The device the metadata was loaded from by load_super
+        * If 'map' is present, then it is an array raid_disks long
+        * (raid_disk must already be set and correct) and it is filled
+        * with 1 for slots that are thought to be active and 0 for slots which
+        * appear to be failed/missing.
         */
-       void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
-
+       void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map);
+       struct mdinfo *(*getinfo_super_disks)(struct supertype *st);
        /* Check if the given metadata is flagged as belonging to "this"
         * host.  0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
         */
@@ -580,7 +663,11 @@ extern struct superswitch {
         * when hot-adding a spare.
         */
        int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
-                            int fd, char *devname);
+                           int fd, char *devname);
+       /* update the metadata to delete a device,
+        * when hot-removing.
+        */
+       int (*remove_from_super)(struct supertype *st, mdu_disk_info_t *dinfo);
 
        /* Write metadata to one device when fixing problems or adding
         * a new device.
@@ -592,8 +679,10 @@ extern struct superswitch {
        int (*write_init_super)(struct supertype *st);
        int (*compare_super)(struct supertype *st, struct supertype *tst);
        int (*load_super)(struct supertype *st, int fd, char *devname);
+       int (*load_container)(struct supertype *st, int fd, char *devname);
        struct supertype * (*match_metadata_desc)(char *arg);
        __u64 (*avail_size)(struct supertype *st, __u64 size);
+       unsigned long long (*min_acceptable_spare_size)(struct supertype *st);
        int (*add_internal_bitmap)(struct supertype *st, int *chunkp,
                                   int delay, int write_behind,
                                   unsigned long long size, int may_change, int major);
@@ -608,23 +697,36 @@ extern struct superswitch {
         * added to validate changing size and new devices.  If there are
         * inter-device dependencies, it should record sufficient details
         * so these can be validated.
-        * Both 'size' and '*freesize' are in sectors.  chunk is bytes.
+        * Both 'size' and '*freesize' are in sectors.  chunk is KiB.
         */
        int (*validate_geometry)(struct supertype *st, int level, int layout,
                                 int raiddisks,
-                                int chunk, unsigned long long size,
+                                int *chunk, unsigned long long size,
                                 char *subdev, unsigned long long *freesize,
                                 int verbose);
 
-       struct mdinfo *(*container_content)(struct supertype *st);
-       /* Allow a metadata handler to override mdadm's default layouts */
-       int (*default_layout)(int level); /* optional */
-       /* query the supertype for default chunk size */
-       int (*default_chunk)(struct supertype *st); /* optional */
+       struct mdinfo *(*container_content)(struct supertype *st, char *subarray);
+       /* query the supertype for default geometry */
+       void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */
        /* Permit subarray's to be deleted from inactive containers */
        int (*kill_subarray)(struct supertype *st); /* optional */
        /* Permit subarray's to be modified */
-       int (*update_subarray)(struct supertype *st, char *update, mddev_ident_t ident); /* optional */
+       int (*update_subarray)(struct supertype *st, char *subarray,
+                              char *update, struct mddev_ident *ident); /* optional */
+       /* Check if reshape is supported for this external format.
+        * st is obtained from super_by_fd() where st->subarray[0] is
+        * initialized to indicate if reshape is being performed at the
+        * container or subarray level
+        */
+       int (*reshape_super)(struct supertype *st, long long size, int level,
+                            int layout, int chunksize, int raid_disks,
+                            int delta_disks, char *backup, char *dev,
+                            int verbose); /* optional */
+       int (*manage_reshape)( /* optional */
+               int afd, struct mdinfo *sra, struct reshape *reshape,
+               struct supertype *st, unsigned long blocks,
+               int *fds, unsigned long long *offsets,
+               int dests, int *destfd, unsigned long long *destoffsets);
 
 /* for mdmon */
        int (*open_new)(struct supertype *c, struct active_array *a,
@@ -666,18 +768,38 @@ extern struct superswitch {
         */
        struct mdinfo *(*activate_spare)(struct active_array *a,
                                         struct metadata_update **updates);
+       /*
+        * Return statically allocated string that represents metadata specific
+        * controller domain of the disk. The domain is used in disk domain
+        * matching functions. Disks belong to the same domain if the they have
+        * the same domain from mdadm.conf and belong the same metadata domain.
+        * Returning NULL or not providing this handler means that metadata
+        * does not distinguish the differences between disks that belong to
+        * different controllers. They are in the domain specified by
+        * configuration file (mdadm.conf).
+        * In case when the metadata has the notion of domains based on disk
+        * it shall return NULL for disks that do not belong to the controller
+        * the supported domains. Such disks will form another domain and won't
+        * be mixed with supported ones.
+        */
+       const char *(*get_disk_controller_domain)(const char *path);
 
        int swapuuid; /* true if uuid is bigending rather than hostendian */
        int external;
        const char *name; /* canonical metadata name */
-} super0, super1, super_ddf, *superlist[];
+} *superlist[];
 
-extern struct superswitch super_imsm;
+extern struct superswitch super0, super1;
+extern struct superswitch super_imsm, super_ddf;
+extern struct superswitch mbr, gpt;
 
 struct metadata_update {
        int     len;
        char    *buf;
        void    *space; /* allocated space that monitor will use */
+       void    **space_list; /* list of allocated spaces that monitor can
+                              * use or that it returned.
+                              */
        struct metadata_update *next;
 };
 
@@ -703,11 +825,8 @@ struct supertype {
        int minor_version;
        int max_devs;
        int container_dev;    /* devnum of container */
-       char subarray[32];      /* name of array inside container */
        void *sb;
        void *info;
-       int loaded_container;   /* Set if load_super found a container,
-                                * not just one device */
 
        struct metadata_update *updates;
        struct metadata_update **update_tail;
@@ -725,14 +844,117 @@ struct supertype {
 
 };
 
-extern struct supertype *super_by_fd(int fd);
-extern struct supertype *guess_super(int fd);
+extern struct supertype *super_by_fd(int fd, char **subarray);
+enum guess_types { guess_any, guess_array, guess_partitions };
+extern struct supertype *guess_super_type(int fd, enum guess_types guess_type);
+static inline struct supertype *guess_super(int fd) {
+       return guess_super_type(fd, guess_any);
+}
 extern struct supertype *dup_super(struct supertype *st);
 extern int get_dev_size(int fd, char *dname, unsigned long long *sizep);
+extern int must_be_container(int fd);
+extern int dev_size_from_id(dev_t id, unsigned long long *size);
 extern void get_one_disk(int mdfd, mdu_array_info_t *ainf,
                         mdu_disk_info_t *disk);
 void wait_for(char *dev, int fd);
 
+/*
+ * Data structures for policy management.
+ * Each device can have a policy structure that lists
+ * various name/value pairs each possibly with a metadata associated.
+ * The policy list is sorted by name/value/metadata
+ */
+struct dev_policy {
+       struct dev_policy *next;
+       char *name;     /* None of these strings are allocated.  They are
+                        * all just references to strings which are known
+                        * to exist elsewhere.
+                        * name and metadata can be compared by address equality.
+                        */
+       const char *metadata;
+       const char *value;
+};
+
+extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[];
+
+/* iterate over the sublist starting at list, having the same
+ * 'name' as 'list', and matching the given metadata (Where
+ * NULL matches anything
+ */
+#define pol_for_each(item, list, _metadata)                            \
+       for (item = list;                                               \
+            item && item->name == list->name;                          \
+            item = item->next)                                         \
+               if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \
+                       ; else
+
+/*
+ * policy records read from mdadm are largely just name-value pairs.
+ * The names are constants, not strdupped
+ */
+struct pol_rule {
+       struct pol_rule *next;
+       char *type;     /* rule_policy or rule_part */
+       struct rule {
+               struct rule *next;
+               char *name;
+               char *value;
+               char *dups; /* duplicates of 'value' with a partNN appended */
+       } *rule;
+};
+
+extern char rule_policy[], rule_part[];
+extern char rule_path[], rule_type[];
+extern char type_part[], type_disk[];
+
+extern void policyline(char *line, char *type);
+extern void policy_add(char *type, ...);
+extern void policy_free(void);
+
+extern struct dev_policy *path_policy(char *path, char *type);
+extern struct dev_policy *disk_policy(struct mdinfo *disk);
+extern struct dev_policy *devnum_policy(int dev);
+extern void dev_policy_free(struct dev_policy *p);
+
+//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern struct dev_policy *pol_find(struct dev_policy *pol, char *name);
+
+enum policy_action {
+       act_default,
+       act_include,
+       act_re_add,
+       act_spare,      /* This only applies to bare devices */
+       act_spare_same_slot, /* this allows non-bare devices,
+                             * but only if recent removal */
+       act_force_spare, /* this allow non-bare devices in any case */
+       act_err
+};
+
+extern int policy_action_allows(struct dev_policy *plist, const char *metadata,
+                               enum policy_action want);
+extern int disk_action_allows(struct mdinfo *disk, const char *metadata,
+                             enum policy_action want);
+
+struct domainlist {
+       struct domainlist *next;
+       const char *dom;
+};
+
+extern int domain_test(struct domainlist *dom, struct dev_policy *pol,
+                      const char *metadata);
+extern struct domainlist *domain_from_array(struct mdinfo *mdi,
+                                           const char *metadata);
+extern void domainlist_add_dev(struct domainlist **dom, int devnum,
+                              const char *metadata);
+extern void domain_free(struct domainlist *dl);
+extern void domain_merge(struct domainlist **domp, struct dev_policy *pol,
+                        const char *metadata);
+void domain_add(struct domainlist **domp, char *domain);
+
+extern void policy_save_path(char *id_path, struct map_ent *array);
+extern int policy_check_path(struct mdinfo *disk, struct map_ent *array);
+
 #if __GNUC__ < 3
 struct stat64;
 #endif
@@ -778,27 +1000,31 @@ extern int Manage_ro(char *devname, int fd, int readonly);
 extern int Manage_runstop(char *devname, int fd, int runstop, int quiet);
 extern int Manage_resize(char *devname, int fd, long long size, int raid_disks);
 extern int Manage_subdevs(char *devname, int fd,
-                         mddev_dev_t devlist, int verbose, int test);
+                         struct mddev_dev *devlist, int verbose, int test,
+                         char *update);
 extern int autodetect(void);
 extern int Grow_Add_device(char *devname, int fd, char *newdev);
 extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force);
 extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        long long size,
-                       int level, char *layout_str, int chunksize, int raid_disks);
+                       int level, char *layout_str, int chunksize, int raid_disks,
+                       struct mddev_dev *devlist,
+                       int force);
 extern int Grow_restart(struct supertype *st, struct mdinfo *info,
                        int *fdlist, int cnt, char *backup_file, int verbose);
 extern int Grow_continue(int mdfd, struct supertype *st,
                         struct mdinfo *info, char *backup_file);
 
 extern int Assemble(struct supertype *st, char *mddev,
-                   mddev_ident_t ident,
-                   mddev_dev_t devlist, char *backup_file,
+                   struct mddev_ident *ident,
+                   struct mddev_dev *devlist,
+                   char *backup_file, int invalid_backup,
                    int readonly, int runstop,
                    char *update, char *homehost, int require_homehost,
                    int verbose, int force);
 
 extern int Build(char *mddev, int chunk, int level, int layout,
-                int raiddisks, mddev_dev_t devlist, int assume_clean,
+                int raiddisks, struct mddev_dev *devlist, int assume_clean,
                 char *bitmap_file, int bitmap_chunk, int write_behind,
                 int delay, int verbose, int autof, unsigned long long size);
 
@@ -806,46 +1032,46 @@ extern int Build(char *mddev, int chunk, int level, int layout,
 extern int Create(struct supertype *st, char *mddev,
                  int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
                  char *name, char *homehost, int *uuid,
-                 int subdevs, mddev_dev_t devlist,
+                 int subdevs, struct mddev_dev *devlist,
                  int runstop, int verbose, int force, int assume_clean,
                  char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof);
 
 extern int Detail(char *dev, int brief, int export, int test, char *homehost);
 extern int Detail_Platform(struct superswitch *ss, int scan, int verbose);
 extern int Query(char *dev);
-extern int Examine(mddev_dev_t devlist, int brief, int export, int scan,
+extern int Examine(struct mddev_dev *devlist, int brief, int export, int scan,
                   int SparcAdjust, struct supertype *forcest, char *homehost);
-extern int Monitor(mddev_dev_t devlist,
+extern int Monitor(struct mddev_dev *devlist,
                   char *mailaddr, char *alert_cmd,
                   int period, int daemonise, int scan, int oneshot,
-                  int dosyslog, int test, char *pidfile, int increments);
+                  int dosyslog, int test, char *pidfile, int increments,
+                  int share);
 
 extern int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl);
 extern int Kill_subarray(char *dev, char *subarray, int quiet);
-extern int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet);
+extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet);
 extern int Wait(char *dev);
 extern int WaitClean(char *dev, int sock, int verbose);
 
 extern int Incremental(char *devname, int verbose, int runstop,
                       struct supertype *st, char *homehost, int require_homehost,
                       int autof);
-extern int Incremental_container(struct supertype *st, char *devname,
-                                int verbose, int runstop, int autof,
-                                int trustworthy);
 extern void RebuildMap(void);
 extern int IncrementalScan(int verbose);
-extern int IncrementalRemove(char *devname, int verbose);
+extern int IncrementalRemove(char *devname, char *path, int verbose);
 extern int CreateBitmap(char *filename, int force, char uuid[16],
                        unsigned long chunksize, unsigned long daemon_sleep,
                        unsigned long write_behind,
                        unsigned long long array_size,
                        int major);
 extern int ExamineBitmap(char *filename, int brief, struct supertype *st);
+extern int Write_rules(char *rule_name);
 extern int bitmap_update_uuid(int fd, int *uuid, int swap);
 extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb);
 
 extern int md_get_version(int fd);
 extern int get_linux_version(void);
+extern int mdadm_version(char *version);
 extern long long parse_size(char *size);
 extern int parse_uuid(char *str, int uuid[4]);
 extern int parse_layout_10(char *layout);
@@ -853,7 +1079,9 @@ extern int parse_layout_faulty(char *layout);
 extern int check_ext2(int fd, char *name);
 extern int check_reiser(int fd, char *name);
 extern int check_raid(int fd, char *name);
-extern int check_partitions(int fd, char *dname, unsigned long long freesize);
+extern int check_partitions(int fd, char *dname,
+                           unsigned long long freesize,
+                           unsigned long long size);
 
 extern int get_mdp_major(void);
 extern int dev_open(char *dev, int flags);
@@ -863,10 +1091,10 @@ extern int is_standard(char *dev, int *nump);
 extern int same_dev(char *one, char *two);
 
 extern int parse_auto(char *str, char *msg, int config);
-extern mddev_ident_t conf_get_ident(char *dev);
-extern mddev_dev_t conf_get_devs(void);
+extern struct mddev_ident *conf_get_ident(char *dev);
+extern struct mddev_dev *conf_get_devs(void);
 extern int conf_test_dev(char *devname);
-extern int conf_test_metadata(const char *version, int is_homehost);
+extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost);
 extern struct createinfo *conf_get_create_info(void);
 extern void set_conffile(char *file);
 extern char *conf_get_mailaddr(void);
@@ -877,12 +1105,13 @@ extern char *conf_line(FILE *file);
 extern char *conf_word(FILE *file, int allow_key);
 extern int conf_name_is_free(char *name);
 extern int devname_matches(char *name, char *match);
-extern struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st);
+extern struct mddev_ident *conf_match(struct mdinfo *info, struct supertype *st);
+extern int experimental(void);
 
 extern void free_line(char *line);
 extern int match_oneof(char *devices, char *devname);
 extern void uuid_from_super(int uuid[4], mdp_super_t *super);
-extern const int uuid_match_any[4];
+extern const int uuid_zero[4];
 extern int same_uuid(int a[4], int b[4], int swapuuid);
 extern void copy_uuid(void *a, int b[4], int swapuuid);
 extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep);
@@ -896,16 +1125,25 @@ extern int ask(char *mesg);
 extern unsigned long long get_component_size(int fd);
 extern void remove_partitions(int fd);
 extern int test_partition(int fd);
+extern int test_partition_from_id(dev_t id);
 extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
                                   int chunksize, unsigned long long devsize);
 extern int flush_metadata_updates(struct supertype *st);
 extern void append_metadata_update(struct supertype *st, void *buf, int len);
 extern int assemble_container_content(struct supertype *st, int mdfd,
                                      struct mdinfo *content, int runstop,
-                                     char *chosen_name, int verbose);
-
+                                     char *chosen_name, int verbose,
+                                     char *backup_file);
+extern struct mdinfo *container_choose_spares(struct supertype *st,
+                                             unsigned long long min_size,
+                                             struct domainlist *domlist,
+                                             char *spare_group,
+                                             const char *metadata, int get_one);
+extern int move_spare(char *from_devname, char *to_devname, dev_t devid);
 extern int add_disk(int mdfd, struct supertype *st,
                    struct mdinfo *sra, struct mdinfo *info);
+extern int remove_disk(int mdfd, struct supertype *st,
+                      struct mdinfo *sra, struct mdinfo *info);
 extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
 unsigned long long min_recovery_start(struct mdinfo *array);
 
@@ -932,8 +1170,7 @@ extern int open_mddev(char *dev, int report_errors);
 extern int open_container(int fd);
 extern int is_container_member(struct mdstat_ent *ent, char *devname);
 extern int is_subarray_active(char *subarray, char *devname);
-int is_container_active(char *devname);
-extern int open_subarray(char *dev, struct supertype *st, int quiet);
+extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet);
 extern struct superswitch *version_to_superswitch(char *vers);
 
 extern int mdmon_running(int devnum);
@@ -942,7 +1179,13 @@ extern int check_env(char *name);
 extern __u32 random32(void);
 extern int start_mdmon(int devnum);
 
+extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+                        struct supertype *st, unsigned long stripes,
+                        int *fds, unsigned long long *offsets,
+                        int dests, int *destfd, unsigned long long *destoffsets);
+
 extern char *devnum2devname(int num);
+extern void fmt_devname(char *name, int num);
 extern int devname2devnum(char *name);
 extern int stat2devnum(struct stat *st);
 extern int fd2devnum(int fd);
@@ -972,13 +1215,20 @@ static inline int is_subarray(char *vers)
        /* The version string for a 'subarray' (an array in a container)
         * is 
         *    /containername/componentname    for normal read-write arrays
-        *    -containername/componentname    for read-only arrays.
+        *    -containername/componentname    for arrays which mdmon must not
+        *                                    reconfigure.  They might be read-only
+        *                                    or might be undergoing reshape etc.
         * containername is e.g. md0, md_d1
         * componentname is dependant on the metadata. e.g. '1' 'S1' ...
         */
        return (*vers == '/' || *vers == '-');
 }
 
+static inline char *to_subarray(struct mdstat_ent *ent, char *container)
+{
+       return &ent->metadata_version[10+strlen(container)+1];
+}
+
 #ifdef DEBUG
 #define dprintf(fmt, arg...) \
        fprintf(stderr, fmt, ##arg)
index bae85aff8af7b0cd4b2b90eb6905bbbdd27f246c..e96086dd37517989fbb525fb3ba65757b84ecd28 100644 (file)
@@ -1,6 +1,6 @@
 Summary:     mdadm is used for controlling Linux md devices (aka RAID arrays)
 Name:        mdadm
-Version:     3.1.4
+Version:     3.2
 Release:     1
 Source:      http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tgz
 URL:         http://neil.brown.name/blog/mdadm
index 986432cc9c2855515a09920ef98e56df7a467c14..5c442ee3c81277fa47f42b70aabd02561fc11db8 100644 (file)
@@ -1,5 +1,5 @@
 .\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v3.1.4
+.TH MDASSEMBLE 8 "" v3.2
 .SH NAME
 mdassemble \- assemble MD devices
 .I aka
index d0d0707e123412b6233fad96285ad6eb8114c137..a8b78ceb45b214e5de5c21691c7ed9881380eee0 100644 (file)
@@ -88,7 +88,7 @@ int verbose = 0;
 int force = 0;
 
 int main(int argc, char *argv[]) {
-       mddev_ident_t array_list =  conf_get_ident(NULL);
+       struct mddev_ident *array_list =  conf_get_ident(NULL);
        if (!array_list) {
                fprintf(stderr, Name ": No arrays found in config file\n");
                rv = 1;
@@ -105,7 +105,7 @@ int main(int argc, char *argv[]) {
                        if (mdfd >= 0)
                                close(mdfd);
                        rv |= Assemble(array_list->st, array_list->devname,
-                                      array_list, NULL, NULL,
+                                      array_list, NULL, NULL, 0,
                                       readonly, runstop, NULL, NULL, 0,
                                       verbose, force);
                }
diff --git a/mdmon-design.txt b/mdmon-design.txt
new file mode 100644 (file)
index 0000000..f09184a
--- /dev/null
@@ -0,0 +1,146 @@
+
+When managing a RAID1 array which uses metadata other than the
+"native" metadata understood by the kernel, mdadm makes use of a
+partner program named 'mdmon' to manage some aspects of updating
+that metadata and synchronising the metadata with the array state.
+
+This document provides some details on how mdmon works.
+
+Containers
+----------
+
+As background: mdadm makes a distinction between an 'array' and a
+'container'.  Other sources sometimes use the term 'volume' or
+'device' for an 'array', and may use the term 'array' for a
+'container'.
+
+For our purposes:
+ - a 'container' is a collection of devices which are described by a
+   single set of metadata.  The metadata may be stored equally
+   on all devices, or different devices may have quite different
+   subsets of the total metadata.  But there is conceptually one set
+   of metadata that unifies the devices.
+
+ - an 'array' is a set of datablock from various devices which
+   together are used to present the abstraction of a single linear
+   sequence of block, which may provide data redundancy or enhanced
+   performance.
+
+So a container has some metadata and provides a number of arrays which
+are described by that metadata.
+
+Sometimes this model doesn't work perfectly.  For example, global
+spares may have their own metadata which is quite different from the
+metadata from any device that participates in one or more arrays.
+Such a global spare might still need to belong to some container so
+that it is available to be used should a failure arise.  In that case
+we consider the 'metadata' to be the union of the metadata on the
+active devices which describes the arrays, and the metadata on the
+global spares which only describes the spares.  In this case different
+devices in the one container will have quite different metadata.
+
+
+Purpose
+-------
+
+The main purpose of mdmon is to update the metadata in response to
+changes to the array which need to be reflected in the metadata before
+futures writes to the array can safely be performed.
+These include:
+ - transitions from 'clean' to 'dirty'.
+ - recording the devices have failed.
+ - recording the progress of a 'reshape'
+
+This requires mdmon to be running at any time that the array is
+writable (a read-only array does not require mdmon to be running).
+
+Because mdmon must be able to process these metadata updates at any
+time, it must (when running) have exclusive write access to the
+metadata.  Any other changes (e.g. reconfiguration of the array) must
+go through mdmon.
+
+A secondary role for mdmon is to activate spares when a device fails.
+This role is much less time-critical than the other metadata updates,
+so it could be performed by a separate process, possibly
+"mdadm --monitor" which has a related role of moving devices between
+arrays.  A main reason for including this functionality in mdmon is
+that in the native-metadata case this function is handled in the
+kernel, and mdmon's reason for existence to provide functionality
+which is otherwise handled by the kernel.
+
+
+Design overview
+---------------
+
+mdmon is structured as two threads with a common address space and
+common data structures.  These threads are know as the 'monitor' and
+the 'manager'.
+
+The 'monitor' has the primary role of monitoring the array for
+important state changes and updating the metadata accordingly.  As
+writes to the array can be blocked until 'monitor' completes and
+acknowledges the update, it much be very careful not to block itself.
+In particular it must not block waiting for any write to complete else
+it could deadlock.  This means that it must not allocate memory as
+doing this can require dirty memory to be written out and if the
+system choose to write to the array that mdmon is monitoring, the
+memory allocation could deadlock.
+
+So 'monitor' must never allocate memory and must limit the number of
+other system call it performs. It may:
+ - use select (or poll) to wait for activity on a file descriptor
+ - read from a sysfs file descriptor
+ - write to a sysfs file descriptor
+ - write the metadata out to the block devices using O_DIRECT
+ - send a signal (kill) to the manager thread
+
+It must not e.g. open files or do anything similar that might allocate
+resources.
+
+The 'manager' thread does everything else that is needed.  If any
+files are to be opened (e.g. because a device has been added to the
+array), the manager does that.  If any memory needs to be allocated
+(e.g. to hold data about a new array as can happen when one set of
+metadata describes several arrays), the manager performs that
+allocation.
+
+The 'manager' is also responsible for communicating with mdadm and
+assigning spares to replace failed devices.
+
+
+Handling metadata updates
+-------------------------
+
+There are a number of cases in which mdadm needs to update the
+metdata which mdmon is managing.  These include:
+ - creating a new array in an active container
+ - adding a device to a container
+ - reconfiguring an array
+etc.
+
+To complete these updates, mdadm must send a message to mdmon which
+will merge the update into the metadata as it is at that moment.
+
+To achieve this, mdmon creates a Unix Domain Socket which the manager
+thread listens on.  mdadm sends a message over this socket.  The
+manager thread examines the message to see if it will require
+allocating any memory and allocates it.  This is done in the
+'prepare_update' metadata method.
+
+The update message is then queued for handling by the monitor thread
+which it will do when convenient.  The monitor thread calls
+->process_update which should atomically make the required changes to
+the metadata, making use of the pre-allocate memory as required.  Any
+memory the is no-longer needed can be placed back in the request and
+the manager thread will free it.
+
+The exact format of a metadata update is up to the implementer of the
+metadata handlers.  It will simply describe a change that needs to be
+made.  It will sometimes contain fragments of the metadata to be
+copied in to place.  However the ->process_update routine must make
+sure not to over-write any field that the monitor thread might have
+updated, such as a 'device failed' or 'array is dirty' state.
+
+When the monitor thread has completed the update and written it to the
+devices, an acknowledgement message is sent back over the socket so
+that mdadm knows it is complete.
diff --git a/mdmon.8 b/mdmon.8
index 2ccb2792766d173f7ffc1f8eb5b8d3e29683ddba..efa30898635cf1889b3976543273b846f46b6d9e 100644 (file)
--- a/mdmon.8
+++ b/mdmon.8
@@ -1,5 +1,5 @@
 .\" See file COPYING in distribution for details.
-.TH MDMON 8 "" v3.1.4
+.TH MDMON 8 "" v3.2-devel
 .SH NAME
 mdmon \- monitor MD external metadata arrays
 
diff --git a/mdmon.c b/mdmon.c
index e416b2e42f4559c69b896252f6d4cb4f8efa4447..1f39f165d620035aebc06112a7ee872ceb34269b 100644 (file)
--- a/mdmon.c
+++ b/mdmon.c
@@ -398,7 +398,6 @@ static int mdmon(char *devname, int devnum, int must_fork, int takeover)
        container->devnum = devnum;
        container->devname = devname;
        container->arrays = NULL;
-       container->subarray[0] = 0;
        container->sock = -1;
 
        if (!container->devname) {
@@ -469,7 +468,7 @@ static int mdmon(char *devname, int devnum, int must_fork, int takeover)
                }
                close(victim_sock);
        }
-       if (container->ss->load_super(container, mdfd, devname)) {
+       if (container->ss->load_container(container, mdfd, devname)) {
                fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
                        devname);
                exit(3);
@@ -518,3 +517,12 @@ static int mdmon(char *devname, int devnum, int must_fork, int takeover)
 
        exit(0);
 }
+
+/* Some stub functions so super-* can link with us */
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+                 struct supertype *st, unsigned long blocks,
+                 int *fds, unsigned long long *offsets,
+                 int dests, int *destfd, unsigned long long *destoffsets)
+{
+       return 0;
+}
diff --git a/mdmon.h b/mdmon.h
index 5c515663f4a4225d1c23ec6c38c571537eca9cb2..6d1776f9281e5ab6e0a47ed29ed416ed2d483328 100644 (file)
--- a/mdmon.h
+++ b/mdmon.h
@@ -46,6 +46,7 @@ struct active_array {
        enum sync_action prev_action, curr_action, next_action;
 
        int check_degraded; /* flag set by mon, read by manage */
+       int check_reshape; /* flag set by mon, read by manage */
 
        int devnum;
 };
index 580e87c568edd4e1443ebcdaa053bee152e3dc64..3d2edadb494dcad1a7db3528e493e60414741d8a 100644 (file)
--- a/mdstat.c
+++ b/mdstat.c
@@ -240,11 +240,27 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                                   w[l-1] == '%' &&
                                   (eq=strchr(w, '=')) != NULL ) {
                                ent->percent = atoi(eq+1);
-                               if (strncmp(w,"resync", 4)==0)
+                               if (strncmp(w,"resync", 6)==0)
                                        ent->resync = 1;
+                               else if (strncmp(w, "reshape", 7)==0)
+                                       ent->resync = 2;
+                               else
+                                       ent->resync = 0;
                        } else if (ent->percent == -1 &&
-                                  strncmp(w, "resync", 4)==0) {
-                               ent->resync = 1;
+                                  (w[0] == 'r' || w[0] == 'c')) {
+                               if (strncmp(w, "resync", 4)==0)
+                                       ent->resync = 1;
+                               if (strncmp(w, "reshape", 7)==0)
+                                       ent->resync = 2;
+                               if (strncmp(w, "recovery", 8)==0)
+                                       ent->resync = 0;
+                               if (strncmp(w, "check", 5)==0)
+                                       ent->resync = 3;
+
+                               if (l > 8 && strcmp(w+l-8, "=DELAYED"))
+                                       ent->percent = 0;
+                               if (l > 8 && strcmp(w+l-8, "=PENDING"))
+                                       ent->percent = 0;
                        } else if (ent->percent == -1 &&
                                   w[0] >= '0' &&
                                   w[0] <= '9' &&
@@ -367,3 +383,36 @@ struct mdstat_ent *mdstat_by_component(char *name)
        }
        return NULL;
 }
+
+struct mdstat_ent *mdstat_by_subdev(char *subdev, int container)
+{
+       struct mdstat_ent *mdstat = mdstat_read(0, 0);
+
+       while (mdstat) {
+               struct mdstat_ent *ent;
+               char *pos;
+               /* metadata version must match:
+                *   external:[/-]md%d/%s
+                * where %d is 'container' and %s is 'subdev'
+                */
+               if (mdstat->metadata_version &&
+                   strncmp(mdstat->metadata_version, "external:", 9) == 0 &&
+                   strchr("/-", mdstat->metadata_version[9]) != NULL &&
+                   strncmp(mdstat->metadata_version+10, "md", 2) == 0 &&
+                   strtoul(mdstat->metadata_version+12, &pos, 10)
+                   == (unsigned)container &&
+                   pos > mdstat->metadata_version+12 &&
+                   *pos == '/' &&
+                   strcmp(pos+1, subdev) == 0
+                       ) {
+                       free_mdstat(mdstat->next);
+                       mdstat->next = NULL;
+                       return mdstat;
+               }
+               ent = mdstat;
+               mdstat = mdstat->next;
+               ent->next = NULL;
+               free_mdstat(ent);
+       }
+       return NULL;
+}
index 0a7d0f471e74d93e65f44f2a520d00b3e9ae47e7..8e0f1b7ee503dd1c509467e1cc5273b4caa300c7 100644 (file)
--- a/monitor.c
+++ b/monitor.c
@@ -215,6 +215,7 @@ static int read_and_act(struct active_array *a)
 {
        unsigned long long sync_completed;
        int check_degraded = 0;
+       int check_reshape = 0;
        int deactivate = 0;
        struct mdinfo *mdi;
        int dirty = 0;
@@ -235,6 +236,13 @@ static int read_and_act(struct active_array *a)
                }
        }
 
+       if (a->curr_state > inactive &&
+           a->prev_state == inactive) {
+               /* array has been started
+                * possible that container operation has to be completed
+                */
+               a->container->ss->set_array_state(a, 0);
+       }
        if (a->curr_state <= inactive &&
            a->prev_state > inactive) {
                /* array has been stopped */
@@ -306,6 +314,15 @@ static int read_and_act(struct active_array *a)
                }
        }
 
+       if (!deactivate &&
+           a->curr_action == reshape &&
+           a->prev_action != reshape)
+               /* reshape was requested by mdadm.  Need to see if
+                * new devices have been added.  Manager does that
+                * when it sees check_reshape
+                */
+               check_reshape = 1;
+
        /* Check for failures and if found:
         * 1/ Record the failure in the metadata and unblock the device.
         *    FIXME update the kernel to stop notifying on failed drives when
@@ -330,8 +347,8 @@ static int read_and_act(struct active_array *a)
 
        /* Check for recovery checkpoint notifications.  We need to be a
         * minimum distance away from the last checkpoint to prevent
-        * over checkpointing.  Note reshape checkpointing is not
-        * handled here.
+        * over checkpointing.  Note reshape checkpointing is handled
+        * in the second branch.
         */
        if (sync_completed > a->last_checkpoint &&
            sync_completed - a->last_checkpoint > a->info.component_size >> 4 &&
@@ -341,7 +358,37 @@ static int read_and_act(struct active_array *a)
                 */
                a->last_checkpoint = sync_completed;
                a->container->ss->set_array_state(a, a->curr_state <= clean);
-       } else if (sync_completed > a->last_checkpoint)
+       } else if ((a->curr_action == idle && a->prev_action == reshape) ||
+                  (a->curr_action == reshape
+                   && sync_completed > a->last_checkpoint) ) {
+               /* Reshape has progressed or completed so we need to
+                * update the array state - and possibly the array size
+                */
+               if (sync_completed != 0)
+                       a->last_checkpoint = sync_completed;
+               /* We might need to update last_checkpoint depending on
+                * the reason that reshape finished.
+                * if array reshape is really finished:
+                *        set check point to the end, this allows
+                *        set_array_state() to finalize reshape in metadata
+                * if reshape if broken: do not set checkpoint to the end
+                *        this allows for reshape restart from checkpoint
+                */
+               if ((a->curr_action != reshape) &&
+                   (a->prev_action == reshape)) {
+                       char buf[40];
+                       if ((sysfs_get_str(&a->info, NULL,
+                                         "reshape_position",
+                                         buf,
+                                         sizeof(buf)) >= 0) &&
+                            strncmp(buf, "none", 4) == 0)
+                               a->last_checkpoint = a->info.component_size;
+               }
+               a->container->ss->set_array_state(a, a->curr_state <= clean);
+               a->last_checkpoint = sync_completed;
+       }
+
+       if (sync_completed > a->last_checkpoint)
                a->last_checkpoint = sync_completed;
 
        a->container->ss->sync_metadata(a->container);
@@ -395,9 +442,12 @@ static int read_and_act(struct active_array *a)
                mdi->next_state = 0;
        }
 
-       if (check_degraded) {
+       if (check_degraded || check_reshape) {
                /* manager will do the actual check */
-               a->check_degraded = 1;
+               if (check_degraded)
+                       a->check_degraded = 1;
+               if (check_reshape)
+                       a->check_reshape = 1;
                signal_manager();
        }
 
@@ -485,7 +535,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                /* once an array has been deactivated we want to
                 * ask the manager to discard it.
                 */
-               if (!a->container) {
+               if (!a->container || (a->info.array.level == 0)) {
                        if (discard_this) {
                                ap = &(*ap)->next;
                                continue;
@@ -527,6 +577,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                                remove_pidfile(container->devname);
                        exit_now = 1;
                        signal_manager();
+                       close(fd);
                        exit(0);
                }
        }
diff --git a/msg.c b/msg.c
index aabfa8f566072b64d900b91c8e50cfaf0fde0057..a1f4bc6e0b5540556549b08578f4190b9e1d704e 100644 (file)
--- a/msg.c
+++ b/msg.c
@@ -135,7 +135,15 @@ int ack(int fd, int tmo)
 int wait_reply(int fd, int tmo)
 {
        struct metadata_update msg;
-       return receive_message(fd, &msg, tmo);
+       int err = receive_message(fd, &msg, tmo);
+
+       /* mdmon sent extra data, but caller only cares that we got a
+        * successful reply
+        */
+       if (err == 0 && msg.len > 0)
+               free(msg.buf);
+
+       return err;
 }
 
 int connect_monitor(char *devname)
@@ -195,7 +203,6 @@ int fping_monitor(int sfd)
        return err;
 }
 
-
 /* give the monitor a chance to update the metadata */
 int ping_monitor(char *devname)
 {
@@ -206,6 +213,223 @@ int ping_monitor(char *devname)
        return err;
 }
 
+static char *ping_monitor_version(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       struct metadata_update msg;
+       int err = 0;
+
+       if (sfd < 0)
+               return NULL;
+
+       if (ack(sfd, 20) != 0)
+               err = -1;
+
+       if (!err && receive_message(sfd, &msg, 20) != 0)
+               err = -1;
+
+       close(sfd);
+
+       if (err || !msg.len || !msg.buf)
+               return NULL;
+       return msg.buf;
+}
+
+int unblock_subarray(struct mdinfo *sra, const int unfreeze)
+{
+       char buf[64];
+       int rc = 0;
+
+       if (sra) {
+               sprintf(buf, "external:%s\n", sra->text_version);
+               buf[9] = '/';
+       } else
+               buf[9] = '-';
+
+       if (buf[9] == '-' ||
+           sysfs_set_str(sra, NULL, "metadata_version", buf) ||
+           (unfreeze &&
+            sysfs_attribute_available(sra, NULL, "sync_action") &&
+            sysfs_set_str(sra, NULL, "sync_action", "idle")))
+               rc = -1;
+       return rc;
+}
+
+int block_subarray(struct mdinfo *sra)
+{
+       char buf[64];
+       int rc = 0;
+
+       sprintf(buf, "external:%s\n", sra->text_version);
+       buf[9] = '-';
+       if (sysfs_set_str(sra, NULL, "metadata_version", buf))
+               rc = -1;
+
+       return rc;
+}
+/**
+ * block_monitor - prevent mdmon spare assignment
+ * @container - container to block
+ * @freeze - flag to additionally freeze sync_action
+ *
+ * This is used by the reshape code to freeze the container, and the
+ * auto-rebuild implementation to atomically move spares.
+ * In both cases we need to stop mdmon from assigning spares to replace
+ * failed devices as we might have other plans for the spare.
+ * For the reshape case we also need to 'freeze' sync_action so that
+ * no recovery happens until we have fully prepared for the reshape.
+ *
+ * We tell mdmon that the array is frozen by marking the 'metadata' name
+ * with a leading '-'.  The previously told mdmon "Don't make this array
+ * read/write, leave it readonly".  Now it means a more general "Don't
+ * reconfigure this array at all".
+ * As older versions of mdmon (which might run from initrd) don't understand
+ * this, we first check that the running mdmon is new enough.
+ */
+int block_monitor(char *container, const int freeze)
+{
+       int devnum = devname2devnum(container);
+       struct mdstat_ent *ent, *e, *e2;
+       struct mdinfo *sra = NULL;
+       char *version = NULL;
+       char buf[64];
+       int rv = 0;
+
+       if (!mdmon_running(devnum)) {
+               /* if mdmon is not active we assume that any instance that is
+                * later started will match the current mdadm version, if this
+                * assumption is violated we may inadvertantly rebuild an array
+                * that was meant for reshape, or start rebuild on a spare that
+                * was to be moved to another container
+                */
+               /* pass */;
+       } else {
+               int ver;
+
+               version = ping_monitor_version(container);
+               ver = version ? mdadm_version(version) : -1;
+               free(version);
+               if (ver < 3002000) {
+                       fprintf(stderr, Name
+                               ": mdmon instance for %s cannot be disabled\n",
+                               container);
+                       return -1;
+               }
+       }
+
+       ent = mdstat_read(0, 0);
+       if (!ent) {
+               fprintf(stderr, Name
+                       ": failed to read /proc/mdstat while disabling mdmon\n");
+               return -1;
+       }
+
+       /* freeze container contents */
+       for (e = ent; e; e = e->next) {
+               if (!is_container_member(e, container))
+                       continue;
+               sysfs_free(sra);
+               sra = sysfs_read(-1, e->devnum, GET_VERSION);
+               if (!sra) {
+                       fprintf(stderr, Name
+                               ": failed to read sysfs for subarray%s\n",
+                               to_subarray(e, container));
+                       break;
+               }
+               /* can't reshape an array that we can't monitor */
+               if (sra->text_version[0] == '-')
+                       break;
+
+               if (freeze && sysfs_freeze_array(sra) < 1)
+                       break;
+               /* flag this array to not be modified by mdmon (close race with
+                * takeover in reshape case and spare reassignment in the
+                * auto-rebuild case)
+                */
+               if (block_subarray(sra))
+                       break;
+               ping_monitor(container);
+
+               /* check that we did not race with recovery */
+               if ((freeze &&
+                    !sysfs_attribute_available(sra, NULL, "sync_action")) ||
+                   (freeze &&
+                    sysfs_attribute_available(sra, NULL, "sync_action") &&
+                    sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 &&
+                    strcmp(buf, "frozen\n") == 0))
+                       /* pass */;
+               else {
+                       unblock_subarray(sra, 0);
+                       break;
+               }
+               /* Double check against races - there should be no spares
+                * or part-spares
+                */
+               sysfs_free(sra);
+               sra = sysfs_read(-1, e->devnum, GET_DEVS | GET_STATE);
+               if (sra && sra->array.spare_disks > 0) {
+                       unblock_subarray(sra, freeze);
+                       break;
+               }
+       }
+
+       if (e) {
+               fprintf(stderr, Name ": failed to freeze subarray%s\n",
+                       to_subarray(e, container));
+
+               /* thaw the partially frozen container */
+               for (e2 = ent; e2 && e2 != e; e2 = e2->next) {
+                       if (!is_container_member(e2, container))
+                               continue;
+                       sysfs_free(sra);
+                       sra = sysfs_read(-1, e2->devnum, GET_VERSION);
+                       if (unblock_subarray(sra, freeze))
+                               fprintf(stderr, Name ": Failed to unfreeze %s\n", e2->dev);
+               }
+
+               ping_monitor(container); /* cleared frozen */
+               rv = -1;
+       }
+
+       sysfs_free(sra);
+       free_mdstat(ent);
+
+       return rv;
+}
+
+void unblock_monitor(char *container, const int unfreeze)
+{
+       struct mdstat_ent *ent, *e;
+       struct mdinfo *sra = NULL;
+       int to_ping = 0;
+
+       ent = mdstat_read(0, 0);
+       if (!ent) {
+               fprintf(stderr, Name
+                       ": failed to read /proc/mdstat while unblocking container\n");
+               return;
+       }
+
+       /* unfreeze container contents */
+       for (e = ent; e; e = e->next) {
+               if (!is_container_member(e, container))
+                       continue;
+               sysfs_free(sra);
+               sra = sysfs_read(-1, e->devnum, GET_VERSION|GET_LEVEL);
+               if (sra->array.level > 0)
+                       to_ping++;
+               if (unblock_subarray(sra, unfreeze))
+                       fprintf(stderr, Name ": Failed to unfreeze %s\n", e->dev);
+       }
+       if (to_ping)
+               ping_monitor(container);
+
+       sysfs_free(sra);
+       free_mdstat(ent);
+}
+
+
+
 /* give the manager a chance to view the updated container state.  This
  * would naturally happen due to the manager noticing a change in
  * /proc/mdstat; however, pinging encourages this detection to happen
diff --git a/msg.h b/msg.h
index f8e89fdccc989a29923026127d8cfdbba8a4ec3d..91a77987fe362e66d50b3a5de25fe9ec4341cb89 100644 (file)
--- a/msg.h
+++ b/msg.h
@@ -27,6 +27,10 @@ extern int ack(int fd, int tmo);
 extern int wait_reply(int fd, int tmo);
 extern int connect_monitor(char *devname);
 extern int ping_monitor(char *devname);
+extern int block_subarray(struct mdinfo *sra);
+extern int unblock_subarray(struct mdinfo *sra, const int unfreeze);
+extern int block_monitor(char *container, const int freeze);
+extern void unblock_monitor(char *container, const int unfreeze);
 extern int fping_monitor(int sock);
 extern int ping_manager(char *devname);
 
diff --git a/part.h b/part.h
new file mode 100644 (file)
index 0000000..0afea33
--- /dev/null
+++ b/part.h
@@ -0,0 +1,82 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/* Structure definitions ext  for MBR and GPT partition tables
+ */
+
+
+#define        MBR_SIGNATURE_MAGIC     __cpu_to_le16(0xAA55)
+#define MBR_PARTITIONS               4
+
+struct MBR_part_record {
+  __u8 bootable;
+  __u8 first_head;
+  __u8 first_sector;
+  __u8 first_cyl;
+  __u8 part_type;
+  __u8 last_head;
+  __u8 last_sector;
+  __u8 last_cyl;
+  __u32 first_sect_lba;
+  __u32 blocks_num;
+};
+
+struct MBR {
+       __u8 pad[446];
+       struct MBR_part_record parts[MBR_PARTITIONS];
+       __u16 magic;
+} __attribute__((packed));
+
+
+
+#define        GPT_SIGNATURE_MAGIC     __cpu_to_le64(0x5452415020494645ULL)
+#define MBR_GPT_PARTITION_TYPE       0xEE
+
+struct GPT_part_entry {
+       unsigned char type_guid[16];
+       unsigned char partition_guid[16];
+       __u64 starting_lba;
+       __u64 ending_lba;
+       unsigned char attr_bits[8];
+       unsigned char name[72];
+} __attribute__((packed));
+
+struct GPT {
+       __u64 magic;
+       __u32 revision;
+       __u32 header_size;
+       __u32 crc;
+       __u32 pad1;
+       __u64 current_lba;
+       __u64 backup_lba;
+       __u64 first_lba;
+       __u64 last_lba;
+       __u8 guid[16];
+       __u64 part_start;
+       __u32 part_cnt;
+       __u32 part_size;
+       __u32 part_crc;
+       __u8 pad2[420];
+} __attribute__((packed));
index 61749085307b4b826ceee3528f2786448107a681..8003da6e3864b27b45ee69db9a90942fda4b3379 100644 (file)
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <limits.h>
+
+
+static int devpath_to_ll(const char *dev_path, const char *entry,
+                        unsigned long long *val);
+
+static __u16 devpath_to_vendor(const char *dev_path);
 
 void free_sys_dev(struct sys_dev **list)
 {
@@ -51,6 +58,15 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
        struct dirent *de;
        struct sys_dev *head = NULL;
        struct sys_dev *list = NULL;
+       enum sys_dev_type type;
+       unsigned long long dev_id;
+
+       if (strcmp(driver, "isci") == 0)
+               type = SYS_DEV_SAS;
+       else if (strcmp(driver, "ahci") == 0)
+               type = SYS_DEV_SATA;
+       else
+               type = SYS_DEV_UNKNOWN;
 
        sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
        driver_dir = opendir(path);
@@ -74,6 +90,16 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
                if (strncmp(bus, c+1, strlen(bus)) != 0)
                        continue;
 
+               sprintf(path, "/sys/bus/%s/drivers/%s/%s",
+                       bus, driver, de->d_name);
+
+               /* if it's not Intel device skip it. */
+               if (devpath_to_vendor(path) != 0x8086)
+                       continue;
+
+               if (devpath_to_ll(path, "device", &dev_id) != 0)
+                       continue;
+
                /* start / add list entry */
                if (!head) {
                        head = malloc(sizeof(*head));
@@ -88,17 +114,48 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
                        break;
                }
 
-               /* generate canonical path name for the device */
-               sprintf(path, "/sys/bus/%s/drivers/%s/%s",
-                       bus, driver, de->d_name);
+               list->dev_id = (__u16) dev_id;
+               list->type = type;
                list->path = canonicalize_file_name(path);
                list->next = NULL;
+               if ((list->pci_id = strrchr(list->path, '/')) != NULL)
+                       list->pci_id++;
        }
        closedir(driver_dir);
        return head;
 }
 
-__u16 devpath_to_vendor(const char *dev_path)
+
+static struct sys_dev *intel_devices=NULL;
+
+static enum sys_dev_type device_type_by_id(__u16 device_id)
+{
+       struct sys_dev *iter;
+
+       for(iter = intel_devices; iter != NULL; iter = iter->next)
+               if (iter->dev_id == device_id)
+                       return iter->type;
+       return SYS_DEV_UNKNOWN;
+}
+
+static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val)
+{
+       char path[strlen(dev_path) + strlen(entry) + 2];
+       int fd;
+       int n;
+
+       sprintf(path, "%s/%s", dev_path, entry);
+
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       n = sysfs_fd_get_ll(fd, val);
+       close(fd);
+       return n;
+}
+
+
+static __u16 devpath_to_vendor(const char *dev_path)
 {
        char path[strlen(dev_path) + strlen("/vendor") + 1];
        char vendor[7];
@@ -122,70 +179,140 @@ __u16 devpath_to_vendor(const char *dev_path)
        return id;
 }
 
-static int platform_has_intel_ahci(void)
+struct sys_dev *find_intel_devices(void)
 {
-       struct sys_dev *devices = find_driver_devices("pci", "ahci");
-       struct sys_dev *dev;
-       int ret = 0;
-
-       for (dev = devices; dev; dev = dev->next)
-               if (devpath_to_vendor(dev->path) == 0x8086) {
-                       ret = 1;
-                       break;
-               }
-
-       free_sys_dev(&devices);
-
-       return ret;
+       struct sys_dev *ahci, *isci;
+
+       isci = find_driver_devices("pci", "isci");
+       ahci = find_driver_devices("pci", "ahci");
+
+       if (!ahci) {
+               ahci = isci;
+       } else {
+               struct sys_dev *elem = ahci;
+               while (elem->next)
+                       elem = elem->next;
+               elem->next = isci;
+       }
+       return ahci;
 }
 
+/*
+ * PCI Expansion ROM Data Structure Format */
+struct pciExpDataStructFormat {
+       __u8  ver[4];
+       __u16 vendorID;
+       __u16 deviceID;
+} __attribute__ ((packed));
 
-static struct imsm_orom imsm_orom;
-static int scan(const void *start, const void *end)
+static struct imsm_orom imsm_orom[SYS_DEV_MAX];
+static int populated_orom[SYS_DEV_MAX];
+
+static int scan(const void *start, const void *end, const void *data)
 {
        int offset;
        const struct imsm_orom *imsm_mem;
+       int dev;
        int len = (end - start);
+       struct pciExpDataStructFormat *ptr= (struct pciExpDataStructFormat *)data;
+
+       if (data + 0x18 > end) {
+               dprintf("cannot find pciExpDataStruct \n");
+               return 0;
+       }
+
+       dprintf("ptr->vendorID: %lx __le16_to_cpu(ptr->deviceID): %lx \n",
+               (ulong) __le16_to_cpu(ptr->vendorID),
+               (ulong) __le16_to_cpu(ptr->deviceID));
+
+       if (__le16_to_cpu(ptr->vendorID) == 0x8086) {
+               /* serach  attached intel devices by device id from OROM */
+               dev = device_type_by_id(__le16_to_cpu(ptr->deviceID));
+               if (dev == SYS_DEV_UNKNOWN)
+                       return 0;
+       }
+       else
+               return 0;
 
        for (offset = 0; offset < len; offset += 4) {
                imsm_mem = start + offset;
-               if (memcmp(imsm_mem->signature, "$VER", 4) == 0) {
-                       imsm_orom = *imsm_mem;
-                       return 1;
+               if ((memcmp(imsm_mem->signature, "$VER", 4) == 0) ||
+                   (memcmp(imsm_mem->signature, "$OEM", 4) == 0)) {
+                       imsm_orom[dev] = *imsm_mem;
+                       populated_orom[dev] = 1;
+                       return populated_orom[SYS_DEV_SATA] && populated_orom[SYS_DEV_SAS];
                }
        }
-
        return 0;
 }
 
-const struct imsm_orom *find_imsm_orom(void)
-{
-       static int populated = 0;
-       unsigned long align;
-
-       /* it's static data so we only need to read it once */
-       if (populated)
-               return &imsm_orom;
 
-       if (check_env("IMSM_TEST_OROM")) {
-               memset(&imsm_orom, 0, sizeof(imsm_orom));
-               imsm_orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+const struct imsm_orom *imsm_platform_test(enum sys_dev_type hba_id, int *populated,
+                                          struct imsm_orom *imsm_orom)
+{
+       memset(imsm_orom, 0, sizeof(*imsm_orom));
+       imsm_orom->rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
                                IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5;
-               imsm_orom.sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
+       imsm_orom->sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
                                IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB |
                                IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB |
                                IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB |
                                IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB;
-               imsm_orom.dpa = 6;
-               imsm_orom.tds = 6;
-               imsm_orom.vpa = 2;
-               imsm_orom.vphba = 4;
-               imsm_orom.attr = imsm_orom.rlc | IMSM_OROM_ATTR_ChecksumVerify;
-               populated = 1;
-               return &imsm_orom;
+       imsm_orom->dpa = IMSM_OROM_DISKS_PER_ARRAY;
+       imsm_orom->tds = IMSM_OROM_TOTAL_DISKS;
+       imsm_orom->vpa = IMSM_OROM_VOLUMES_PER_ARRAY;
+       imsm_orom->vphba = IMSM_OROM_VOLUMES_PER_HBA;
+       imsm_orom->attr = imsm_orom->rlc | IMSM_OROM_ATTR_ChecksumVerify;
+       *populated = 1;
+
+       if (check_env("IMSM_TEST_OROM_NORAID5")) {
+               imsm_orom->rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+                               IMSM_OROM_RLC_RAID10;
+       }
+       if (check_env("IMSM_TEST_AHCI_EFI_NORAID5") && (hba_id == SYS_DEV_SAS)) {
+               imsm_orom->rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+                               IMSM_OROM_RLC_RAID10;
+       }
+       if (check_env("IMSM_TEST_SCU_EFI_NORAID5") && (hba_id == SYS_DEV_SATA)) {
+               imsm_orom->rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+                               IMSM_OROM_RLC_RAID10;
+       }
+
+       return imsm_orom;
+}
+
+
+
+static const struct imsm_orom *find_imsm_hba_orom(enum sys_dev_type hba_id)
+{
+       unsigned long align;
+
+       if (hba_id >= SYS_DEV_MAX)
+               return NULL;
+
+       /* it's static data so we only need to read it once */
+       if (populated_orom[hba_id]) {
+               dprintf("OROM CAP: %p, pid: %d pop: %d\n",
+                       &imsm_orom[hba_id], (int) getpid(), populated_orom[hba_id]);
+               return &imsm_orom[hba_id];
+       }
+       if (check_env("IMSM_TEST_OROM")) {
+               dprintf("OROM CAP: %p,  pid: %d pop: %d\n",
+                     &imsm_orom[hba_id], (int) getpid(), populated_orom[hba_id]);
+               return imsm_platform_test(hba_id, &populated_orom[hba_id], &imsm_orom[hba_id]);
        }
+       /* return empty OROM capabilities in EFI test mode */
+       if (check_env("IMSM_TEST_AHCI_EFI") ||
+           check_env("IMSM_TEST_SCU_EFI"))
+               return NULL;
+
+
+       if (intel_devices != NULL)
+               free_sys_dev(&intel_devices);
+
+       intel_devices = find_intel_devices();
 
-       if (!platform_has_intel_ahci())
+       if (intel_devices == NULL)
                return NULL;
 
        /* scan option-rom memory looking for an imsm signature */
@@ -196,11 +323,107 @@ const struct imsm_orom *find_imsm_orom(void)
        if (probe_roms_init(align) != 0)
                return NULL;
        probe_roms();
-       populated = scan_adapter_roms(scan);
+       /* ignore return value - True is returned if both adapater roms are found */
+       scan_adapter_roms(scan);
        probe_roms_exit();
 
-       if (populated)
-               return &imsm_orom;
+       if (intel_devices != NULL)
+               free_sys_dev(&intel_devices);
+       intel_devices = NULL;
+
+       if (populated_orom[hba_id])
+               return &imsm_orom[hba_id];
+       return NULL;
+}
+
+#define GUID_STR_MAX   37  /* according to GUID format:
+                            * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */
+
+#define EFI_GUID(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+((struct efi_guid) \
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+  (b) & 0xff, ((b) >> 8) & 0xff, \
+  (c) & 0xff, ((c) >> 8) & 0xff, \
+  (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+
+#define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars"
+#define SCU_PROP "RstScuV"
+#define AHCI_PROP "RstSataV"
+
+#define VENDOR_GUID \
+       EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6)
+
+int populated_efi[SYS_DEV_MAX] = { 0, 0 };
+
+static struct imsm_orom imsm_efi[SYS_DEV_MAX];
+
+const struct imsm_orom *find_imsm_efi(enum sys_dev_type hba_id)
+{
+       int dfd=-1;
+       char path[PATH_MAX];
+       char buf[GUID_STR_MAX];
+       int n;
+
+       if (hba_id >= SYS_DEV_MAX)
+               return NULL;
+
+       dprintf("EFI CAP: %p,  pid: %d pop: %d\n",
+               &imsm_efi[hba_id], (int) getpid(), populated_efi[hba_id]);
+
+       /* it's static data so we only need to read it once */
+       if (populated_efi[hba_id]) {
+               dprintf("EFI CAP: %p, pid: %d pop: %d\n",
+                       &imsm_efi[hba_id], (int) getpid(), populated_efi[hba_id]);
+               return &imsm_efi[hba_id];
+       }
+       if (check_env("IMSM_TEST_AHCI_EFI") ||
+           check_env("IMSM_TEST_SCU_EFI")) {
+               dprintf("OROM CAP: %p,  pid: %d pop: %d\n",
+                       &imsm_efi[hba_id], (int) getpid(), populated_efi[hba_id]);
+               return imsm_platform_test(hba_id, &populated_efi[hba_id], &imsm_efi[hba_id]);
+       }
+       /* OROM test is set, return that there is no EFI capabilities */
+       if (check_env("IMSM_TEST_OROM")) {
+               return NULL;
+       }
+       if (hba_id == SYS_DEV_SAS)
+               snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFI_VAR_PATH, SCU_PROP, guid_str(buf, VENDOR_GUID));
+       else
+               snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFI_VAR_PATH, AHCI_PROP, guid_str(buf, VENDOR_GUID));
+
+       dprintf("EFI VAR: path=%s\n", path);
+
+       if ((dfd = open(path, O_RDONLY)) < 0) {
+               populated_efi[hba_id] = 0;
+               return NULL;
+       }
+       n = read(dfd, &imsm_efi[hba_id], sizeof(imsm_efi[0]));
+       close(dfd);
+       if (n  <  (int) (sizeof(imsm_efi[0]))) {
+               return NULL;
+       }
+       populated_efi[hba_id] = 1;
+       return &imsm_efi[hba_id];
+}
+
+/*
+ * backward interface compatibility
+ */
+const struct imsm_orom *find_imsm_orom(void)
+{
+       return find_imsm_hba_orom(SYS_DEV_SATA);
+}
+
+const struct imsm_orom *find_imsm_capability(enum sys_dev_type hba_id)
+{
+       const struct imsm_orom *cap=NULL;
+
+
+       if ((cap = find_imsm_efi(hba_id)) != NULL)
+               return cap;
+       if ((cap = find_imsm_hba_orom(hba_id)) != NULL)
+               return cap;
        return NULL;
 }
 
@@ -212,7 +435,7 @@ char *devt_to_devpath(dev_t dev)
        return canonicalize_file_name(device);
 }
 
-static char *diskfd_to_devpath(int fd)
+char *diskfd_to_devpath(int fd)
 {
        /* return the device path for a disk, return NULL on error or fd
         * refers to a partition
@@ -231,9 +454,14 @@ int path_attached_to_hba(const char *disk_path, const char *hba_path)
 {
        int rc;
 
+       if (check_env("IMSM_TEST_AHCI_DEV") ||
+           check_env("IMSM_TEST_SCU_DEV")) {
+               return 1;
+       }
+
        if (!disk_path || !hba_path)
                return 0;
-
+       dprintf("hba: %s - disk: %s\n", hba_path, disk_path);
        if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0)
                rc = 1;
        else
@@ -263,4 +491,3 @@ int disk_attached_to_hba(int fd, const char *hba_path)
 
        return rc;
 }
-
index 908843618de969bfc86120cecada0513a9d9c389..e24ae37928b3b422664e41faed530e979de05708 100644 (file)
@@ -19,7 +19,7 @@
 #include <asm/types.h>
 #include <strings.h>
 
-/* The IMSM OROM Version Table definition */
+/* The IMSM Capability (IMSM AHCI and ISCU OROM/EFI variable) Version Table definition */
 struct imsm_orom {
        __u8 signature[4];
        __u8 table_ver_major; /* Currently 2 (can change with future revs) */
@@ -58,9 +58,13 @@ struct imsm_orom {
        #define IMSM_OROM_SSS_32MB (1 << 14)
        #define IMSM_OROM_SSS_64MB (1 << 15)
        __u16 dpa; /* Disks Per Array supported */
+       #define IMSM_OROM_DISKS_PER_ARRAY 6
        __u16 tds; /* Total Disks Supported */
+       #define IMSM_OROM_TOTAL_DISKS 6
        __u8 vpa; /* # Volumes Per Array supported */
+       #define IMSM_OROM_VOLUMES_PER_ARRAY 2
        __u8 vphba; /* # Volumes Per Host Bus Adapter supported */
+       #define IMSM_OROM_VOLUMES_PER_HBA 4
        /* Attributes supported. This should map to the
         * attributes in the MPB. Also, lower 16 bits
         * should match/duplicate RLC bits above.
@@ -75,8 +79,20 @@ struct imsm_orom {
        #define IMSM_OROM_ATTR_2TB (1 << 29)
        #define IMSM_OROM_ATTR_PM (1 << 30)
        #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31)
-       __u32 reserved1;
-       __u32 reserved2;
+       __u32 capabilities;
+       #define IMSM_OROM_CAPABILITIES_Ext_SATA (1 << 0)
+       #define IMSM_OROM_CAPABILITIES_TurboMemory (1 << 1)
+       #define IMSM_OROM_CAPABILITIES_HddPassword (1 << 2)
+       #define IMSM_OROM_CAPABILITIES_DiskCoercion (1 << 3)
+       __u32 driver_features;
+       #define IMSM_OROM_CAPABILITIES_HDDUnlock (1 << 0)
+       #define IMSM_OROM_CAPABILITIES_LEDLoc (1 << 1)
+       #define IMSM_OROM_CAPABILITIES_EnterpriseSystem (1 << 2)
+       #define IMSM_OROM_CAPABILITIES_Zpodd (1 << 3)
+       #define IMSM_OROM_CAPABILITIES_LargeDramCache (1 << 4)
+       #define IMSM_OROM_CAPABILITIES_Rohi (1 << 5)
+       #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6)
+       #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7)
 } __attribute__((packed));
 
 static inline int imsm_orom_has_raid0(const struct imsm_orom *orom)
@@ -115,6 +131,7 @@ static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk)
        return !!(orom->sss & (1 << (fs - 1)));
 }
 
+
 /**
  * fls - find last (most-significant) bit set
  * @x: the word to search
@@ -164,15 +181,46 @@ static inline int imsm_orom_default_chunk(const struct imsm_orom *orom)
        return min(512, (1 << fs));
 }
 
+
+enum sys_dev_type {
+       SYS_DEV_UNKNOWN = 0,
+       SYS_DEV_SAS,
+       SYS_DEV_SATA,
+       SYS_DEV_MAX
+};
+
+
 struct sys_dev {
+       enum sys_dev_type type;
        char *path;
+       char *pci_id;
+       __u16  dev_id;
        struct sys_dev *next;
 };
 
+struct efi_guid {
+       __u8 b[16];
+};
+
+static inline char *guid_str(char *buf, struct efi_guid guid)
+{
+       sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-"
+                    "%02x%02x-%02x%02x%02x%02x%02x%02x",
+                guid.b[3], guid.b[2], guid.b[1], guid.b[0],
+                guid.b[5], guid.b[4], guid.b[7], guid.b[6],
+                guid.b[8], guid.b[9], guid.b[10], guid.b[11],
+                guid.b[12], guid.b[13], guid.b[14], guid.b[15]);
+       return buf;
+}
+
+char *diskfd_to_devpath(int fd);
 struct sys_dev *find_driver_devices(const char *bus, const char *driver);
-__u16 devpath_to_vendor(const char *dev_path);
+struct sys_dev *find_intel_devices(void);
 void free_sys_dev(struct sys_dev **list);
+const struct imsm_orom *find_imsm_capability(enum sys_dev_type hba_id);
 const struct imsm_orom *find_imsm_orom(void);
 int disk_attached_to_hba(int fd, const char *hba_path);
 char *devt_to_devpath(dev_t dev);
 int path_attached_to_hba(const char *disk_path, const char *hba_path);
+const char *get_sys_dev_type(enum sys_dev_type);
+
diff --git a/policy.c b/policy.c
new file mode 100644 (file)
index 0000000..ebb1481
--- /dev/null
+++ b/policy.c
@@ -0,0 +1,902 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <dirent.h>
+#include <fnmatch.h>
+#include <ctype.h>
+#include "dlink.h"
+/*
+ * Policy module for mdadm.
+ * A policy statement about a device lists a set of values for each
+ * of a set of names.  Each value can have a metadata type as context.
+ *
+ * names include:
+ *   action - the actions that can be taken on hot-plug
+ *   domain - the domain(s) that the device is part of
+ *
+ * Policy information is extracted from various sources, but
+ * particularly from a set of policy rules in mdadm.conf
+ */
+
+static void pol_new(struct dev_policy **pol, char *name, const char *val,
+                   const char *metadata)
+{
+       struct dev_policy *n = malloc(sizeof(*n));
+       const char *real_metadata = NULL;
+       int i;
+
+       n->name = name;
+       n->value = val;
+
+       /* We need to normalise the metadata name */
+       if (metadata) {
+               for (i = 0; superlist[i] ; i++)
+                       if (strcmp(metadata, superlist[i]->name) == 0) {
+                               real_metadata = superlist[i]->name;
+                               break;
+                       }
+               if (!real_metadata) {
+                       if (strcmp(metadata, "1") == 0 ||
+                           strcmp(metadata, "1.0") == 0 ||
+                           strcmp(metadata, "1.1") == 0 ||
+                           strcmp(metadata, "1.2") == 0)
+                               real_metadata = super1.name;
+               }
+               if (!real_metadata) {
+                       static const char *prev = NULL;
+                       if (prev != metadata) {
+                               fprintf(stderr, Name ": metadata=%s unrecognised - ignoring rule\n",
+                                       metadata);
+                               prev = metadata;
+                       }
+                       real_metadata = "unknown";
+               }
+       }
+
+       n->metadata = real_metadata;
+       n->next = *pol;
+       *pol = n;
+}
+
+static int pol_lesseq(struct dev_policy *a, struct dev_policy *b)
+{
+       int cmp;
+
+       if (a->name < b->name)
+               return 1;
+       if (a->name > b->name)
+               return 0;
+
+       cmp = strcmp(a->value, b->value);
+       if (cmp < 0)
+               return 1;
+       if (cmp > 0)
+               return 0;
+
+       return (a->metadata <= b->metadata);
+}
+
+static void pol_sort(struct dev_policy **pol)
+{
+       /* sort policy list in *pol by name/metadata/value
+        * using merge sort
+        */
+
+       struct dev_policy *pl[2];
+       pl[0] = *pol;
+       pl[1] = NULL;
+
+       do {
+               struct dev_policy **plp[2], *p[2];
+               int curr = 0;
+               struct dev_policy nul = { NULL, NULL, NULL, NULL };
+               struct dev_policy *prev = &nul;
+               int next = 0;
+
+               /* p[] are the two lists that we are merging.
+                * plp[] are the ends of the two lists we create
+                * from the merge.
+                * 'curr' is which of plp[] that we are currently
+                *   adding items to.
+                * 'next' is which if p[] we will take the next
+                *   item from.
+                * 'prev' is that last value, which was placed in
+                * plp[curr].
+                */
+               plp[0] = &pl[0];
+               plp[1] = &pl[1];
+               p[0] = pl[0];
+               p[1] = pl[1];
+
+               /* take least of p[0] and p[1]
+                * if it is larger than prev, add to
+                * plp[curr], else swap curr then add
+                */
+               while (p[0] || p[1]) {
+                       if (p[next] == NULL ||
+                           (p[1-next] != NULL &&
+                            !(pol_lesseq(prev, p[1-next])
+                              ^pol_lesseq(prev, p[next])
+                              ^pol_lesseq(p[next], p[1-next])))
+                               )
+                               next = 1 - next;
+
+                       if (!pol_lesseq(prev, p[next]))
+                               curr = 1 - curr;
+
+                       *plp[curr] = prev = p[next];
+                       plp[curr] = &p[next]->next;
+                       p[next] = p[next]->next;
+               }
+               *plp[0] = NULL;
+               *plp[1] = NULL;
+       } while (pl[0] && pl[1]);
+       if (pl[0])
+               *pol = pl[0];
+       else
+               *pol = pl[1];
+}
+
+static void pol_dedup(struct dev_policy *pol)
+{
+       /* This is a sorted list - remove duplicates. */
+       while (pol && pol->next) {
+               if (pol_lesseq(pol->next, pol)) {
+                       struct dev_policy *tmp = pol->next;
+                       pol->next = tmp->next;
+                       free(tmp);
+               } else
+                       pol = pol->next;
+       }
+}
+
+/*
+ * pol_find finds the first entry in the policy
+ * list to match name.
+ * If it returns non-NULL there is at least one
+ * value, but how many can only be found by
+ * iterating through the list.
+ */
+struct dev_policy *pol_find(struct dev_policy *pol, char *name)
+{
+       while (pol && pol->name < name)
+               pol = pol->next;
+
+       if (!pol || pol->name != name)
+               return NULL;
+       return pol;
+}
+
+static char *disk_path(struct mdinfo *disk)
+{
+       struct stat stb;
+       int prefix_len;
+       DIR *by_path;
+       char symlink[PATH_MAX] = "/dev/disk/by-path/";
+       struct dirent *ent;
+
+       by_path = opendir(symlink);
+       if (!by_path)
+               return NULL;
+       prefix_len = strlen(symlink);
+
+       while ((ent = readdir(by_path)) != NULL) {
+               if (ent->d_type != DT_LNK)
+                       continue;
+               strncpy(symlink + prefix_len,
+                       ent->d_name,
+                       sizeof(symlink) - prefix_len);
+               if (stat(symlink, &stb) < 0)
+                       continue;
+               if ((stb.st_mode & S_IFMT) != S_IFBLK)
+                       continue;
+               if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor))
+                       continue;
+               closedir(by_path);
+               return strdup(ent->d_name);
+       }
+       closedir(by_path);
+       return NULL;
+}
+
+char type_part[] = "part";
+char type_disk[] = "disk";
+static char *disk_type(struct mdinfo *disk)
+{
+       char buf[30+20+20];
+       struct stat stb;
+       sprintf(buf, "/sys/dev/block/%d:%d/partition",
+               disk->disk.major, disk->disk.minor);
+       if (stat(buf, &stb) == 0)
+               return type_part;
+       else
+               return type_disk;
+}
+
+static int pol_match(struct rule *rule, char *path, char *type)
+{
+       /* check if this rule matches on path and type */
+       int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */
+       int typeok = 0;
+
+       while (rule) {
+               if (rule->name == rule_path) {
+                       if (pathok == 0)
+                               pathok = -1;
+                       if (fnmatch(rule->value, path, 0) == 0)
+                               pathok = 1;
+               }
+               if (rule->name == rule_type) {
+                       if (typeok == 0)
+                               typeok = -1;
+                       if (strcmp(rule->value, type) == 0)
+                               typeok = 1;
+               }
+               rule = rule->next;
+       }
+       return pathok >= 0 && typeok >= 0;
+}
+
+static void pol_merge(struct dev_policy **pol, struct rule *rule)
+{
+       /* copy any name assignments from rule into pol */
+       struct rule *r;
+       char *metadata = NULL;
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_metadata)
+                       metadata = r->value;
+
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_act ||
+                   r->name == pol_domain)
+                       pol_new(pol, r->name, r->value, metadata);
+}
+
+static int path_has_part(char *path, char **part)
+{
+       /* check if path ends with "-partNN" and
+        * if it does, place a pointer to "-pathNN"
+        * in 'part'.
+        */
+       int l = strlen(path);
+       while (l > 1 && isdigit(path[l-1]))
+               l--;
+       if (l < 5 || strncmp(path+l-5, "-part", 5) != 0)
+               return 0;
+       *part = path+l-4;
+       return 1;
+}
+
+static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part)
+{
+       /* copy any name assignments from rule into pol, appending
+        * -part to any domain.  The string with -part appended is
+        * stored with the rule so it has a lifetime to match
+        * the rule.
+        */
+       struct rule *r;
+       char *metadata = NULL;
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_metadata)
+                       metadata = r->value;
+
+       for (r = rule; r ; r = r->next) {
+               if (r->name == pol_act)
+                       pol_new(pol, r->name, r->value, metadata);
+               else if (r->name == pol_domain) {
+                       char *dom;
+                       int len;
+                       if (r->dups == NULL)
+                               r->dups = dl_head();
+                       len = strlen(r->value);
+                       for (dom = dl_next(r->dups); dom != r->dups;
+                            dom = dl_next(dom))
+                               if (strcmp(dom+len+1, part)== 0)
+                                       break;
+                       if (dom == r->dups) {
+                               char *newdom = dl_strndup(
+                                       r->value, len + 1 + strlen(part));
+                               strcat(strcat(newdom, "-"), part);
+                               dl_add(r->dups, newdom);
+                               dom = newdom;
+                       }
+                       pol_new(pol, r->name, dom, metadata);
+               }
+       }
+}
+
+static struct pol_rule *config_rules = NULL;
+static struct pol_rule **config_rules_end = NULL;
+static int config_rules_has_path = 0;
+
+/*
+ * most policy comes from a set policy rules that are
+ * read from the config file.
+ * path_policy() gathers policy information for the
+ * disk described in the given a 'path' and a 'type'.
+ */
+struct dev_policy *path_policy(char *path, char *type)
+{
+       struct pol_rule *rules;
+       struct dev_policy *pol = NULL;
+       int i;
+
+       if (!type)
+               return NULL;
+
+       rules = config_rules;
+
+       while (rules) {
+               char *part;
+               if (rules->type == rule_policy)
+                       if (pol_match(rules->rule, path, type))
+                               pol_merge(&pol, rules->rule);
+               if (rules->type == rule_part && strcmp(type, type_part) == 0)
+                       if (path_has_part(path, &part)) {
+                               *part = 0;
+                               if (pol_match(rules->rule, path, type_disk))
+                                       pol_merge_part(&pol, rules->rule, part+1);
+                               *part = '-';
+                       }
+               rules = rules->next;
+       }
+
+       /* Now add any metadata-specific internal knowledge
+        * about this path
+        */
+       for (i=0; superlist[i]; i++)
+               if (superlist[i]->get_disk_controller_domain) {
+                       const char *d =
+                               superlist[i]->get_disk_controller_domain(path);
+                       if (d)
+                               pol_new(&pol, pol_domain, d, superlist[i]->name);
+               }
+
+       pol_sort(&pol);
+       pol_dedup(pol);
+       return pol;
+}
+
+void pol_add(struct dev_policy **pol,
+                   char *name, char *val,
+                   char *metadata)
+{
+       pol_new(pol, name, val, metadata);
+       pol_sort(pol);
+       pol_dedup(*pol);
+}
+
+
+/*
+ * disk_policy() gathers policy information for the
+ * disk described in the given mdinfo (disk.{major,minor}).
+ */
+struct dev_policy *disk_policy(struct mdinfo *disk)
+{
+       char *path = NULL;
+       char *type = disk_type(disk);
+       struct dev_policy *pol = NULL;
+
+       if (!type)
+               return NULL;
+       if (config_rules_has_path)
+               path = disk_path(disk);
+       if (!path)
+               return NULL;
+
+       pol = path_policy(path, type);
+
+       free(path);
+       return pol;
+}
+
+struct dev_policy *devnum_policy(int dev)
+{
+       struct mdinfo disk;
+       disk.disk.major = major(dev);
+       disk.disk.minor = minor(dev);
+       return disk_policy(&disk);
+}
+
+/*
+ * process policy rules read from config file.
+ */
+
+char rule_path[] = "path";
+char rule_type[] = "type";
+
+char rule_policy[] = "policy";
+char rule_part[] = "part-policy";
+
+char pol_metadata[] = "metadata";
+char pol_act[] = "action";
+char pol_domain[] = "domain";
+char pol_auto[] = "auto";
+
+static int try_rule(char *w, char *name, struct rule **rp)
+{
+       struct rule *r;
+       int len = strlen(name);
+       if (strncmp(w, name, len) != 0 ||
+           w[len] != '=')
+               return 0;
+       r = malloc(sizeof(*r));
+       r->next = *rp;
+       r->name = name;
+       r->value = strdup(w+len+1);
+       r->dups = NULL;
+       *rp = r;
+       return 1;
+}
+
+void policyline(char *line, char *type)
+{
+       struct pol_rule *pr;
+       char *w;
+
+       if (config_rules_end == NULL)
+               config_rules_end = &config_rules;
+
+       pr = malloc(sizeof(*pr));
+       pr->type = type;
+       pr->rule = NULL;
+       for (w = dl_next(line); w != line ; w = dl_next(w)) {
+               if (try_rule(w, rule_path, &pr->rule))
+                       config_rules_has_path = 1;
+               else if (! try_rule(w, rule_type, &pr->rule) &&
+                        ! try_rule(w, pol_metadata, &pr->rule) &&
+                        ! try_rule(w, pol_act, &pr->rule) &&
+                        ! try_rule(w, pol_domain, &pr->rule) &&
+                        ! try_rule(w, pol_auto, &pr->rule))
+                       fprintf(stderr, Name ": policy rule %s unrecognised and ignored\n",
+                               w);
+       }
+       pr->next = config_rules;
+       config_rules = pr;
+}
+
+void policy_add(char *type, ...)
+{
+       va_list ap;
+       struct pol_rule *pr;
+       char *name, *val;
+
+       pr = malloc(sizeof(*pr));
+       pr->type = type;
+       pr->rule = NULL;
+
+       va_start(ap, type);
+       while ((name = va_arg(ap, char*)) != NULL) {
+               struct rule *r;
+
+               val = va_arg(ap, char*);
+               r = malloc(sizeof(*r));
+               r->next = pr->rule;
+               r->name = name;
+               r->value = strdup(val);
+               r->dups = NULL;
+               pr->rule = r;
+       }
+       pr->next = config_rules;
+       config_rules = pr;
+}
+
+void policy_free(void)
+{
+       while (config_rules) {
+               struct pol_rule *pr = config_rules;
+               struct rule *r;
+
+               config_rules = config_rules->next;
+
+               for (r = pr->rule; r; ) {
+                       struct rule *next = r->next;
+                       free(r->value);
+                       if (r->dups)
+                               free_line(r->dups);
+                       free(r);
+                       r = next;
+               }
+               free(pr);
+       }
+       config_rules_end = NULL;
+       config_rules_has_path = 0;
+}
+
+void dev_policy_free(struct dev_policy *p)
+{
+       struct dev_policy *t;
+       while (p) {
+               t = p;
+               p = p->next;
+               free(t);
+       }
+}
+
+static enum policy_action map_act(const char *act)
+{
+       if (strcmp(act, "include") == 0)
+               return act_include;
+       if (strcmp(act, "re-add") == 0)
+               return act_re_add;
+       if (strcmp(act, "spare") == 0)
+               return act_spare;
+       if (strcmp(act, "spare-same-slot") == 0)
+               return act_spare_same_slot;
+       if (strcmp(act, "force-spare") == 0)
+               return act_force_spare;
+       return act_err;
+}
+
+static enum policy_action policy_action(struct dev_policy *plist, const char *metadata)
+{
+       enum policy_action rv = act_default;
+       struct dev_policy *p;
+
+       plist = pol_find(plist, pol_act);
+       pol_for_each(p, plist, metadata) {
+               enum policy_action a = map_act(p->value);
+               if (a > rv)
+                       rv = a;
+       }
+       return rv;
+}
+
+int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want)
+{
+       enum policy_action act = policy_action(plist, metadata);
+
+       if (act == act_err)
+               return 0;
+       return (act >= want);
+}
+
+int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want)
+{
+       struct dev_policy *pol = disk_policy(disk);
+       int rv = policy_action_allows(pol, metadata, want);
+
+       dev_policy_free(pol);
+       return rv;
+}
+
+
+/* Domain policy:
+ * Any device can have a list of domains asserted by different policy
+ * statements.
+ * An array also has a list of domains comprising all the domains of
+ * all the devices in an array.
+ * Where an array has a spare-group, that becomes an addition domain for
+ * every device in the array and thus for the array.
+ *
+ * We keep the list of domains in a sorted linked list
+ * As dev policies are already sorted, this is fairly easy to manage.
+ */
+
+static struct domainlist **domain_merge_one(struct domainlist **domp,
+                                           const char *domain)
+{
+       /* merge a domain name into a sorted list and return the
+        * location of the insertion or match
+        */
+       struct domainlist *dom = *domp;
+
+       while (dom && strcmp(dom->dom, domain) < 0) {
+               domp = &dom->next;
+               dom = *domp;
+       }
+       if (dom == NULL || strcmp(dom->dom, domain) != 0) {
+               dom = malloc(sizeof(*dom));
+               dom->next = *domp;
+               dom->dom = domain;
+               *domp = dom;
+       }
+       return domp;
+}
+
+#if (DEBUG)
+void dump_policy(struct dev_policy *policy)
+{
+       while (policy) {
+               dprintf("policy: %p name: %s value: %s metadata: %s\n",
+                       policy,
+                       policy->name,
+                       policy->value,
+                       policy->metadata);
+               policy = policy->next;
+       }
+}
+#endif
+
+void domain_merge(struct domainlist **domp, struct dev_policy *pollist,
+                        const char *metadata)
+{
+       /* Add to 'domp' all the domains in pol that apply to 'metadata'
+        * which are not already in domp
+        */
+       struct dev_policy *pol;
+       pollist = pol_find(pollist, pol_domain);
+       pol_for_each(pol, pollist, metadata)
+               domain_merge_one(domp, pol->value);
+}
+
+int domain_test(struct domainlist *dom, struct dev_policy *pol,
+               const char *metadata)
+{
+       /* Check that all domains in pol (for metadata) are also in
+        * dom.  Both lists are sorted.
+        * If pol has no domains, we don't really know about this device
+        * so we allow caller to choose:
+        * -1:  has no domains
+        *  0:  has domains, not all match
+        *  1:  has domains, all match
+        */
+       int found_any = -1;
+       struct dev_policy *p;
+
+       pol = pol_find(pol, pol_domain);
+       pol_for_each(p, pol, metadata) {
+               found_any = 1;
+               while (dom && strcmp(dom->dom, p->value) < 0)
+                       dom = dom->next;
+               if (!dom || strcmp(dom->dom, p->value) != 0)
+                       return 0;
+       }
+       return found_any;
+}
+
+void domainlist_add_dev(struct domainlist **dom, int devnum, const char *metadata)
+{
+       struct dev_policy *pol = devnum_policy(devnum);
+       domain_merge(dom, pol, metadata);
+       dev_policy_free(pol);
+}
+
+struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata)
+{
+       struct domainlist *domlist = NULL;
+
+       for (mdi = mdi->devs ; mdi ; mdi = mdi->next)
+               domainlist_add_dev(&domlist, makedev(mdi->disk.major,
+                                                    mdi->disk.minor),
+                                  metadata);
+
+       return domlist;
+}
+
+void domain_add(struct domainlist **domp, char *domain)
+{
+       domain_merge_one(domp, domain);
+}
+
+
+void domain_free(struct domainlist *dl)
+{
+       while (dl) {
+               struct domainlist *head = dl;
+               dl = dl->next;
+               free(head);
+       }
+}
+
+/*
+ * same-path policy.
+ * Some policy decisions are guided by knowledge of which
+ * array previously owned the device at a given physical location (path).
+ * When removing a device from an array we might record the array against
+ * the path, and when finding a new device, we might look for which
+ * array previously used that path.
+ *
+ * The 'array' is described by a map_ent, and the path by a the disk in an
+ * mdinfo, or a string.
+ */
+
+void policy_save_path(char *id_path, struct map_ent *array)
+{
+       char path[PATH_MAX];
+       FILE *f = NULL;
+
+       if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) {
+               fprintf(stderr, Name ": can't create file to save path "
+                       "to old disk: %s\n", strerror(errno));
+               return;
+       }
+
+       snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path);
+       f = fopen(path, "w");
+       if (!f) {
+               fprintf(stderr, Name ": can't create file to"
+                       " save path to old disk: %s\n",
+                       strerror(errno));
+               return;
+       }
+
+       if (fprintf(f, "%s %08x:%08x:%08x:%08x\n",
+                   array->metadata,
+                   array->uuid[0], array->uuid[1],
+                   array->uuid[2], array->uuid[3]) <= 0)
+               fprintf(stderr, Name ": Failed to write to "
+                       "<id_path> cookie\n");
+
+       fclose(f);
+}
+
+int policy_check_path(struct mdinfo *disk, struct map_ent *array)
+{
+       char path[PATH_MAX];
+       FILE *f = NULL;
+       char *id_path = disk_path(disk);
+       int rv;
+
+       if (!id_path)
+               return 0;
+
+       snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path);
+       f = fopen(path, "r");
+       if (!f)
+               return 0;
+
+       rv = fscanf(f, " %s %x:%x:%x:%x\n",
+                   array->metadata,
+                   array->uuid,
+                   array->uuid+1,
+                   array->uuid+2,
+                   array->uuid+3);
+       fclose(f);
+       return rv == 5;
+}
+
+/* invocation of udev rule file */
+char udev_template_start[] =
+"# do not edit this file, it is automatically generated by mdadm\n"
+"\n";
+
+/* find rule named rule_type and return its value */
+char *find_rule(struct rule *rule, char *rule_type)
+{
+       while (rule) {
+               if (rule->name == rule_type)
+                       return rule->value;
+
+               rule = rule->next;
+       }
+       return NULL;
+}
+
+#define UDEV_RULE_FORMAT \
+"ACTION==\"add\", SUBSYSTEM==\"block\", " \
+"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \
+"RUN+=\"/sbin/mdadm --incremental $env{DEVNAME}\"\n"
+
+#define UDEV_RULE_FORMAT_NOTYPE \
+"ACTION==\"add\", SUBSYSTEM==\"block\", " \
+"ENV{ID_PATH}==\"%s\", " \
+"RUN+=\"/sbin/mdadm --incremental $env{DEVNAME}\"\n"
+
+/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */
+int write_rule(struct rule *rule, int fd, int force_part)
+{
+       char line[1024];
+       char *pth = find_rule(rule, rule_path);
+       char *typ = find_rule(rule, rule_type);
+       if (!pth)
+               return -1;
+
+       if (force_part)
+               typ = type_part;
+       if (typ)
+               snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT, typ, pth);
+       else
+               snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT_NOTYPE, pth);
+       return write(fd, line, strlen(line)) == (int)strlen(line);
+}
+
+/* Generate single entry in udev rule basing on POLICY line found in config
+ * file. Take only those with paths, only first occurrence if paths are equal
+ * and if actions supports handling of spares (>=act_spare_same_slot)
+ */
+int generate_entries(int fd)
+{
+       struct pol_rule *loop, *dup;
+       char *loop_value, *dup_value;
+       int duplicate;
+
+       for (loop = config_rules; loop; loop = loop->next) {
+               if (loop->type != rule_policy && loop->type != rule_part)
+                       continue;
+               duplicate = 0;
+
+               /* only policies with paths and with actions supporting
+                * bare disks are considered */
+               loop_value = find_rule(loop->rule, pol_act);
+               if (!loop_value || map_act(loop_value) < act_spare_same_slot)
+                       continue;
+               loop_value = find_rule(loop->rule, rule_path);
+               if (!loop_value)
+                       continue;
+               for (dup = config_rules; dup != loop; dup = dup->next) {
+                       if (dup->type != rule_policy && loop->type != rule_part)
+                               continue;
+                       dup_value = find_rule(dup->rule, pol_act);
+                       if (!dup_value || map_act(dup_value) < act_spare_same_slot)
+                               continue;
+                       dup_value = find_rule(dup->rule, rule_path);
+                       if (!dup_value)
+                               continue;
+                       if (strcmp(loop_value, dup_value) == 0) {
+                               duplicate = 1;
+                               break;
+                       }
+               }
+
+               /* not a dup or first occurrence */
+               if (!duplicate)
+                       if (!write_rule(loop->rule, fd, loop->type == rule_part) )
+                               return 0;
+       }
+       return 1;
+}
+
+/* Write_rules routine creates dynamic udev rules used to handle
+ * hot-plug events for bare devices (and making them spares)
+ */
+int Write_rules(char *rule_name)
+{
+       int fd;
+       char udev_rule_file[PATH_MAX];
+
+       if (rule_name) {
+              strcpy(udev_rule_file, rule_name);
+              strcat(udev_rule_file, ".temp");
+               fd = creat(udev_rule_file,
+                          S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+              if (fd == -1)
+                      return 1;
+       } else
+               fd = 1;
+
+       /* write static invocation */
+       if (write(fd, udev_template_start,
+                sizeof(udev_template_start) - 1)
+          != (int)sizeof(udev_template_start)-1)
+              goto abort;
+
+       /* iterate, if none created or error occurred, remove file */
+       if (generate_entries(fd) < 0)
+              goto abort;
+
+       fsync(fd);
+       if (rule_name) {
+              close(fd);
+              rename(udev_rule_file, rule_name);
+       }
+       return 0;
+abort:
+       if (rule_name) {
+              close(fd);
+              unlink(udev_rule_file);
+       }
+       return 1;
+}
index a8e3a5896f76f6a5d3fb85c9ffd6498e8b253ea3..6a0b98d83ad6cb4c1e05477d6ea28a84831436f0 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include "probe_roms.h"
+#include "mdadm.h"
 #include <unistd.h>
 #include <signal.h>
 #include <fcntl.h>
@@ -130,50 +131,60 @@ static void *isa_bus_to_virt(unsigned long addr)
 struct resource {
        unsigned long start;
        unsigned long end;
+       unsigned long data;
        const char *name;
 };
 
 static struct resource system_rom_resource = {
        .name   = "System ROM",
        .start  = 0xf0000,
+       .data   = 0,
        .end    = 0xfffff,
 };
 
 static struct resource extension_rom_resource = {
        .name   = "Extension ROM",
        .start  = 0xe0000,
+       .data   = 0,
        .end    = 0xeffff,
 };
 
 static struct resource adapter_rom_resources[] = { {
        .name   = "Adapter ROM",
        .start  = 0xc8000,
+       .data   = 0,
        .end    = 0,
 }, {
        .name   = "Adapter ROM",
        .start  = 0,
+       .data   = 0,
        .end    = 0,
 }, {
        .name   = "Adapter ROM",
        .start  = 0,
+       .data   = 0,
        .end    = 0,
 }, {
        .name   = "Adapter ROM",
        .start  = 0,
+       .data   = 0,
        .end    = 0,
 }, {
        .name   = "Adapter ROM",
        .start  = 0,
+       .data   = 0,
        .end    = 0,
 }, {
        .name   = "Adapter ROM",
        .start  = 0,
+       .data   = 0,
        .end    = 0,
 } };
 
 static struct resource video_rom_resource = {
        .name   = "Video ROM",
        .start  = 0xc0000,
+       .data   = 0,
        .end    = 0xc7fff,
 };
 
@@ -211,7 +222,8 @@ int scan_adapter_roms(scan_fn fn)
 
                if (res->start) {
                        found = fn(isa_bus_to_virt(res->start),
-                                  isa_bus_to_virt(res->end));
+                                  isa_bus_to_virt(res->end),
+                                  isa_bus_to_virt(res->data));
                        if (found)
                                break;
                } else
@@ -232,6 +244,7 @@ void probe_roms(void)
        unsigned long start, length, upper;
        unsigned char c;
        unsigned int i;
+       __u16 val=0;
 
        if (rom_fd < 0)
                return;
@@ -284,14 +297,23 @@ void probe_roms(void)
                /* 0 < length <= 0x7f * 512, historically */
                length = c * 512;
 
+               /* Retrieve 16-bit pointer to PCI Data Structure (offset 18h-19h)
+                * The data can be within 64KB forward of the first location
+                * of this code image. The pointer is in little-endian order
+                */
+
+               if (probe_address16(rom + 0x18, &val) != 0)
+                       continue;
+               val = __le16_to_cpu(val);
+
                /* but accept any length that fits if checksum okay */
                if (!length || start + length > upper || !romchecksum(rom, length))
                        continue;
 
                adapter_rom_resources[i].start = start;
+               adapter_rom_resources[i].data = start + (unsigned long) val;
                adapter_rom_resources[i].end = start + length - 1;
 
                start = adapter_rom_resources[i++].end & ~(rom_align - 1);
        }
 }
-
index a1e291a10cceaf989553871473eae1677ac83b07..6d70411a3d1840b087126ffff2fdd984ff382fdd 100644 (file)
@@ -19,6 +19,6 @@
 
 void probe_roms_exit(void);
 int probe_roms_init(unsigned long align);
-typedef int (*scan_fn)(const void *start, const void *end);
+typedef int (*scan_fn)(const void *start, const void *end, const void *data);
 int scan_adapter_roms(scan_fn fn);
 void probe_roms(void);
index 3074693bd2c3f8f36faa4f63ed76a01cacaf07d1..a26f9e52f134a0e87daf1f77f5562329f6fd173e 100644 (file)
@@ -43,6 +43,11 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks,
         */
        int pd;
 
+       /* layout is not relevant for raid0 and raid4 */
+       if ((level == 0) ||
+           (level == 4))
+               layout = 0;
+
        switch(level*100 + layout) {
        case 000:
        case 400:
@@ -280,10 +285,13 @@ uint8_t raid6_gfmul[256][256];
 uint8_t raid6_gfexp[256];
 uint8_t raid6_gfinv[256];
 uint8_t raid6_gfexi[256];
+uint8_t raid6_gflog[256];
+uint8_t raid6_gfilog[256];
 void make_tables(void)
 {
        int i, j;
        uint8_t v;
+       uint32_t b, log;
 
        /* Compute multiplication table */
        for (i = 0; i < 256; i++)
@@ -307,6 +315,21 @@ void make_tables(void)
        for (i = 0; i < 256; i ++)
                raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
 
+       /* Compute log and inverse log */
+       /* Modified code from:
+        *    http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
+        */
+       b = 1;
+       raid6_gflog[0] = 0;
+       raid6_gfilog[255] = 0;
+
+       for (log = 0; log < 255; log++) {
+               raid6_gflog[b] = (uint8_t) log;
+               raid6_gfilog[log] = (uint8_t) b;
+               b = b << 1;
+               if (b & 256) b = b ^ 0435;
+       }
+
        tables_ready = 1;
 }
 
@@ -382,6 +405,67 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs)
        }
 }
 
+/* Try to find out if a specific disk has a problem */
+int raid6_check_disks(int data_disks, int start, int chunk_size,
+                     int level, int layout, int diskP, int diskQ,
+                     char *p, char *q, char **stripes)
+{
+       int i;
+       int data_id, diskD;
+       uint8_t Px, Qx;
+       int curr_broken_disk = -1;
+       int prev_broken_disk = -1;
+       int broken_status = 0;
+
+       for(i = 0; i < chunk_size; i++) {
+               Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
+               Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
+
+               if((Px != 0) && (Qx == 0))
+                       curr_broken_disk = diskP;
+
+
+               if((Px == 0) && (Qx != 0))
+                       curr_broken_disk = diskQ;
+
+
+               if((Px != 0) && (Qx != 0)) {
+                       data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
+                       if(data_id < 0) data_id += 255;
+                       diskD = geo_map(data_id, start/chunk_size,
+                                       data_disks + 2, level, layout);
+                       curr_broken_disk = diskD;
+               }
+
+               if((Px == 0) && (Qx == 0))
+                       curr_broken_disk = curr_broken_disk;
+
+               if(curr_broken_disk >= data_disks + 2)
+                       broken_status = 2;
+
+               switch(broken_status) {
+               case 0:
+                       if(curr_broken_disk != -1) {
+                               prev_broken_disk = curr_broken_disk;
+                               broken_status = 1;
+                       }
+                       break;
+
+               case 1:
+                       if(curr_broken_disk != prev_broken_disk)
+                               broken_status = 2;
+                       break;
+
+               case 2:
+               default:
+                       curr_broken_disk = prev_broken_disk = -2;
+                       break;
+               }
+       }
+
+       return curr_broken_disk;
+}
+
 /* Save data:
  * We are given:
  *  A list of 'fds' of the active disks.  Some may be absent.
@@ -668,7 +752,12 @@ int test_stripes(int *source, unsigned long long *offsets,
        char *q = malloc(chunk_size);
 
        int i;
+       int diskP, diskQ;
        int data_disks = raid_disks - (level == 5 ? 1: 2);
+
+       if (!tables_ready)
+               make_tables();
+
        for ( i = 0 ; i < raid_disks ; i++)
                stripes[i] = stripe_buf + i * chunk_size;
 
@@ -688,18 +777,27 @@ int test_stripes(int *source, unsigned long long *offsets,
                switch(level) {
                case 6:
                        qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
-                       disk = geo_map(-1, start/chunk_size, raid_disks,
+                       diskP = geo_map(-1, start/chunk_size, raid_disks,
                                       level, layout);
-                       if (memcmp(p, stripes[disk], chunk_size) != 0) {
-                               printf("P(%d) wrong at %llu\n", disk,
+                       if (memcmp(p, stripes[diskP], chunk_size) != 0) {
+                               printf("P(%d) wrong at %llu\n", diskP,
                                       start / chunk_size);
                        }
-                       disk = geo_map(-2, start/chunk_size, raid_disks,
+                       diskQ = geo_map(-2, start/chunk_size, raid_disks,
                                       level, layout);
-                       if (memcmp(q, stripes[disk], chunk_size) != 0) {
-                               printf("Q(%d) wrong at %llu\n", disk,
+                       if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
+                               printf("Q(%d) wrong at %llu\n", diskQ,
                                       start / chunk_size);
                        }
+                       disk = raid6_check_disks(data_disks, start, chunk_size,
+                                                level, layout, diskP, diskQ,
+                                                p, q, stripes);
+                       if(disk >= 0) {
+                         printf("Possible failed disk: %d\n", disk);
+                       }
+                       if(disk == -2) {
+                         printf("Failure detected, but disk unknown\n");
+                       }
                        break;
                }
                length -= chunk_size;
index be15e43ebc9b9f74f1a94bd7ada1d564e9531162..8c5456834ff7c294f0d9c66580dbf10e827fdca1 100644 (file)
@@ -760,7 +760,7 @@ static int load_ddf_local(int fd, struct ddf_super *super,
 
 #ifndef MDASSEMBLE
 static int load_super_ddf_all(struct supertype *st, int fd,
-                             void **sbp, char *devname, int keep_fd);
+                             void **sbp, char *devname);
 #endif
 
 static void free_super_ddf(struct supertype *st);
@@ -772,14 +772,6 @@ static int load_super_ddf(struct supertype *st, int fd,
        struct ddf_super *super;
        int rv;
 
-#ifndef MDASSEMBLE
-       /* if 'fd' is a container, load metadata from all the devices */
-       if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
-               return 0;
-#endif
-       if (st->subarray[0])
-               return 1; /* FIXME Is this correct */
-
        if (get_dev_size(fd, devname, &dsize) == 0)
                return 1;
 
@@ -844,26 +836,6 @@ static int load_super_ddf(struct supertype *st, int fd,
                return rv;
        }
 
-       if (st->subarray[0]) {
-               unsigned long val;
-               struct vcl *v;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free(super);
-                       return 1;
-               }
-
-               for (v = super->conflist; v; v = v->next)
-                       if (v->vcnum == val)
-                               super->currentconf = v;
-               if (!super->currentconf) {
-                       free(super);
-                       return 1;
-               }
-       }
-
        /* Should possibly check the sections .... */
 
        st->sb = super;
@@ -872,7 +844,6 @@ static int load_super_ddf(struct supertype *st, int fd,
                st->minor_version = 0;
                st->max_devs = 512;
        }
-       st->loaded_container = 0;
        return 0;
 
 }
@@ -924,6 +895,7 @@ static struct supertype *match_metadata_desc_ddf(char *arg)
 
        st = malloc(sizeof(*st));
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super_ddf;
        st->max_devs = 512;
        st->minor_version = 0;
@@ -1199,7 +1171,7 @@ static void examine_super_ddf(struct supertype *st, char *homehost)
        examine_pds(sb);
 }
 
-static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map);
 
 static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
 
@@ -1209,7 +1181,7 @@ static void brief_examine_super_ddf(struct supertype *st, int verbose)
         */
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
 
        printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5);
@@ -1223,7 +1195,7 @@ static void brief_examine_subarrays_ddf(struct supertype *st, int verbose)
        struct mdinfo info;
        unsigned int i;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
 
        for (i = 0; i < __be16_to_cpu(ddf->virt->max_vdes); i++) {
@@ -1245,7 +1217,7 @@ static void export_examine_super_ddf(struct supertype *st)
 {
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("MD_METADATA=ddf\n");
        printf("MD_LEVEL=container\n");
@@ -1271,7 +1243,7 @@ static void brief_detail_super_ddf(struct supertype *st)
 //     struct ddf_super *ddf = st->sb;
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf,':');
        printf(" UUID=%s", nbuf + 5);
 }
@@ -1358,14 +1330,15 @@ static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
        memcpy(uuid, buf, 4*4);
 }
 
-static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map);
 
-static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct ddf_super *ddf = st->sb;
+       int map_disks = info->array.raid_disks;
 
        if (ddf->currentconf) {
-               getinfo_super_ddf_bvd(st, info);
+               getinfo_super_ddf_bvd(st, info, map);
                return;
        }
 
@@ -1409,17 +1382,29 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
 
        uuid_from_super_ddf(st, info->uuid);
 
+       if (map) {
+               int i;
+               for (i = 0 ; i < map_disks; i++) {
+                       if (i < info->array.raid_disks &&
+                           (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) &&
+                           !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed))
+                               map[i] = 1;
+                       else
+                               map[i] = 0;
+               }
+       }
 }
 
 static int rlq_to_layout(int rlq, int prl, int raiddisks);
 
-static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct ddf_super *ddf = st->sb;
        struct vcl *vc = ddf->currentconf;
        int cd = ddf->currentdev;
        int j;
        struct dl *dl;
+       int map_disks = info->array.raid_disks;
 
        /* FIXME this returns BVD info - what if we want SVD ?? */
 
@@ -1471,9 +1456,9 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
 
        info->array.major_version = -1;
        info->array.minor_version = -2;
-       sprintf(info->text_version, "/%s/%s",
+       sprintf(info->text_version, "/%s/%d",
                devnum2devname(st->container_dev),
-               st->subarray);
+               info->container_member);
        info->safe_mode_delay = 200;
 
        memcpy(info->name, ddf->virt->entries[info->container_member].name, 16);
@@ -1481,6 +1466,18 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
        for(j=0; j<16; j++)
                if (info->name[j] == ' ')
                        info->name[j] = 0;
+
+       if (map)
+               for (j = 0; j < map_disks; j++) {
+                       map[j] = 0;
+                       if (j <  info->array.raid_disks) {
+                               int i = find_phys(ddf, vc->conf.phys_refnum[j]);
+                               if (i >= 0 && 
+                                   (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) &&
+                                   !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed))
+                                       map[i] = 1;
+                       }
+               }
 }
 
 
@@ -1526,28 +1523,27 @@ static int update_super_ddf(struct supertype *st, struct mdinfo *info,
 
        if (strcmp(update, "grow") == 0) {
                /* FIXME */
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
 //             info->resync_checkpoint = 0;
-       }
-       /* We ignore UUID updates as they make even less sense
-        * with DDF
-        */
-       if (strcmp(update, "homehost") == 0) {
+       } else if (strcmp(update, "homehost") == 0) {
                /* homehost is stored in controller->vendor_data,
                 * or it is when we are the vendor
                 */
 //             if (info->vendor_is_local)
 //                     strcpy(ddf->controller.vendor_data, homehost);
-       }
-       if (strcmp(update, "name") == 0) {
+               rv = -1;
+       } else if (strcmp(update, "name") == 0) {
                /* name is stored in virtual_entry->name */
 //             memset(ve->name, ' ', 16);
 //             strncpy(ve->name, info->name, 16);
-       }
-       if (strcmp(update, "_reshape_progress") == 0) {
+               rv = -1;
+       } else if (strcmp(update, "_reshape_progress") == 0) {
                /* We don't support reshape yet */
-       }
+       } else if (strcmp(update, "assemble") == 0 ) {
+               /* Do nothing, just succeed */
+               rv = 0;
+       } else
+               rv = -1;
 
 //     update_all_csum(ddf);
 
@@ -2023,7 +2019,6 @@ static int init_super_ddf_bvd(struct supertype *st,
        }
        vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
        vcl->vcnum = venum;
-       sprintf(st->subarray, "%d", venum);
        vcl->block_sizes = NULL; /* FIXME not for CONCAT */
 
        vc = &vcl->conf;
@@ -2289,7 +2284,7 @@ static int add_to_super_ddf(struct supertype *st,
 
 static unsigned char null_conf[4096+512];
 
-static int __write_init_super_ddf(struct supertype *st, int do_close)
+static int __write_init_super_ddf(struct supertype *st)
 {
 
        struct ddf_super *ddf = st->sb;
@@ -2396,12 +2391,6 @@ static int __write_init_super_ddf(struct supertype *st, int do_close)
                successes++;
        }
 
-       if (do_close)
-               for (d = ddf->dlist; d; d=d->next) {
-                       close(d->fd);
-                       d->fd = -1;
-               }
-
        return attempts != successes;
 }
 
@@ -2454,7 +2443,7 @@ static int write_init_super_ddf(struct supertype *st)
                struct dl *d;
                for (d = ddf->dlist; d; d=d->next)
                        while (Kill(d->devname, NULL, 0, 1, 1) == 0);
-               return __write_init_super_ddf(st, 1);
+               return __write_init_super_ddf(st);
        }
 }
 
@@ -2568,13 +2557,13 @@ validate_geometry_ddf_container(struct supertype *st,
 
 static int validate_geometry_ddf_bvd(struct supertype *st,
                                     int level, int layout, int raiddisks,
-                                    int chunk, unsigned long long size,
+                                    int *chunk, unsigned long long size,
                                     char *dev, unsigned long long *freesize,
                                     int verbose);
 
 static int validate_geometry_ddf(struct supertype *st,
                                 int level, int layout, int raiddisks,
-                                int chunk, unsigned long long size,
+                                int *chunk, unsigned long long size,
                                 char *dev, unsigned long long *freesize,
                                 int verbose)
 {
@@ -2589,10 +2578,14 @@ static int validate_geometry_ddf(struct supertype *st,
         * If given BVDs, we make an SVD, changing all the GUIDs in the process.
         */
 
+       if (chunk && *chunk == UnSet)
+               *chunk = DEFAULT_CHUNK;
+
+
        if (level == LEVEL_CONTAINER) {
                /* Must be a fresh device to add to a container */
                return validate_geometry_ddf_container(st, level, layout,
-                                                      raiddisks, chunk,
+                                                      raiddisks, chunk?*chunk:0,
                                                       size, dev, freesize,
                                                       verbose);
        }
@@ -2619,7 +2612,7 @@ static int validate_geometry_ddf(struct supertype *st,
                         * chosen so that add_to_super/getinfo_super
                         * can return them.
                         */
-                       return reserve_space(st, raiddisks, size, chunk, freesize);
+                       return reserve_space(st, raiddisks, size, chunk?*chunk:0, freesize);
                }
                return 1;
        }
@@ -2683,7 +2676,7 @@ static int validate_geometry_ddf(struct supertype *st,
                 * and try to create a bvd
                 */
                struct ddf_super *ddf;
-               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) {
                        st->sb = ddf;
                        st->container_dev = fd2devnum(cfd);
                        close(cfd);
@@ -2736,7 +2729,7 @@ validate_geometry_ddf_container(struct supertype *st,
 
 static int validate_geometry_ddf_bvd(struct supertype *st,
                                     int level, int layout, int raiddisks,
-                                    int chunk, unsigned long long size,
+                                    int *chunk, unsigned long long size,
                                     char *dev, unsigned long long *freesize,
                                     int verbose)
 {
@@ -2830,7 +2823,7 @@ static int validate_geometry_ddf_bvd(struct supertype *st,
 }
 
 static int load_super_ddf_all(struct supertype *st, int fd,
-                             void **sbp, char *devname, int keep_fd)
+                             void **sbp, char *devname)
 {
        struct mdinfo *sra;
        struct ddf_super *super;
@@ -2886,49 +2879,35 @@ static int load_super_ddf_all(struct supertype *st, int fd,
                int rv;
 
                sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
-               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               dfd = dev_open(nm, O_RDWR);
                if (dfd < 0)
                        return 2;
                rv = load_ddf_headers(dfd, super, NULL);
                if (rv == 0)
-                       rv = load_ddf_local(dfd, super, NULL, keep_fd);
-               if (!keep_fd) close(dfd);
+                       rv = load_ddf_local(dfd, super, NULL, 1);
                if (rv)
                        return 1;
        }
-       if (st->subarray[0]) {
-               unsigned long val;
-               struct vcl *v;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free(super);
-                       return 1;
-               }
-
-               for (v = super->conflist; v; v = v->next)
-                       if (v->vcnum == val)
-                               super->currentconf = v;
-               if (!super->currentconf) {
-                       free(super);
-                       return 1;
-               }
-       }
 
        *sbp = super;
        if (st->ss == NULL) {
                st->ss = &super_ddf;
                st->minor_version = 0;
                st->max_devs = 512;
-               st->container_dev = fd2devnum(fd);
        }
-       st->loaded_container = 1;
+       st->container_dev = fd2devnum(fd);
        return 0;
 }
+
+static int load_container_ddf(struct supertype *st, int fd,
+                             char *devname)
+{
+       return load_super_ddf_all(st, fd, &st->sb, devname);
+}
+
 #endif /* MDASSEMBLE */
 
-static struct mdinfo *container_content_ddf(struct supertype *st)
+static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray)
 {
        /* Given a container loaded by load_super_ddf_all,
         * extract information about all the arrays into
@@ -2947,6 +2926,13 @@ static struct mdinfo *container_content_ddf(struct supertype *st)
                unsigned int i;
                unsigned int j;
                struct mdinfo *this;
+               char *ep;
+
+               if (subarray &&
+                   (strtoul(subarray, &ep, 10) != vc->vcnum ||
+                    *ep != '\0'))
+                       continue;
+
                this = malloc(sizeof(*this));
                memset(this, 0, sizeof(*this));
                this->next = rest;
@@ -3293,7 +3279,7 @@ static void ddf_sync_metadata(struct supertype *st)
        if (!ddf->updates_pending)
                return;
        ddf->updates_pending = 0;
-       __write_init_super_ddf(st, 0);
+       __write_init_super_ddf(st);
        dprintf("ddf: sync_metadata\n");
 }
 
@@ -3717,6 +3703,15 @@ static int ddf_level_to_layout(int level)
        }
 }
 
+static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk)
+{
+       if (level && *level == UnSet)
+               *level = LEVEL_CONTAINER;
+
+       if (level && layout && *layout == UnSet)
+               *layout = ddf_level_to_layout(*level);
+}
+
 struct superswitch super_ddf = {
 #ifndef        MDASSEMBLE
        .examine_super  = examine_super_ddf,
@@ -3728,6 +3723,7 @@ struct superswitch super_ddf = {
        .validate_geometry = validate_geometry_ddf,
        .write_init_super = write_init_super_ddf,
        .add_to_super   = add_to_super_ddf,
+       .load_container = load_container_ddf,
 #endif
        .match_home     = match_home_ddf,
        .uuid_from_super= uuid_from_super_ddf,
@@ -3744,7 +3740,7 @@ struct superswitch super_ddf = {
        .free_super     = free_super_ddf,
        .match_metadata_desc = match_metadata_desc_ddf,
        .container_content = container_content_ddf,
-       .default_layout = ddf_level_to_layout,
+       .default_geometry = default_geometry_ddf,
 
        .external       = 1,
 
diff --git a/super-gpt.c b/super-gpt.c
new file mode 100644 (file)
index 0000000..6f852aa
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'gpt' is a pseudo metadata type for devices which have a
+ * GPT partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_gpt(struct supertype *st)
+{
+       free(st->sb);
+       st->sb = NULL;
+}
+
+#ifndef MDASSEMBLE
+static void examine_gpt(struct supertype *st, char *homehost)
+{
+       struct GPT *gpt = st->sb + 512;
+       struct GPT_part_entry *gpe = st->sb + 1024;
+       unsigned int i;
+
+       printf("    GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic));
+       printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision));
+       for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+               printf("  Partition[%02d] : %12llu sectors at %12llu\n",
+                      i,
+                      (unsigned long long)__le64_to_cpu(gpe[i].starting_lba),
+                      (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)-
+                      (unsigned long long)__le64_to_cpu(gpe[i].starting_lba)
+                      +1
+                       );
+       }
+}
+#endif /* MDASSEMBLE */
+
+static int load_gpt(struct supertype *st, int fd, char *devname)
+{
+       struct MBR *super;
+       struct GPT *gpt_head;
+       int to_read;
+
+       free_gpt(st);
+
+       if (posix_memalign((void**)&super, 512, 32*512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+       no_read:
+               if (devname)
+                       fprintf(stderr, Name ": Cannot read partition table on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       if (super->magic != MBR_SIGNATURE_MAGIC ||
+           super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) {
+       not_found:
+               if (devname)
+                       fprintf(stderr, Name ": No partition table found on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       /* Seem to have GPT, load the header */
+       gpt_head = (struct GPT*)(super+1);
+       if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head))
+               goto no_read;
+       if (gpt_head->magic != GPT_SIGNATURE_MAGIC)
+               goto not_found;
+       if (__le32_to_cpu(gpt_head->part_cnt) >= 128)
+               goto not_found;
+
+       to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry);
+       to_read =  ((to_read+511)/512) * 512;
+       if (read(fd, gpt_head+1, to_read) != to_read)
+               goto no_read;
+
+       st->sb = super;
+
+       if (st->ss == NULL) {
+               st->ss = &gpt;
+               st->minor_version = 0;
+               st->max_devs = 1;
+               st->info = NULL;
+       }
+       return 0;
+}
+
+static int store_gpt(struct supertype *st, int fd)
+{
+       /* FIXME should I save the boot loader */
+       /* need to write two copies! */
+       /* FIXME allow for blocks != 512 bytes
+        *etc
+        */
+       struct MBR *super = st->sb;
+       struct GPT *gpt;
+       int to_write;
+
+       gpt = (struct GPT*)(super+1);
+
+       to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry);
+       to_write =  ((to_write+511)/512) * 512;
+
+       lseek(fd, 0, 0);
+       if (write(fd, st->sb, to_write) != to_write)
+               return 4;
+
+       fsync(fd);
+       ioctl(fd, BLKRRPART, 0);
+       return 0;
+}
+
+static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map)
+{
+       struct GPT *gpt = st->sb + 512;
+       struct GPT_part_entry *gpe = st->sb + 1024;
+       unsigned int i;
+
+       memset(&info->array, 0, sizeof(info->array));
+       memset(&info->disk, 0, sizeof(info->disk));
+       strcpy(info->text_version, "gpt");
+       strcpy(info->name, "gpt");
+       info->component_size = 0;
+
+       for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+               unsigned long long last =
+                       (unsigned long long)__le64_to_cpu(gpe[i].ending_lba);
+               if (last > info->component_size)
+                       info->component_size = last;
+       }
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+       struct supertype *st = malloc(sizeof(*st));
+
+       if (!st)
+               return st;
+       if (strcmp(arg, "gpt") != 0)
+               return NULL;
+
+       st->ss = &gpt;
+       st->info = NULL;
+       st->minor_version = 0;
+       st->max_devs = 1;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry(struct supertype *st, int level,
+                            int layout, int raiddisks,
+                            int *chunk, unsigned long long size,
+                            char *subdev, unsigned long long *freesize,
+                            int verbose)
+{
+       fprintf(stderr, Name ": gpt metadata cannot be used this way\n");
+       return 0;
+}
+#endif
+
+struct superswitch gpt = {
+#ifndef MDASSEMBLE
+       .examine_super = examine_gpt,
+       .validate_geometry = validate_geometry,
+#endif
+       .match_metadata_desc = match_metadata_desc,
+       .load_super = load_gpt,
+       .store_super = store_gpt,
+       .getinfo_super = getinfo_gpt,
+       .free_super = free_gpt,
+       .name = "gpt",
+};
index b3a116f90308d75f2229c6ba1b70a3e78a7a1ce0..44c100b5e97bf22a375cf687fcd5b1b5fe19f435 100644 (file)
@@ -233,6 +233,17 @@ struct intel_dev {
        unsigned index;
 };
 
+struct intel_hba {
+       enum sys_dev_type type;
+       char *path;
+       char *pci_id;
+       struct intel_hba *next;
+};
+
+enum action {
+       DISK_REMOVE = 1,
+       DISK_ADD
+};
 /* internal representation of IMSM metadata */
 struct intel_super {
        union {
@@ -258,11 +269,13 @@ struct intel_super {
                int extent_cnt;
                struct extent *e; /* for determining freespace @ create */
                int raiddisk; /* slot to fill in autolayout */
+               enum action action;
        } *disks;
-       struct dl *add; /* list of disks to add while mdmon active */
+       struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon
+                                     active */
        struct dl *missing; /* disks removed while we weren't looking */
        struct bbm_log *bbm_log;
-       const char *hba; /* device path of the raid controller for this metadata */
+       struct intel_hba *hba; /* device path of the raid controller for this metadata */
        const struct imsm_orom *orom; /* platform firmware support */
        struct intel_super *next; /* (temp) list for disambiguating family_num */
 };
@@ -278,13 +291,21 @@ struct extent {
        unsigned long long start, size;
 };
 
+/* definitions of reshape process types */
+enum imsm_reshape_type {
+       CH_TAKEOVER,
+       CH_MIGRATION,
+};
+
 /* definition of messages passed to imsm_process_update */
 enum imsm_update_type {
        update_activate_spare,
        update_create_array,
        update_kill_array,
        update_rename_array,
-       update_add_disk,
+       update_add_remove_disk,
+       update_reshape_container_disks,
+       update_takeover
 };
 
 struct imsm_update_activate_spare {
@@ -295,6 +316,33 @@ struct imsm_update_activate_spare {
        struct imsm_update_activate_spare *next;
 };
 
+struct geo_params {
+       int dev_id;
+       char *dev_name;
+       long long size;
+       int level;
+       int layout;
+       int chunksize;
+       int raid_disks;
+};
+
+enum takeover_direction {
+       R10_TO_R0,
+       R0_TO_R10
+};
+struct imsm_update_takeover {
+       enum imsm_update_type type;
+       int subarray;
+       enum takeover_direction direction;
+};
+
+struct imsm_update_reshape {
+       enum imsm_update_type type;
+       int old_raid_disks;
+       int new_raid_disks;
+       int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */
+};
+
 struct disk_info {
        __u8 serial[MAX_RAID_SERIAL_LEN];
 };
@@ -316,10 +364,121 @@ struct imsm_update_rename_array {
        int dev_idx;
 };
 
-struct imsm_update_add_disk {
+struct imsm_update_add_remove_disk {
        enum imsm_update_type type;
 };
 
+
+static const char *_sys_dev_type[] = {
+       [SYS_DEV_UNKNOWN] = "Unknown",
+       [SYS_DEV_SAS] = "SAS",
+       [SYS_DEV_SATA] = "SATA"
+};
+
+const char *get_sys_dev_type(enum sys_dev_type type)
+{
+       if (type >= SYS_DEV_MAX)
+               type = SYS_DEV_UNKNOWN;
+
+       return _sys_dev_type[type];
+}
+
+#ifndef MDASSEMBLE
+static struct intel_hba * alloc_intel_hba(struct sys_dev *device)
+{
+       struct intel_hba *result = malloc(sizeof(*result));
+       if (result) {
+               result->type = device->type;
+               result->path = strdup(device->path);
+               result->next = NULL;
+               if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL)
+                       result->pci_id++;
+       }
+       return result;
+}
+
+static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device)
+{
+       struct intel_hba *result=NULL;
+       for (result = hba; result; result = result->next) {
+               if (result->type == device->type && strcmp(result->path, device->path) == 0)
+                       break;
+       }
+       return result;
+}
+
+
+static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device)
+{
+       struct intel_hba *hba;
+
+       /* check if disk attached to Intel HBA */
+       hba = find_intel_hba(super->hba, device);
+       if (hba != NULL)
+               return 1;
+       /* Check if HBA is already attached to super */
+       if (super->hba == NULL) {
+               super->hba = alloc_intel_hba(device);
+               return 1;
+       }
+
+       hba = super->hba;
+       /* Intel metadata allows for all disks attached to the same type HBA.
+        * Do not sypport odf HBA types mixing
+        */
+       if (device->type != hba->type)
+               return 2;
+
+       while (hba->next)
+               hba = hba->next;
+
+       hba->next = alloc_intel_hba(device);
+       return 1;
+}
+
+static struct sys_dev* find_disk_attached_hba(int fd, const char *devname)
+{
+       struct sys_dev *list, *elem, *prev;
+       char *disk_path;
+
+       if ((list = find_intel_devices()) == NULL)
+               return 0;
+
+       if (fd < 0)
+               disk_path  = (char *) devname;
+       else
+               disk_path = diskfd_to_devpath(fd);
+
+       if (!disk_path) {
+               free_sys_dev(&list);
+               return 0;
+       }
+
+       for (prev = NULL, elem = list; elem; prev = elem, elem = elem->next) {
+               if (path_attached_to_hba(disk_path, elem->path)) {
+                       if (prev == NULL)
+                               list = list->next;
+                       else
+                               prev->next = elem->next;
+                       elem->next = NULL;
+                       if (disk_path != devname)
+                               free(disk_path);
+                       free_sys_dev(&list);
+                       return elem;
+               }
+       }
+       if (disk_path != devname)
+               free(disk_path);
+       free_sys_dev(&list);
+
+       return NULL;
+}
+#endif /* MDASSEMBLE */
+
+
+static int find_intel_hba_capability(int fd, struct intel_super *super,
+                                    char *devname);
+
 static struct supertype *match_metadata_desc_imsm(char *arg)
 {
        struct supertype *st;
@@ -333,6 +492,7 @@ static struct supertype *match_metadata_desc_imsm(char *arg)
        if (!st)
                return NULL;
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super_imsm;
        st->max_devs = IMSM_MAX_DEVICES;
        st->minor_version = 0;
@@ -357,15 +517,28 @@ static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
        return &mpb->disk[index];
 }
 
-/* retrieve a disk from the parsed metadata */
-static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+/* retrieve the disk description based on a index of the disk
+ * in the sub-array
+ */
+static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index)
 {
        struct dl *d;
 
        for (d = super->disks; d; d = d->next)
                if (d->index == index)
-                       return &d->disk;
-       
+                       return d;
+
+       return NULL;
+}
+/* retrieve a disk from the parsed metadata */
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+       struct dl *dl;
+
+       dl = get_imsm_dl_disk(super, index);
+       if (dl)
+               return &dl->disk;
+
        return NULL;
 }
 
@@ -393,17 +566,24 @@ static size_t sizeof_imsm_map(struct imsm_map *map)
 
 struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
 {
+       /* A device can have 2 maps if it is in the middle of a migration.
+        * If second_map is:
+        *    0   - we return the first map
+        *    1   - we return the second map if it exists, else NULL
+        *   -1   - we return the second map if it exists, else the first
+        */
        struct imsm_map *map = &dev->vol.map[0];
 
-       if (second_map && !dev->vol.migr_state)
+       if (second_map == 1 && !dev->vol.migr_state)
                return NULL;
-       else if (second_map) {
+       else if (second_map == 1 ||
+                (second_map < 0 && dev->vol.migr_state)) {
                void *ptr = map;
 
                return ptr + sizeof_imsm_map(map);
        } else
                return map;
-               
+
 }
 
 /* return the size of the device.
@@ -470,23 +650,28 @@ static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
        return NULL;
 }
 
-static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot)
+/*
+ * for second_map:
+ *  == 0 get first map
+ *  == 1 get second map
+ *  == -1 than get map according to the current migr_state
+ */
+static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev,
+                                 int slot,
+                                 int second_map)
 {
        struct imsm_map *map;
 
-       if (dev->vol.migr_state)
-               map = get_imsm_map(dev, 1);
-       else
-               map = get_imsm_map(dev, 0);
+       map = get_imsm_map(dev, second_map);
 
        /* top byte identifies disk under rebuild */
        return __le32_to_cpu(map->disk_ord_tbl[slot]);
 }
 
 #define ord_to_idx(ord) (((ord) << 8) >> 8)
-static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot)
+static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map)
 {
-       __u32 ord = get_imsm_ord_tbl_ent(dev, slot);
+       __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map);
 
        return ord_to_idx(ord);
 }
@@ -646,6 +831,37 @@ static int is_failed(struct imsm_disk *disk)
        return (disk->status & FAILED_DISK) == FAILED_DISK;
 }
 
+/* Return minimum size of a spare that can be used in this array*/
+static unsigned long long min_acceptable_spare_size_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dl;
+       struct extent *e;
+       int i;
+       unsigned long long rv = 0;
+
+       if (!super)
+               return rv;
+       /* find first active disk in array */
+       dl = super->disks;
+       while (dl && (is_failed(&dl->disk) || dl->index == -1))
+               dl = dl->next;
+       if (!dl)
+               return rv;
+       /* find last lba used by subarrays */
+       e = get_extents(super, dl);
+       if (!e)
+               return rv;
+       for (i = 0; e[i].size; i++)
+               continue;
+       if (i > 0)
+               rv = e[i-1].start + e[i-1].size;
+       free(e);
+       /* add the amount of space needed for metadata */
+       rv = rv + MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+       return rv * 512;
+}
+
 #ifndef MDASSEMBLE
 static __u64 blocks_per_migr_unit(struct imsm_dev *dev);
 
@@ -654,22 +870,44 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
        __u64 sz;
        int slot, i;
        struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *map2 = get_imsm_map(dev, 1);
        __u32 ord;
 
        printf("\n");
        printf("[%.16s]:\n", dev->volume);
        printf("           UUID : %s\n", uuid);
-       printf("     RAID Level : %d\n", get_imsm_raid_level(map));
-       printf("        Members : %d\n", map->num_members);
+       printf("     RAID Level : %d", get_imsm_raid_level(map));
+       if (map2)
+               printf(" <-- %d", get_imsm_raid_level(map2));
+       printf("\n");
+       printf("        Members : %d", map->num_members);
+       if (map2)
+               printf(" <-- %d", map2->num_members);
+       printf("\n");
        printf("          Slots : [");
        for (i = 0; i < map->num_members; i++) {
-               ord = get_imsm_ord_tbl_ent(dev, i);
+               ord = get_imsm_ord_tbl_ent(dev, i, 0);
                printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
        }
-       printf("]\n");
+       printf("]");
+       if (map2) {
+               printf(" <-- [");
+               for (i = 0; i < map2->num_members; i++) {
+                       ord = get_imsm_ord_tbl_ent(dev, i, 1);
+                       printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
+               }
+               printf("]");
+       }
+       printf("\n");
+       printf("    Failed disk : ");
+       if (map->failed_disk_num == 0xff)
+               printf("none");
+       else
+               printf("%i", map->failed_disk_num);
+       printf("\n");
        slot = get_imsm_disk_slot(map, disk_idx);
        if (slot >= 0) {
-               ord = get_imsm_ord_tbl_ent(dev, slot);
+               ord = get_imsm_ord_tbl_ent(dev, slot, -1);
                printf("      This Slot : %d%s\n", slot,
                       ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
        } else
@@ -686,8 +924,12 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
                __le32_to_cpu(map->pba_of_lba0));
        printf("    Num Stripes : %u\n",
                __le32_to_cpu(map->num_data_stripes));
-       printf("     Chunk Size : %u KiB\n",
+       printf("     Chunk Size : %u KiB",
                __le16_to_cpu(map->blocks_per_strip) / 2);
+       if (map2)
+               printf(" <-- %u KiB",
+                       __le16_to_cpu(map2->blocks_per_strip) / 2);
+       printf("\n");
        printf("       Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
        printf("  Migrate State : ");
        if (dev->vol.migr_state) {
@@ -741,7 +983,7 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved)
               human_size(sz * 512));
 }
 
-static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map);
 
 static void examine_super_imsm(struct supertype *st, char *homehost)
 {
@@ -753,7 +995,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
        char nbuf[64];
        __u32 sum;
        __u32 reserved = imsm_reserved_sectors(super, super->disks);
-
+       struct dl *dl;
 
        snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
        printf("          Magic : %s\n", str);
@@ -762,7 +1004,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
        printf("    Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num));
        printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
        printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("           UUID : %s\n", nbuf + 5);
        sum = __le32_to_cpu(mpb->check_sum);
@@ -789,7 +1031,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
                struct imsm_dev *dev = __get_imsm_dev(mpb, i);
 
                super->current_vol = i;
-               getinfo_super_imsm(st, &info);
+               getinfo_super_imsm(st, &info, NULL);
                fname_from_uuid(st, &info, nbuf, ':');
                print_imsm_dev(dev, nbuf + 5, super->disks->index);
        }
@@ -798,6 +1040,26 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
                        continue;
                print_imsm_disk(mpb, i, reserved);
        }
+       for (dl = super->disks ; dl; dl = dl->next) {
+               struct imsm_disk *disk;
+               char str[MAX_RAID_SERIAL_LEN + 1];
+               __u64 sz;
+
+               if (dl->index >= 0)
+                       continue;
+
+               disk = &dl->disk;
+               printf("\n");
+               snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
+               printf("    Disk Serial : %s\n", str);
+               printf("          State :%s%s%s\n", is_spare(disk) ? " spare" : "",
+                      is_configured(disk) ? " active" : "",
+                      is_failed(disk) ? " failed" : "");
+               printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+               sz = __le32_to_cpu(disk->total_blocks) - reserved;
+               printf("    Usable Size : %llu%s\n", (unsigned long long)sz,
+                      human_size(sz * 512));
+       }
 }
 
 static void brief_examine_super_imsm(struct supertype *st, int verbose)
@@ -812,7 +1074,7 @@ static void brief_examine_super_imsm(struct supertype *st, int verbose)
                return;
        }
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5);
 }
@@ -829,13 +1091,13 @@ static void brief_examine_subarrays_imsm(struct supertype *st, int verbose)
        if (!super->anchor->num_raid_devs)
                return;
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        for (i = 0; i < super->anchor->num_raid_devs; i++) {
                struct imsm_dev *dev = get_imsm_dev(super, i);
 
                super->current_vol = i;
-               getinfo_super_imsm(st, &info);
+               getinfo_super_imsm(st, &info, NULL);
                fname_from_uuid(st, &info, nbuf1, ':');
                printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n",
                       dev->volume, nbuf + 5, i, nbuf1 + 5);
@@ -849,7 +1111,7 @@ static void export_examine_super_imsm(struct supertype *st)
        struct mdinfo info;
        char nbuf[64];
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("MD_METADATA=imsm\n");
        printf("MD_LEVEL=container\n");
@@ -862,7 +1124,7 @@ static void detail_super_imsm(struct supertype *st, char *homehost)
        struct mdinfo info;
        char nbuf[64];
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("\n           UUID : %s\n", nbuf + 5);
 }
@@ -871,7 +1133,7 @@ static void brief_detail_super_imsm(struct supertype *st)
 {
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf(" UUID=%s", nbuf + 5);
 }
@@ -879,10 +1141,10 @@ static void brief_detail_super_imsm(struct supertype *st)
 static int imsm_read_serial(int fd, char *devname, __u8 *serial);
 static void fd2devname(int fd, char *name);
 
-static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
+static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
 {
-       /* dump an unsorted list of devices attached to ahci, as well as
-        * non-connected ports
+       /* dump an unsorted list of devices attached to AHCI Intel storage
+        * controller, as well as non-connected ports
         */
        int hba_len = strlen(hba_path) + 1;
        struct dirent *ent;
@@ -1042,56 +1304,53 @@ static int imsm_enumerate_ports(const char *hba_path, int port_count, int host_b
        return err;
 }
 
-static int detail_platform_imsm(int verbose, int enumerate_only)
+
+
+static void print_found_intel_controllers(struct sys_dev *elem)
+{
+       for (; elem; elem = elem->next) {
+               fprintf(stderr, Name ": found Intel(R) ");
+               if (elem->type == SYS_DEV_SATA)
+                       fprintf(stderr, "SATA ");
+               else if (elem->type == SYS_DEV_SAS)
+                       fprintf(stderr, "SAS ");
+               fprintf(stderr, "RAID controller");
+               if (elem->pci_id)
+                       fprintf(stderr, " at %s", elem->pci_id);
+               fprintf(stderr, ".\n");
+       }
+       fflush(stderr);
+}
+
+static int ahci_get_port_count(const char *hba_path, int *port_count)
 {
-       /* There are two components to imsm platform support, the ahci SATA
-        * controller and the option-rom.  To find the SATA controller we
-        * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
-        * controller with the Intel vendor id is present.  This approach
-        * allows mdadm to leverage the kernel's ahci detection logic, with the
-        * caveat that if ahci.ko is not loaded mdadm will not be able to
-        * detect platform raid capabilities.  The option-rom resides in a
-        * platform "Adapter ROM".  We scan for its signature to retrieve the
-        * platform capabilities.  If raid support is disabled in the BIOS the
-        * option-rom capability structure will not be available.
-        */
-       const struct imsm_orom *orom;
-       struct sys_dev *list, *hba;
-       DIR *dir;
        struct dirent *ent;
-       const char *hba_path;
-       int host_base = 0;
-       int port_count = 0;
+       DIR *dir;
+       int host_base = -1;
 
-       if (enumerate_only) {
-               if (check_env("IMSM_NO_PLATFORM") || find_imsm_orom())
-                       return 0;
-               return 2;
-       }
+       *port_count = 0;
+       if ((dir = opendir(hba_path)) == NULL)
+               return -1;
 
-       list = find_driver_devices("pci", "ahci");
-       for (hba = list; hba; hba = hba->next)
-               if (devpath_to_vendor(hba->path) == 0x8086)
-                       break;
+       for (ent = readdir(dir); ent; ent = readdir(dir)) {
+               int host;
 
-       if (!hba) {
-               if (verbose)
-                       fprintf(stderr, Name ": unable to find active ahci controller\n");
-               free_sys_dev(&list);
-               return 2;
-       } else if (verbose)
-               fprintf(stderr, Name ": found Intel SATA AHCI Controller\n");
-       hba_path = hba->path;
-       hba->path = NULL;
-       free_sys_dev(&list);
+               if (sscanf(ent->d_name, "host%d", &host) != 1)
+                       continue;
+               if (*port_count == 0)
+                       host_base = host;
+               else if (host < host_base)
+                       host_base = host;
 
-       orom = find_imsm_orom();
-       if (!orom) {
-               if (verbose)
-                       fprintf(stderr, Name ": imsm option-rom not found\n");
-               return 2;
+               if (host + 1 > *port_count + host_base)
+                       *port_count = host + 1 - host_base;
        }
+       closedir(dir);
+       return host_base;
+}
 
+static void print_imsm_capability(const struct imsm_orom *orom)
+{
        printf("       Platform : Intel(R) Matrix Storage Manager\n");
        printf("        Version : %d.%d.%d.%d\n", orom->major_ver, orom->minor_ver,
               orom->hotfix_ver, orom->build);
@@ -1120,35 +1379,81 @@ static int detail_platform_imsm(int verbose, int enumerate_only)
               imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : "");
        printf("      Max Disks : %d\n", orom->tds);
        printf("    Max Volumes : %d\n", orom->vpa);
-       printf(" I/O Controller : %s\n", hba_path);
-
-       /* find the smallest scsi host number to determine a port number base */
-       dir = opendir(hba_path);
-       for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
-               int host;
-
-               if (sscanf(ent->d_name, "host%d", &host) != 1)
-                       continue;
-               if (port_count == 0)
-                       host_base = host;
-               else if (host < host_base)
-                       host_base = host;
+       return;
+}
 
-               if (host + 1 > port_count + host_base)
-                       port_count = host + 1 - host_base;
+static int detail_platform_imsm(int verbose, int enumerate_only)
+{
+       /* There are two components to imsm platform support, the ahci SATA
+        * controller and the option-rom.  To find the SATA controller we
+        * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
+        * controller with the Intel vendor id is present.  This approach
+        * allows mdadm to leverage the kernel's ahci detection logic, with the
+        * caveat that if ahci.ko is not loaded mdadm will not be able to
+        * detect platform raid capabilities.  The option-rom resides in a
+        * platform "Adapter ROM".  We scan for its signature to retrieve the
+        * platform capabilities.  If raid support is disabled in the BIOS the
+        * option-rom capability structure will not be available.
+        */
+       const struct imsm_orom *orom;
+       struct sys_dev *list, *hba;
+       int host_base = 0;
+       int port_count = 0;
+       int result=0;
 
+       if (enumerate_only) {
+               if (check_env("IMSM_NO_PLATFORM"))
+                       return 0;
+               list = find_intel_devices();
+               if (!list)
+                       return 2;
+               for (hba = list; hba; hba = hba->next) {
+                       orom = find_imsm_capability(hba->type);
+                       if (!orom) {
+                               result = 2;
+                               break;
+                       }
+               }
+               free_sys_dev(&list);
+               return result;
        }
-       if (dir)
-               closedir(dir);
 
-       if (!port_count || imsm_enumerate_ports(hba_path, port_count,
-                                               host_base, verbose) != 0) {
+       list = find_intel_devices();
+       if (!list) {
                if (verbose)
-                       fprintf(stderr, Name ": failed to enumerate ports\n");
+                       fprintf(stderr, Name ": no active Intel(R) RAID "
+                               "controller found.\n");
+               free_sys_dev(&list);
                return 2;
+       } else if (verbose)
+               print_found_intel_controllers(list);
+
+       for (hba = list; hba; hba = hba->next) {
+               orom = find_imsm_capability(hba->type);
+               if (!orom)
+                       fprintf(stderr, Name ": imsm capabilities not found for controller: %s (type %s)\n",
+                               hba->path, get_sys_dev_type(hba->type));
+               else
+                       print_imsm_capability(orom);
        }
 
-       return 0;
+       for (hba = list; hba; hba = hba->next) {
+               printf(" I/O Controller : %s (%s)\n",
+                       hba->path, get_sys_dev_type(hba->type));
+
+               if (hba->type == SYS_DEV_SATA) {
+                       host_base = ahci_get_port_count(hba->path, &port_count);
+                       if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) {
+                               if (verbose)
+                                       fprintf(stderr, Name ": failed to enumerate "
+                                               "ports on SATA controller at %s.", hba->pci_id);
+                               result |= 2;
+                       }
+               }
+       }
+
+       free_sys_dev(&list);
+       return result;
 }
 #endif
 
@@ -1307,12 +1612,12 @@ static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev)
                return num_stripes_per_unit_resync(dev);
 }
 
-static __u8 imsm_num_data_members(struct imsm_dev *dev)
+static __u8 imsm_num_data_members(struct imsm_dev *dev, int second_map)
 {
        /* named 'imsm_' because raid0, raid1 and raid10
         * counter-intuitively have the same number of data disks
         */
-       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *map = get_imsm_map(dev, second_map);
 
        switch (get_imsm_raid_level(map)) {
        case 0:
@@ -1376,6 +1681,7 @@ static __u64 blocks_per_migr_unit(struct imsm_dev *dev)
                return 0;
 
        switch (migr_type(dev)) {
+       case MIGR_GEN_MIGR:
        case MIGR_VERIFY:
        case MIGR_REPAIR:
        case MIGR_INIT: {
@@ -1395,7 +1701,7 @@ static __u64 blocks_per_migr_unit(struct imsm_dev *dev)
                 */
                stripes_per_unit = num_stripes_per_unit_resync(dev);
                migr_chunk = migr_strip_blocks_resync(dev);
-               disks = imsm_num_data_members(dev);
+               disks = imsm_num_data_members(dev, 0);
                blocks_per_unit = stripes_per_unit * migr_chunk * disks;
                stripe = __le32_to_cpu(map->blocks_per_strip) * disks;
                segment = blocks_per_unit / stripe;
@@ -1412,7 +1718,6 @@ static __u64 blocks_per_migr_unit(struct imsm_dev *dev)
                migr_chunk = migr_strip_blocks_rebuild(dev);
                return migr_chunk * stripes_per_unit;
        }
-       case MIGR_GEN_MIGR:
        case MIGR_STATE_CHANGE:
        default:
                return 0;
@@ -1434,30 +1739,84 @@ static int imsm_level_to_layout(int level)
        return UnSet;
 }
 
-static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap)
 {
        struct intel_super *super = st->sb;
        struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
        struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_map *prev_map = get_imsm_map(dev, 1);
+       struct imsm_map *map_to_analyse = map;
        struct dl *dl;
        char *devname;
+       int map_disks = info->array.raid_disks;
+
+       if (prev_map)
+               map_to_analyse = prev_map;
 
        for (dl = super->disks; dl; dl = dl->next)
                if (dl->raiddisk == info->disk.raid_disk)
                        break;
        info->container_member    = super->current_vol;
-       info->array.raid_disks    = map->num_members;
-       info->array.level         = get_imsm_raid_level(map);
+       info->array.raid_disks    = map_to_analyse->num_members;
+       info->array.level         = get_imsm_raid_level(map_to_analyse);
        info->array.layout        = imsm_level_to_layout(info->array.level);
        info->array.md_minor      = -1;
        info->array.ctime         = 0;
        info->array.utime         = 0;
-       info->array.chunk_size    = __le16_to_cpu(map->blocks_per_strip) << 9;
+       info->array.chunk_size    =
+               __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9;
        info->array.state         = !dev->vol.dirty;
        info->custom_array_size   = __le32_to_cpu(dev->size_high);
        info->custom_array_size   <<= 32;
        info->custom_array_size   |= __le32_to_cpu(dev->size_low);
-
+       if (prev_map && map->map_state == prev_map->map_state) {
+               info->reshape_active = 1;
+               info->new_level = get_imsm_raid_level(map);
+               info->new_layout = imsm_level_to_layout(info->new_level);
+               info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9;
+               info->delta_disks = map->num_members - prev_map->num_members;
+               if (info->delta_disks) {
+                       /* this needs to be applied to every array
+                        * in the container.
+                        */
+                       info->reshape_active = 2;
+               }
+               /* We shape information that we give to md might have to be
+                * modify to cope with md's requirement for reshaping arrays.
+                * For example, when reshaping a RAID0, md requires it to be
+                * presented as a degraded RAID4.
+                * Also if a RAID0 is migrating to a RAID5 we need to specify
+                * the array as already being RAID5, but the 'before' layout
+                * is a RAID4-like layout.
+                */
+               switch (info->array.level) {
+               case 0:
+                       switch(info->new_level) {
+                       case 0:
+                               /* conversion is happening as RAID4 */
+                               info->array.level = 4;
+                               info->array.raid_disks += 1;
+                               break;
+                       case 5:
+                               /* conversion is happening as RAID5 */
+                               info->array.level = 5;
+                               info->array.layout = ALGORITHM_PARITY_N;
+                               info->array.raid_disks += 1;
+                               info->delta_disks -= 1;
+                               break;
+                       default:
+                               /* FIXME error message */
+                               info->array.level = UnSet;
+                               break;
+                       }
+                       break;
+               }
+       } else {
+               info->new_level = UnSet;
+               info->new_layout = UnSet;
+               info->new_chunk = info->array.chunk_size;
+               info->delta_disks = 0;
+       }
        info->disk.major = 0;
        info->disk.minor = 0;
        if (dl) {
@@ -1465,15 +1824,19 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
                info->disk.minor = dl->minor;
        }
 
-       info->data_offset         = __le32_to_cpu(map->pba_of_lba0);
-       info->component_size      = __le32_to_cpu(map->blocks_per_member);
+       info->data_offset         = __le32_to_cpu(map_to_analyse->pba_of_lba0);
+       info->component_size      =
+               __le32_to_cpu(map_to_analyse->blocks_per_member);
        memset(info->uuid, 0, sizeof(info->uuid));
        info->recovery_start = MaxSector;
-       info->reshape_active = 0;
 
-       if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) {
+       info->reshape_progress = 0;
+       info->resync_start = MaxSector;
+       if (map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED ||
+           dev->vol.dirty) {
                info->resync_start = 0;
-       } else if (dev->vol.migr_state) {
+       }
+       if (dev->vol.migr_state) {
                switch (migr_type(dev)) {
                case MIGR_REPAIR:
                case MIGR_INIT: {
@@ -1483,6 +1846,34 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
                        info->resync_start = blocks_per_unit * units;
                        break;
                }
+               case MIGR_GEN_MIGR: {
+                       __u64 blocks_per_unit = blocks_per_migr_unit(dev);
+                       __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit);
+                       unsigned long long array_blocks;
+                       int used_disks;
+
+                       info->reshape_progress = blocks_per_unit * units;
+
+                       /* checkpoint is written per disks unit
+                        * recalculate it to reshape position
+                        */
+                       used_disks = imsm_num_data_members(dev, 0);
+                       info->reshape_progress *= used_disks;
+                       dprintf("IMSM: General Migration checkpoint : %llu "
+                              "(%llu) -> read reshape progress : %llu\n",
+                               units, blocks_per_unit, info->reshape_progress);
+
+                       used_disks = imsm_num_data_members(dev, 1);
+                       if (used_disks > 0) {
+                               array_blocks = map->blocks_per_member *
+                                       used_disks;
+                               /* round array size down to closest MB
+                                */
+                               info->custom_array_size = (array_blocks
+                                               >> SECT_PER_MB_SHIFT)
+                                               << SECT_PER_MB_SHIFT;
+                       }
+               }
                case MIGR_VERIFY:
                        /* we could emulate the checkpointing of
                         * 'sync_action=check' migrations, but for now
@@ -1490,15 +1881,13 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
                         */
                case MIGR_REBUILD:
                        /* this is handled by container_content_imsm() */
-               case MIGR_GEN_MIGR:
                case MIGR_STATE_CHANGE:
                        /* FIXME handle other migrations */
                default:
                        /* we are not dirty, so... */
                        info->resync_start = MaxSector;
                }
-       } else
-               info->resync_start = MaxSector;
+       }
 
        strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
        info->name[MAX_RAID_SERIAL_LEN] = 0;
@@ -1512,46 +1901,28 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
        free(devname);
        info->safe_mode_delay = 4000;  /* 4 secs like the Matrix driver */
        uuid_from_super_imsm(st, info->uuid);
+
+       if (dmap) {
+               int i, j;
+               for (i=0; i<map_disks; i++) {
+                       dmap[i] = 0;
+                       if (i < info->array.raid_disks) {
+                               struct imsm_disk *dsk;
+                               j = get_imsm_disk_idx(dev, i, -1);
+                               dsk = get_imsm_disk(super, j);
+                               if (dsk && (dsk->status & CONFIGURED_DISK))
+                                       dmap[i] = 1;
+                       }
+               }
+       }
 }
 
-/* check the config file to see if we can return a real uuid for this spare */
-static void fixup_container_spare_uuid(struct mdinfo *inf)
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed);
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev);
+
+static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
 {
-       struct mddev_ident_s *array_list;
-
-       if (inf->array.level != LEVEL_CONTAINER ||
-           memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0)
-               return;
-
-       array_list = conf_get_ident(NULL);
-
-       for (; array_list; array_list = array_list->next) {
-               if (array_list->uuid_set) {
-                       struct supertype *_sst; /* spare supertype */
-                       struct supertype *_cst; /* container supertype */
-
-                       _cst = array_list->st;
-                       if (_cst)
-                               _sst = _cst->ss->match_metadata_desc(inf->text_version);
-                       else
-                               _sst = NULL;
-
-                       if (_sst) {
-                               memcpy(inf->uuid, array_list->uuid, sizeof(int[4]));
-                               free(_sst);
-                               break;
-                       }
-               }
-       }
-}
-
-
-static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed);
-static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev);
-
-static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
-{
-       struct dl *d;
+       struct dl *d;
 
        for (d = super->missing; d; d = d->next)
                if (d->index == index)
@@ -1559,13 +1930,17 @@ static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
        return NULL;
 }
 
-static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct intel_super *super = st->sb;
        struct imsm_disk *disk;
+       int map_disks = info->array.raid_disks;
+       int max_enough = -1;
+       int i;
+       struct imsm_super *mpb;
 
        if (super->current_vol >= 0) {
-               getinfo_super_imsm_volume(st, info);
+               getinfo_super_imsm_volume(st, info, map);
                return;
        }
 
@@ -1594,51 +1969,47 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
        info->recovery_start = MaxSector;
 
        /* do we have the all the insync disks that we expect? */
-       if (st->loaded_container) {
-               struct imsm_super *mpb = super->anchor;
-               int max_enough = -1, i;
+       mpb = super->anchor;
 
-               for (i = 0; i < mpb->num_raid_devs; i++) {
-                       struct imsm_dev *dev = get_imsm_dev(super, i);
-                       int failed, enough, j, missing = 0;
-                       struct imsm_map *map;
-                       __u8 state;
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               int failed, enough, j, missing = 0;
+               struct imsm_map *map;
+               __u8 state;
 
-                       failed = imsm_count_failed(super, dev);
-                       state = imsm_check_degraded(super, dev, failed);
-                       map = get_imsm_map(dev, dev->vol.migr_state);
+               failed = imsm_count_failed(super, dev);
+               state = imsm_check_degraded(super, dev, failed);
+               map = get_imsm_map(dev, dev->vol.migr_state);
 
-                       /* any newly missing disks?
-                        * (catches single-degraded vs double-degraded)
-                        */
-                       for (j = 0; j < map->num_members; j++) {
-                               __u32 ord = get_imsm_ord_tbl_ent(dev, i);
-                               __u32 idx = ord_to_idx(ord);
+               /* any newly missing disks?
+                * (catches single-degraded vs double-degraded)
+                */
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 ord = get_imsm_ord_tbl_ent(dev, i, -1);
+                       __u32 idx = ord_to_idx(ord);
 
-                               if (!(ord & IMSM_ORD_REBUILD) &&
-                                   get_imsm_missing(super, idx)) {
-                                       missing = 1;
-                                       break;
-                               }
+                       if (!(ord & IMSM_ORD_REBUILD) &&
+                           get_imsm_missing(super, idx)) {
+                               missing = 1;
+                               break;
                        }
+               }
 
-                       if (state == IMSM_T_STATE_FAILED)
-                               enough = -1;
-                       else if (state == IMSM_T_STATE_DEGRADED &&
-                                (state != map->map_state || missing))
-                               enough = 0;
-                       else /* we're normal, or already degraded */
-                               enough = 1;
+               if (state == IMSM_T_STATE_FAILED)
+                       enough = -1;
+               else if (state == IMSM_T_STATE_DEGRADED &&
+                        (state != map->map_state || missing))
+                       enough = 0;
+               else /* we're normal, or already degraded */
+                       enough = 1;
 
-                       /* in the missing/failed disk case check to see
-                        * if at least one array is runnable
-                        */
-                       max_enough = max(max_enough, enough);
-               }
-               dprintf("%s: enough: %d\n", __func__, max_enough);
-               info->container_enough = max_enough;
-       } else
-               info->container_enough = -1;
+               /* in the missing/failed disk case check to see
+                * if at least one array is runnable
+                */
+               max_enough = max(max_enough, enough);
+       }
+       dprintf("%s: enough: %d\n", __func__, max_enough);
+       info->container_enough = max_enough;
 
        if (super->disks) {
                __u32 reserved = imsm_reserved_sectors(super, super->disks);
@@ -1660,10 +2031,61 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
         */
        if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
                uuid_from_super_imsm(st, info->uuid);
-       else {
-               memcpy(info->uuid, uuid_match_any, sizeof(int[4]));
-               fixup_container_spare_uuid(info);
+       else
+               memcpy(info->uuid, uuid_zero, sizeof(uuid_zero));
+
+       /* I don't know how to compute 'map' on imsm, so use safe default */
+       if (map) {
+               int i;
+               for (i = 0; i < map_disks; i++)
+                       map[i] = 1;
+       }
+
+}
+
+/* allocates memory and fills disk in mdinfo structure
+ * for each disk in array */
+struct mdinfo *getinfo_super_disks_imsm(struct supertype *st)
+{
+       struct mdinfo *mddev = NULL;
+       struct intel_super *super = st->sb;
+       struct imsm_disk *disk;
+       int count = 0;
+       struct dl *dl;
+       if (!super || !super->disks)
+               return NULL;
+       dl = super->disks;
+       mddev = malloc(sizeof(*mddev));
+       if (!mddev) {
+               fprintf(stderr, Name ": Failed to allocate memory.\n");
+               return NULL;
        }
+       memset(mddev, 0, sizeof(*mddev));
+       while (dl) {
+               struct mdinfo *tmp;
+               disk = &dl->disk;
+               tmp = malloc(sizeof(*tmp));
+               if (!tmp) {
+                       fprintf(stderr, Name ": Failed to allocate memory.\n");
+                       if (mddev)
+                               sysfs_free(mddev);
+                       return NULL;
+               }
+               memset(tmp, 0, sizeof(*tmp));
+               if (mddev->devs)
+                       tmp->next = mddev->devs;
+               mddev->devs = tmp;
+               tmp->disk.number = count++;
+               tmp->disk.major = dl->major;
+               tmp->disk.minor = dl->minor;
+               tmp->disk.state = is_configured(disk) ?
+                                 (1 << MD_DISK_ACTIVE) : 0;
+               tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
+               tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC);
+               tmp->disk.raid_disk = -1;
+               dl = dl->next;
+       }
+       return mddev;
 }
 
 static int update_super_imsm(struct supertype *st, struct mdinfo *info,
@@ -1705,8 +2127,7 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info,
        mpb = super->anchor;
 
        if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private)
-               fprintf(stderr,
-                       Name ": '--uuid' not supported for imsm metadata\n");
+               rv = -1;
        else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) {
                mpb->orig_family_num = *((__u32 *) info->update_private);
                rv = 0;
@@ -1727,9 +2148,7 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info,
        } else if (strcmp(update, "assemble") == 0)
                rv = 0;
        else
-               fprintf(stderr,
-                       Name ": '--update=%s' not supported for imsm metadata\n",
-                       update);
+               rv = -1;
 
        /* successful update? recompute checksum */
        if (rv == 0)
@@ -1795,6 +2214,18 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst)
                 tst->sb = NULL;
                 return 0;
         }
+       /* in platform dependent environment test if the disks
+        * use the same Intel hba
+        */
+       if (!check_env("IMSM_NO_PLATFORM")) {
+               if (first->hba->type != sec->hba->type)  {
+                       fprintf(stderr,
+                               "HBAs of devices does not match %s != %s\n",
+                               get_sys_dev_type(first->hba->type),
+                               get_sys_dev_type(sec->hba->type));
+                       return 3;
+               }
+       }
 
        /* if an anchor does not have num_raid_devs set then it is a free
         * floating spare
@@ -2086,7 +2517,8 @@ static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type)
 
        /* duplicate and then set the target end state in map[0] */
        memcpy(dest, src, sizeof_imsm_map(src));
-       if (migr_type == MIGR_REBUILD) {
+       if ((migr_type == MIGR_REBUILD) ||
+           (migr_type ==  MIGR_GEN_MIGR)) {
                __u32 ord;
                int i;
 
@@ -2103,18 +2535,26 @@ static void end_migration(struct imsm_dev *dev, __u8 map_state)
 {
        struct imsm_map *map = get_imsm_map(dev, 0);
        struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
-       int i;
+       int i, j;
 
        /* merge any IMSM_ORD_REBUILD bits that were not successfully
         * completed in the last migration.
         *
-        * FIXME add support for online capacity expansion and
-        * raid-level-migration
+        * FIXME add support for raid-level-migration
         */
        for (i = 0; i < prev->num_members; i++)
-               map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i];
+               for (j = 0; j < map->num_members; j++)
+                       /* during online capacity expansion
+                        * disks position can be changed if takeover is used
+                        */
+                       if (ord_to_idx(map->disk_ord_tbl[j]) ==
+                           ord_to_idx(prev->disk_ord_tbl[i])) {
+                               map->disk_ord_tbl[j] |= prev->disk_ord_tbl[i];
+                               break;
+                       }
 
        dev->vol.migr_state = 0;
+       dev->vol.migr_type = 0;
        dev->vol.curr_migr_unit = 0;
        map->map_state = map_state;
 }
@@ -2125,6 +2565,7 @@ static int parse_raid_devices(struct intel_super *super)
        int i;
        struct imsm_dev *dev_new;
        size_t len, len_migr;
+       size_t max_len = 0;
        size_t space_needed = 0;
        struct imsm_super *mpb = super->anchor;
 
@@ -2140,7 +2581,11 @@ static int parse_raid_devices(struct intel_super *super)
                dv = malloc(sizeof(*dv));
                if (!dv)
                        return 1;
-               dev_new = malloc(len_migr);
+               if (max_len < len_migr)
+                       max_len = len_migr;
+               if (max_len > len_migr)
+                       space_needed += max_len - len_migr;
+               dev_new = malloc(max_len);
                if (!dev_new) {
                        free(dv);
                        return 1;
@@ -2188,7 +2633,7 @@ struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
 static void __free_imsm(struct intel_super *super, int free_disks);
 
 /* load_imsm_mpb - read matrix metadata
- * allocates super->mpb to be freed by free_super
+ * allocates super->mpb to be freed by free_imsm
  */
 static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 {
@@ -2199,6 +2644,13 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
        __u32 check_sum;
 
        get_dev_size(fd, NULL, &dsize);
+       if (dsize < 1024) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": %s: device to small for imsm\n",
+                               devname);
+               return 1;
+       }
 
        if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
                if (devname)
@@ -2233,6 +2685,10 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
        }
 
        __free_imsm(super, 0);
+       /*  reload capability and hba */
+
+       /* capability and hba must be updated with new super allocation */
+       find_intel_hba_capability(fd, super, devname);
        super->len = ROUND_UP(anchor->mpb_size, 512);
        if (posix_memalign(&super->buf, 512, super->len) != 0) {
                if (devname)
@@ -2324,6 +2780,7 @@ static void __free_imsm_disk(struct dl *d)
        free(d);
 
 }
+
 static void free_imsm_disks(struct intel_super *super)
 {
        struct dl *d;
@@ -2333,6 +2790,11 @@ static void free_imsm_disks(struct intel_super *super)
                super->disks = d->next;
                __free_imsm_disk(d);
        }
+       while (super->disk_mgmt_list) {
+               d = super->disk_mgmt_list;
+               super->disk_mgmt_list = d->next;
+               __free_imsm_disk(d);
+       }
        while (super->missing) {
                d = super->missing;
                super->missing = d->next;
@@ -2344,17 +2806,26 @@ static void free_imsm_disks(struct intel_super *super)
 /* free all the pieces hanging off of a super pointer */
 static void __free_imsm(struct intel_super *super, int free_disks)
 {
+       struct intel_hba *elem, *next;
+
        if (super->buf) {
                free(super->buf);
                super->buf = NULL;
        }
+       /* unlink capability description */
+       super->orom = NULL;
        if (free_disks)
                free_imsm_disks(super);
        free_devlist(super);
-       if (super->hba) {
-               free((void *) super->hba);
-               super->hba = NULL;
+       elem = super->hba;
+       while (elem) {
+               if (elem->path)
+                       free((void *)elem->path);
+               next = elem->next;
+               free(elem);
+               elem = next;
        }
+       super->hba = NULL;
 }
 
 static void free_imsm(struct intel_super *super)
@@ -2382,25 +2853,64 @@ static struct intel_super *alloc_super(void)
                memset(super, 0, sizeof(*super));
                super->current_vol = -1;
                super->create_offset = ~((__u32 ) 0);
-               if (!check_env("IMSM_NO_PLATFORM"))
-                       super->orom = find_imsm_orom();
-               if (super->orom && !check_env("IMSM_TEST_OROM")) {
-                       struct sys_dev *list, *ent;
-
-                       /* find the first intel ahci controller */
-                       list = find_driver_devices("pci", "ahci");
-                       for (ent = list; ent; ent = ent->next)
-                               if (devpath_to_vendor(ent->path) == 0x8086)
-                                       break;
-                       if (ent) {
-                               super->hba = ent->path;
-                               ent->path = NULL;
+       }
+       return super;
+}
+
+/*
+ * find and allocate hba and OROM/EFI based on valid fd of RAID component device
+ */
+static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname)
+{
+       struct sys_dev *hba_name;
+       int rv = 0;
+
+       if ((fd < 0) || check_env("IMSM_NO_PLATFORM")) {
+               super->orom = NULL;
+               super->hba = NULL;
+               return 0;
+       }
+       hba_name = find_disk_attached_hba(fd, NULL);
+       if (!hba_name) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": %s is not attached to Intel(R) RAID controller.\n",
+                               devname);
+               return 1;
+       }
+       rv = attach_hba_to_super(super, hba_name);
+       if (rv == 2) {
+               if (devname) {
+                       struct intel_hba *hba = super->hba;
+
+                       fprintf(stderr, Name ": %s is attached to Intel(R) %s RAID "
+                               "controller (%s),\n"
+                               "    but the container is assigned to Intel(R) "
+                               "%s RAID controller (",
+                               devname,
+                               hba_name->path,
+                               hba_name->pci_id ? : "Err!",
+                               get_sys_dev_type(hba_name->type));
+
+                       while (hba) {
+                               fprintf(stderr, "%s", hba->pci_id ? : "Err!");
+                               if (hba->next)
+                                       fprintf(stderr, ", ");
+                               hba = hba->next;
                        }
-                       free_sys_dev(&list);
+
+                       fprintf(stderr, ").\n"
+                               "    Mixing devices attached to different controllers "
+                               "is not allowed.\n");
                }
+               free_sys_dev(&hba_name);
+               return 2;
        }
-
-       return super;
+       super->orom = find_imsm_capability(hba_name->type);
+       free_sys_dev(&hba_name);
+       if (!super->orom)
+               return 3;
+       return 0;
 }
 
 #ifndef MDASSEMBLE
@@ -2756,7 +3266,7 @@ imsm_thunderdome(struct intel_super **super_list, int len)
 }
 
 static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
-                              char *devname, int keep_fd)
+                              char *devname)
 {
        struct mdinfo *sra;
        struct intel_super *super_list = NULL;
@@ -2783,6 +3293,7 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
                struct intel_super *s = alloc_super();
                char nm[32];
                int dfd;
+               int rv;
 
                err = 1;
                if (!s)
@@ -2792,22 +3303,25 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
 
                err = 2;
                sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
-               dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+               dfd = dev_open(nm, O_RDWR);
                if (dfd < 0)
                        goto error;
 
-               err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+               rv = find_intel_hba_capability(dfd, s, devname);
+               /* no orom/efi or non-intel hba of the disk */
+               if (rv != 0)
+                       goto error;
+
+               err = load_and_parse_mpb(dfd, s, NULL, 1);
 
                /* retry the load if we might have raced against mdmon */
                if (err == 3 && mdmon_running(devnum))
                        for (retry = 0; retry < 3; retry++) {
                                usleep(3000);
-                               err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+                               err = load_and_parse_mpb(dfd, s, NULL, 1);
                                if (err != 3)
                                        break;
                        }
-               if (!keep_fd)
-                       close(dfd);
                if (err)
                        goto error;
        }
@@ -2824,25 +3338,6 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
                err = 2;
                goto error;
        }
-
-       if (st->subarray[0]) {
-               unsigned long val;
-               char *ep;
-
-               err = 1;
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free_imsm(super);
-                       goto error;
-               }
-
-               if (val < super->anchor->num_raid_devs)
-                       super->current_vol = val;
-               else {
-                       free_imsm(super);
-                       goto error;
-               }
-       }
        err = 0;
 
  error:
@@ -2864,10 +3359,13 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
                st->minor_version = 0;
                st->max_devs = IMSM_MAX_DEVICES;
        }
-       st->loaded_container = 1;
-
        return 0;
 }
+
+static int load_container_imsm(struct supertype *st, int fd, char *devname)
+{
+       return load_super_imsm_all(st, fd, &st->sb, devname);
+}
 #endif
 
 static int load_super_imsm(struct supertype *st, int fd, char *devname)
@@ -2875,11 +3373,6 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
        struct intel_super *super;
        int rv;
 
-#ifndef MDASSEMBLE
-       if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
-               return 0;
-#endif
-
        if (test_partition(fd))
                /* IMSM not allowed on partitions */
                return 1;
@@ -2893,6 +3386,15 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
                        sizeof(*super));
                return 1;
        }
+       rv = find_intel_hba_capability(fd, super, devname);
+       /* no orom/efi or non-intel hba of the disk */
+       if (rv != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": No OROM/EFI properties for %s\n", devname);
+               free_imsm(super);
+               return 2;
+       }
 
        rv = load_and_parse_mpb(fd, super, devname, 0);
 
@@ -2905,32 +3407,12 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
                return rv;
        }
 
-       if (st->subarray[0]) {
-               unsigned long val;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free_imsm(super);
-                       return 1;
-               }
-
-               if (val < super->anchor->num_raid_devs)
-                       super->current_vol = val;
-               else {
-                       free_imsm(super);
-                       return 1;
-               }
-       }
-
        st->sb = super;
        if (st->ss == NULL) {
                st->ss = &super_imsm;
                st->minor_version = 0;
                st->max_devs = IMSM_MAX_DEVICES;
        }
-       st->loaded_container = 0;
-
        return 0;
 }
 
@@ -3083,18 +3565,18 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 
        if (!check_name(super, name, 0))
                return 0;
-       sprintf(st->subarray, "%d", idx);
        dv = malloc(sizeof(*dv));
        if (!dv) {
                fprintf(stderr, Name ": failed to allocate device list entry\n");
                return 0;
        }
-       dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+       dev = calloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
        if (!dev) {
                free(dv);
                fprintf(stderr, Name": could not allocate raid device\n");
                return 0;
        }
+
        strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
        if (info->level == 1)
                array_blocks = info_to_blocks_per_member(info);
@@ -3107,8 +3589,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 
        dev->size_low = __cpu_to_le32((__u32) array_blocks);
        dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32));
-       dev->status = __cpu_to_le32(0);
-       dev->reserved_blocks = __cpu_to_le32(0);
+       dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING);
        vol = &dev->vol;
        vol->migr_state = 0;
        set_migr_type(dev, MIGR_INIT);
@@ -3262,7 +3743,7 @@ static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
        /* Check the device has not already been added */
        slot = get_imsm_disk_slot(map, dl->index);
        if (slot >= 0 &&
-           (get_imsm_ord_tbl_ent(dev, slot) & IMSM_ORD_REBUILD) == 0) {
+           (get_imsm_ord_tbl_ent(dev, slot, -1) & IMSM_ORD_REBUILD) == 0) {
                fprintf(stderr, Name ": %s has been included in this array twice\n",
                        devname);
                return 1;
@@ -3291,8 +3772,9 @@ static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
        return 0;
 }
 
+
 static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
-                             int fd, char *devname)
+                            int fd, char *devname)
 {
        struct intel_super *super = st->sb;
        struct dl *dd;
@@ -3301,13 +3783,16 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
        int rv;
        struct stat stb;
 
-       /* if we are on an RAID enabled platform check that the disk is
-        * attached to the raid controller
+       /* If we are on an RAID enabled platform check that the disk is
+        * attached to the raid controller.
+        * We do not need to test disks attachment for container based additions,
+        * they shall be already tested when container was created/assembled.
         */
-       if (super->hba && !disk_attached_to_hba(fd, super->hba)) {
-               fprintf(stderr,
-                       Name ": %s is not attached to the raid controller: %s\n",
-                       devname ? : "disk", super->hba);
+       rv = find_intel_hba_capability(fd, super, devname);
+       /* no orom/efi or non-intel hba of the disk */
+       if (rv != 0) {
+               dprintf("capability: %p fd: %d ret: %d\n",
+                       super->orom, fd, rv);
                return 1;
        }
 
@@ -3328,6 +3813,7 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
        dd->devname = devname ? strdup(devname) : NULL;
        dd->fd = fd;
        dd->e = NULL;
+       dd->action = DISK_ADD;
        rv = imsm_read_serial(fd, devname, dd->serial);
        if (rv) {
                fprintf(stderr,
@@ -3347,8 +3833,8 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
                dd->disk.scsi_id = __cpu_to_le32(0);
 
        if (st->update_tail) {
-               dd->next = super->add;
-               super->add = dd;
+               dd->next = super->disk_mgmt_list;
+               super->disk_mgmt_list = dd;
        } else {
                dd->next = super->disks;
                super->disks = dd;
@@ -3357,6 +3843,43 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
        return 0;
 }
 
+
+static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dd;
+
+       /* remove from super works only in mdmon - for communication
+        * manager - monitor. Check if communication memory buffer
+        * is prepared.
+        */
+       if (!st->update_tail) {
+               fprintf(stderr,
+                       Name ": %s shall be used in mdmon context only"
+                       "(line %d).\n", __func__, __LINE__);
+               return 1;
+       }
+       dd = malloc(sizeof(*dd));
+       if (!dd) {
+               fprintf(stderr,
+                       Name ": malloc failed %s:%d.\n", __func__, __LINE__);
+               return 1;
+       }
+       memset(dd, 0, sizeof(*dd));
+       dd->major = dk->major;
+       dd->minor = dk->minor;
+       dd->index = -1;
+       dd->fd = -1;
+       dd->disk.status = SPARE_DISK;
+       dd->action = DISK_REMOVE;
+
+       dd->next = super->disk_mgmt_list;
+       super->disk_mgmt_list = dd;
+
+
+       return 0;
+}
+
 static int store_imsm_mpb(int fd, struct imsm_super *mpb);
 
 static union {
@@ -3410,8 +3933,9 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose)
        return 0;
 }
 
-static int write_super_imsm(struct intel_super *super, int doclose)
+static int write_super_imsm(struct supertype *st, int doclose)
 {
+       struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
        struct dl *d;
        __u32 generation;
@@ -3419,6 +3943,7 @@ static int write_super_imsm(struct intel_super *super, int doclose)
        int spares = 0;
        int i;
        __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+       int num_disks = 0;
 
        /* 'generation' is incremented everytime the metadata is written */
        generation = __le32_to_cpu(mpb->generation_num);
@@ -3431,21 +3956,28 @@ static int write_super_imsm(struct intel_super *super, int doclose)
        if (mpb->orig_family_num == 0)
                mpb->orig_family_num = mpb->family_num;
 
-       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
        for (d = super->disks; d; d = d->next) {
                if (d->index == -1)
                        spares++;
-               else
+               else {
                        mpb->disk[d->index] = d->disk;
+                       num_disks++;
+               }
        }
-       for (d = super->missing; d; d = d->next)
+       for (d = super->missing; d; d = d->next) {
                mpb->disk[d->index] = d->disk;
+               num_disks++;
+       }
+       mpb->num_disks = num_disks;
+       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
 
        for (i = 0; i < mpb->num_raid_devs; i++) {
                struct imsm_dev *dev = __get_imsm_dev(mpb, i);
-
-               imsm_copy_dev(dev, get_imsm_dev(super, i));
-               mpb_size += sizeof_imsm_dev(dev, 0);
+               struct imsm_dev *dev2 = get_imsm_dev(super, i);
+               if (dev && dev2) {
+                       imsm_copy_dev(dev, dev2);
+                       mpb_size += sizeof_imsm_dev(dev, 0);
+               }
        }
        mpb_size += __le32_to_cpu(mpb->bbm_log_size);
        mpb->mpb_size = __cpu_to_le32(mpb_size);
@@ -3499,7 +4031,7 @@ static int create_array(struct supertype *st, int dev_idx)
        imsm_copy_dev(&u->dev, dev);
        inf = get_disk_info(u);
        for (i = 0; i < map->num_members; i++) {
-               int idx = get_imsm_disk_idx(dev, i);
+               int idx = get_imsm_disk_idx(dev, i, -1);
 
                disk = get_imsm_disk(super, idx);
                serialcpy(inf[i].serial, disk->serial);
@@ -3509,13 +4041,13 @@ static int create_array(struct supertype *st, int dev_idx)
        return 0;
 }
 
-static int _add_disk(struct supertype *st)
+static int mgmt_disk(struct supertype *st)
 {
        struct intel_super *super = st->sb;
        size_t len;
-       struct imsm_update_add_disk *u;
+       struct imsm_update_add_remove_disk *u;
 
-       if (!super->add)
+       if (!super->disk_mgmt_list)
                return 0;
 
        len = sizeof(*u);
@@ -3526,7 +4058,7 @@ static int _add_disk(struct supertype *st)
                return 1;
        }
 
-       u->type = update_add_disk;
+       u->type = update_add_remove_disk;
        append_metadata_update(st, u, len);
 
        return 0;
@@ -3543,29 +4075,23 @@ static int write_init_super_imsm(struct supertype *st)
        if (st->update_tail) {
                /* queue the recently created array / added disk
                 * as a metadata update */
-               struct dl *d;
                int rv;
 
                /* determine if we are creating a volume or adding a disk */
                if (current_vol < 0) {
-                       /* in the add disk case we are running in mdmon
-                        * context, so don't close fd's
+                       /* in the mgmt (add/remove) disk case we are running
+                        * in mdmon context, so don't close fd's
                         */
-                       return _add_disk(st);
+                       return mgmt_disk(st);
                } else
                        rv = create_array(st, current_vol);
 
-               for (d = super->disks; d ; d = d->next) {
-                       close(d->fd);
-                       d->fd = -1;
-               }
-
                return rv;
        } else {
                struct dl *d;
                for (d = super->disks; d; d = d->next)
                        Kill(d->devname, NULL, 0, 1, 1);
-               return write_super_imsm(st->sb, 1);
+               return write_super_imsm(st, 1);
        }
 }
 #endif
@@ -3599,25 +4125,14 @@ static int validate_geometry_imsm_container(struct supertype *st, int level,
 {
        int fd;
        unsigned long long ldsize;
-       const struct imsm_orom *orom;
+       struct intel_super *super=NULL;
+       int rv = 0;
 
        if (level != LEVEL_CONTAINER)
                return 0;
        if (!dev)
                return 1;
 
-       if (check_env("IMSM_NO_PLATFORM"))
-               orom = NULL;
-       else
-               orom = find_imsm_orom();
-       if (orom && raiddisks > orom->tds) {
-               if (verbose)
-                       fprintf(stderr, Name ": %d exceeds maximum number of"
-                               " platform supported disks: %d\n",
-                               raiddisks, orom->tds);
-               return 0;
-       }
-
        fd = open(dev, O_RDONLY|O_EXCL, 0);
        if (fd < 0) {
                if (verbose)
@@ -3629,9 +4144,45 @@ static int validate_geometry_imsm_container(struct supertype *st, int level,
                close(fd);
                return 0;
        }
+
+       /* capabilities retrieve could be possible
+        * note that there is no fd for the disks in array.
+        */
+       super = alloc_super();
+       if (!super) {
+               fprintf(stderr,
+                       Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               close(fd);
+               return 0;
+       }
+
+       rv = find_intel_hba_capability(fd, super, verbose ? dev : NULL);
+       if (rv != 0) {
+#if DEBUG
+               char str[256];
+               fd2devname(fd, str);
+               dprintf("validate_geometry_imsm_container: fd: %d %s orom: %p rv: %d raiddisk: %d\n",
+                       fd, str, super->orom, rv, raiddisks);
+#endif
+               /* no orom/efi or non-intel hba of the disk */
+               close(fd);
+               free_imsm(super);
+               return 0;
+       }
        close(fd);
+       if (super->orom && raiddisks > super->orom->tds) {
+               if (verbose)
+                       fprintf(stderr, Name ": %d exceeds maximum number of"
+                               " platform supported disks: %d\n",
+                               raiddisks, super->orom->tds);
+
+               free_imsm(super);
+               return 0;
+       }
 
        *freesize = avail_size_imsm(st, ldsize >> 9);
+       free_imsm(super);
 
        return 1;
 }
@@ -3766,20 +4317,42 @@ static int is_raid_level_supported(const struct imsm_orom *orom, int level, int
        return 0;
 }
 
+
 #define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg))
+/*
+ * validate volume parameters with OROM/EFI capabilities
+ */
 static int
 validate_geometry_imsm_orom(struct intel_super *super, int level, int layout,
-                           int raiddisks, int chunk, int verbose)
+                           int raiddisks, int *chunk, int verbose)
 {
-       if (!is_raid_level_supported(super->orom, level, raiddisks)) {
+#if DEBUG
+       verbose = 1;
+#endif
+       /* validate container capabilities */
+       if (super->orom && raiddisks > super->orom->tds) {
+               if (verbose)
+                       fprintf(stderr, Name ": %d exceeds maximum number of"
+                               " platform supported disks: %d\n",
+                               raiddisks, super->orom->tds);
+               return 0;
+       }
+
+        /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */
+       if (super->orom && (!is_raid_level_supported(super->orom, level,
+                                                    raiddisks))) {
                pr_vrb(": platform does not support raid%d with %d disk%s\n",
                        level, raiddisks, raiddisks > 1 ? "s" : "");
                return 0;
        }
-       if (super->orom && level != 1 &&
-           !imsm_orom_has_chunk(super->orom, chunk)) {
-               pr_vrb(": platform does not support a chunk size of: %d\n", chunk);
-               return 0;
+       if (super->orom && level != 1) {
+               if (chunk && (*chunk == 0 || *chunk == UnSet))
+                       *chunk = imsm_orom_default_chunk(super->orom);
+               else if (chunk && !imsm_orom_has_chunk(super->orom, *chunk)) {
+                       pr_vrb(": platform does not support a chunk size of: "
+                              "%d\n", *chunk);
+                       return 0;
+               }
        }
        if (layout != imsm_level_to_layout(level)) {
                if (level == 5)
@@ -3791,7 +4364,6 @@ validate_geometry_imsm_orom(struct intel_super *super, int level, int layout,
                                layout, level);
                return 0;
        }
-
        return 1;
 }
 
@@ -3799,7 +4371,7 @@ validate_geometry_imsm_orom(struct intel_super *super, int level, int layout,
  * FIX ME add ahci details
  */
 static int validate_geometry_imsm_volume(struct supertype *st, int level,
-                                        int layout, int raiddisks, int chunk,
+                                        int layout, int raiddisks, int *chunk,
                                         unsigned long long size, char *dev,
                                         unsigned long long *freesize,
                                         int verbose)
@@ -3817,9 +4389,11 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level,
        if (!super)
                return 0;
 
-       if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, verbose))
+       if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, verbose)) {
+               fprintf(stderr, Name ": RAID gemetry validation failed. "
+                       "Cannot proceed with the action(s).\n");
                return 0;
-
+       }
        if (!dev) {
                /* General test:  make sure there is space for
                 * 'raiddisks' device extents of size 'size' at a given
@@ -3988,7 +4562,8 @@ static int reserve_space(struct supertype *st, int raiddisks,
        maxsize = merge_extents(super, extent_cnt);
        minsize = size;
        if (size == 0)
-               minsize = chunk;
+               /* chunk is in K */
+               minsize = chunk * 2;
 
        if (cnt < raiddisks ||
            (super->orom && used && used != raiddisks) ||
@@ -4001,8 +4576,8 @@ static int reserve_space(struct supertype *st, int raiddisks,
        if (size == 0) {
                size = maxsize;
                if (chunk) {
-                       size /= chunk;
-                       size *= chunk;
+                       size /= 2 * chunk;
+                       size *= 2 * chunk;
                }
        }
 
@@ -4017,7 +4592,7 @@ static int reserve_space(struct supertype *st, int raiddisks,
 }
 
 static int validate_geometry_imsm(struct supertype *st, int level, int layout,
-                                 int raiddisks, int chunk, unsigned long long size,
+                                 int raiddisks, int *chunk, unsigned long long size,
                                  char *dev, unsigned long long *freesize,
                                  int verbose)
 {
@@ -4025,13 +4600,15 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
        struct mdinfo *sra;
        int is_member = 0;
 
-       /* if given unused devices create a container 
+       /* load capability
+        * if given unused devices create a container
         * if given given devices in a container create a member volume
         */
        if (level == LEVEL_CONTAINER) {
                /* Must be a fresh device to add to a container */
                return validate_geometry_imsm_container(st, level, layout,
-                                                       raiddisks, chunk, size,
+                                                       raiddisks,
+                                                       chunk?*chunk:0, size,
                                                        dev, freesize,
                                                        verbose);
        }
@@ -4050,7 +4627,8 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
                                                         raiddisks, chunk,
                                                         verbose))
                                return 0;
-                       return reserve_space(st, raiddisks, size, chunk, freesize);
+                       return reserve_space(st, raiddisks, size,
+                                            chunk?*chunk:0, freesize);
                }
                return 1;
        }
@@ -4097,7 +4675,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
                 */
                struct intel_super *super;
 
-               if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+               if (load_super_imsm_all(st, cfd, (void **) &super, NULL) == 0) {
                        st->sb = super;
                        st->container_dev = fd2devnum(cfd);
                        close(cfd);
@@ -4115,14 +4693,19 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
        return 0;
 }
 
-static int default_chunk_imsm(struct supertype *st)
+static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk)
 {
        struct intel_super *super = st->sb;
 
-       if (!super || !super->orom)
-               return 0;
+       if (level && *level == UnSet)
+               *level = LEVEL_CONTAINER;
+
+       if (level && layout && *layout == UnSet)
+               *layout = imsm_level_to_layout(*level);
 
-       return imsm_orom_default_chunk(super->orom);
+       if (chunk && (*chunk == UnSet || *chunk == 0) && 
+           super && super->orom)
+               *chunk = imsm_orom_default_chunk(super->orom);
 }
 
 static void handle_missing(struct intel_super *super, struct imsm_dev *dev);
@@ -4201,19 +4784,19 @@ static int kill_subarray_imsm(struct supertype *st)
        return 0;
 }
 
-static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_t ident)
+static int update_subarray_imsm(struct supertype *st, char *subarray,
+                               char *update, struct mddev_ident *ident)
 {
        /* update the subarray currently referenced by ->current_vol */
        struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
 
-       if (super->current_vol < 0)
-               return 2;
-
        if (strcmp(update, "name") == 0) {
                char *name = ident->name;
+               char *ep;
+               int vol;
 
-               if (is_subarray_active(st->subarray, st->devname)) {
+               if (is_subarray_active(subarray, st->devname)) {
                        fprintf(stderr,
                                Name ": Unable to update name of active subarray\n");
                        return 2;
@@ -4222,20 +4805,24 @@ static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_
                if (!check_name(super, name, 0))
                        return 2;
 
+               vol = strtoul(subarray, &ep, 10);
+               if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+                       return 2;
+
                if (st->update_tail) {
                        struct imsm_update_rename_array *u = malloc(sizeof(*u));
 
                        if (!u)
                                return 2;
                        u->type = update_rename_array;
-                       u->dev_idx = super->current_vol;
+                       u->dev_idx = vol;
                        snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name);
                        append_metadata_update(st, u, sizeof(*u));
                } else {
                        struct imsm_dev *dev;
                        int i;
 
-                       dev = get_imsm_dev(super, super->current_vol);
+                       dev = get_imsm_dev(super, vol);
                        snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name);
                        for (i = 0; i < mpb->num_raid_devs; i++) {
                                dev = get_imsm_dev(super, i);
@@ -4248,6 +4835,17 @@ static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_
 
        return 0;
 }
+
+static int is_gen_migration(struct imsm_dev *dev)
+{
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (migr_type(dev) == MIGR_GEN_MIGR)
+               return 1;
+
+       return 0;
+}
 #endif /* MDASSEMBLE */
 
 static int is_rebuilding(struct imsm_dev *dev)
@@ -4299,11 +4897,12 @@ static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array)
 }
 
 
-static struct mdinfo *container_content_imsm(struct supertype *st)
+static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray)
 {
        /* Given a container loaded by load_super_imsm_all,
         * extract information about all the arrays into
         * an mdinfo tree.
+        * If 'subarray' is given, just extract info about that array.
         *
         * For each imsm_dev create an mdinfo, fill it in,
         *  then look for matching devices in super->disks
@@ -4312,33 +4911,62 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
        struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
        struct mdinfo *rest = NULL;
-       int i;
+       unsigned int i;
+       int bbm_errors = 0;
+       struct dl *d;
+       int spare_disks = 0;
 
-       /* do not assemble arrays that might have bad blocks */
-       if (imsm_bbm_log_size(super->anchor)) {
-               fprintf(stderr, Name ": BBM log found in metadata. "
-                               "Cannot activate array(s).\n");
-               return NULL;
-       }
+       /* check for bad blocks */
+       if (imsm_bbm_log_size(super->anchor))
+               bbm_errors = 1;
+
+       /* count spare devices, not used in maps
+        */
+       for (d = super->disks; d; d = d->next)
+               if (d->index == -1)
+                       spare_disks++;
 
        for (i = 0; i < mpb->num_raid_devs; i++) {
-               struct imsm_dev *dev = get_imsm_dev(super, i);
-               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct imsm_dev *dev;
+               struct imsm_map *map;
+               struct imsm_map *map2;
                struct mdinfo *this;
-               int slot;
+               int slot, chunk;
+               char *ep;
+
+               if (subarray &&
+                   (i != strtoul(subarray, &ep, 10) || *ep != '\0'))
+                       continue;
+
+               dev = get_imsm_dev(super, i);
+               map = get_imsm_map(dev, 0);
+               map2 = get_imsm_map(dev, 1);
 
                /* do not publish arrays that are in the middle of an
                 * unsupported migration
                 */
                if (dev->vol.migr_state &&
-                   (migr_type(dev) == MIGR_GEN_MIGR ||
-                    migr_type(dev) == MIGR_STATE_CHANGE)) {
+                   (migr_type(dev) == MIGR_STATE_CHANGE)) {
                        fprintf(stderr, Name ": cannot assemble volume '%.16s':"
                                " unsupported migration in progress\n",
                                dev->volume);
                        continue;
                }
+               /* do not publish arrays that are not support by controller's
+                * OROM/EFI
+                */
 
+               chunk = __le16_to_cpu(map->blocks_per_strip) >> 1;
+               if (!validate_geometry_imsm_orom(super,
+                                                get_imsm_raid_level(map), /* RAID level */
+                                                imsm_level_to_layout(get_imsm_raid_level(map)),
+                                                map->num_members, /* raid disks */
+                                                &chunk,
+                                                1 /* verbose */)) {
+                       fprintf(stderr, Name ": RAID gemetry validation failed. "
+                               "Cannot proceed with the action(s).\n");
+                       continue;
+               }
                this = malloc(sizeof(*this));
                if (!this) {
                        fprintf(stderr, Name ": failed to allocate %zu bytes\n",
@@ -4349,7 +4977,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
                this->next = rest;
 
                super->current_vol = i;
-               getinfo_super_imsm_volume(st, this);
+               getinfo_super_imsm_volume(st, this, NULL);
                for (slot = 0 ; slot <  map->num_members; slot++) {
                        unsigned long long recovery_start;
                        struct mdinfo *info_d;
@@ -4359,8 +4987,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
                        __u32 ord;
 
                        skip = 0;
-                       idx = get_imsm_disk_idx(dev, slot);
-                       ord = get_imsm_ord_tbl_ent(dev, slot); 
+                       idx = get_imsm_disk_idx(dev, slot, 0);
+                       ord = get_imsm_ord_tbl_ent(dev, slot, -1);
                        for (d = super->disks; d ; d = d->next)
                                if (d->index == idx)
                                        break;
@@ -4408,7 +5036,17 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
                        info_d->disk.minor = d->minor;
                        info_d->disk.raid_disk = slot;
                        info_d->recovery_start = recovery_start;
-
+                       if (map2) {
+                               if (slot < map2->num_members)
+                                       info_d->disk.state = (1 << MD_DISK_ACTIVE);
+                               else
+                                       this->array.spare_disks++;
+                       } else {
+                               if (slot < map->num_members)
+                                       info_d->disk.state = (1 << MD_DISK_ACTIVE);
+                               else
+                                       this->array.spare_disks++;
+                       }
                        if (info_d->recovery_start == MaxSector)
                                this->array.working_disks++;
 
@@ -4418,9 +5056,14 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
                }
                /* now that the disk list is up-to-date fixup recovery_start */
                update_recovery_start(dev, this);
+               this->array.spare_disks += spare_disks;
                rest = this;
        }
 
+       /* if array has bad blocks, set suitable bit in array status */
+       if (bbm_errors)
+               rest->array.state |= (1<<MD_SB_BBM_ERRORS);
+
        return rest;
 }
 
@@ -4455,7 +5098,7 @@ static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
                int insync = insync;
 
                for (i = 0; i < map->num_members; i++) {
-                       __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+                       __u32 ord = get_imsm_ord_tbl_ent(dev, i, -1);
                        int idx = ord_to_idx(ord);
                        struct imsm_disk *disk;
 
@@ -4552,9 +5195,13 @@ static int is_resyncing(struct imsm_dev *dev)
            migr_type(dev) == MIGR_REPAIR)
                return 1;
 
+       if (migr_type(dev) == MIGR_GEN_MIGR)
+               return 0;
+
        migr_map = get_imsm_map(dev, 1);
 
-       if (migr_map->map_state == IMSM_T_STATE_NORMAL)
+       if ((migr_map->map_state == IMSM_T_STATE_NORMAL) &&
+           (dev->vol.migr_type != MIGR_GEN_MIGR))
                return 1;
        else
                return 0;
@@ -4566,6 +5213,8 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
        __u32 ord;
        int slot;
        struct imsm_map *map;
+       char buf[MAX_RAID_SERIAL_LEN+3];
+       unsigned int len, shift = 0;
 
        /* new failures are always set in map[0] */
        map = get_imsm_map(dev, 0);
@@ -4578,8 +5227,12 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
        if (is_failed(disk) && (ord & IMSM_ORD_REBUILD))
                return 0;
 
+       sprintf(buf, "%s:0", disk->serial);
+       if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN)
+               shift = len - MAX_RAID_SERIAL_LEN + 1;
+       strncpy((char *)disk->serial, &buf[shift], MAX_RAID_SERIAL_LEN);
+
        disk->status |= FAILED_DISK;
-       disk->status &= ~CONFIGURED_DISK;
        set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD);
        if (map->failed_disk_num == 0xff)
                map->failed_disk_num = slot;
@@ -4615,7 +5268,89 @@ static void handle_missing(struct intel_super *super, struct imsm_dev *dev)
        super->updates_pending++;
 }
 
-/* Handle dirty -> clean transititions and resync.  Degraded and rebuild
+static unsigned long long imsm_set_array_size(struct imsm_dev *dev)
+{
+       int used_disks = imsm_num_data_members(dev, 0);
+       unsigned long long array_blocks;
+       struct imsm_map *map;
+
+       if (used_disks == 0) {
+               /* when problems occures
+                * return current array_blocks value
+                */
+               array_blocks = __le32_to_cpu(dev->size_high);
+               array_blocks = array_blocks << 32;
+               array_blocks += __le32_to_cpu(dev->size_low);
+
+               return array_blocks;
+       }
+
+       /* set array size in metadata
+        */
+       map = get_imsm_map(dev, 0);
+       array_blocks = map->blocks_per_member * used_disks;
+
+       /* round array size down to closest MB
+        */
+       array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
+       dev->size_low = __cpu_to_le32((__u32)array_blocks);
+       dev->size_high = __cpu_to_le32((__u32)(array_blocks >> 32));
+
+       return array_blocks;
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state);
+
+static void imsm_progress_container_reshape(struct intel_super *super)
+{
+       /* if no device has a migr_state, but some device has a
+        * different number of members than the previous device, start
+        * changing the number of devices in this device to match
+        * previous.
+        */
+       struct imsm_super *mpb = super->anchor;
+       int prev_disks = -1;
+       int i;
+       int copy_map_size;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct imsm_map *map2;
+               int prev_num_members;
+
+               if (dev->vol.migr_state)
+                       return;
+
+               if (prev_disks == -1)
+                       prev_disks = map->num_members;
+               if (prev_disks == map->num_members)
+                       continue;
+
+               /* OK, this array needs to enter reshape mode.
+                * i.e it needs a migr_state
+                */
+
+               copy_map_size = sizeof_imsm_map(map);
+               prev_num_members = map->num_members;
+               map->num_members = prev_disks;
+               dev->vol.migr_state = 1;
+               dev->vol.curr_migr_unit = 0;
+               dev->vol.migr_type = MIGR_GEN_MIGR;
+               for (i = prev_num_members;
+                    i < map->num_members; i++)
+                       set_imsm_ord_tbl_ent(map, i, i);
+               map2 = get_imsm_map(dev, 1);
+               /* Copy the current map */
+               memcpy(map2, map, copy_map_size);
+               map2->num_members = prev_num_members;
+
+               imsm_set_array_size(dev);
+               super->updates_pending++;
+       }
+}
+
+/* Handle dirty -> clean transititions, resync and reshape.  Degraded and rebuild
  * states are handled in imsm_set_disk() with one exception, when a
  * resync is stopped due to a new failure this routine will set the
  * 'degraded' state for the array.
@@ -4630,6 +5365,62 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
        __u8 map_state = imsm_check_degraded(super, dev, failed);
        __u32 blocks_per_unit;
 
+       if (dev->vol.migr_state &&
+           dev->vol.migr_type  == MIGR_GEN_MIGR) {
+               /* array state change is blocked due to reshape action
+                * We might need to
+                * - abort the reshape (if last_checkpoint is 0 and action!= reshape)
+                * - finish the reshape (if last_checkpoint is big and action != reshape)
+                * - update curr_migr_unit
+                */
+               if (a->curr_action == reshape) {
+                       /* still reshaping, maybe update curr_migr_unit */
+                       goto mark_checkpoint;
+               } else {
+                       if (a->last_checkpoint == 0 && a->prev_action == reshape) {
+                               /* for some reason we aborted the reshape.
+                                * Better clean up
+                                */
+                               struct imsm_map *map2 = get_imsm_map(dev, 1);
+                               dev->vol.migr_state = 0;
+                               dev->vol.migr_type = 0;
+                               dev->vol.curr_migr_unit = 0;
+                               memcpy(map, map2, sizeof_imsm_map(map2));
+                               super->updates_pending++;
+                       }
+                       if (a->last_checkpoint >= a->info.component_size) {
+                               unsigned long long array_blocks;
+                               int used_disks;
+                               struct mdinfo *mdi;
+
+                               used_disks = imsm_num_data_members(dev, 0);
+                               if (used_disks > 0) {
+                                       array_blocks =
+                                               map->blocks_per_member *
+                                               used_disks;
+                                       /* round array size down to closest MB
+                                        */
+                                       array_blocks = (array_blocks
+                                                       >> SECT_PER_MB_SHIFT)
+                                               << SECT_PER_MB_SHIFT;
+                                       a->info.custom_array_size = array_blocks;
+                                       /* encourage manager to update array
+                                        * size
+                                        */
+
+                                       a->check_reshape = 1;
+                               }
+                               /* finalize online capacity expansion/reshape */
+                               for (mdi = a->info.devs; mdi; mdi = mdi->next)
+                                       imsm_set_disk(a,
+                                                     mdi->disk.raid_disk,
+                                                     mdi->curr_state);
+
+                               imsm_progress_container_reshape(super);
+                       }
+               }
+       }
+
        /* before we activate this array handle any missing disks */
        if (consistent == 2)
                handle_missing(super, dev);
@@ -4661,6 +5452,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
                super->updates_pending++;
        }
 
+mark_checkpoint:
        /* check if we can update curr_migr_unit from resync_start, recovery_start */
        blocks_per_unit = blocks_per_migr_unit(dev);
        if (blocks_per_unit) {
@@ -4690,6 +5482,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
                        dev->vol.dirty = 1;
                super->updates_pending++;
        }
+
        return consistent;
 }
 
@@ -4713,7 +5506,7 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
 
        dprintf("imsm: set_disk %d:%x\n", n, state);
 
-       ord = get_imsm_ord_tbl_ent(dev, n);
+       ord = get_imsm_ord_tbl_ent(dev, n, -1);
        disk = get_imsm_disk(super, ord_to_idx(ord));
 
        /* check for new failures */
@@ -4753,6 +5546,23 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
                end_migration(dev, map_state);
                super->updates_pending++;
                a->last_checkpoint = 0;
+       } else if (is_gen_migration(dev)) {
+               dprintf("imsm: Detected General Migration in state: ");
+               if (map_state == IMSM_T_STATE_NORMAL) {
+                       end_migration(dev, map_state);
+                       map = get_imsm_map(dev, 0);
+                       map->failed_disk_num = ~0;
+                       dprintf("normal\n");
+               } else {
+                       if (map_state == IMSM_T_STATE_DEGRADED) {
+                               printf("degraded\n");
+                               end_migration(dev, map_state);
+                       } else {
+                               dprintf("failed\n");
+                       }
+                       map->map_state = map_state;
+               }
+               super->updates_pending++;
        }
 }
 
@@ -4792,10 +5602,11 @@ static void imsm_sync_metadata(struct supertype *container)
 {
        struct intel_super *super = container->sb;
 
+       dprintf("sync metadata: %d\n", super->updates_pending);
        if (!super->updates_pending)
                return;
 
-       write_super_imsm(super, 0);
+       write_super_imsm(container, 0);
 
        super->updates_pending = 0;
 }
@@ -4803,7 +5614,7 @@ static void imsm_sync_metadata(struct supertype *container)
 static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
 {
        struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
-       int i = get_imsm_disk_idx(dev, idx);
+       int i = get_imsm_disk_idx(dev, idx, -1);
        struct dl *dl;
 
        for (dl = super->disks; dl; dl = dl->next)
@@ -4820,10 +5631,11 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a
 }
 
 static struct dl *imsm_add_spare(struct intel_super *super, int slot,
-                                struct active_array *a, int activate_new)
+                                struct active_array *a, int activate_new,
+                                struct mdinfo *additional_test_list)
 {
        struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
-       int idx = get_imsm_disk_idx(dev, slot);
+       int idx = get_imsm_disk_idx(dev, slot, -1);
        struct imsm_super *mpb = super->anchor;
        struct imsm_map *map;
        unsigned long long pos;
@@ -4834,6 +5646,7 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot,
        __u32 array_start = 0;
        __u32 array_end = 0;
        struct dl *dl;
+       struct mdinfo *test_list;
 
        for (dl = super->disks; dl; dl = dl->next) {
                /* If in this array, skip */
@@ -4841,11 +5654,24 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot,
                        if (d->state_fd >= 0 &&
                            d->disk.major == dl->major &&
                            d->disk.minor == dl->minor) {
-                               dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                               dprintf("%x:%x already in array\n",
+                                       dl->major, dl->minor);
                                break;
                        }
                if (d)
                        continue;
+               test_list = additional_test_list;
+               while (test_list) {
+                       if (test_list->disk.major == dl->major &&
+                           test_list->disk.minor == dl->minor) {
+                               dprintf("%x:%x already in additional test list\n",
+                                       dl->major, dl->minor);
+                               break;
+                       }
+                       test_list = test_list->next;
+               }
+               if (test_list)
+                       continue;
 
                /* skip in use or failed drives */
                if (is_failed(&dl->disk) || idx == dl->index ||
@@ -4915,6 +5741,45 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot,
        return dl;
 }
 
+
+static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed)
+{
+       struct imsm_dev *dev2;
+       struct imsm_map *map;
+       struct dl *idisk;
+       int slot;
+       int idx;
+       __u8 state;
+
+       dev2 = get_imsm_dev(cont->sb, dev_idx);
+       if (dev2) {
+               state = imsm_check_degraded(cont->sb, dev2, failed);
+               if (state == IMSM_T_STATE_FAILED) {
+                       map = get_imsm_map(dev2, 0);
+                       if (!map)
+                               return 1;
+                       for (slot = 0; slot < map->num_members; slot++) {
+                               /*
+                                * Check if failed disks are deleted from intel
+                                * disk list or are marked to be deleted
+                                */
+                               idx = get_imsm_disk_idx(dev2, slot, -1);
+                               idisk = get_imsm_dl_disk(cont->sb, idx);
+                               /*
+                                * Do not rebuild the array if failed disks
+                                * from failed sub-array are not removed from
+                                * container.
+                                */
+                               if (idisk &&
+                                   is_failed(&idisk->disk) &&
+                                   (idisk->action != DISK_REMOVE))
+                                       return 0;
+                       }
+               }
+       }
+       return 1;
+}
+
 static struct mdinfo *imsm_activate_spare(struct active_array *a,
                                          struct metadata_update **updates)
 {
@@ -4942,6 +5807,7 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
        struct imsm_update_activate_spare *u;
        int num_spares = 0;
        int i;
+       int allowed;
 
        for (d = a->info.devs ; d ; d = d->next) {
                if ((d->curr_state & DS_FAULTY) &&
@@ -4954,9 +5820,41 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
 
        dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
                inst, failed, a->info.array.raid_disks, a->info.array.level);
+
+       if (dev->vol.migr_state &&
+           dev->vol.migr_type == MIGR_GEN_MIGR)
+               /* No repair during migration */
+               return NULL;
+
+       if (a->info.array.level == 4)
+               /* No repair for takeovered array
+                * imsm doesn't support raid4
+                */
+               return NULL;
+
        if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED)
                return NULL;
 
+       /*
+        * If there are any failed disks check state of the other volume.
+        * Block rebuild if the another one is failed until failed disks
+        * are removed from container.
+        */
+       if (failed) {
+               dprintf("found failed disks in %s, check if there another"
+                       "failed sub-array.\n",
+                       dev->volume);
+               /* check if states of the other volumes allow for rebuild */
+               for (i = 0; i <  super->anchor->num_raid_devs; i++) {
+                       if (i != inst) {
+                               allowed = imsm_rebuild_allowed(a->container,
+                                                              i, failed);
+                               if (!allowed)
+                                       return NULL;
+                       }
+               }
+       }
+
        /* For each slot, if it is not working, find a spare */
        for (i = 0; i < a->info.array.raid_disks; i++) {
                for (d = a->info.devs ; d ; d = d->next)
@@ -4975,9 +5873,9 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
                 */
                dl = imsm_readd(super, i, a);
                if (!dl)
-                       dl = imsm_add_spare(super, i, a, 0);
+                       dl = imsm_add_spare(super, i, a, 0, NULL);
                if (!dl)
-                       dl = imsm_add_spare(super, i, a, 1);
+                       dl = imsm_add_spare(super, i, a, 1, NULL);
                if (!dl)
                        continue;
  
@@ -5044,6 +5942,7 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
        }
                        
        mu->space = NULL;
+       mu->space_list = NULL;
        mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
        mu->next = *updates;
        u = (struct imsm_update_activate_spare *) mu->buf;
@@ -5074,7 +5973,7 @@ static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_
        int j;
 
        for (i = 0; i < map->num_members; i++) {
-               disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+               disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, -1));
                for (j = 0; j < new_map->num_members; j++)
                        if (serialcmp(disk->serial, inf[j].serial) == 0)
                                return 1;
@@ -5083,18 +5982,308 @@ static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_
        return 0;
 }
 
+
+static struct dl *get_disk_super(struct intel_super *super, int major, int minor)
+{
+       struct dl *dl = NULL;
+       for (dl = super->disks; dl; dl = dl->next)
+               if ((dl->major == major) &&  (dl->minor == minor))
+                       return dl;
+       return NULL;
+}
+
+static int remove_disk_super(struct intel_super *super, int major, int minor)
+{
+       struct dl *prev = NULL;
+       struct dl *dl;
+
+       prev = NULL;
+       for (dl = super->disks; dl; dl = dl->next) {
+               if ((dl->major == major) && (dl->minor == minor)) {
+                       /* remove */
+                       if (prev)
+                               prev->next = dl->next;
+                       else
+                               super->disks = dl->next;
+                       dl->next = NULL;
+                       __free_imsm_disk(dl);
+                       dprintf("%s: removed %x:%x\n",
+                               __func__, major, minor);
+                       break;
+               }
+               prev = dl;
+       }
+       return 0;
+}
+
 static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index);
 
+static int add_remove_disk_update(struct intel_super *super)
+{
+       int check_degraded = 0;
+       struct dl *disk = NULL;
+       /* add/remove some spares to/from the metadata/contrainer */
+       while (super->disk_mgmt_list) {
+               struct dl *disk_cfg;
+
+               disk_cfg = super->disk_mgmt_list;
+               super->disk_mgmt_list = disk_cfg->next;
+               disk_cfg->next = NULL;
+
+               if (disk_cfg->action == DISK_ADD) {
+                       disk_cfg->next = super->disks;
+                       super->disks = disk_cfg;
+                       check_degraded = 1;
+                       dprintf("%s: added %x:%x\n",
+                               __func__, disk_cfg->major,
+                               disk_cfg->minor);
+               } else if (disk_cfg->action == DISK_REMOVE) {
+                       dprintf("Disk remove action processed: %x.%x\n",
+                               disk_cfg->major, disk_cfg->minor);
+                       disk = get_disk_super(super,
+                                             disk_cfg->major,
+                                             disk_cfg->minor);
+                       if (disk) {
+                               /* store action status */
+                               disk->action = DISK_REMOVE;
+                               /* remove spare disks only */
+                               if (disk->index == -1) {
+                                       remove_disk_super(super,
+                                                         disk_cfg->major,
+                                                         disk_cfg->minor);
+                               }
+                       }
+                       /* release allocate disk structure */
+                       __free_imsm_disk(disk_cfg);
+               }
+       }
+       return check_degraded;
+}
+
+static int apply_reshape_container_disks_update(struct imsm_update_reshape *u,
+                                               struct intel_super *super,
+                                               void ***space_list)
+{
+       struct dl *new_disk;
+       struct intel_dev *id;
+       int i;
+       int delta_disks = u->new_raid_disks - u->old_raid_disks;
+       int disk_count = u->old_raid_disks;
+       void **tofree = NULL;
+       int devices_to_reshape = 1;
+       struct imsm_super *mpb = super->anchor;
+       int ret_val = 0;
+       unsigned int dev_id;
+
+       dprintf("imsm: apply_reshape_container_disks_update()\n");
+
+       /* enable spares to use in array */
+       for (i = 0; i < delta_disks; i++) {
+               new_disk = get_disk_super(super,
+                                         major(u->new_disks[i]),
+                                         minor(u->new_disks[i]));
+               dprintf("imsm: new disk for reshape is: %i:%i "
+                       "(%p, index = %i)\n",
+                       major(u->new_disks[i]), minor(u->new_disks[i]),
+                       new_disk, new_disk->index);
+               if ((new_disk == NULL) ||
+                   ((new_disk->index >= 0) &&
+                    (new_disk->index < u->old_raid_disks)))
+                       goto update_reshape_exit;
+               new_disk->index = disk_count++;
+               /* slot to fill in autolayout
+                */
+               new_disk->raiddisk = new_disk->index;
+               new_disk->disk.status |=
+                       CONFIGURED_DISK;
+               new_disk->disk.status &= ~SPARE_DISK;
+       }
+
+       dprintf("imsm: volume set mpb->num_raid_devs = %i\n",
+               mpb->num_raid_devs);
+       /* manage changes in volume
+        */
+       for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) {
+               void **sp = *space_list;
+               struct imsm_dev *newdev;
+               struct imsm_map *newmap, *oldmap;
+
+               for (id = super->devlist ; id; id = id->next) {
+                       if (id->index == dev_id)
+                               break;
+               }
+               if (id == NULL)
+                       break;
+               if (!sp)
+                       continue;
+               *space_list = *sp;
+               newdev = (void*)sp;
+               /* Copy the dev, but not (all of) the map */
+               memcpy(newdev, id->dev, sizeof(*newdev));
+               oldmap = get_imsm_map(id->dev, 0);
+               newmap = get_imsm_map(newdev, 0);
+               /* Copy the current map */
+               memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
+               /* update one device only
+                */
+               if (devices_to_reshape) {
+                       dprintf("imsm: modifying subdev: %i\n",
+                               id->index);
+                       devices_to_reshape--;
+                       newdev->vol.migr_state = 1;
+                       newdev->vol.curr_migr_unit = 0;
+                       newdev->vol.migr_type = MIGR_GEN_MIGR;
+                       newmap->num_members = u->new_raid_disks;
+                       for (i = 0; i < delta_disks; i++) {
+                               set_imsm_ord_tbl_ent(newmap,
+                                                    u->old_raid_disks + i,
+                                                    u->old_raid_disks + i);
+                       }
+                       /* New map is correct, now need to save old map
+                        */
+                       newmap = get_imsm_map(newdev, 1);
+                       memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
+
+                       imsm_set_array_size(newdev);
+               }
+
+               sp = (void **)id->dev;
+               id->dev = newdev;
+               *sp = tofree;
+               tofree = sp;
+       }
+       if (tofree)
+               *space_list = tofree;
+       ret_val = 1;
+
+update_reshape_exit:
+
+       return ret_val;
+}
+
+static int apply_takeover_update(struct imsm_update_takeover *u,
+                                struct intel_super *super,
+                                void ***space_list)
+{
+       struct imsm_dev *dev = NULL;
+       struct intel_dev *dv;
+       struct imsm_dev *dev_new;
+       struct imsm_map *map;
+       struct dl *dm, *du;
+       int i;
+
+       for (dv = super->devlist; dv; dv = dv->next)
+               if (dv->index == (unsigned int)u->subarray) {
+                       dev = dv->dev;
+                       break;
+               }
+
+       if (dev == NULL)
+               return 0;
+
+       map = get_imsm_map(dev, 0);
+
+       if (u->direction == R10_TO_R0) {
+               /* Number of failed disks must be half of initial disk number */
+               if (imsm_count_failed(super, dev) != (map->num_members / 2))
+                       return 0;
+
+               /* iterate through devices to mark removed disks as spare */
+               for (dm = super->disks; dm; dm = dm->next) {
+                       if (dm->disk.status & FAILED_DISK) {
+                               int idx = dm->index;
+                               /* update indexes on the disk list */
+/* FIXME this loop-with-the-loop looks wrong,  I'm not convinced
+   the index values will end up being correct.... NB */
+                               for (du = super->disks; du; du = du->next)
+                                       if (du->index > idx)
+                                               du->index--;
+                               /* mark as spare disk */
+                               dm->disk.status = SPARE_DISK;
+                               dm->index = -1;
+                       }
+               }
+               /* update map */
+               map->num_members = map->num_members / 2;
+               map->map_state = IMSM_T_STATE_NORMAL;
+               map->num_domains = 1;
+               map->raid_level = 0;
+               map->failed_disk_num = -1;
+       }
+
+       if (u->direction == R0_TO_R10) {
+               void **space;
+               /* update slots in current disk list */
+               for (dm = super->disks; dm; dm = dm->next) {
+                       if (dm->index >= 0)
+                               dm->index *= 2;
+               }
+               /* create new *missing* disks */
+               for (i = 0; i < map->num_members; i++) {
+                       space = *space_list;
+                       if (!space)
+                               continue;
+                       *space_list = *space;
+                       du = (void *)space;
+                       memcpy(du, super->disks, sizeof(*du));
+                       du->fd = -1;
+                       du->minor = 0;
+                       du->major = 0;
+                       du->index = (i * 2) + 1;
+                       sprintf((char *)du->disk.serial,
+                               " MISSING_%d", du->index);
+                       sprintf((char *)du->serial,
+                               "MISSING_%d", du->index);
+                       du->next = super->missing;
+                       super->missing = du;
+               }
+               /* create new dev and map */
+               space = *space_list;
+               if (!space)
+                       return 0;
+               *space_list = *space;
+               dev_new = (void *)space;
+               memcpy(dev_new, dev, sizeof(*dev));
+               /* update new map */
+               map = get_imsm_map(dev_new, 0);
+               map->num_members = map->num_members * 2;
+               map->map_state = IMSM_T_STATE_DEGRADED;
+               map->num_domains = 2;
+               map->raid_level = 1;
+               /* replace dev<->dev_new */
+               dv->dev = dev_new;
+       }
+       /* update disk order table */
+       for (du = super->disks; du; du = du->next)
+               if (du->index >= 0)
+                       set_imsm_ord_tbl_ent(map, du->index, du->index);
+       for (du = super->missing; du; du = du->next)
+               if (du->index >= 0) {
+                       set_imsm_ord_tbl_ent(map, du->index, du->index);
+                       mark_missing(dev_new, &du->disk, du->index);
+               }
+
+       return 1;
+}
+
 static void imsm_process_update(struct supertype *st,
                                struct metadata_update *update)
 {
        /**
         * crack open the metadata_update envelope to find the update record
         * update can be one of:
-        *      update_activate_spare - a spare device has replaced a failed
+        *    update_reshape_container_disks - all the arrays in the container
+        *      are being reshaped to have more devices.  We need to mark
+        *      the arrays for general migration and convert selected spares
+        *      into active devices.
+        *    update_activate_spare - a spare device has replaced a failed
         *      device in an array, update the disk_ord_tbl.  If this disk is
         *      present in all member arrays then also clear the SPARE_DISK
         *      flag
+        *    update_create_array
+        *    update_kill_array
+        *    update_rename_array
+        *    update_add_remove_disk
         */
        struct intel_super *super = st->sb;
        struct imsm_super *mpb;
@@ -5119,6 +6308,22 @@ static void imsm_process_update(struct supertype *st,
        mpb = super->anchor;
 
        switch (type) {
+       case update_takeover: {
+               struct imsm_update_takeover *u = (void *)update->buf;
+               if (apply_takeover_update(u, super, &update->space_list)) {
+                       imsm_update_version_info(super);
+                       super->updates_pending++;
+               }
+               break;
+       }
+
+       case update_reshape_container_disks: {
+               struct imsm_update_reshape *u = (void *)update->buf;
+               if (apply_reshape_container_disks_update(
+                           u, super, &update->space_list))
+                       super->updates_pending++;
+               break;
+       }
        case update_activate_spare: {
                struct imsm_update_activate_spare *u = (void *) update->buf; 
                struct imsm_dev *dev = get_imsm_dev(super, u->array);
@@ -5130,7 +6335,7 @@ static void imsm_process_update(struct supertype *st,
                struct dl *dl;
                unsigned int found;
                int failed;
-               int victim = get_imsm_disk_idx(dev, u->slot);
+               int victim = get_imsm_disk_idx(dev, u->slot, -1);
                int i;
 
                for (dl = super->disks; dl; dl = dl->next)
@@ -5153,7 +6358,8 @@ static void imsm_process_update(struct supertype *st,
                for (i = 0; i < map->num_members; i++) {
                        if (i == u->slot)
                                continue;
-                       disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+                       disk = get_imsm_disk(super,
+                                            get_imsm_disk_idx(dev, i, -1));
                        if (!disk || is_failed(disk))
                                failed++;
                }
@@ -5394,31 +6600,24 @@ static void imsm_process_update(struct supertype *st,
                super->updates_pending++;
                break;
        }
-       case update_add_disk:
-
+       case update_add_remove_disk: {
                /* we may be able to repair some arrays if disks are
-                * being added */
-               if (super->add) {
+                * being added, check teh status of add_remove_disk
+                * if discs has been added.
+                */
+               if (add_remove_disk_update(super)) {
                        struct active_array *a;
 
                        super->updates_pending++;
-                       for (a = st->arrays; a; a = a->next)
+                       for (a = st->arrays; a; a = a->next)
                                a->check_degraded = 1;
                }
-               /* add some spares to the metadata */
-               while (super->add) {
-                       struct dl *al;
-
-                       al = super->add;
-                       super->add = al->next;
-                       al->next = super->disks;
-                       super->disks = al;
-                       dprintf("%s: added %x:%x\n",
-                               __func__, al->major, al->minor);
-               }
-
                break;
        }
+       default:
+               fprintf(stderr, "error: unsuported process update type:"
+                       "(type: %d)\n", type);
+       }
 }
 
 static void imsm_prepare_update(struct supertype *st,
@@ -5438,6 +6637,86 @@ static void imsm_prepare_update(struct supertype *st,
        size_t len = 0;
 
        switch (type) {
+       case update_takeover: {
+               struct imsm_update_takeover *u = (void *)update->buf;
+               if (u->direction == R0_TO_R10) {
+                       void **tail = (void **)&update->space_list;
+                       struct imsm_dev *dev = get_imsm_dev(super, u->subarray);
+                       struct imsm_map *map = get_imsm_map(dev, 0);
+                       int num_members = map->num_members;
+                       void *space;
+                       int size, i;
+                       int err = 0;
+                       /* allocate memory for added disks */
+                       for (i = 0; i < num_members; i++) {
+                               size = sizeof(struct dl);
+                               space = malloc(size);
+                               if (!space) {
+                                       err++;
+                                       break;
+                               }
+                               *tail = space;
+                               tail = space;
+                               *tail = NULL;
+                       }
+                       /* allocate memory for new device */
+                       size = sizeof_imsm_dev(super->devlist->dev, 0) +
+                               (num_members * sizeof(__u32));
+                       space = malloc(size);
+                       if (!space)
+                               err++;
+                       else {
+                               *tail = space;
+                               tail = space;
+                               *tail = NULL;
+                       }
+                       if (!err) {
+                               len = disks_to_mpb_size(num_members * 2);
+                       } else {
+                               /* if allocation didn't success, free buffer */
+                               while (update->space_list) {
+                                       void **sp = update->space_list;
+                                       update->space_list = *sp;
+                                       free(sp);
+                               }
+                       }
+               }
+
+               break;
+       }
+       case update_reshape_container_disks: {
+               /* Every raid device in the container is about to
+                * gain some more devices, and we will enter a
+                * reconfiguration.
+                * So each 'imsm_map' will be bigger, and the imsm_vol
+                * will now hold 2 of them.
+                * Thus we need new 'struct imsm_dev' allocations sized
+                * as sizeof_imsm_dev but with more devices in both maps.
+                */
+               struct imsm_update_reshape *u = (void *)update->buf;
+               struct intel_dev *dl;
+               void **space_tail = (void**)&update->space_list;
+
+               dprintf("imsm: imsm_prepare_update() for update_reshape\n");
+
+               for (dl = super->devlist; dl; dl = dl->next) {
+                       int size = sizeof_imsm_dev(dl->dev, 1);
+                       void *s;
+                       if (u->new_raid_disks > u->old_raid_disks)
+                               size += sizeof(__u32)*2*
+                                       (u->new_raid_disks - u->old_raid_disks);
+                       s = malloc(size);
+                       if (!s)
+                               break;
+                       *space_tail = s;
+                       space_tail = s;
+                       *space_tail = NULL;
+               }
+
+               len = disks_to_mpb_size(u->new_raid_disks);
+               dprintf("New anchor length is %llu\n", (unsigned long long)len);
+               break;
+       }
        case update_create_array: {
                struct imsm_update_create_array *u = (void *) update->buf;
                struct intel_dev *dv;
@@ -5533,7 +6812,7 @@ static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned ind
                        /* update ord entries being careful not to propagate
                         * ord-flags to the first map
                         */
-                       ord = get_imsm_ord_tbl_ent(dev, j);
+                       ord = get_imsm_ord_tbl_ent(dev, j, -1);
 
                        if (ord_to_idx(ord) <= index)
                                continue;
@@ -5555,6 +6834,536 @@ static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned ind
                __free_imsm_disk(dl);
        }
 }
+
+static char disk_by_path[] = "/dev/disk/by-path/";
+
+static const char *imsm_get_disk_controller_domain(const char *path)
+{
+       char disk_path[PATH_MAX];
+       char *drv=NULL;
+       struct stat st;
+
+       strncpy(disk_path, disk_by_path, PATH_MAX - 1);
+       strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1);
+       if (stat(disk_path, &st) == 0) {
+               struct sys_dev* hba;
+               char *path=NULL;
+
+               path = devt_to_devpath(st.st_rdev);
+               if (path == NULL)
+                       return "unknown";
+               hba = find_disk_attached_hba(-1, path);
+               if (hba && hba->type == SYS_DEV_SAS)
+                       drv = "isci";
+               else if (hba && hba->type == SYS_DEV_SATA)
+                       drv = "ahci";
+               else 
+                       drv = "unknown";
+               dprintf("path: %s hba: %s attached: %s\n",
+                       path, (hba) ? hba->path : "NULL", drv);
+               free(path);
+               if (hba)
+                       free_sys_dev(&hba);
+       }
+       return drv;
+}
+
+static int imsm_find_array_minor_by_subdev(int subdev, int container, int *minor)
+{
+       char subdev_name[20];
+       struct mdstat_ent *mdstat;
+
+       sprintf(subdev_name, "%d", subdev);
+       mdstat = mdstat_by_subdev(subdev_name, container);
+       if (!mdstat)
+               return -1;
+
+       *minor = mdstat->devnum;
+       free_mdstat(mdstat);
+       return 0;
+}
+
+static int imsm_reshape_is_allowed_on_container(struct supertype *st,
+                                               struct geo_params *geo,
+                                               int *old_raid_disks)
+{
+       /* currently we only support increasing the number of devices
+        * for a container.  This increases the number of device for each
+        * member array.  They must all be RAID0 or RAID5.
+        */
+       int ret_val = 0;
+       struct mdinfo *info, *member;
+       int devices_that_can_grow = 0;
+
+       dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): "
+               "st->devnum = (%i)\n",
+               st->devnum);
+
+       if (geo->size != -1 ||
+           geo->level != UnSet ||
+           geo->layout != UnSet ||
+           geo->chunksize != 0 ||
+           geo->raid_disks == UnSet) {
+               dprintf("imsm: Container operation is allowed for "
+                       "raid disks number change only.\n");
+               return ret_val;
+       }
+
+       info = container_content_imsm(st, NULL);
+       for (member = info; member; member = member->next) {
+               int result;
+               int minor;
+
+               dprintf("imsm: checking device_num: %i\n",
+                       member->container_member);
+
+               if (geo->raid_disks <= member->array.raid_disks) {
+                       /* we work on container for Online Capacity Expansion
+                        * only so raid_disks has to grow
+                        */
+                       dprintf("imsm: for container operation raid disks "
+                               "increase is required\n");
+                       break;
+               }
+
+               if ((info->array.level != 0) &&
+                   (info->array.level != 5)) {
+                       /* we cannot use this container with other raid level
+                        */
+                       dprintf("imsm: for container operation wrong"
+                               " raid level (%i) detected\n",
+                               info->array.level);
+                       break;
+               } else {
+                       /* check for platform support
+                        * for this raid level configuration
+                        */
+                       struct intel_super *super = st->sb;
+                       if (!is_raid_level_supported(super->orom,
+                                                    member->array.level,
+                                                    geo->raid_disks)) {
+                               dprintf("platform does not support raid%d with"
+                                       " %d disk%s\n",
+                                        info->array.level,
+                                        geo->raid_disks,
+                                        geo->raid_disks > 1 ? "s" : "");
+                               break;
+                       }
+               }
+
+               if (*old_raid_disks &&
+                   info->array.raid_disks != *old_raid_disks)
+                       break;
+               *old_raid_disks = info->array.raid_disks;
+
+               /* All raid5 and raid0 volumes in container
+                * have to be ready for Online Capacity Expansion
+                * so they need to be assembled.  We have already
+                * checked that no recovery etc is happening.
+                */
+               result = imsm_find_array_minor_by_subdev(member->container_member,
+                                                        st->container_dev,
+                                                        &minor);
+               if (result < 0) {
+                       dprintf("imsm: cannot find array\n");
+                       break;
+               }
+               devices_that_can_grow++;
+       }
+       sysfs_free(info);
+       if (!member && devices_that_can_grow)
+               ret_val = 1;
+
+       if (ret_val)
+               dprintf("\tContainer operation allowed\n");
+       else
+               dprintf("\tError: %i\n", ret_val);
+
+       return ret_val;
+}
+
+/* Function: get_spares_for_grow
+ * Description: Allocates memory and creates list of spare devices
+ *             avaliable in container. Checks if spare drive size is acceptable.
+ * Parameters: Pointer to the supertype structure
+ * Returns: Pointer to the list of spare devices (mdinfo structure) on success,
+ *             NULL if fail
+ */
+static struct mdinfo *get_spares_for_grow(struct supertype *st)
+{
+       unsigned long long min_size = min_acceptable_spare_size_imsm(st);
+       return container_choose_spares(st, min_size, NULL, NULL, NULL, 0);
+}
+
+/******************************************************************************
+ * function: imsm_create_metadata_update_for_reshape
+ * Function creates update for whole IMSM container.
+ *
+ ******************************************************************************/
+static int imsm_create_metadata_update_for_reshape(
+       struct supertype *st,
+       struct geo_params *geo,
+       int old_raid_disks,
+       struct imsm_update_reshape **updatep)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       int update_memory_size = 0;
+       struct imsm_update_reshape *u = NULL;
+       struct mdinfo *spares = NULL;
+       int i;
+       int delta_disks = 0;
+       struct mdinfo *dev;
+
+       dprintf("imsm_update_metadata_for_reshape(enter) raid_disks = %i\n",
+               geo->raid_disks);
+
+       delta_disks = geo->raid_disks - old_raid_disks;
+
+       /* size of all update data without anchor */
+       update_memory_size = sizeof(struct imsm_update_reshape);
+
+       /* now add space for spare disks that we need to add. */
+       update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1);
+
+       u = calloc(1, update_memory_size);
+       if (u == NULL) {
+               dprintf("error: "
+                       "cannot get memory for imsm_update_reshape update\n");
+               return 0;
+       }
+       u->type = update_reshape_container_disks;
+       u->old_raid_disks = old_raid_disks;
+       u->new_raid_disks = geo->raid_disks;
+
+       /* now get spare disks list
+        */
+       spares = get_spares_for_grow(st);
+
+       if (spares == NULL
+           || delta_disks > spares->array.spare_disks) {
+               fprintf(stderr, Name ": imsm: ERROR: Cannot get spare devices "
+                       "for %s.\n", geo->dev_name);
+               goto abort;
+       }
+
+       /* we have got spares
+        * update disk list in imsm_disk list table in anchor
+        */
+       dprintf("imsm: %i spares are available.\n\n",
+               spares->array.spare_disks);
+
+       dev = spares->devs;
+       for (i = 0; i < delta_disks; i++) {
+               struct dl *dl;
+
+               if (dev == NULL)
+                       break;
+               u->new_disks[i] = makedev(dev->disk.major,
+                                         dev->disk.minor);
+               dl = get_disk_super(super, dev->disk.major, dev->disk.minor);
+               dl->index = mpb->num_disks;
+               mpb->num_disks++;
+               dev = dev->next;
+       }
+
+abort:
+       /* free spares
+        */
+       sysfs_free(spares);
+
+       dprintf("imsm: reshape update preparation :");
+       if (i == delta_disks) {
+               dprintf(" OK\n");
+               *updatep = u;
+               return update_memory_size;
+       }
+       free(u);
+       dprintf(" Error\n");
+
+       return 0;
+}
+
+static void imsm_update_metadata_locally(struct supertype *st,
+                                        void *buf, int len)
+{
+       struct metadata_update mu;
+
+       mu.buf = buf;
+       mu.len = len;
+       mu.space = NULL;
+       mu.space_list = NULL;
+       mu.next = NULL;
+       imsm_prepare_update(st, &mu);
+       imsm_process_update(st, &mu);
+
+       while (mu.space_list) {
+               void **space = mu.space_list;
+               mu.space_list = *space;
+               free(space);
+       }
+}
+
+/***************************************************************************
+* Function:    imsm_analyze_change
+* Description: Function analyze change for single volume
+*              and validate if transition is supported
+* Parameters:  Geometry parameters, supertype structure
+* Returns:     Operation type code on success, -1 if fail
+****************************************************************************/
+enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
+                                          struct geo_params *geo)
+{
+       struct mdinfo info;
+       int change = -1;
+       int check_devs = 0;
+       int chunk;
+
+       getinfo_super_imsm_volume(st, &info, NULL);
+
+       if ((geo->level != info.array.level) &&
+           (geo->level >= 0) &&
+           (geo->level != UnSet)) {
+               switch (info.array.level) {
+               case 0:
+                       if (geo->level == 5) {
+                               change = CH_MIGRATION;
+                               check_devs = 1;
+                       }
+                       if (geo->level == 10) {
+                               change = CH_TAKEOVER;
+                               check_devs = 1;
+                       }
+                       break;
+               case 1:
+                       if (geo->level == 0) {
+                               change = CH_TAKEOVER;
+                               check_devs = 1;
+                       }
+                       break;
+               case 5:
+                       if (geo->level == 0)
+                               change = CH_MIGRATION;
+                       break;
+               case 10:
+                       if (geo->level == 0) {
+                               change = CH_TAKEOVER;
+                               check_devs = 1;
+                       }
+                       break;
+               }
+               if (change == -1) {
+                       fprintf(stderr,
+                               Name " Error. Level Migration from %d to %d "
+                               "not supported!\n",
+                               info.array.level, geo->level);
+                       goto analyse_change_exit;
+               }
+       } else
+               geo->level = info.array.level;
+
+       if ((geo->layout != info.array.layout)
+           && ((geo->layout != UnSet) && (geo->layout != -1))) {
+               change = CH_MIGRATION;
+               if ((info.array.layout == 0)
+                   && (info.array.level == 5)
+                   && (geo->layout == 5)) {
+                       /* reshape 5 -> 4 */
+               } else if ((info.array.layout == 5)
+                          && (info.array.level == 5)
+                          && (geo->layout == 0)) {
+                       /* reshape 4 -> 5 */
+                       geo->layout = 0;
+                       geo->level = 5;
+               } else {
+                       fprintf(stderr,
+                               Name " Error. Layout Migration from %d to %d "
+                               "not supported!\n",
+                               info.array.layout, geo->layout);
+                       change = -1;
+                       goto analyse_change_exit;
+               }
+       } else
+               geo->layout = info.array.layout;
+
+       if ((geo->chunksize > 0) && (geo->chunksize != UnSet)
+           && (geo->chunksize != info.array.chunk_size))
+               change = CH_MIGRATION;
+       else
+               geo->chunksize = info.array.chunk_size;
+
+       chunk = geo->chunksize / 1024;
+       if (!validate_geometry_imsm(st,
+                                   geo->level,
+                                   geo->layout,
+                                   geo->raid_disks,
+                                   &chunk,
+                                   geo->size,
+                                   0, 0, 1))
+               change = -1;
+
+       if (check_devs) {
+               struct intel_super *super = st->sb;
+               struct imsm_super *mpb = super->anchor;
+
+               if (mpb->num_raid_devs > 1) {
+                       fprintf(stderr,
+                               Name " Error. Cannot perform operation on %s"
+                               "- for this operation it MUST be single "
+                               "array in container\n",
+                               geo->dev_name);
+                       change = -1;
+               }
+       }
+
+analyse_change_exit:
+
+       return change;
+}
+
+int imsm_takeover(struct supertype *st, struct geo_params *geo)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_update_takeover *u;
+
+       u = malloc(sizeof(struct imsm_update_takeover));
+       if (u == NULL)
+               return 1;
+
+       u->type = update_takeover;
+       u->subarray = super->current_vol;
+
+       /* 10->0 transition */
+       if (geo->level == 0)
+               u->direction = R10_TO_R0;
+
+       /* 0->10 transition */
+       if (geo->level == 10)
+               u->direction = R0_TO_R10;
+
+       /* update metadata locally */
+       imsm_update_metadata_locally(st, u,
+                                       sizeof(struct imsm_update_takeover));
+       /* and possibly remotely */
+       if (st->update_tail)
+               append_metadata_update(st, u,
+                                       sizeof(struct imsm_update_takeover));
+       else
+               free(u);
+
+       return 0;
+}
+
+static int imsm_reshape_super(struct supertype *st, long long size, int level,
+                             int layout, int chunksize, int raid_disks,
+                             int delta_disks, char *backup, char *dev,
+                             int verbose)
+{
+       int ret_val = 1;
+       struct geo_params geo;
+
+       dprintf("imsm: reshape_super called.\n");
+
+       memset(&geo, 0, sizeof(struct geo_params));
+
+       geo.dev_name = dev;
+       geo.dev_id = st->devnum;
+       geo.size = size;
+       geo.level = level;
+       geo.layout = layout;
+       geo.chunksize = chunksize;
+       geo.raid_disks = raid_disks;
+       if (delta_disks != UnSet)
+               geo.raid_disks += delta_disks;
+
+       dprintf("\tfor level      : %i\n", geo.level);
+       dprintf("\tfor raid_disks : %i\n", geo.raid_disks);
+
+       if (experimental() == 0)
+               return ret_val;
+
+       if (st->container_dev == st->devnum) {
+               /* On container level we can only increase number of devices. */
+               dprintf("imsm: info: Container operation\n");
+               int old_raid_disks = 0;
+               if (imsm_reshape_is_allowed_on_container(
+                           st, &geo, &old_raid_disks)) {
+                       struct imsm_update_reshape *u = NULL;
+                       int len;
+
+                       len = imsm_create_metadata_update_for_reshape(
+                               st, &geo, old_raid_disks, &u);
+
+                       if (len <= 0) {
+                               dprintf("imsm: Cannot prepare update\n");
+                               goto exit_imsm_reshape_super;
+                       }
+
+                       ret_val = 0;
+                       /* update metadata locally */
+                       imsm_update_metadata_locally(st, u, len);
+                       /* and possibly remotely */
+                       if (st->update_tail)
+                               append_metadata_update(st, u, len);
+                       else
+                               free(u);
+
+               } else {
+                       fprintf(stderr, Name ": (imsm) Operation "
+                               "is not allowed on this container\n");
+               }
+       } else {
+               /* On volume level we support following operations
+                * - takeover: raid10 -> raid0; raid0 -> raid10
+                * - chunk size migration
+                * - migration: raid5 -> raid0; raid0 -> raid5
+                */
+               struct intel_super *super = st->sb;
+               struct intel_dev *dev = super->devlist;
+               int change, devnum;
+               dprintf("imsm: info: Volume operation\n");
+               /* find requested device */
+               while (dev) {
+                       imsm_find_array_minor_by_subdev(dev->index, st->container_dev, &devnum);
+                       if (devnum == geo.dev_id)
+                               break;
+                       dev = dev->next;
+               }
+               if (dev == NULL) {
+                       fprintf(stderr, Name " Cannot find %s (%i) subarray\n",
+                               geo.dev_name, geo.dev_id);
+                       goto exit_imsm_reshape_super;
+               }
+               super->current_vol = dev->index;
+               change = imsm_analyze_change(st, &geo);
+               switch (change) {
+               case CH_TAKEOVER:
+                       ret_val = imsm_takeover(st, &geo);
+                       break;
+               case CH_MIGRATION:
+                       ret_val = 0;
+                       break;
+               default:
+                       ret_val = 1;
+               }
+       }
+
+exit_imsm_reshape_super:
+       dprintf("imsm: reshape_super Exit code = %i\n", ret_val);
+       return ret_val;
+}
+
+static int imsm_manage_reshape(
+       int afd, struct mdinfo *sra, struct reshape *reshape,
+       struct supertype *st, unsigned long stripes,
+       int *fds, unsigned long long *offsets,
+       int dests, int *destfd, unsigned long long *destoffsets)
+{
+       /* Just use child_monitor for now */
+       return child_monitor(
+               afd, sra, reshape, st, stripes,
+               fds, offsets, dests, destfd, destoffsets);
+}
 #endif /* MDASSEMBLE */
 
 struct superswitch super_imsm = {
@@ -5567,18 +7376,25 @@ struct superswitch super_imsm = {
        .brief_detail_super = brief_detail_super_imsm,
        .write_init_super = write_init_super_imsm,
        .validate_geometry = validate_geometry_imsm,
-       .default_chunk  = default_chunk_imsm,
        .add_to_super   = add_to_super_imsm,
+       .remove_from_super = remove_from_super_imsm,
        .detail_platform = detail_platform_imsm,
        .kill_subarray = kill_subarray_imsm,
        .update_subarray = update_subarray_imsm,
+       .load_container = load_container_imsm,
+       .default_geometry = default_geometry_imsm,
+       .get_disk_controller_domain = imsm_get_disk_controller_domain,
+       .reshape_super  = imsm_reshape_super,
+       .manage_reshape = imsm_manage_reshape,
 #endif
        .match_home     = match_home_imsm,
        .uuid_from_super= uuid_from_super_imsm,
        .getinfo_super  = getinfo_super_imsm,
+       .getinfo_super_disks = getinfo_super_disks_imsm,
        .update_super   = update_super_imsm,
 
        .avail_size     = avail_size_imsm,
+       .min_acceptable_spare_size = min_acceptable_spare_size_imsm,
 
        .compare_super  = compare_super_imsm,
 
@@ -5588,7 +7404,6 @@ struct superswitch super_imsm = {
        .free_super     = free_super_imsm,
        .match_metadata_desc = match_metadata_desc_imsm,
        .container_content = container_content_imsm,
-       .default_layout = imsm_level_to_layout,
 
        .external       = 1,
        .name = "imsm",
diff --git a/super-mbr.c b/super-mbr.c
new file mode 100644 (file)
index 0000000..5eefdf6
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'mbr' is a pseudo metadata type for devices which have a
+ * partition table in the Master Boot Record (mbr) also known
+ * as a dos partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_mbr(struct supertype *st)
+{
+       free(st->sb);
+       st->sb = NULL;
+}
+
+#ifndef MDASSEMBLE
+
+static void examine_mbr(struct supertype *st, char *homehost)
+{
+       struct MBR *sb = st->sb;
+       int i;
+
+       printf("   MBR Magic : %04x\n", sb->magic);
+       for (i = 0; i < MBR_PARTITIONS; i++)
+               if (sb->parts[i].blocks_num)
+                       printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n",
+                              i,
+                              (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num),
+                              (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba),
+                              sb->parts[i].part_type);
+
+}
+
+#endif /*MDASSEMBLE */
+
+static int load_super_mbr(struct supertype *st, int fd, char *devname)
+{
+       /* try to read an mbr
+        * Return
+        *  0 on success
+        *  1 cannot get record
+        *  2 record is meaningless
+        */
+       struct MBR *super;
+
+       free_mbr(st);
+
+       if (posix_memalign((void**)&super, 512, 512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+               if (devname)
+                       fprintf(stderr, Name ": Cannot read partition table on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       if (super->magic != MBR_SIGNATURE_MAGIC) {
+               if (devname)
+                       fprintf(stderr, Name ": No partition table found on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+
+       st->sb = super;
+
+       if (st->ss == NULL) {
+               st->ss = &mbr;
+               st->minor_version = 0;
+               st->max_devs = 1;
+               st->info = NULL;
+       }
+       return 0;
+}
+
+static int store_mbr(struct supertype *st, int fd)
+{
+       struct MBR *old, *super;
+
+       if (posix_memalign((void**)&old, 512, 512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, old, sizeof(*old)) != sizeof(*old)) {
+               free(old);
+               return 1;
+       }
+
+       super = st->sb;
+       memcpy(super->pad, old->pad, sizeof(super->pad));
+       free(old);
+       lseek(fd, 0, 0);
+       if (write(fd, super, sizeof(*super)) != sizeof(*super))
+               return 4;
+       fsync(fd);
+       ioctl(fd, BLKRRPART, 0);
+       return 0;
+}
+
+static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map)
+{
+       struct MBR *sb = st->sb;
+       int i;
+
+       memset(&info->array, 0, sizeof(info->array));
+       memset(&info->disk, 0, sizeof(info->disk));
+       strcpy(info->text_version, "mbr");
+       strcpy(info->name, "mbr");
+       info->component_size = 0;
+
+       for (i = 0; i < MBR_PARTITIONS ; i++)
+               if (sb->parts[i].blocks_num) {
+                       unsigned long last = 
+                               (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num)
+                               + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba);
+                       if (last > info->component_size)
+                               info->component_size = last;
+               }
+
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+       struct supertype *st = malloc(sizeof(*st));
+
+       if (!st)
+               return st;
+       if (strcmp(arg, "mbr") != 0)
+               return NULL;
+
+       st->ss = &mbr;
+       st->info = NULL;
+       st->minor_version = 0;
+       st->max_devs = 1;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry(struct supertype *st, int level,
+                            int layout, int raiddisks,
+                            int *chunk, unsigned long long size,
+                            char *subdev, unsigned long long *freesize,
+                            int verbose)
+{
+       fprintf(stderr, Name ": mbr metadata cannot be used this way\n");
+       return 0;
+}
+#endif
+
+struct superswitch mbr = {
+#ifndef MDASSEMBLE
+       .examine_super = examine_mbr,
+       .validate_geometry = validate_geometry,
+#endif
+       .match_metadata_desc = match_metadata_desc,
+       .load_super = load_super_mbr,
+       .store_super = store_mbr,
+       .getinfo_super = getinfo_mbr,
+       .free_super = free_mbr,
+       .name = "mbr",
+};
index e855541369e797db7a420c2000bdf27c65a71229..3ae236a82d57b9d8fd7efff4a96b7e5d1d6e3ba8 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -339,11 +339,12 @@ static void uuid_from_super0(struct supertype *st, int uuid[4])
        }
 }
 
-static void getinfo_super0(struct supertype *st, struct mdinfo *info)
+static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map)
 {
        mdp_super_t *sb = st->sb;
        int working = 0;
        int i;
+       int map_disks = info->array.raid_disks;
 
        info->array.major_version = sb->major_version;
        info->array.minor_version = sb->minor_version;
@@ -391,11 +392,26 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info)
                if ((sb->disks[i].state & (1<<MD_DISK_SYNC)) &&
                    (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) &&
                    (sb->disks[i].state & (1<<MD_DISK_ACTIVE)) &&
-                   !(sb->disks[i].state & (1<<MD_DISK_FAULTY)))
+                   !(sb->disks[i].state & (1<<MD_DISK_FAULTY))) {
                        working ++;
+                       if (map && i < map_disks)
+                               map[i] = 1;
+               } else if (map && i < map_disks)
+                       map[i] = 0;
        info->array.working_disks = working;
 }
 
+static struct mdinfo *container_content0(struct supertype *st, char *subarray)
+{
+       struct mdinfo *info;
+
+       if (subarray)
+               return NULL;
+
+       info = malloc(sizeof(*info));
+       getinfo_super0(st, info, NULL);
+       return info;
+}
 
 static int update_super0(struct supertype *st, struct mdinfo *info,
                         char *update,
@@ -420,14 +436,12 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                if (verbose >= 0)
                        fprintf (stderr, Name ": adjusting superblock of %s for 2.2/sparc compatability.\n",
                                 devname);
-       }
-       if (strcmp(update, "super-minor") ==0) {
+       } else if (strcmp(update, "super-minor") ==0) {
                sb->md_minor = info->array.md_minor;
                if (verbose > 0)
                        fprintf(stderr, Name ": updating superblock of %s with minor number %d\n",
                                devname, info->array.md_minor);
-       }
-       if (strcmp(update, "summaries") == 0) {
+       } else if (strcmp(update, "summaries") == 0) {
                unsigned int i;
                /* set nr_disks, active_disks, working_disks,
                 * failed_disks, spare_disks based on disks[]
@@ -454,8 +468,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                                        sb->spare_disks++;
                        } else if (i >= sb->raid_disks && sb->disks[i].number == 0)
                                sb->disks[i].state = 0;
-       }
-       if (strcmp(update, "force-one")==0) {
+       } else if (strcmp(update, "force-one")==0) {
                /* Not enough devices for a working array, so
                 * bring this one up-to-date.
                 */
@@ -465,8 +478,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                if (sb->events_hi != ehi ||
                    sb->events_lo != elo)
                        rv = 1;
-       }
-       if (strcmp(update, "force-array")==0) {
+       } else if (strcmp(update, "force-array")==0) {
                /* degraded array and 'force' requested, so
                 * maybe need to mark it 'clean'
                 */
@@ -476,8 +488,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                        sb->state |= (1 << MD_SB_CLEAN);
                        rv = 1;
                }
-       }
-       if (strcmp(update, "assemble")==0) {
+       } else if (strcmp(update, "assemble")==0) {
                int d = info->disk.number;
                int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
                int mask = (1<<MD_DISK_WRITEMOSTLY);
@@ -506,8 +517,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                        sb->reshape_position = info->reshape_progress;
                        rv = 1;
                }
-       }
-       if (strcmp(update, "linear-grow-new") == 0) {
+       } else if (strcmp(update, "linear-grow-new") == 0) {
                memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0]));
                sb->disks[info->disk.number].number = info->disk.number;
                sb->disks[info->disk.number].major = info->disk.major;
@@ -515,8 +525,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
                sb->disks[info->disk.number].state = info->disk.state;
                sb->this_disk = sb->disks[info->disk.number];
-       }
-       if (strcmp(update, "linear-grow-update") == 0) {
+       } else if (strcmp(update, "linear-grow-update") == 0) {
                sb->raid_disks = info->array.raid_disks;
                sb->nr_disks = info->array.nr_disks;
                sb->active_disks = info->array.active_disks;
@@ -527,20 +536,17 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                sb->disks[info->disk.number].minor = info->disk.minor;
                sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
                sb->disks[info->disk.number].state = info->disk.state;
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
                /* make sure resync happens */
                sb->state &= ~(1<<MD_SB_CLEAN);
                sb->recovery_cp = 0;
-       }
-       if (strcmp(update, "homehost") == 0 &&
+       } else if (strcmp(update, "homehost") == 0 &&
            homehost) {
                uuid_set = 0;
                update = "uuid";
                info->uuid[0] = sb->set_uuid0;
                info->uuid[1] = sb->set_uuid1;
-       }
-       if (strcmp(update, "uuid") == 0) {
+       } else if (strcmp(update, "uuid") == 0) {
                if (!uuid_set && homehost) {
                        char buf[20];
                        char *hash = sha1_buffer(homehost,
@@ -559,9 +565,10 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                }
        } else if (strcmp(update, "no-bitmap") == 0) {
                sb->state &= ~(1<<MD_SB_BITMAP_PRESENT);
-       }
-       if (strcmp(update, "_reshape_progress")==0)
+       } else if (strcmp(update, "_reshape_progress")==0)
                sb->reshape_position = info->reshape_progress;
+       else
+               rv = -1;
 
        sb->sb_csum = calc_sb0_csum(sb);
        return rv;
@@ -759,8 +766,6 @@ static int write_init_super0(struct supertype *st)
                        fprintf(stderr,
                                Name ": failed to write superblock to %s\n",
                                di->devname);
-               close(di->fd);
-               di->fd = -1;
        }
        return rv;
 }
@@ -830,9 +835,6 @@ static int load_super0(struct supertype *st, int fd, char *devname)
 
        free_super0(st);
 
-       if (st->subarray[0])
-               return 1;
-
        if (!get_dev_size(fd, devname, &dsize))
                return 1;
 
@@ -930,6 +932,7 @@ static struct supertype *match_metadata_desc0(char *arg)
        if (!st) return st;
 
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super0;
        st->info = NULL;
        st->minor_version = 90;
@@ -1088,13 +1091,20 @@ static void free_super0(struct supertype *st)
 {
        if (st->sb)
                free(st->sb);
+       while (st->info) {
+               struct devinfo *di = st->info;
+               st->info = di->next;
+               if (di->fd >= 0)
+                       close(di->fd);
+               free(di);
+       }
        st->sb = NULL;
 }
 
 #ifndef MDASSEMBLE
 static int validate_geometry0(struct supertype *st, int level,
                              int layout, int raiddisks,
-                             int chunk, unsigned long long size,
+                             int *chunk, unsigned long long size,
                              char *subdev, unsigned long long *freesize,
                              int verbose)
 {
@@ -1117,6 +1127,9 @@ static int validate_geometry0(struct supertype *st, int level,
                        fprintf(stderr, Name ": 0.90 metadata supports at most 2 terrabytes per device\n");
                return 0;
        }
+       if (chunk && *chunk == UnSet)
+               *chunk = DEFAULT_CHUNK;
+
        if (!subdev)
                return 1;
 
@@ -1157,6 +1170,7 @@ struct superswitch super0 = {
        .match_home = match_home0,
        .uuid_from_super = uuid_from_super0,
        .getinfo_super = getinfo_super0,
+       .container_content = container_content0,
        .update_super = update_super0,
        .init_super = init_super0,
        .store_super = store_super0,
index 457e2d6e0f034b45deca157486ca40f7c878160b..79bb4d0cdcaf10c2dd95840e283e4f06eb7983af 100644 (file)
--- a/super1.c
+++ b/super1.c
@@ -558,12 +558,13 @@ static void uuid_from_super1(struct supertype *st, int uuid[4])
                cuuid[i] = super->set_uuid[i];
 }
 
-static void getinfo_super1(struct supertype *st, struct mdinfo *info)
+static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct mdp_superblock_1 *sb = st->sb;
        int working = 0;
        unsigned int i;
-       int role;
+       unsigned int role;
+       unsigned int map_disks = info->array.raid_disks;
 
        info->array.major_version = 1;
        info->array.minor_version = st->minor_version;
@@ -629,15 +630,33 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
        } else
                info->reshape_active = 0;
 
+       if (map)
+               for (i=0; i<map_disks; i++)
+                       map[i] = 0;
        for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
                role = __le16_to_cpu(sb->dev_roles[i]);
-               if (/*role == 0xFFFF || */role < info->array.raid_disks)
+               if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) {
                        working++;
+                       if (map && role < map_disks)
+                               map[role] = 1;
+               }
        }
 
        info->array.working_disks = working;
 }
 
+static struct mdinfo *container_content1(struct supertype *st, char *subarray)
+{
+       struct mdinfo *info;
+
+       if (subarray)
+               return NULL;
+
+       info = malloc(sizeof(*info));
+       getinfo_super1(st, info, NULL);
+       return info;
+}
+
 static int update_super1(struct supertype *st, struct mdinfo *info,
                         char *update,
                         char *devname, int verbose,
@@ -657,8 +676,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                if (sb->events != __cpu_to_le64(info->events))
                        rv = 1;
                sb->events = __cpu_to_le64(info->events);
-       }
-       if (strcmp(update, "force-array")==0) {
+       } else if (strcmp(update, "force-array")==0) {
                /* Degraded array and 'force' requests to
                 * maybe need to mark it 'clean'.
                 */
@@ -669,8 +687,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                                rv = 1;
                        sb->resync_offset = MaxSector;
                }
-       }
-       if (strcmp(update, "assemble")==0) {
+       } else if (strcmp(update, "assemble")==0) {
                int d = info->disk.number;
                int want;
                if (info->disk.state == 6)
@@ -695,8 +712,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        sb->reshape_position = __cpu_to_le64(info->reshape_progress);
                        rv = 1;
                }
-       }
-       if (strcmp(update, "linear-grow-new") == 0) {
+       } else if (strcmp(update, "linear-grow-new") == 0) {
                unsigned int i;
                int rfd, fd;
                unsigned int max = __le32_to_cpu(sb->max_dev);
@@ -738,17 +754,14 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                                        ds - __le64_to_cpu(sb->data_offset));
                        }
                }
-       }
-       if (strcmp(update, "linear-grow-update") == 0) {
+       } else if (strcmp(update, "linear-grow-update") == 0) {
                sb->raid_disks = __cpu_to_le32(info->array.raid_disks);
                sb->dev_roles[info->disk.number] =
                        __cpu_to_le16(info->disk.raid_disk);
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
                /* make sure resync happens */
                sb->resync_offset = 0ULL;
-       }
-       if (strcmp(update, "uuid") == 0) {
+       } else if (strcmp(update, "uuid") == 0) {
                copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid);
 
                if (__le32_to_cpu(sb->feature_map)&MD_FEATURE_BITMAP_OFFSET) {
@@ -758,8 +771,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                }
        } else if (strcmp(update, "no-bitmap") == 0) {
                sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
-       }
-       if (strcmp(update, "homehost") == 0 &&
+       } else if (strcmp(update, "homehost") == 0 &&
            homehost) {
                char *c;
                update = "name";
@@ -769,8 +781,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                else
                        strncpy(info->name, sb->set_name, 32);
                info->name[32] = 0;
-       }
-       if (strcmp(update, "name") == 0) {
+       } else if (strcmp(update, "name") == 0) {
                if (info->name[0] == 0)
                        sprintf(info->name, "%d", info->array.md_minor);
                memset(sb->set_name, 0, sizeof(sb->set_name));
@@ -782,8 +793,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        strcat(sb->set_name, info->name);
                } else
                        strcpy(sb->set_name, info->name);
-       }
-       if (strcmp(update, "devicesize") == 0 &&
+       } else if (strcmp(update, "devicesize") == 0 &&
            __le64_to_cpu(sb->super_offset) <
            __le64_to_cpu(sb->data_offset)) {
                /* set data_size to device size less data_offset */
@@ -795,9 +805,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        misc->device_size - __le64_to_cpu(sb->data_offset));
                printf("Size is %llu\n", (unsigned long long)
                       __le64_to_cpu(sb->data_size));
-       }
-       if (strcmp(update, "_reshape_progress")==0)
+       } else if (strcmp(update, "_reshape_progress")==0)
                sb->reshape_position = __cpu_to_le64(info->reshape_progress);
+       else
+               rv = -1;
 
        sb->sb_csum = calc_sb_1_csum(sb);
        return rv;
@@ -1022,11 +1033,13 @@ static unsigned long choose_bm_space(unsigned long devsize)
        return 4*2;
 }
 
+static void free_super1(struct supertype *st);
+
 #ifndef MDASSEMBLE
 static int write_init_super1(struct supertype *st)
 {
        struct mdp_superblock_1 *sb = st->sb;
-       struct supertype refst;
+       struct supertype *refst;
        int rfd;
        int rv = 0;
        unsigned long long bm_space;
@@ -1058,10 +1071,9 @@ static int write_init_super1(struct supertype *st)
 
                sb->events = 0;
 
-               refst =*st;
-               refst.sb = NULL;
-               if (load_super1(&refst, di->fd, NULL)==0) {
-                       struct mdp_superblock_1 *refsb = refst.sb;
+               refst = dup_super(st);
+               if (load_super1(refst, di->fd, NULL)==0) {
+                       struct mdp_superblock_1 *refsb = refst->sb;
 
                        memcpy(sb->device_uuid, refsb->device_uuid, 16);
                        if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
@@ -1074,8 +1086,9 @@ static int write_init_super1(struct supertype *st)
                                if (get_linux_version() >= 2006018)
                                        sb->dev_number = refsb->dev_number;
                        }
-                       free(refsb);
+                       free_super1(refst);
                }
+               free(refst);
 
                if (!get_dev_size(di->fd, NULL, &dsize))
                        return 1;
@@ -1210,8 +1223,6 @@ static int compare_super1(struct supertype *st, struct supertype *tst)
        return 0;
 }
 
-static void free_super1(struct supertype *st);
-
 static int load_super1(struct supertype *st, int fd, char *devname)
 {
        unsigned long long dsize;
@@ -1223,9 +1234,6 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        free_super1(st);
 
-       if (st->subarray[0])
-               return 1;
-
        if (st->ss == NULL || st->minor_version == -1) {
                int bestvers = -1;
                struct supertype tst;
@@ -1380,6 +1388,7 @@ static struct supertype *match_metadata_desc1(char *arg)
        if (!st) return st;
 
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super1;
        st->max_devs = 384;
        st->sb = NULL;
@@ -1646,13 +1655,20 @@ static void free_super1(struct supertype *st)
 {
        if (st->sb)
                free(st->sb);
+       while (st->info) {
+               struct devinfo *di = st->info;
+               st->info = di->next;
+               if (di->fd >= 0)
+                       close(di->fd);
+               free(di);
+       }
        st->sb = NULL;
 }
 
 #ifndef MDASSEMBLE
 static int validate_geometry1(struct supertype *st, int level,
                              int layout, int raiddisks,
-                             int chunk, unsigned long long size,
+                             int *chunk, unsigned long long size,
                              char *subdev, unsigned long long *freesize,
                              int verbose)
 {
@@ -1664,6 +1680,9 @@ static int validate_geometry1(struct supertype *st, int level,
                        fprintf(stderr, Name ": 1.x metadata does not support containers\n");
                return 0;
        }
+       if (chunk && *chunk == UnSet)
+               *chunk = DEFAULT_CHUNK;
+
        if (!subdev)
                return 1;
 
@@ -1701,6 +1720,7 @@ struct superswitch super1 = {
        .match_home = match_home1,
        .uuid_from_super = uuid_from_super1,
        .getinfo_super = getinfo_super1,
+       .container_content = container_content1,
        .update_super = update_super1,
        .init_super = init_super1,
        .store_super = store_super1,
diff --git a/sysfs.c b/sysfs.c
index 6e1d77b313daf28189cfa38e753c022f92bdfd71..a7dfcc2966b5eca585d504338a267fc521ad6486 100644 (file)
--- a/sysfs.c
+++ b/sysfs.c
@@ -90,11 +90,7 @@ void sysfs_init(struct mdinfo *mdi, int fd, int devnum)
        }
        if (devnum == NoMdDev)
                return;
-       if (devnum >= 0)
-               sprintf(mdi->sys_name, "md%d", devnum);
-       else
-               sprintf(mdi->sys_name, "md_d%d",
-                       -1-devnum);
+       fmt_devname(mdi->sys_name, devnum);
 }
 
 
@@ -435,6 +431,17 @@ int sysfs_uevent(struct mdinfo *sra, char *event)
        return 0;
 }      
 
+int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name)
+{
+       char fname[50];
+       struct stat st;
+
+       sprintf(fname, "/sys/block/%s/md/%s/%s",
+               sra->sys_name, dev?dev->sys_name:"", name);
+
+       return stat(fname, &st) == 0;
+}
+
 int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
                       char *name)
 {
@@ -562,6 +569,18 @@ int sysfs_set_array(struct mdinfo *info, int vers)
 
        if (info->array.level > 0)
                rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
+
+       if (info->reshape_active) {
+               rv |= sysfs_set_num(info, NULL, "reshape_position",
+                                   info->reshape_progress);
+               rv |= sysfs_set_num(info, NULL, "chunk_size", info->new_chunk);
+               rv |= sysfs_set_num(info, NULL, "layout", info->new_layout);
+               rv |= sysfs_set_num(info, NULL, "raid_disks",
+                                   info->array.raid_disks + info->delta_disks);
+               /* We don't set 'new_level' here.  That can only happen
+                * once the reshape completes.
+                */
+       }
        return rv;
 }
 
@@ -603,7 +622,8 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
                         * yet, so just ignore status for now.
                         */
                        sysfs_set_str(sra, sd, "state", "insync");
-               rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+               if (sd->disk.raid_disk >= 0)
+                       rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
                if (resume)
                        sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
        }
@@ -688,7 +708,7 @@ int sysfs_disk_to_scsi_id(int fd, __u32 *id)
        if (fstat(fd, &st))
                return 1;
 
-       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device/scsi_device",
                 major(st.st_rdev), minor(st.st_rdev));
 
        dir = opendir(path);
@@ -697,8 +717,7 @@ int sysfs_disk_to_scsi_id(int fd, __u32 *id)
 
        de = readdir(dir);
        while (de) {
-               if (strncmp("scsi_disk:", de->d_name,
-                           strlen("scsi_disk:")) == 0)
+               if (strchr(de->d_name, ':'))
                        break;
                de = readdir(dir);
        }
@@ -707,21 +726,20 @@ int sysfs_disk_to_scsi_id(int fd, __u32 *id)
        if (!de)
                return 1;
 
-       c1 = strchr(de->d_name, ':');
-       c1++;
+       c1 = de->d_name;
        c2 = strchr(c1, ':');
        *c2 = '\0';
        *id = strtol(c1, NULL, 10) << 24; /* host */
        c1 = c2 + 1;
        c2 = strchr(c1, ':');
        *c2 = '\0';
-       *id |= strtol(c1, NULL, 10) << 16; /* channel */
+       *id |= strtol(c1, NULL, 10) << 16; /* bus */
        c1 = c2 + 1;
        c2 = strchr(c1, ':');
        *c2 = '\0';
-       *id |= strtol(c1, NULL, 10) << 8; /* lun */
+       *id |= strtol(c1, NULL, 10) << 8; /* target */
        c1 = c2 + 1;
-       *id |= strtol(c1, NULL, 10); /* id */
+       *id |= strtol(c1, NULL, 10); /* lun */
 
        return 0;
 }
@@ -789,6 +807,28 @@ int sysfs_unique_holder(int devnum, long rdev)
                return found;
 }
 
+int sysfs_freeze_array(struct mdinfo *sra)
+{
+       /* Try to freeze resync/rebuild on this array/container.
+        * Return -1 if the array is busy,
+        * return -2 container cannot be frozen,
+        * return 0 if this kernel doesn't support 'frozen'
+        * return 1 if it worked.
+        */
+       char buf[20];
+
+       if (!sysfs_attribute_available(sra, NULL, "sync_action"))
+               return 1; /* no sync_action == frozen */
+       if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
+               return 0;
+       if (strcmp(buf, "idle\n") != 0 &&
+           strcmp(buf, "frozen\n") != 0)
+               return -1;
+       if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
+               return 0;
+       return 1;
+}
+
 #ifndef MDASSEMBLE
 
 static char *clean_states[] = {
diff --git a/test b/test
old mode 100644 (file)
new mode 100755 (executable)
index a31ad40..cb1398b
--- a/test
+++ b/test
@@ -50,11 +50,12 @@ ddfsize=65536
 
 cleanup() {
        udevadm settle
-       $mdadm -Ssq
+       $mdadm -Ssq 2> /dev/null
        for d in 0 1 2 3 4 5 6 7  8 9 10 11 12
        do
-           losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
-        done
+           losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
+           rm -f /dev/disk/by-path/loop*
+       done
 }
 
 trap cleanup 0 1 2 3 15
@@ -116,45 +117,45 @@ check() {
       ;;
     raid* | linear )
       grep -s "active $1 " /proc/mdstat > /dev/null || {
-               echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;}
+               echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;}
      ;;
     resync | recovery | reshape)
       sleep 0.5
       grep -s $1 /proc/mdstat > /dev/null || {
-               echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; }
+               echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; }
      ;;
-   
+
      nosync )
        sleep 0.5
-       if grep -s -E '(resync|recovery|reshape) =' > /dev/null /proc/mdstat ; then
-          echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; 
+       if grep -s -E '(resync|recovery|reshape) *=' > /dev/null /proc/mdstat ; then
+               echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1;
        fi
      ;;
-   
+
     wait )
       sleep 0.1
-      while grep -E '(resync|recovery|reshape|check|repair) =' > /dev/null /proc/mdstat
+      while grep -E '(resync|recovery|reshape|check|repair) *=' > /dev/null /proc/mdstat
       do sleep 2;
       done
       ;;
-   
+
     state )
        grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || {
-               echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; }
+               echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; }
        sleep 0.5
       ;;
 
     bitmap )
        grep -s bitmap > /dev/null /proc/mdstat || {
-          echo >&2 ERROR no bitmap ; cat /proc/mdstat ; exit 1; }
+               echo >&2 ERROR no bitmap ; cat /proc/mdstat ; exit 1; }
       ;;
     nobitmap )
-       if grep -s "bitmap" > /dev/null /proc/mdstat 
+       if grep -s "bitmap" > /dev/null /proc/mdstat
        then
-          echo >&2 ERROR bitmap present ; cat /proc/mdstat ; exit 1;
+               echo >&2 ERROR bitmap present ; cat /proc/mdstat ; exit 1;
        fi
       ;;
-   
+
     * ) echo >&2 ERROR unknown check $1 ; exit 1;
    esac
 }
@@ -196,42 +197,23 @@ rotest() {
   fsck -fn $dev >&2
 }
 
-setup_environment() {
-   if [ -f $1 ]; then
-      . $environment
-      setup_env
-   fi
-}
-
-reset_environment() {
-   if [ -f $1 ]; then
-      reset_env
-      unset setup_env
-      unset reset_env
-   fi
-}
-
 for script in tests/$prefix tests/$prefix*[^~]
 do
   if [ -f "$script" ]
   then
    rm -f $targetdir/stderr
    # stop all arrays, just incase some script left an array active.
-   mdadm -Ssq
+   $mdadm -Ssq 2> /dev/null
    mdadm --zero $devlist 2> /dev/null
    mdadm --zero $devlist 2> /dev/null
-   environment="tests/env-`basename $script`"
-   setup_environment $environment
    # source script in a subshell, so it has access to our
    # namespace, but cannot change it.
+   echo -ne "$script... "
    if ( set -ex ; . $script )  2> $targetdir/log
-   then echo "$script succeeded" 
-   else cat $targetdir/log ; cat $targetdir/stderr
-        echo "$script failed"
-       reset_environment $environment
+   then echo "succeeded"
+   else echo "FAILED - see $targetdir/log for details"
        exit 1
    fi
-   reset_environment $environment
   fi
 done
 exit 0
index 1a071efa8d41cefb5e64071bc5a78b65dd13fd15..30abd64ea0a45621c33c734ef93380779bd43893 100644 (file)
@@ -1,3 +1,59 @@
+imsm_check() {
+   case $1 in
+    container )
+      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+      ;;
+    member )
+      member=$2
+      num_disks=$3
+      level=$4
+      size=$5
+      offset=$6
+      err=0
+
+      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+      sysfs=/sys/dev/block/${major}:${minor}
+      if [ ! -f ${sysfs}/md/array_state ]; then
+           echo "member array $member not found" >&2
+           cat /proc/mdstat >&2
+           exit 1
+      fi
+      for i in `seq 0 $((num_disks-1))`
+      do
+         _offset=`cat ${sysfs}/md/rd${i}/offset`
+         if [ $offset -ne $((_offset/2)) ]; then
+           echo "offset mismatch expected $offset got $_offset" >&2
+            err=$((err+1))
+         fi
+         _size=`cat ${sysfs}/md/rd${i}/size`
+         if [ $size -ne $_size ]; then
+           echo "offset mismatch expected $size got $_size" >&2
+            err=$((err+1))
+         fi
+      done
+
+      if [ $err -gt 0 ]; then
+          echo "$member failed check" >&2
+          cat /proc/mdstat >&2
+         mdadm -E /dev/loop0 >&2
+          exit 1
+      fi
+      ;;
+    * ) echo >&2 ERROR unknown check $1 ; exit 1;
+   esac
+}
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_NO_PLATFORM=1
+container=/dev/md/container
+member0=/dev/md/vol0
+member1=/dev/md/vol1
+member2=/dev/md/vol2
+member3=/dev/md/vol3
+member4=/dev/md/vol4
+
+
 # create raid arrays with varying degress of overlap
 mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5
 imsm_check container 6
index 46e15bcf648310884d7b997dc95dbe3f898d1d3e..dd9fd6e83336c00a161cbfc00c4673ce3de713b4 100644 (file)
@@ -1,5 +1,32 @@
 # validate the prodigal member disk scenario i.e. a former container
 # member is returned after having been rebuilt on another system
+
+
+imsm_check_hold() {
+   if mdadm --remove $1 $2; then
+       echo "$2 removal from $1 should have been blocked" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+imsm_check_removal() {
+   if ! mdadm --remove $1 $2 ; then
+       echo "$2 removal from $1 should have succeeded" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export IMSM_NO_PLATFORM=1
+container=/dev/md/container
+member=/dev/md/vol0
+
+
 num_disks=4
 size=$((10*1024))
 mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3
index 68fdd09680729b26e44d9e231ff7fe39eb27e9e6..2539deb17310101829bd602e8170740d4d1f35a1 100644 (file)
@@ -1,5 +1,91 @@
 # sanity check array creation
 
+imsm_check_hold() {
+   if mdadm --remove $1 $2; then
+       echo "$2 removal from $1 should have been blocked" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+imsm_check_removal() {
+   if ! mdadm --remove $1 $2 ; then
+       echo "$2 removal from $1 should have succeeded" >&2
+       cat /proc/mdstat >&2
+       mdadm -E $2
+       exit 1
+   fi
+}
+
+imsm_check() {
+   udevadm settle
+   case $1 in
+    container )
+      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
+               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
+      ;;
+    member )
+      member=$2
+      num_disks=$3
+      level=$4
+      size=$5
+      offset=$6
+      chunk=$7
+      err=0
+
+      if [ $level -ne 1 ]; then
+         size=$((size & ~(chunk - 1)))
+      else
+         chunk=64
+      fi
+      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
+      sysfs=/sys/dev/block/${major}:${minor}
+      if [ ! -f ${sysfs}/md/array_state ]; then
+           echo "member array $member not found" >&2
+           cat /proc/mdstat >&2
+           exit 1
+      fi
+      _chunk=`cat ${sysfs}/md/chunk_size`
+      if [ $chunk -ne $((_chunk/1024)) ]; then
+         echo "chunk mismatch expected $chunk got $_chunk" >&2
+         err=$((err+1))
+      fi
+      for i in `seq 0 $((num_disks-1))`
+      do
+         _offset=`cat ${sysfs}/md/rd${i}/offset`
+         if [ $offset -ne $((_offset/2)) ]; then
+           echo "offset mismatch expected $offset got $_offset" >&2
+            err=$((err+1))
+         fi
+         _size=`cat ${sysfs}/md/rd${i}/size`
+         if [ $size -ne $_size ]; then
+           echo "size mismatch expected $size got $_size" >&2
+            err=$((err+1))
+         fi
+      done
+
+      if [ $err -gt 0 ]; then
+          echo "$member failed check" >&2
+          cat /proc/mdstat >&2
+         mdadm -E /dev/loop0 >&2
+          exit 1
+      fi
+      ;;
+    * ) echo >&2 ERROR unknown check $1 ; exit 1;
+   esac
+}
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export IMSM_NO_PLATFORM=1
+container=/dev/md/container
+member0=/dev/md/vol0
+member1=/dev/md/vol1
+member2=/dev/md/vol2
+member3=/dev/md/vol3
+member4=/dev/md/vol4
+
 # IMSM rounds to multiples of one mebibyte - 1024K
 DEV_ROUND_K=1024
 
diff --git a/tests/11spare-migration b/tests/11spare-migration
new file mode 100644 (file)
index 0000000..3567883
--- /dev/null
@@ -0,0 +1,453 @@
+# Set of tests for autorebuild functionality using mdadm -F
+# To be able to test ddf one must have all loop devices of bigger size, with the ones
+# above number 7 bigger again by any amount (this is not changed for now as it
+# could affect other tests)
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export IMSM_NO_PLATFORM=1
+
+. tests/utils
+set -ex
+verbose="yes"
+sleeptime=10
+
+# if listfailed=yes then don't exit if test failed due to wrong
+# spare-migration and just print a list at the end. Other errors still
+# stop the test.
+# if listfailed=no then exit on first failure
+listfailed="yes"
+
+# start Monitor, set monitorpid
+# uses global scan variable
+# all parameters are numbers of devices to be monitored. only used when $scan="no"
+# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1
+monitor(){
+       [ -z $monitorpid ] || return
+       if [ "$scan" == "yes" ]; then
+               $mdadm -F -d 1 --scan --mail root@localhost &
+               monitorpid=$!
+               return
+       fi
+       unset mddevs
+       while [ -n "$1" ]
+       do
+               eval container=\$c$1
+               eval volumes=\$v$1
+               mddevs="$mddevs /dev/$container"
+               if [ "$container" != "$volumes" ]; then
+                       for vol in $volumes; do
+                               mddevs="$mddevs /dev/$vol"
+                       done
+               fi
+               shift
+       done
+       if [ -n "$mddevs" ]; then
+               if [ "$verbose" != "yes" ]; then
+                       $mdadm -F -d 1 $mddevs >&2 &
+                       monitorpid=$!
+               else
+                       $mdadm -F -t -d 1 $mddevs &
+                       monitorpid=$!
+               fi
+       fi
+       [ "$verbose" != "yes" ] || echo $mddevs $monitorpid
+}
+
+test0()
+{
+dsc "Test 0: No config file, no spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was not moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2 n
+tidyup
+}
+
+test0a()
+{
+dsc "Test 0a: No domains in config file, no spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was not moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2 n
+tidyup
+}
+
+test1()
+{
+dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with arrays and common domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test1a()
+{
+dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev1
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test2()
+{
+dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev2
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test3()
+{
+dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with 2 domains
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2
+createconfig domain-$platform"2" $platform spare 3 4 5
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+chksparemoved $c1 $c0 $dev5 n
+tidyup
+}
+
+test4()
+{
+dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, move if metadata allows"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+unset shouldmove
+[ "$platform" == "imsm" ] || shouldmove="n"
+chksparemoved $c1 $c0 $dev5 $shouldmove
+tidyup
+}
+
+test5()
+{
+dsc "Test 5: Two domains, two containers in each domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+setupdevs 2 5 6 $platform
+setupdevs 3 7 8 $platform
+# 2 and 9 for spares
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2 3 4
+createconfig domain-$platform"2" $platform spare 5 6 7 8 9
+monitor 0 1 2 3
+test5a
+test5b
+test5c
+tidyup
+}
+
+test5a()
+{
+dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved"
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v2 $dev5
+chksparemoved $c0 $c2 $dev2 n
+}
+
+test5b()
+{
+dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved"
+mdadm --fail /dev/$v1 $dev3
+chksparemoved $c0 $c1 $dev2
+}
+
+test5c()
+{
+dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved"
+mdadm -a /dev/$c3 $dev9
+chksparemoved $c3 $c2 $dev9
+}
+
+test6()
+{
+dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+# all devices in one domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v1 $dev8
+chksparemoved $c0 $c1 $dev2 n
+tidyup
+}
+
+test7()
+{
+dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9 10
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+
+test7a()
+{
+dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+#all $platform devices in one domain
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 8 9 10
+createconfig domain-$platform"2" $platform spare 2
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+chkspare $c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+test8()
+{
+# ddf does not have getinfo_super_disks implemented so skip this test
+return
+dsc "Test 8: imsm and ddf - spare should not be migrated"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 ddf
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12 n
+tidyup
+}
+
+test9()
+{
+dsc "Test 9: imsm and native 1.2 - one domain, no metadata specified, spare should be moved"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 1.2
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+test9a()
+{
+dsc "Test 9a: imsm and native 1.2 - spare in global domain, should be moved"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 1.2
+createconfig a
+createconfig domain-global noplatform spare 8 9 10 11 12
+createconfig domain-1.2 1.2 spare 8 9
+createconfig domain-imsm imsm spare 10 11
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+test10()
+{
+dsc "Test 10: Two arrays on the same devices in container"
+setupdevs 0 0 1 $platform 10000
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/md/sub0_ $dev0
+chksparemoved $c1 $c0 $dev2
+if [ $failed -eq 0 ]; then
+# now fail the spare and see if we get another one
+       mdadm --fail /dev/md/sub0_ $dev2
+       mdadm -a /dev/$c1 $dev5
+       chksparemoved $c1 $c0 $dev5
+fi
+tidyup
+}
+
+test11()
+{
+dsc "Test 11: Failed spare from other container should not be used"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev3
+#wait until recovery finishes so no degraded array in c1
+check wait
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $dev3 n
+tidyup
+}
+
+test12()
+{
+dsc "Test 12: Only one spare should be taken for rebuild, second not needed"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm -a /dev/$c1 $dev5
+mdadm --fail /dev/$v0 $dev0
+sleep $sleeptime
+chkarray $dev2 n
+sc1=$c
+chkarray $dev5 n
+sc2=$c
+[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1"
+tidyup
+}
+
+test13()
+{
+dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should be moved regadless of action"
+setupdevs 0 0 1 $platform
+setupdevs 1 4 5 $platform
+# same domain but different action on 4 5 6
+createconfig a
+createconfig domain-$platform $platform spare 0 1
+createconfig domain-$platform $platform include 4 5 6
+monitor 0 1
+mdadm -a /dev/$c1 $dev6
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $d6
+tidyup
+}
+
+test14()
+{
+dsc "Test 14: One domain, small array on big disks, check if small spare is accepted"
+setupdevs 0 8 9 $platform 10000 1
+setupdevs 1 0 1 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev9
+chksparemoved $c1 $c0 $d2
+tidyup
+}
+
+test15()
+{
+dsc "Test 15: spare in global domain for $platform metadata, should be moved"
+# this is like 9a but only one metadata used
+setupdevs 0 10 11 $platform
+setupdevs 1 8 9 $platform
+createconfig a
+createconfig domain-global $platform spare 8 9 10 11 12
+createconfig domain-1 $platform spare 8 9
+createconfig domain-2 $platform spare 10 11
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+try()
+{
+test0
+test0a
+test1
+test1a
+test2
+test3
+test4
+test5
+test6
+if [ "$platform" != "1.2" ]; then
+# this is because we can't have a small spare added to native array
+    test7
+    test7a
+fi
+test8
+test9
+test9a
+if [ "$platform" != "1.2" ]; then
+# we can't create two subarrays on the same devices for native (without
+# partitions)
+    test10
+fi
+test11
+test12
+test13
+test14
+test15
+}
+
+try_failed()
+{
+platform="1.2"
+scan="no"
+test5
+test9
+test13
+scan="yes"
+test9
+}
+
+#try_failed
+
+for scan in no yes; do
+       for platform in 1.2 imsm; do
+               try
+       done
+done
+
+[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist"
+
+#cat $targetdir/log
+rm -f /dev/disk/by-path/loop*
diff --git a/tests/12imsm-r0_2d-grow-r0_3d b/tests/12imsm-r0_2d-grow-r0_3d
new file mode 100644 (file)
index 0000000..3c6cf74
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 3 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 3 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_2d-grow-r0_4d b/tests/12imsm-r0_2d-grow-r0_4d
new file mode 100644 (file)
index 0000000..e4fccda
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_2d-grow-r0_5d b/tests/12imsm-r0_2d-grow-r0_5d
new file mode 100644 (file)
index 0000000..388a5bb
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 5 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3 $dev4"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 5 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 3))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_3d-grow-r0_4d b/tests/12imsm-r0_3d-grow-r0_4d
new file mode 100644 (file)
index 0000000..7065f07
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 3 disks grow to RAID 0 volume, 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r5_3d-grow-r5_4d b/tests/12imsm-r5_3d-grow-r5_4d
new file mode 100644 (file)
index 0000000..097da0a
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$num_disks
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r5_3d-grow-r5_5d b/tests/12imsm-r5_3d-grow-r5_5d
new file mode 100644 (file)
index 0000000..2e5c7d2
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 5 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_4d b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d
new file mode 100644 (file)
index 0000000..f85efa5
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 2 disks to 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 128k chunk size
+#         RAID 0 volume in slot #1, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID 0 volume in slot #0, 4 disks, 128k chunk size
+#        RAID 0 volume in slot #1, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_5d b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d
new file mode 100644 (file)
index 0000000..1b851a9
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow both members from 2 disks to 5 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3 $dev4"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size
+#         RAID 0 volume in slot #1, 2 disks, 256k chunk size
+vol0_level=0
+vol0_comp_size=$((4 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((6 * 1024))
+vol1_chunk=256
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID 0 volume in slot #0, 5 disks, 64k chunk size
+#        RAID 0 volume in slot #1, 5 disks, 256k chunk size
+vol0_new_num_comps=$((num_disks + 3))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_3d-grow-r0_r0_4d b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d
new file mode 100644 (file)
index 0000000..27ba83b
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow a container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 128k chunk size
+#         RAID 0 volume in slot #1, 3 disks, 512k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID0 volume in slot #0, 4 disks, 128k chunk size
+#        RAID0 volume in slot #1, 4 disks, 512k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_4d b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d
new file mode 100644 (file)
index 0000000..b4bde44
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 64k chunk size
+#         RAID 5 volume in slot #1, 3 disks, 128k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID 0 volume in slot #0, 4 disks, 64k chunk size
+#        RAID 5 volume in slot #1, 4 disks, 128k chunk size
+vol1_new_num_comps=$num_disks
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_5d b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d
new file mode 100644 (file)
index 0000000..d0db9ae
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 256k chunk size
+#         RAID 5 volume in slot #1, 3 disks, 512k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID 0 volume in slot #0, 5 disks, 256k chunk size
+#        RAID 5 volume in slot #1, 5 disks, 512k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+vol1_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_4d b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d
new file mode 100644 (file)
index 0000000..32ebc92
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 5 volume in slot #0, 3 disks, 64k chunk size
+#         RAID 0 volume in slot #1, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_offset=$((vol0_comp_size + 2048))
+vol1_num_comps=$num_disks
+
+# After: RAID 5 volume in slot #0, 4 disks, 64k chunk size
+#        RAID 0 volume in slot #1, 4 disks, 64k chunk size
+vol0_new_num_comps=$num_disks
+vol1_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_5d b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d
new file mode 100644 (file)
index 0000000..a97002d
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 5 volume in slot #0, 3 disks, 128k chunk size
+#         RAID 0 volume in slot #1, 3 disks, 256k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_offset=$((vol0_comp_size + 2048))
+vol1_num_comps=$num_disks
+
+# After: RAID 5 volume in slot #0, 5 disks, 128k chunk size
+#        RAID 0 volume in slot #1, 5 disks, 256k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+vol1_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d
new file mode 100644 (file)
index 0000000..386abee
--- /dev/null
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# RAID 0 and RAID 5 volumes (3 disks) migrate to RAID 5 and RAID 5 volumes (4 disks)
+# NEGATIVE test - migration is not allowed if there is more then one array in a container
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size, as member #0
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# Extra: RAID 5 volume, 3 disks, 64k chunk size, as member #1
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After: RAID 5 volume, 4 disks, 64k chunk size (only member #0)
+vol0_new_level=5
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/14imsm-r0_3d_no_spares-migrate-r5_3d b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d
new file mode 100644 (file)
index 0000000..10bbab6
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (3 disks, no spares) migrate to RAID 5 volume (3 disks)
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 3 disks, 64k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/14imsm-r0_r0_2d-takeover-r10_4d b/tests/14imsm-r0_r0_2d-takeover-r10_4d
new file mode 100644 (file)
index 0000000..89d63a6
--- /dev/null
@@ -0,0 +1,31 @@
+. tests/env-imsm-template
+
+
+# Two RAID 0 volumes (2 disks) migrate to RAID 10 volume (4 disks)
+# NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size
+#         RAID 0 volume in slot #1, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# Before: RAID 0 volume,  disks, 64k chunk size
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=num_disks
+vol1_offset=$(( $vol0_comp_size + 2048 ))
+
+# After: RAID 10, 4 disks, 64k chunk size
+vol0_new_level=10
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
+
diff --git a/tests/14imsm-r10_4d-grow-r10_5d b/tests/14imsm-r10_4d-grow-r10_5d
new file mode 100644 (file)
index 0000000..bcbe147
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 10 volume, 4 disks grow to RAID 10 volume, 5 disks
+# NEGATIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+spare_list="$dev4"
+
+# Before: RAID 10 volume, 4 disks, 128k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$((num_disks - 2))
+vol0_offset=0
+
+# After: RAID 10 volume, 5 disks, 128k chunks size (test should fail)
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r10_r5_4d-takeover-r0_2d b/tests/14imsm-r10_r5_4d-takeover-r0_2d
new file mode 100644 (file)
index 0000000..9e5205e
--- /dev/null
@@ -0,0 +1,30 @@
+. tests/env-imsm-template
+
+
+# Two RAID volumes: RAID10 and RAID5 (4 disks) migrate to RAID 0 volume (2 disks)
+# NEGATIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+
+# Before: RAID 10 volume in slot #0, 4 disks, 64k chunk size
+#         RAID 5 volume in slot #1, 4 disks, 64k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$(( $num_disks - 2 ))
+vol0_offset=0
+
+# Before: RAID 0 volume,  disks, 64k chunk size
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$(( $num_disks - 1 ))
+vol1_offset=$(( $vol0_comp_size + 2048 ))
+
+# After: RAID 10, 4 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=2
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/14imsm-r1_2d-grow-r1_3d b/tests/14imsm-r1_2d-grow-r1_3d
new file mode 100644 (file)
index 0000000..1edd50e
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks grow to RAID 1 volume, 3 disks
+# NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev4"
+
+# Before: RAID 1 volume, 2 disks, 64k chunk size
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 1 volume, 3 disks, 64k chunks size (test should fail)
+vol0_new_num_comps=$num_disks
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r1_2d-takeover-r0_2d b/tests/14imsm-r1_2d-takeover-r0_2d
new file mode 100644 (file)
index 0000000..d829681
--- /dev/null
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks change to RAID 0 volume, 2 disks
+#
+#NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 1 volume, 2 disks, 64k chunk size
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 0 volume, 2 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/14imsm-r5_3d-grow-r5_5d-no-spares b/tests/14imsm-r5_3d-grow-r5_5d-no-spares
new file mode 100644 (file)
index 0000000..ed18e72
--- /dev/null
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 4 disks, 64k chunks size
+add_to_num_disks=2
+vol0_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r5_3d-migrate-r4_3d b/tests/14imsm-r5_3d-migrate-r4_3d
new file mode 100644 (file)
index 0000000..e3b971c
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (3 disks) migrate to RAID 4 volume (3 disks)
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 4, 3 disks, 64k chunk size
+vol0_new_level=4
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k
new file mode 100644 (file)
index 0000000..be59e3d
--- /dev/null
@@ -0,0 +1,24 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, Migration from 64k to 256k chunk size.
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# RAID 0, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# RAID 0, 2 disks, 256k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+
+. tests/imsm-grow-template 0 1
+
+
diff --git a/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k
new file mode 100644 (file)
index 0000000..025e9ef
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 4k to 256 chunk size.
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 4k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=4
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 3 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k
new file mode 100644 (file)
index 0000000..37547b7
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 64k to 256k chunk size.
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 3 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k
new file mode 100644 (file)
index 0000000..d2f6c70
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 4k to 256k chunk size.
+# POSITIVE test
+
+num_disks=6
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5"
+
+# RAID 5, 6 disks, 4k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=4
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 6 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k
new file mode 100644 (file)
index 0000000..da218ef
--- /dev/null
@@ -0,0 +1,34 @@
+. tests/env-imsm-template
+
+# Member 0: RAID 5 volume, Member 1: RAID 0 volume
+# Migration from 64k to 256k chunk size (both members)
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After migration parameters
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+# RAID 0, 3 disks, 64k chunk size
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 2048))
+
+# After migration paramters
+vol1_new_level=0
+vol1_new_num_comps=$vol1_num_comps
+vol1_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r0_3d-migrate-r5_4d b/tests/16imsm-r0_3d-migrate-r5_4d
new file mode 100644 (file)
index 0000000..4f45479
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (3 disks) migrate to RAID 5 volume (4 disks)
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 0, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 4 disks, 64k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r0_5d-migrate-r5_6d b/tests/16imsm-r0_5d-migrate-r5_6d
new file mode 100644 (file)
index 0000000..bee505b
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (5 disks) migrate to RAID 5 volume (6 disks)
+# POSITIVE test
+
+num_disks=5
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4"
+
+# Before: RAID 0, 5 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 6 disks, 64k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r5_3d-migrate-r0_3d b/tests/16imsm-r5_3d-migrate-r0_3d
new file mode 100644 (file)
index 0000000..5a83c41
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (3 disks) migrate to RAID 0 volume (3 disks)
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5, 4 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r5_5d-migrate-r0_5d b/tests/16imsm-r5_5d-migrate-r0_5d
new file mode 100644 (file)
index 0000000..ff5a2d8
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (5 disks) migration to RAID 0 volume (5 disks)
+# POSITIVE test
+
+num_disks=5
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4"
+
+# Before: RAID 5 volume, 5 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 0 volume, 5 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d
new file mode 100644 (file)
index 0000000..bf462e2
--- /dev/null
@@ -0,0 +1,23 @@
+. tests/env-imsm-template
+
+# Create RAID 0 from a single disk.
+# POSITIVE test
+
+vol0_num_comps=1
+vol0_comp_size=$((10 * 1024))
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0
+wait
+imsm_check container $vol0_num_comps
+
+# Create RAID 0 volume
+mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0
+wait
+check wait
+
+# Test the member
+imsm_check member $member0 $vol0_num_comps 0 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64
+testdev $member0 $vol0_num_comps $vol0_comp_size 64
+
+exit 0
diff --git a/tests/18imsm-1d-takeover-r1_2d b/tests/18imsm-1d-takeover-r1_2d
new file mode 100644 (file)
index 0000000..fa02b6c
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# Create RAID 1 from a single disk
+# POSITIVE test
+
+vol0_num_comps=1
+vol0_comp_size=$((10 * 1024))
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0
+wait
+imsm_check container $vol0_num_comps
+
+# Create RAID 1 volume
+mdadm --create --run $member0 --auto=md --level=1 --size=$vol0_comp_size --chunk=64 --raid-disks=$((vol0_num_comps + 1)) $dev0 missing
+wait
+check wait
+
+# Test the member0
+imsm_check member $member0 $((vol_num_comps + 1)) 1 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64
+testdev $member0 $vol0_num_comps $vol0_comp_size 64
diff --git a/tests/18imsm-r0_2d-takeover-r10_4d b/tests/18imsm-r0_2d-takeover-r10_4d
new file mode 100644 (file)
index 0000000..a1c395b
--- /dev/null
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks change to RAID 10 volume, 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 0 volume, 2 disks, 256k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 10 volume, 4 disks, 256k chunk size
+vol0_new_level=10
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=128
+
+. tests/imsm-grow-template 0 1 1
diff --git a/tests/18imsm-r10_4d-takeover-r0_2d b/tests/18imsm-r10_4d-takeover-r0_2d
new file mode 100644 (file)
index 0000000..8a9606b
--- /dev/null
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 10 volume, 4 disks change to RAID 0 volume, 2 disks
+# POSITIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+
+# Before: RAID 10 volume, 4 disks, 128k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$((num_disks - 2))
+vol0_offset=0
+
+# After: RAID 0 volume, 2 disks, 128k chunk size
+vol0_new_level=0
+vol0_new_num_comps=2
+vol0_new_chunk=128
+new_num_disks=2
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/18imsm-r1_2d-takeover-r0_1d b/tests/18imsm-r1_2d-takeover-r0_1d
new file mode 100644 (file)
index 0000000..1697d60
--- /dev/null
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks change to RAID 0 volume, 1 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 1 volume, 2 disks
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$(( $num_disks - 1 )) 
+vol0_offset=0
+
+# After: RAID 0 volume, 1 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=1
+vol0_new_chunk=64
+new_num_disks=0
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/env-08imsm-overlap b/tests/env-08imsm-overlap
deleted file mode 100644 (file)
index 83557d3..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-imsm_check() {
-   case $1 in
-    container )
-      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
-               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
-      ;;
-    member )
-      member=$2
-      num_disks=$3
-      level=$4
-      size=$5
-      offset=$6
-      err=0
-
-      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
-      sysfs=/sys/dev/block/${major}:${minor}
-      if [ ! -f ${sysfs}/md/array_state ]; then
-           echo "member array $member not found" >&2
-           cat /proc/mdstat >&2
-           exit 1
-      fi
-      for i in `seq 0 $((num_disks-1))`
-      do
-         _offset=`cat ${sysfs}/md/rd${i}/offset`
-         if [ $offset -ne $((_offset/2)) ]; then
-           echo "offset mismatch expected $offset got $_offset" >&2
-            err=$((err+1))
-         fi
-         _size=`cat ${sysfs}/md/rd${i}/size`
-         if [ $size -ne $_size ]; then
-           echo "offset mismatch expected $size got $_size" >&2
-            err=$((err+1))
-         fi
-      done
-
-      if [ $err -gt 0 ]; then
-          echo "$member failed check" >&2
-          cat /proc/mdstat >&2
-         mdadm -E /dev/loop0 >&2
-          exit 1
-      fi
-      ;;
-    * ) echo >&2 ERROR unknown check $1 ; exit 1;
-   esac
-}
-
-setup_env() {
-       export IMSM_DEVNAME_AS_SERIAL=1
-       export IMSM_NO_PLATFORM=1
-       container=/dev/md/container
-       member0=/dev/md/vol0
-       member1=/dev/md/vol1
-       member2=/dev/md/vol2
-       member3=/dev/md/vol3
-       member4=/dev/md/vol4
-}
-
-reset_env() {
-       unset IMSM_DEVNAME_AS_SERIAL
-       unset IMSM_NO_PLATFORM
-       unset imsm_check
-       unset container
-       unset member0
-       unset member1
-       unset member2
-       unset member3
-       unset member4
-}
diff --git a/tests/env-09imsm-assemble b/tests/env-09imsm-assemble
deleted file mode 100644 (file)
index b12954b..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-imsm_check_hold() {
-   if mdadm --remove $1 $2; then
-       echo "$2 removal from $1 should have been blocked" >&2
-       cat /proc/mdstat >&2
-       mdadm -E $2
-       exit 1
-   fi
-}
-
-imsm_check_removal() {
-   if ! mdadm --remove $1 $2 ; then
-       echo "$2 removal from $1 should have succeeded" >&2
-       cat /proc/mdstat >&2
-       mdadm -E $2
-       exit 1
-   fi
-}
-
-setup_env() {
-       export IMSM_DEVNAME_AS_SERIAL=1
-       export IMSM_TEST_OROM=1
-       container=/dev/md/container
-       member=/dev/md/vol0
-}
-
-reset_env() {
-       unset IMSM_DEVNAME_AS_SERIAL
-       unset IMSM_TEST_OROM
-       unset imsm_check
-       unset container
-       unset member
-}
diff --git a/tests/env-09imsm-create-fail-rebuild b/tests/env-09imsm-create-fail-rebuild
deleted file mode 100644 (file)
index b44746c..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-imsm_check_hold() {
-   if mdadm --remove $1 $2; then
-       echo "$2 removal from $1 should have been blocked" >&2
-       cat /proc/mdstat >&2
-       mdadm -E $2
-       exit 1
-   fi
-}
-
-imsm_check_removal() {
-   if ! mdadm --remove $1 $2 ; then
-       echo "$2 removal from $1 should have succeeded" >&2
-       cat /proc/mdstat >&2
-       mdadm -E $2
-       exit 1
-   fi
-}
-
-imsm_check() {
-   udevadm settle
-   case $1 in
-    container )
-      grep -s "$(((418 * $2)/2)) blocks super external:imsm" /proc/mdstat > /dev/null || {
-               echo >&2 "ERROR correctly formed container not found"; cat /proc/mdstat; exit 1;}
-      ;;
-    member )
-      member=$2
-      num_disks=$3
-      level=$4
-      size=$5
-      offset=$6
-      chunk=$7
-      err=0
-
-      if [ $level -ne 1 ]; then
-         size=$((size & ~(chunk - 1)))
-      else
-         chunk=64
-      fi
-      eval `stat -L -c "let major=0x%t; let minor=0x%T;" $member`
-      sysfs=/sys/dev/block/${major}:${minor}
-      if [ ! -f ${sysfs}/md/array_state ]; then
-           echo "member array $member not found" >&2
-           cat /proc/mdstat >&2
-           exit 1
-      fi
-      _chunk=`cat ${sysfs}/md/chunk_size`
-      if [ $chunk -ne $((_chunk/1024)) ]; then
-         echo "chunk mismatch expected $chunk got $_chunk" >&2
-         err=$((err+1))
-      fi
-      for i in `seq 0 $((num_disks-1))`
-      do
-         _offset=`cat ${sysfs}/md/rd${i}/offset`
-         if [ $offset -ne $((_offset/2)) ]; then
-           echo "offset mismatch expected $offset got $_offset" >&2
-            err=$((err+1))
-         fi
-         _size=`cat ${sysfs}/md/rd${i}/size`
-         if [ $size -ne $_size ]; then
-           echo "size mismatch expected $size got $_size" >&2
-            err=$((err+1))
-         fi
-      done
-
-      if [ $err -gt 0 ]; then
-          echo "$member failed check" >&2
-          cat /proc/mdstat >&2
-         mdadm -E /dev/loop0 >&2
-          exit 1
-      fi
-      ;;
-    * ) echo >&2 ERROR unknown check $1 ; exit 1;
-   esac
-}
-
-setup_env() {
-       export IMSM_DEVNAME_AS_SERIAL=1
-       export IMSM_TEST_OROM=1
-       container=/dev/md/container
-       member0=/dev/md/vol0
-       member1=/dev/md/vol1
-       member2=/dev/md/vol2
-       member3=/dev/md/vol3
-       member4=/dev/md/vol4
-}
-
-reset_env() {
-       unset IMSM_DEVNAME_AS_SERIAL
-       unset IMSM_TEST_OROM
-       unset imsm_check
-       unset container
-       unset member0
-       unset member1
-       unset member2
-       unset member3
-       unset member4
-}
diff --git a/tests/env-imsm-template b/tests/env-imsm-template
new file mode 100644 (file)
index 0000000..7a2890c
--- /dev/null
@@ -0,0 +1,71 @@
+imsm_check() {
+    udevadm settle
+    case $1 in
+    container )
+        grep -s "blocks super external:imsm" /proc/mdstat > /dev/null || {
+            echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; }
+        ;;
+    member )
+        t_member=$2
+        t_num_disks=$3
+        t_level=$4
+        t_rd_size=$5
+        t_size=$6
+        t_offset=$7
+        t_chunk=$8
+       t_takeover10=$9
+
+        err=0
+
+        eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member`
+        sysfs=/sys/dev/block/${major}:${minor}
+        if [ ! -f ${sysfs}/md/array_state ]; then
+            echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1
+        fi
+        _chunk=`cat ${sysfs}/md/chunk_size`
+        if [ $t_chunk -ne $((_chunk/1024)) ]; then
+            echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2
+            err=$((err + 1))
+        fi
+       if [ ! -z $t_takeover10 ] ; then
+               t_num_disks=$(( t_num_disks * 2 ))
+       fi
+        for i in `seq 0 $((t_num_disks - 1))`; do
+           if [ ! -z $t_takeover10 ] && [ ! -z $(( $i % 2 )) ] ; then
+               continue
+           fi
+           _offset=`cat ${sysfs}/md/rd${i}/offset`
+           if [ $t_offset -ne $((_offset / 2)) ]; then
+               echo "**Error**: Offset mismatch - expected $t_offset, actual $_offset" >&2
+               err=$((err + 1))
+           fi
+           _rd_size=`cat ${sysfs}/md/rd${i}/size`
+           if [ $t_rd_size -ne $_rd_size ]; then
+               echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2
+               err=$((err + 1))
+           fi
+        done
+        _size=`cat ${sysfs}/md/array_size`
+        if [ $t_size -ne $_size ]; then
+            echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2
+            err=$((err + 1))
+        fi
+        if [ $err -gt 0 ]; then
+            echo "$t_member failed check" >&2
+            cat /proc/mdstat >&2
+            mdadm -E /dev/loop0 >&2
+            exit 1
+        fi
+        ;;
+    * )
+        echo >&2 "**Error** unknown check $1"; exit 1;
+    esac
+}
+
+export IMSM_NO_PLATFORM=1
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export MDADM_EXPERIMENTAL=1
+container=/dev/md/container
+member0=/dev/md/vol0
+member1=/dev/md/vol1
diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template
new file mode 100644 (file)
index 0000000..7c212c4
--- /dev/null
@@ -0,0 +1,104 @@
+
+# 0 - POSITIVE test, otherwise NEGATIVE test
+negative_test=$1
+
+# 0 - On-line Capacity Expansion test, otherwise LEVEL migration or CHUNK size migration test
+migration_test=$2
+
+# 1 - raid0 -> raid10 takeover verification
+takeover10_test=$3
+
+function grow_member() {
+       local member=$1
+       local disks=$2
+       local comps=$3
+       local level=$4
+       local size=$5
+       local offset=$6
+       local chunk=$7
+       local array_size=$((comps * size))
+
+       ( set -ex; mdadm --grow $member --chunk=$chunk --level=$level --backup-file=/tmp/backup_imsm )
+       local status=$?
+       if [ $negative_test -ne 0 ]; then
+               if [ $status -eq 0 ]; then
+                       echo >&2 "**Error**: $member: --grow should failed, but it completed successfuly"
+                       exit 1
+               fi
+               return
+       fi
+       check wait
+       sleep 5
+       imsm_check member $member $disks $level $size $array_size $offset $chunk $takeover10_test
+       testdev $member $comps $size $chunk $takeover10_test
+}
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list
+wait
+imsm_check container $num_disks
+
+# Create first volume inside the container
+mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list
+wait
+
+# Create second volume inside the container (if defined)
+if [ ! -z $vol1_chunk ]; then
+    mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list
+    wait
+fi
+
+# Wait for any RESYNC to complete
+check wait
+
+# Test first volume
+imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_num_comps)) $vol0_offset $vol0_chunk
+testdev $member0 $vol0_num_comps $vol0_comp_size $vol0_chunk
+
+# Test second volume (if defined)
+if [ ! -z $vol1_chunk ]; then
+    imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_num_comps)) $vol1_offset $vol1_chunk
+    testdev $member1 $vol1_num_comps $vol1_comp_size $vol1_chunk
+fi
+
+# Add extra disks to container if operation requires spares in container.
+for i in $spare_list
+do
+    mdadm --add $container $i
+    wait
+    num_disks=$((num_disks + 1))
+done
+
+imsm_check container $num_disks
+num_disks=$((num_disks + add_to_num_disks))
+
+# Grow each member or a container depending on the type of an operation
+if [ $migration_test -ne 0 ]; then
+       if [ -z $new_num_disks ]; then
+               new_num_disks=$num_disks
+       fi
+       grow_member $member0 $new_num_disks $vol0_new_num_comps $vol0_new_level $vol0_comp_size $vol0_offset $vol0_new_chunk
+       if [[ $vol1_new_chunk -ne 0 ]] ; then
+               grow_member $member1 $new_num_disks $vol1_new_num_comps $vol1_new_level $vol1_comp_size $vol1_offset $vol1_new_chunk
+       fi
+else
+       ( set -x; mdadm --grow $container --raid-disks=$num_disks --backup-file=/tmp/backup_imsm )
+       grow_status=$?
+       if [ $negative_test -ne 0 ]; then
+               if [ $grow_status -eq 0 ]; then
+                       echo >&2 "**Error**: $container: --grow should failed, but it completed successfuly"
+                       exit 1
+               fi
+       else
+               check wait
+               sleep 5
+               imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_new_num_comps)) $vol0_offset $vol0_chunk
+               testdev $member0 $vol0_new_num_comps $vol0_comp_size $vol0_chunk
+               if [ $vol1_new_num_comps -ne 0 ]; then
+                       imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_new_num_comps)) $vol1_offset $vol1_chunk
+                       testdev $member1 $vol1_new_num_comps $vol1_comp_size $vol1_chunk
+               fi
+       fi
+fi
+
+exit 0
diff --git a/tests/utils b/tests/utils
new file mode 100644 (file)
index 0000000..1d45fa8
--- /dev/null
@@ -0,0 +1,192 @@
+# set of functions used to test policy framework with assemble, incremental and Monitor
+
+set +e
+#create links to be able to use domains
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
+do
+       eval ln -s \$dev$d /dev/disk/by-path/loop$d
+       eval d$d="loop$d"
+       eval mdadm --zero-superblock \$dev$d
+done
+
+devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12"
+
+# on failure print out few things before exit
+# uses testdsc and platform global variables
+err(){
+       echo >&2 "ERROR: $*"
+       cat /etc/mdadm.conf >&2 || true
+       cat /proc/mdstat >&2
+       [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; }
+       ps -e | grep mdadm >&2 || true
+       if [ $listfailed == "yes" ]; then
+               [ "$verbose" != "yes" ] || echo ---FAILED---
+               flist="$flist \n $platform $testdsc"
+               failed=1
+       else
+               exit 1
+       fi
+}
+
+# set test description
+dsc(){
+       failed=0
+       testdsc="$*"
+       [ "$verbose" != "yes" ] || echo $testdsc
+}
+
+killmonitor(){
+       [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; }
+}
+
+tidyup(){
+       killmonitor
+       mdadm -Ss || true
+       mdadm -Ss
+       mdadm --zero-superblock $devices || true
+       udevadm settle
+       rm -f /etc/mdadm.conf
+}
+
+trap tidyup 0 1 2 3 15
+
+# create a RAID 1 array or container and subarray(s) on 2 disks
+# if platform not specified imsm is used
+# if subsize is given, first subarray is created with given size and second one on remaining space
+ccv(){
+       # mddevno used to name created array
+       local mddevno="$1"
+       # numbers of devices to be used in array
+       local devno1="$2"
+       local devno2="$3"
+       local platform="$4"
+       local subsize="$5"
+       local onearray="$6"
+       [ -n "$platform" ] || platform="imsm"
+       if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then
+               eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2
+               udevadm settle
+               [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize
+               [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno
+       else
+               [ -z "$subsize" ] || sizepar="-z $subsize"
+               eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar
+               unset sizepar
+       fi
+}
+
+# get container and subarray using given device from mdstat
+# sets global variables c and v
+getarray(){
+       local devname=`basename $1`
+       local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'`
+       c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+       v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+       [ "$platformtype" == "external" ] || c=$v
+}
+
+# check if given device belongs to any container and subarray
+# if $2 given then only container checked
+chkarray(){
+       local devname="$1"
+       local subcheck="$2"
+       getarray $devname
+       [ -n "$c" ] || err "$devname not in any container"
+       [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray"
+}
+
+# test if two devices in the same container/subarray
+# $1 $2 - devices
+# $3 don't check subarrays, only containers
+tst(){
+       local device1=`basename $1`
+       local device2=`basename $2`
+       local subcheck="$3"
+       chkarray $device1 $subcheck
+       local x="$c"
+       local y="$v"
+       chkarray $device2 $subcheck
+       [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container"
+       [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray"
+}
+
+# same as tst, just use numbers of devices instead of names as parameters
+dtst(){
+       local devno1="$1"
+       local devno2="$2"
+       local subcheck="$3"
+       eval tst \$dev$devno1 \$dev$devno2 $subcheck
+}
+
+# create containers/subarrays, check if created properly,
+# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc.
+setupdevs(){
+       local mddevno="$1"
+       local devno1="$2"
+       local devno2="$3"
+       local p="$4"
+       local subsize="$5"
+       local onearray="$6"
+       [ -n "$p" ] || p=$platform
+       ccv $mddevno $devno1 $devno2 $p $subsize $onearray
+       dtst $devno1 $devno2
+       eval c$mddevno=\"$c\"
+       eval v$mddevno=\"$v\"
+}
+
+# check if given spare in container
+# usage: chkspare container spare [n]  (n if spare shouldn't be in container)
+chkspare(){
+       local container=`basename $1`
+       local spare=$2
+       local expected=$3
+       getarray $spare
+       [ -n "$expected" ] || expected="y"
+       if [ "$expected" == "y" ]; then
+               [ "$c" == "$container" ] || err "$spare not in container $container"
+       else
+               [ "$c" != "$container" ] || err "$spare in container $container"
+       fi
+}
+
+#check if spare was moved from one container to another
+# args: from_container to_container spare [yn]
+# n when spare should remain in original container
+chksparemoved(){
+       sleep $sleeptime
+       from_container="$1"
+       to_container="$2"
+       spare="$3"
+       expected="$4"
+       [ -n "$expected" ] || expected="y"
+       notexpected="n"; [ "$expected" == "y" ] || notexpected="y"
+       chkspare $from_container $spare $notexpected
+       [ $failed -eq 1 ] || chkspare $to_container $spare $expected
+}
+
+
+# for domains defined through policy
+createconfig(){
+conf=/etc/mdadm.conf
+if [ "$1" != "a" ]; then
+{
+       domain=$1
+       metadata=$2
+       action=$3
+       while [ -n "$4" ]; do
+               echo="policy domain=$domain"
+               [ "$metadata" == "noplatform" ] ||  echo="$echo metadata=$metadata"
+               echo="$echo path=loop$4"
+               echo="$echo action=$action"
+               echo "$echo"
+               shift
+       done
+} >> $conf
+else
+{
+       echo "DEVICES $devlist /dev/md1*"
+       mdadm -Ebs
+} >  $conf
+fi
+#[ "$verbose" != "yes" ] || cat /etc/mdadm.conf | grep policy || true
+}
index f9607f37cd68e370fcd17702993dca8428f348cd..1d898332e4dee19c8467a077b133b88914c0beb3 100644 (file)
@@ -3,8 +3,10 @@
 SUBSYSTEM!="block", GOTO="md_end"
 
 # handle potential components of arrays
-ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name"
+ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}"
 ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
+ENV{ID_FS_TYPE}=="isw_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}"
+ENV{ID_FS_TYPE}=="isw_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
 
 # handle md arrays
 ACTION!="add|change", GOTO="md_end"
diff --git a/util.c b/util.c
index cbefaba45a0f3066ddc6116a7ef2582a8a0990c7..cc6ccb4d25f99ae737ce65996cd3d1c71d80477a 100644 (file)
--- a/util.c
+++ b/util.c
@@ -65,55 +65,7 @@ struct blkpg_partition {
        char volname[BLKPG_VOLNAMELTH]; /* volume label */
 };
 
-/* partition table structures so we can check metadata position
- * against the end of the last partition.
- * Only handle MBR ant GPT partition tables.
- */
-struct MBR_part_record {
-  __u8 bootable;
-  __u8 first_head;
-  __u8 first_sector;
-  __u8 first_cyl;
-  __u8 part_type;
-  __u8 last_head;
-  __u8 last_sector;
-  __u8 last_cyl;
-  __u32 first_sect_lba;
-  __u32 blocks_num;
-};
-
-struct MBR {
-       __u8 pad[446];
-       struct MBR_part_record parts[4];
-       __u16 magic;
-} __attribute__((packed));
-
-struct GPT_part_entry {
-  unsigned char type_guid[16];
-  unsigned char partition_guid[16];
-  __u64 starting_lba;
-  __u64 ending_lba;
-  unsigned char attr_bits[8];
-  unsigned char name[72];
-} __attribute__((packed));
-
-struct GPT {
-       __u64 magic;
-       __u32 revision;
-       __u32 header_size;
-       __u32 crc;
-       __u32 pad1;
-       __u64 current_lba;
-       __u64 backup_lba;
-       __u64 first_lba;
-       __u64 last_lba;
-       __u8 guid[16];
-       __u64 part_start;
-       __u32 part_cnt;
-       __u32 part_size;
-       __u32 part_crc;
-       __u8 pad2[420];
-} __attribute__((packed));
+#include "part.h"
 
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
@@ -124,14 +76,6 @@ struct GPT {
    aren't permitted). */
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 
-
-/* MBR/GPT magic numbers */
-#define        MBR_SIGNATURE_MAGIC     __cpu_to_le16(0xAA55)
-#define        GPT_SIGNATURE_MAGIC     __cpu_to_le64(0x5452415020494645ULL)
-
-#define MBR_PARTITIONS               4
-#define MBR_GPT_PARTITION_TYPE       0xEE
-
 /*
  * Parse a 128 bit uuid in 4 integers
  * format is 32 hexx nibbles with options :.<space> separator
@@ -217,6 +161,31 @@ int get_linux_version()
 }
 
 #ifndef MDASSEMBLE
+int mdadm_version(char *version)
+{
+       int a, b, c;
+       char *cp;
+
+       if (!version)
+               version = Version;
+
+       cp = strchr(version, '-');
+       if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v')
+               return -1;
+       cp += 3;
+       a = strtoul(cp, &cp, 10);
+       if (*cp != '.')
+               return -1;
+       b = strtoul(cp+1, &cp, 10);
+       if (*cp == '.')
+               c = strtoul(cp+1, &cp, 10);
+       else
+               c = 0;
+       if (*cp != ' ' && *cp != '-')
+               return -1;
+       return (a*1000000)+(b*1000)+c;
+}
+
 long long parse_size(char *size)
 {
        /* parse 'size' which should be a number optionally
@@ -326,6 +295,19 @@ int test_partition(int fd)
        return 1;
 }
 
+int test_partition_from_id(dev_t id)
+{
+       char buf[20];
+       int fd, rv;
+
+       sprintf(buf, "%d:%d", major(id), minor(id));
+       fd = dev_open(buf, O_RDONLY);
+       if (fd < 0)
+               return -1;
+       rv = test_partition(fd);
+       close(fd);
+       return rv;
+}
 
 int enough(int level, int raid_disks, int layout, int clean,
           char *avail, int avail_disks)
@@ -406,13 +388,10 @@ int enough_fd(int fd)
 }
 
 
-const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 };
+const int uuid_zero[4] = { 0, 0, 0, 0 };
+
 int same_uuid(int a[4], int b[4], int swapuuid)
 {
-       if (memcmp(a, uuid_match_any, sizeof(int[4])) == 0 ||
-           memcmp(b, uuid_match_any, sizeof(int[4])) == 0)
-               return 1;
-
        if (swapuuid) {
                /* parse uuids are hostendian.
                 * uuid's from some superblocks are big-ending
@@ -556,7 +535,7 @@ int check_raid(int fd, char *name)
        /* Looks like a raid array .. */
        fprintf(stderr, Name ": %s appears to be part of a raid array:\n",
                name);
-       st->ss->getinfo_super(st, &info);
+       st->ss->getinfo_super(st, &info, NULL);
        st->ss->free_super(st);
        crtime = info.array.ctime;
        level = map_num(pers, info.array.level);
@@ -1079,11 +1058,16 @@ void wait_for(char *dev, int fd)
                dprintf("%s: timeout waiting for %s\n", __func__, dev);
 }
 
-struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
+struct superswitch *superlist[] =
+{
+       &super0, &super1,
+       &super_ddf, &super_imsm,
+       &mbr, &gpt,
+       NULL };
 
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
 
-struct supertype *super_by_fd(int fd)
+struct supertype *super_by_fd(int fd, char **subarrayp)
 {
        mdu_array_info_t array;
        int vers;
@@ -1094,6 +1078,7 @@ struct supertype *super_by_fd(int fd)
        char version[20];
        int i;
        char *subarray = NULL;
+       int container = NoMdDev;
 
        sra = sysfs_read(fd, 0, GET_VERSION);
 
@@ -1115,15 +1100,15 @@ struct supertype *super_by_fd(int fd)
        }
        if (minor == -2 && is_subarray(verstr)) {
                char *dev = verstr+1;
+
                subarray = strchr(dev, '/');
-               int devnum;
                if (subarray)
                        *subarray++ = '\0';
-               devnum = devname2devnum(dev);
                subarray = strdup(subarray);
+               container = devname2devnum(dev);
                if (sra)
                        sysfs_free(sra);
-               sra = sysfs_read(-1, devnum, GET_VERSION);
+               sra = sysfs_read(-1, container, GET_VERSION);
                if (sra && sra->text_version[0])
                        verstr = sra->text_version;
                else
@@ -1137,17 +1122,33 @@ struct supertype *super_by_fd(int fd)
                sysfs_free(sra);
        if (st) {
                st->sb = NULL;
-               if (subarray) {
-                       strncpy(st->subarray, subarray, 32);
-                       st->subarray[31] = 0;
-                       free(subarray);
-               } else
-                       st->subarray[0] = 0;
-       }
+               if (subarrayp)
+                       *subarrayp = subarray;
+               st->container_dev = container;
+               st->devnum = fd2devnum(fd);
+       } else
+               free(subarray);
+
        return st;
 }
 #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
 
+int dev_size_from_id(dev_t id, unsigned long long *size)
+{
+       char buf[20];
+       int fd;
+
+       sprintf(buf, "%d:%d", major(id), minor(id));
+       fd = dev_open(buf, O_RDONLY);
+       if (fd < 0)
+               return 0;
+       if (get_dev_size(fd, NULL, size)) {
+               close(fd);
+               return 1;
+       }
+       close(fd);
+       return 0;
+}
 
 struct supertype *dup_super(struct supertype *orig)
 {
@@ -1162,13 +1163,12 @@ struct supertype *dup_super(struct supertype *orig)
        st->ss = orig->ss;
        st->max_devs = orig->max_devs;
        st->minor_version = orig->minor_version;
-       strcpy(st->subarray, orig->subarray);
        st->sb = NULL;
        st->info = NULL;
        return st;
 }
 
-struct supertype *guess_super(int fd)
+struct supertype *guess_super_type(int fd, enum guess_types guess_type)
 {
        /* try each load_super to find the best match,
         * and return the best superswitch
@@ -1180,14 +1180,21 @@ struct supertype *guess_super(int fd)
        int i;
 
        st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
+
        for (i=0 ; superlist[i]; i++) {
                int rv;
                ss = superlist[i];
+               if (guess_type == guess_array && ss->add_to_super == NULL)
+                       continue;
+               if (guess_type == guess_partitions && ss->add_to_super != NULL)
+                       continue;
                memset(st, 0, sizeof(*st));
                rv = ss->load_super(st, fd, NULL);
                if (rv == 0) {
                        struct mdinfo info;
-                       st->ss->getinfo_super(st, &info);
+                       st->ss->getinfo_super(st, &info, NULL);
                        if (bestsuper == -1 ||
                            besttime < info.array.ctime) {
                                bestsuper = i;
@@ -1237,6 +1244,20 @@ int get_dev_size(int fd, char *dname, unsigned long long *sizep)
        return 1;
 }
 
+/* Return true if this can only be a container, not a member device.
+ * i.e. is and md device and size is zero
+ */
+int must_be_container(int fd)
+{
+       unsigned long long size;
+       if (md_get_version(fd) < 0)
+               return 0;
+       if (get_dev_size(fd, NULL, &size) == 0)
+               return 1;
+       if (size == 0)
+               return 1;
+       return 0;
+}
 
 /* Sets endofpart parameter to the last block used by the last GPT partition on the device.
  * Returns: 1 if successful
@@ -1349,7 +1370,8 @@ static int get_last_partition_end(int fd, unsigned long long *endofpart)
        return retval;
 }
 
-int check_partitions(int fd, char *dname, unsigned long long freesize)
+int check_partitions(int fd, char *dname, unsigned long long freesize,
+                       unsigned long long size)
 {
        /*
         * Check where the last partition ends
@@ -1372,6 +1394,12 @@ int check_partitions(int fd, char *dname, unsigned long long freesize)
                                Name ": metadata will over-write last partition on %s.\n",
                                dname);
                        return 1;
+               } else if (size && endofpart > size) {
+                       /* partitions will be truncated in new device */
+                       fprintf(stderr,
+                               Name ": array size is too small to cover all partitions on %s.\n",
+                               dname);
+                       return 1;
                }
        }
        return 0;
@@ -1467,35 +1495,27 @@ int is_subarray_active(char *subarray, char *container)
        struct mdstat_ent *mdstat = mdstat_read(0, 0);
        struct mdstat_ent *ent;
 
-       for (ent = mdstat; ent; ent = ent->next) {
-               if (is_container_member(ent, container)) {
-                       char *inst = &ent->metadata_version[10+strlen(container)+1];
-
-                       if (!subarray || strcmp(inst, subarray) == 0)
+       for (ent = mdstat; ent; ent = ent->next)
+               if (is_container_member(ent, container))
+                       if (strcmp(to_subarray(ent, container), subarray) == 0)
                                break;
-               }
-       }
 
        free_mdstat(mdstat);
 
        return ent != NULL;
 }
 
-int is_container_active(char *container)
-{
-       return is_subarray_active(NULL, container);
-}
-
 /* open_subarray - opens a subarray in a container
  * @dev: container device name
- * @st: supertype with only ->subarray set
+ * @st: empty supertype
  * @quiet: block reporting errors flag
  *
  * On success returns an fd to a container and fills in *st
  */
-int open_subarray(char *dev, struct supertype *st, int quiet)
+int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet)
 {
        struct mdinfo *mdi;
+       struct mdinfo *info;
        int fd, err = 1;
 
        fd = open(dev, O_RDWR|O_EXCL);
@@ -1545,18 +1565,27 @@ int open_subarray(char *dev, struct supertype *st, int quiet)
                goto free_sysfs;
        }
 
-       if (st->ss->load_super(st, fd, NULL)) {
+       if (!st->ss->load_container) {
                if (!quiet)
-                       fprintf(stderr, Name ": Failed to find subarray-%s in %s\n",
-                               st->subarray, dev);
+                       fprintf(stderr, Name ": %s is not a container\n", dev);
                goto free_name;
        }
 
-       if (!st->loaded_container) {
+       if (st->ss->load_container(st, fd, NULL)) {
                if (!quiet)
-                       fprintf(stderr, Name ": %s is not a container\n", dev);
+                       fprintf(stderr, Name ": Failed to load metadata for %s\n",
+                               dev);
+               goto free_name;
+       }
+
+       info = st->ss->container_content(st, subarray);
+       if (!info) {
+               if (!quiet)
+                       fprintf(stderr, Name ": Failed to find subarray-%s in %s\n",
+                               subarray, dev);
                goto free_super;
        }
+       free(info);
 
        err = 0;
 
@@ -1608,6 +1637,21 @@ int add_disk(int mdfd, struct supertype *st,
        return rv;
 }
 
+int remove_disk(int mdfd, struct supertype *st,
+               struct mdinfo *sra, struct mdinfo *info)
+{
+       int rv;
+       /* Remove the disk given by 'info' from the array */
+#ifndef MDASSEMBLE
+       if (st->ss->external)
+               rv = sysfs_set_str(sra, info, "slot", "none");
+       else
+#endif
+               rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major,
+                                                         info->disk.minor));
+       return rv;
+}
+
 int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
 {
        /* Initialise kernel's knowledge of array.
@@ -1648,13 +1692,18 @@ unsigned long long min_recovery_start(struct mdinfo *array)
        return recovery_start;
 }
 
-char *devnum2devname(int num)
+void fmt_devname(char *name, int num)
 {
-       char name[100];
        if (num >= 0)
                sprintf(name, "md%d", num);
        else
                sprintf(name, "md_d%d", -1-num);
+}
+
+char *devnum2devname(int num)
+{
+       char name[100];
+       fmt_devname(name,num);
        return strdup(name);
 }
 
@@ -1852,6 +1901,7 @@ void append_metadata_update(struct supertype *st, void *buf, int len)
        mu->buf = buf;
        mu->len = len;
        mu->space = NULL;
+       mu->space_list = NULL;
        mu->next = NULL;
        *st->update_tail = mu;
        st->update_tail = &mu->next;
@@ -1863,3 +1913,73 @@ void append_metadata_update(struct supertype *st, void *buf, int len)
 unsigned int __invalid_size_argument_for_IOC = 0;
 #endif
 
+int experimental(void)
+{
+       if (check_env("MDADM_EXPERIMENTAL"))
+               return 1;
+       else {
+               fprintf(stderr, Name ": To use this feature MDADM_EXPERIMENTAL enviroment variable has to defined.\n");
+               return 0;
+       }
+}
+
+/* Pick all spares matching given criteria from a container
+ * if min_size == 0 do not check size
+ * if domlist == NULL do not check domains
+ * if spare_group given add it to domains of each spare
+ * metadata allows to test domains using metadata of destination array */
+struct mdinfo *container_choose_spares(struct supertype *st,
+                                      unsigned long long min_size,
+                                      struct domainlist *domlist,
+                                      char *spare_group,
+                                      const char *metadata, int get_one)
+{
+       struct mdinfo *d, **dp, *disks = NULL;
+
+       /* get list of all disks in container */
+       if (st->ss->getinfo_super_disks)
+               disks = st->ss->getinfo_super_disks(st);
+
+       if (!disks)
+               return disks;
+       /* find spare devices on the list */
+       dp = &disks->devs;
+       disks->array.spare_disks = 0;
+       while (*dp) {
+               int found = 0;
+               d = *dp;
+               if (d->disk.state == 0) {
+                       /* check if size is acceptable */
+                       unsigned long long dev_size;
+                       dev_t dev = makedev(d->disk.major,d->disk.minor);
+
+                       if (!min_size ||
+                          (dev_size_from_id(dev,  &dev_size) &&
+                           dev_size >= min_size))
+                               found = 1;
+                       /* check if domain matches */
+                       if (found && domlist) {
+                               struct dev_policy *pol = devnum_policy(dev);
+                               if (spare_group)
+                                       pol_add(&pol, pol_domain,
+                                               spare_group, NULL);
+                               if (domain_test(domlist, pol, metadata) != 1)
+                                       found = 0;
+                               dev_policy_free(pol);
+                       }
+               }
+               if (found) {
+                       dp = &d->next;
+                       disks->array.spare_disks++;
+                       if (get_one) {
+                               sysfs_free(*dp);
+                               d->next = NULL;
+                       }
+               } else {
+                       *dp = d->next;
+                       d->next = NULL;
+                       sysfs_free(d);
+               }
+       }
+       return disks;
+}