]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Merge branch 'master' into devel-3.2
authorNeilBrown <neilb@suse.de>
Mon, 13 Dec 2010 03:00:05 +0000 (14:00 +1100)
committerNeilBrown <neilb@suse.de>
Mon, 13 Dec 2010 03:00:05 +0000 (14:00 +1100)
Conflicts:
super-intel.c

44 files changed:
Assemble.c
Build.c
Create.c
Detail.c
Examine.c
Grow.c
Incremental.c
Kill.c
Makefile
Manage.c
Monitor.c
Query.c
ReadMe.c
config.c
external-reshape-design.txt [new file with mode: 0644]
managemon.c
mapfile.c
mdadm.8.in
mdadm.c
mdadm.h
mdassemble.c
mdmon.c
mdstat.c
monitor.c
msg.c
msg.h
part.h [new file with mode: 0644]
policy.c [new file with mode: 0644]
restripe.c
super-ddf.c
super-gpt.c [new file with mode: 0644]
super-intel.c
super-mbr.c [new file with mode: 0644]
super0.c
super1.c
sysfs.c
test
tests/01r1fail
tests/10ddf-create
tests/11spare-migration [new file with mode: 0644]
tests/env-11spare-migration [new file with mode: 0644]
tests/utils [new file with mode: 0644]
udev-md-raid.rules
util.c

index afd4e60a283f7cfb641463e200775bda04b701b8..ac489e8741c82af3c3f7fe36361a1cc81a3161ef 100644 (file)
@@ -70,9 +70,71 @@ static int is_member_busy(char *metadata_version)
        return busy;
 }
 
+static int ident_matches(struct mddev_ident *ident,
+                        struct mdinfo *content,
+                        struct supertype *tst,
+                        char *homehost,
+                        char *update, char *devname)
+{
+
+       if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
+           same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong uuid.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
+           name_matches(content->name, ident->name, homehost)==0) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong name.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->super_minor != UnSet &&
+           ident->super_minor != content->array.md_minor) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong super-minor.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->level != UnSet &&
+           ident->level != content->array.level) {
+               if (devname)
+                       fprintf(stderr, Name ": %s has wrong raid level.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->raid_disks != UnSet &&
+           ident->raid_disks!= content->array.raid_disks) {
+               if (devname)
+                       fprintf(stderr, Name ": %s requires wrong number of drives.\n",
+                               devname);
+               return 0;
+       }
+       if (ident->member && ident->member[0]) {
+               /* content->text_version must match */
+               char *s = strchr(content->text_version+1, '/');
+               if (s == NULL) {
+                       if (devname)
+                               fprintf(stderr, Name ": %s is not a container and one is required.\n",
+                                       devname);
+                       return 0;
+               } else if (strcmp(ident->member, s+1) != 0) {
+                       if (devname)
+                               fprintf(stderr, Name ": skipping wrong member %s is %s\n",
+                                       content->text_version, devname);
+                       return 0;
+               }
+       }
+       return 1;
+}
+                        
+
 int Assemble(struct supertype *st, char *mddev,
-            mddev_ident_t ident,
-            mddev_dev_t devlist, char *backup_file,
+            struct mddev_ident *ident,
+            struct mddev_dev *devlist,
+            char *backup_file, int invalid_backup,
             int readonly, int runstop,
             char *update, char *homehost, int require_homehost,
             int verbose, int force)
@@ -145,6 +207,7 @@ int Assemble(struct supertype *st, char *mddev,
                               */
                struct mdinfo i;
        } *devices;
+       char *devmap;
        int *best = NULL; /* indexed by raid_disk */
        int bestcnt = 0;
        int devcnt = 0;
@@ -160,7 +223,7 @@ int Assemble(struct supertype *st, char *mddev,
        int start_partial_ok = (runstop >= 0) && 
                (force || devlist==NULL || auto_assem);
        unsigned int num_devs;
-       mddev_dev_t tmpdev;
+       struct mddev_dev *tmpdev;
        struct mdinfo info;
        struct mdinfo *content = NULL;
        char *avail;
@@ -211,7 +274,6 @@ int Assemble(struct supertype *st, char *mddev,
                        num_devs++;
                tmpdev = tmpdev->next;
        }
-       devices = malloc(num_devs * sizeof(*devices));
 
        if (!st && ident->st) st = ident->st;
 
@@ -225,11 +287,13 @@ int Assemble(struct supertype *st, char *mddev,
         */
        for (tmpdev = devlist;
             tmpdev;
-            tmpdev = tmpdev->next) {
+            tmpdev = tmpdev ? tmpdev->next : NULL) {
                char *devname = tmpdev->devname;
                int dfd;
                struct stat stb;
                struct supertype *tst = dup_super(st);
+               struct dev_policy *pol = NULL;
+               int found_container = 0;
 
                if (tmpdev->used > 1) continue;
 
@@ -255,36 +319,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": %s is not a block device.\n",
                                devname);
                        tmpdev->used = 2;
-               } else if (!tst && (tst = guess_super(dfd)) == NULL) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": no recogniseable superblock on %s\n",
-                                       devname);
-                       tmpdev->used = 2;
-               } else if (tst->ss->load_super(tst,dfd, NULL)) {
-                       if (report_missmatch)
-                               fprintf( stderr, Name ": no RAID superblock on %s\n",
-                                        devname);
-               } else if (auto_assem && st == NULL &&
-                          !conf_test_metadata(tst->ss->name,
-                                              tst->ss->match_home(tst, homehost) == 1)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has metadata type %s for which "
-                                       "auto-assembly is disabled\n",
-                                       devname, tst->ss->name);
-                       tst->ss->free_super(tst);
-                       tmpdev->used = 2;
-               } else {
-                       content = &info;
-                       memset(content, 0, sizeof(*content));
-                       tst->ss->getinfo_super(tst, content);
-               }
-               if (dfd >= 0) close(dfd);
-
-               if (tst && tst->sb && tst->ss->container_content
-                   && tst->loaded_container) {
-                       /* tmpdev is a container.  We need to be either
-                        * looking for a member, or auto-assembling
-                        */
+               } else if (must_be_container(dfd)) {
                        if (st) {
                                /* already found some components, this cannot
                                 * be another one.
@@ -292,8 +327,78 @@ int Assemble(struct supertype *st, char *mddev,
                                if (report_missmatch)
                                        fprintf(stderr, Name ": %s is a container, but we are looking for components\n",
                                                devname);
-                               goto loop;
+                               tmpdev->used = 2;
+                       } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": not a recognisable container: %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (!tst->ss->load_container
+                                  || tst->ss->load_container(tst, dfd, NULL)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no correct container type: %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (auto_assem &&
+                                  !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)),
+                                                      tst->ss->match_home(tst, homehost) == 1)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": %s has metadata type %s for which "
+                                               "auto-assembly is disabled\n",
+                                               devname, tst->ss->name);
+                               tmpdev->used = 2;
+                       } else
+                               found_container = 1;
+               } else {
+                       if (!tst && (tst = guess_super(dfd)) == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no recogniseable superblock on %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (tst->ss->load_super(tst,dfd, NULL)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": no RAID superblock on %s\n",
+                                               devname);
+                               tmpdev->used = 2;
+                       } else if (tst->ss->compare_super == NULL) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": Cannot assemble %s metadata on %s\n",
+                                               tst->ss->name, devname);
+                               tmpdev->used = 2;
+                       } else if (auto_assem && st == NULL &&
+                                  !conf_test_metadata(tst->ss->name, (pol = devnum_policy(stb.st_rdev)),
+                                                      tst->ss->match_home(tst, homehost) == 1)) {
+                               if (report_missmatch)
+                                       fprintf(stderr, Name ": %s has metadata type %s for which "
+                                               "auto-assembly is disabled\n",
+                                               devname, tst->ss->name);
+                               tmpdev->used = 2;
                        }
+               }
+               if (dfd >= 0) close(dfd);
+               if (tmpdev->used == 2) {
+                       if (auto_assem)
+                               /* Ignore unrecognised devices during auto-assembly */
+                               goto loop;
+                       if (ident->uuid_set || ident->name[0] ||
+                           ident->super_minor != UnSet)
+                               /* Ignore unrecognised device if looking for
+                                * specific array */
+                               goto loop;
+                           
+
+                       fprintf(stderr, Name ": %s has no superblock - assembly aborted\n",
+                               devname);
+                       if (st)
+                               st->ss->free_super(st);
+                       dev_policy_free(pol);
+                       return 1;
+               }
+
+               if (found_container) {
+                       /* tmpdev is a container.  We need to be either
+                        * looking for a member, or auto-assembling
+                        */
 
                        if (ident->container) {
                                if (ident->container[0] == '/' &&
@@ -306,6 +411,11 @@ int Assemble(struct supertype *st, char *mddev,
                                if (ident->container[0] != '/') {
                                        /* we have a uuid */
                                        int uuid[4];
+
+                                       content = &info;
+                                       memset(content, 0, sizeof(*content));
+                                       tst->ss->getinfo_super(tst, content, NULL);
+
                                        if (!parse_uuid(ident->container, uuid) ||
                                            !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
                                                if (report_missmatch)
@@ -320,184 +430,105 @@ int Assemble(struct supertype *st, char *mddev,
                        if (verbose > 0)
                                fprintf(stderr, Name ": looking in container %s\n",
                                        devname);
-               next_member:
-                       if (tmpdev->content)
-                               content = tmpdev->content;
-                       else
-                               content = tst->ss->container_content(tst);
-                       if (!content)
-                               goto loop; /* empty container */
-
-                       tmpdev->content = content->next;
-                       if (tmpdev->content == NULL)
-                               tmpdev->used = 2;
-
-               } else if (ident->container || ident->member) {
-                       /* No chance of this matching if we don't have
-                        * a container */
-                       if (report_missmatch)
-                               fprintf(stderr, Name "%s is not a container, and one is required.\n",
-                                       devname);
-                       goto loop;
-               }
-
-               if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
-                   (!tst || !tst->sb ||
-                    same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong uuid.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
-                   (!tst || !tst->sb ||
-                    name_matches(content->name, ident->name, homehost)==0)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong name.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->super_minor != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->super_minor != content->array.md_minor)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong super-minor.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->level != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->level != content->array.level)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s has wrong raid level.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (ident->raid_disks != UnSet &&
-                   (!tst || !tst->sb ||
-                    ident->raid_disks!= content->array.raid_disks)) {
-                       if (report_missmatch)
-                               fprintf(stderr, Name ": %s requires wrong number of drives.\n",
-                                       devname);
-                       goto loop;
-               }
-               if (auto_assem) {
-                       if (tst == NULL || tst->sb == NULL)
-                               continue;
-               }
-               /* If we are this far, then we are nearly commited to this device.
-                * If the super_block doesn't exist, or doesn't match others,
-                * then we probably cannot continue
-                * However if one of the arrays is for the homehost, and
-                * the other isn't that can disambiguate.
-                */
 
-               if (!tst || !tst->sb) {
-                       fprintf(stderr, Name ": %s has no superblock - assembly aborted\n",
-                               devname);
-                       if (st)
-                               st->ss->free_super(st);
-                       return 1;
-               }
+                       for (content = tst->ss->container_content(tst, NULL);
+                            content;
+                            content = content->next) {
 
-               if (tst && tst->sb && tst->ss->container_content
-                   && tst->loaded_container) {
-                       /* we have the one container we need, don't keep
-                        * looking.  If the chosen member is active, skip.
-                        */
-                       if (is_member_busy(content->text_version)) {
-                               if (report_missmatch)
-                                       fprintf(stderr, Name ": member %s in %s is already assembled\n",
-                                               content->text_version,
-                                               devname);
-                       skip:
-                               if (tmpdev->content)
-                                       goto next_member;
-                               tst->ss->free_super(tst);
-                               tst = NULL;
-                               content = NULL;
-                               if (auto_assem)
-                                       goto loop;
-                               return 1;
-                       }
-                       if (ident->member && ident->member[0]) {
-                               char *s = strchr(content->text_version+1, '/');
-                               if (s == NULL) {
-                                       fprintf(stderr, Name ": badly formatted version: %s\n",
-                                               content->text_version);
-                                       goto skip;
-                               }
-                               if (strcmp(ident->member, s+1) != 0) {
+                               if (!ident_matches(ident, content, tst,
+                                                  homehost, update,
+                                                  report_missmatch ? devname : NULL))
+                                       /* message already printed */;
+                               else if (is_member_busy(content->text_version)) {
                                        if (report_missmatch)
-                                               fprintf(stderr,
-                                                       Name ": skipping wrong member %s\n",
-                                                       content->text_version);
-                                       goto skip;
-                               }
+                                               fprintf(stderr, Name ": member %s in %s is already assembled\n",
+                                                       content->text_version,
+                                                       devname);
+                               } else
+                                       break;
+                       }
+                       if (!content) {
+                               tmpdev->used = 2;
+                               goto loop; /* empty container */
                        }
+
                        st = tst; tst = NULL;
                        if (!auto_assem && inargv && tmpdev->next != NULL) {
                                fprintf(stderr, Name ": %s is a container, but is not "
                                        "only device given: confused and aborting\n",
                                        devname);
                                st->ss->free_super(st);
+                               dev_policy_free(pol);
                                return 1;
                        }
                        if (verbose > 0)
                                fprintf(stderr, Name ": found match on member %s in %s\n",
                                        content->text_version, devname);
-                       break;
-               }
-               if (st == NULL)
-                       st = dup_super(tst);
-               if (st->minor_version == -1)
-                       st->minor_version = tst->minor_version;
-               if (st->ss != tst->ss ||
-                   st->minor_version != tst->minor_version ||
-                   st->ss->compare_super(st, tst) != 0) {
-                       /* Some mismatch. If exactly one array matches this host,
-                        * we can resolve on that one.
-                        * Or, if we are auto assembling, we just ignore the second
-                        * for now.
-                        */
-                       if (auto_assem)
+
+                       /* make sure we finished the loop */
+                       tmpdev = NULL;
+                       goto loop;
+               } else {
+
+                       content = &info;
+                       memset(content, 0, sizeof(*content));
+                       tst->ss->getinfo_super(tst, content, NULL);
+
+                       if (!ident_matches(ident, content, tst,
+                                          homehost, update,
+                                          report_missmatch ? devname : NULL))
                                goto loop;
-                       if (homehost) {
-                               int first = st->ss->match_home(st, homehost);
-                               int last = tst->ss->match_home(tst, homehost);
-                               if (first != last &&
-                                   (first == 1 || last == 1)) {
-                                       /* We can do something */
-                                       if (first) {/* just ignore this one */
-                                               if (report_missmatch)
-                                                       fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
-                                                               devname);
-                                               goto loop;
-                                       } else { /* reject all those sofar */
-                                               mddev_dev_t td;
-                                               if (report_missmatch)
-                                                       fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
-                                                               devname);
-                                               for (td=devlist; td != tmpdev; td=td->next)
-                                                       if (td->used == 1)
-                                                               td->used = 0;
-                                               tmpdev->used = 1;
-                                               goto loop;
+
+                       if (st == NULL)
+                               st = dup_super(tst);
+                       if (st->minor_version == -1)
+                               st->minor_version = tst->minor_version;
+                       if (st->ss != tst->ss ||
+                           st->minor_version != tst->minor_version ||
+                           st->ss->compare_super(st, tst) != 0) {
+                               /* Some mismatch. If exactly one array matches this host,
+                                * we can resolve on that one.
+                                * Or, if we are auto assembling, we just ignore the second
+                                * for now.
+                                */
+                               if (auto_assem)
+                                       goto loop;
+                               if (homehost) {
+                                       int first = st->ss->match_home(st, homehost);
+                                       int last = tst->ss->match_home(tst, homehost);
+                                       if (first != last &&
+                                           (first == 1 || last == 1)) {
+                                               /* We can do something */
+                                               if (first) {/* just ignore this one */
+                                                       if (report_missmatch)
+                                                               fprintf(stderr, Name ": %s misses out due to wrong homehost\n",
+                                                                       devname);
+                                                       goto loop;
+                                               } else { /* reject all those sofar */
+                                                       struct mddev_dev *td;
+                                                       if (report_missmatch)
+                                                               fprintf(stderr, Name ": %s overrides previous devices due to good homehost\n",
+                                                                       devname);
+                                                       for (td=devlist; td != tmpdev; td=td->next)
+                                                               if (td->used == 1)
+                                                                       td->used = 0;
+                                                       tmpdev->used = 1;
+                                                       goto loop;
+                                               }
                                        }
                                }
+                               fprintf(stderr, Name ": superblock on %s doesn't match others - assembly aborted\n",
+                                       devname);
+                               tst->ss->free_super(tst);
+                               st->ss->free_super(st);
+                               dev_policy_free(pol);
+                               return 1;
                        }
-                       fprintf(stderr, Name ": superblock on %s doesn't match others - assembly aborted\n",
-                               devname);
-                       tst->ss->free_super(tst);
-                       st->ss->free_super(st);
-                       return 1;
+                       tmpdev->used = 1;
                }
-
-               tmpdev->used = 1;
-
        loop:
-               if (tmpdev->content)
-                       goto next_member;
+               dev_policy_free(pol);
+               pol = NULL;
                if (tst)
                        tst->ss->free_super(tst);
        }
@@ -507,7 +538,7 @@ int Assemble(struct supertype *st, char *mddev,
 
        /* Now need to open the array device.  Use create_mddev */
        if (content == &info)
-               st->ss->getinfo_super(st, content);
+               st->ss->getinfo_super(st, content, NULL);
 
        trustworthy = FOREIGN;
        name = content->name;
@@ -549,7 +580,6 @@ int Assemble(struct supertype *st, char *mddev,
                            chosen_name);
        if (mdfd < 0) {
                st->ss->free_super(st);
-               free(devices);
                if (auto_assem)
                        goto try_again;
                return 1;
@@ -575,7 +605,6 @@ int Assemble(struct supertype *st, char *mddev,
                close(mdfd);
                mdfd = -3;
                st->ss->free_super(st);
-               free(devices);
                if (auto_assem)
                        goto try_again;
                return 1;
@@ -592,6 +621,8 @@ int Assemble(struct supertype *st, char *mddev,
        /* Ok, no bad inconsistancy, we can try updating etc */
        bitmap_done = 0;
        content->update_private = NULL;
+       devices = malloc(num_devs * sizeof(*devices));
+       devmap = calloc(num_devs * content->array.raid_disks, 1);
        for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) {
                char *devname = tmpdev->devname;
                struct stat stb;
@@ -602,6 +633,7 @@ int Assemble(struct supertype *st, char *mddev,
                        /* prepare useful information in info structures */
                        struct stat stb2;
                        struct supertype *tst;
+                       int err;
                        fstat(mdfd, &stb2);
 
                        if (strcmp(update, "uuid")==0 &&
@@ -618,8 +650,6 @@ int Assemble(struct supertype *st, char *mddev,
                        }
                        dfd = dev_open(devname, O_RDWR|O_EXCL);
 
-                       remove_partitions(dfd);
-
                        tst = dup_super(st);
                        if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
                                fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n",
@@ -627,30 +657,45 @@ int Assemble(struct supertype *st, char *mddev,
                                if (dfd >= 0)
                                        close(dfd);
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
-                       tst->ss->getinfo_super(tst, content);
+                       tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
 
                        memcpy(content->uuid, ident->uuid, 16);
                        strcpy(content->name, ident->name);
                        content->array.md_minor = minor(stb2.st_rdev);
 
-                       tst->ss->update_super(tst, content, update,
-                                             devname, verbose,
-                                             ident->uuid_set, homehost);
+                       if (strcmp(update, "byteorder") == 0)
+                               err = 0;
+                       else
+                               err = tst->ss->update_super(tst, content, update,
+                                                           devname, verbose,
+                                                           ident->uuid_set,
+                                                           homehost);
+                       if (err < 0) {
+                               fprintf(stderr,
+                                       Name ": --update=%s not understood"
+                                       " for %s metadata\n",
+                                       update, tst->ss->name);
+                               tst->ss->free_super(tst);
+                               free(tst);
+                               close(mdfd);
+                               close(dfd);
+                               free(devices);
+                               free(devmap);
+                               return 1;
+                       }
                        if (strcmp(update, "uuid")==0 &&
                            !ident->uuid_set) {
                                ident->uuid_set = 1;
                                memcpy(ident->uuid, content->uuid, 16);
                        }
-                       if (dfd < 0)
-                               fprintf(stderr, Name ": Cannot open %s for superblock update\n",
-                                       devname);
-                       else if (tst->ss->store_super(tst, dfd))
+                       if (tst->ss->store_super(tst, dfd))
                                fprintf(stderr, Name ": Could not re-write superblock on %s.\n",
                                        devname);
-                       if (dfd >= 0)
-                               close(dfd);
+                       close(dfd);
 
                        if (strcmp(update, "uuid")==0 &&
                            ident->bitmap_fd >= 0 && !bitmap_done) {
@@ -669,17 +714,17 @@ int Assemble(struct supertype *st, char *mddev,
                        int dfd;
                        dfd = dev_open(devname, O_RDWR|O_EXCL);
 
-                       remove_partitions(dfd);
-
                        if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
                                fprintf(stderr, Name ": cannot re-read metadata from %s - aborting\n",
                                        devname);
                                if (dfd >= 0)
                                        close(dfd);
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
-                       tst->ss->getinfo_super(tst, content);
+                       tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
                        tst->ss->free_super(tst);
                        close(dfd);
                }
@@ -749,6 +794,8 @@ int Assemble(struct supertype *st, char *mddev,
                                           "the\n      DEVICE list in mdadm.conf"
                                        );
                                close(mdfd);
+                               free(devices);
+                               free(devmap);
                                return 1;
                        }
                        if (best[i] == -1
@@ -767,13 +814,15 @@ int Assemble(struct supertype *st, char *mddev,
                if (st)
                        st->ss->free_super(st);
                close(mdfd);
+               free(devices);
+               free(devmap);
                return 1;
        }
 
        if (update && strcmp(update, "byteorder")==0)
                st->minor_version = 90;
 
-       st->ss->getinfo_super(st, content);
+       st->ss->getinfo_super(st, content, NULL);
        clean = content->array.state & 1;
 
        /* now we have some devices that might be suitable.
@@ -800,11 +849,26 @@ int Assemble(struct supertype *st, char *mddev,
                                        sparecnt++;
                                continue;
                        }
+               /* If this devices thinks that 'most_recent' has failed, then
+                * we must reject this device.
+                */
+               if (j != most_recent &&
+                   content->array.raid_disks > 0 &&
+                   devices[most_recent].i.disk.raid_disk >= 0 &&
+                   devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) {
+                       if (verbose > -1)
+                               fprintf(stderr, Name ": ignoring %s as it reports %s as failed\n",
+                                       devices[j].devname, devices[most_recent].devname);
+                       best[i] = -1;
+                       continue;
+               }
                if (devices[j].i.events+event_margin >=
                    devices[most_recent].i.events) {
                        devices[j].uptodate = 1;
                        if (i < content->array.raid_disks) {
-                               if (devices[j].i.recovery_start == MaxSector) {
+                               if (devices[j].i.recovery_start == MaxSector ||
+                                   (content->reshape_active &&
+                                    j >= content->array.raid_disks - content->delta_disks)) {
                                        okcnt++;
                                        avail[i]=1;
                                } else
@@ -813,6 +877,7 @@ int Assemble(struct supertype *st, char *mddev,
                                sparecnt++;
                }
        }
+       free(devmap);
        while (force && !enough(content->array.level, content->array.raid_disks,
                                content->array.layout, 1,
                                avail, okcnt)) {
@@ -914,6 +979,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Cannot open %s: %s\n",
                                devices[j].devname, strerror(errno));
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (st->ss->load_super(st,fd, NULL)) {
@@ -921,6 +987,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": RAID superblock has disappeared from %s\n",
                                devices[j].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                close(fd);
@@ -928,9 +995,10 @@ int Assemble(struct supertype *st, char *mddev,
        if (st->sb == NULL) {
                fprintf(stderr, Name ": No suitable drives found for %s\n", mddev);
                close(mdfd);
+               free(devices);
                return 1;
        }
-       st->ss->getinfo_super(st, content);
+       st->ss->getinfo_super(st, content, NULL);
 #ifndef MDASSEMBLE
        sysfs_init(content, mdfd, 0);
 #endif
@@ -991,6 +1059,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Could not open %s for write - cannot Assemble array.\n",
                                devices[chosen_drive].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (st->ss->store_super(st, fd)) {
@@ -998,6 +1067,7 @@ int Assemble(struct supertype *st, char *mddev,
                        fprintf(stderr, Name ": Could not re-write superblock on %s\n",
                                devices[chosen_drive].devname);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                close(fd);
@@ -1028,8 +1098,16 @@ int Assemble(struct supertype *st, char *mddev,
                        } else
                                fdlist[i] = -1;
                }
-               if (!err)
-                       err = Grow_restart(st, content, fdlist, bestcnt, backup_file, verbose > 0);
+               if (!err) {
+                       err = Grow_restart(st, content, fdlist, bestcnt,
+                                          backup_file, verbose > 0);
+                       if (err && invalid_backup) {
+                               if (verbose > 0)
+                                       fprintf(stderr, Name ": continuing"
+                                               " without restoring backup\n");
+                               err = 0;
+                       }
+               }
                while (i>0) {
                        i--;
                        if (fdlist[i]>=0) close(fdlist[i]);
@@ -1039,6 +1117,7 @@ int Assemble(struct supertype *st, char *mddev,
                        if (backup_file == NULL)
                                fprintf(stderr,"      Possibly you needed to specify the --backup-file\n");
                        close(mdfd);
+                       free(devices);
                        return err;
                }
        }
@@ -1064,6 +1143,7 @@ int Assemble(struct supertype *st, char *mddev,
                                mddev, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, NULL);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (ident->bitmap_fd >= 0) {
@@ -1071,6 +1151,7 @@ int Assemble(struct supertype *st, char *mddev,
                                fprintf(stderr, Name ": SET_BITMAP_FILE failed.\n");
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                } else if (ident->bitmap_file) {
@@ -1081,6 +1162,7 @@ int Assemble(struct supertype *st, char *mddev,
                                        ident->bitmap_file);
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                        if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
@@ -1088,6 +1170,7 @@ int Assemble(struct supertype *st, char *mddev,
                                close(bmfd);
                                ioctl(mdfd, STOP_ARRAY, NULL);
                                close(mdfd);
+                               free(devices);
                                return 1;
                        }
                        close(bmfd);
@@ -1104,6 +1187,12 @@ int Assemble(struct supertype *st, char *mddev,
                                j = chosen_drive;
 
                        if (j >= 0 /* && devices[j].uptodate */) {
+                               int dfd = dev_open(devices[j].devname,
+                                                  O_RDWR|O_EXCL);
+                               if (dfd >= 0) {
+                                       remove_partitions(dfd);
+                                       close(dfd);
+                               }
                                rv = add_disk(mdfd, st, content, &devices[j].i);
 
                                if (rv) {
@@ -1141,6 +1230,7 @@ int Assemble(struct supertype *st, char *mddev,
                        sysfs_uevent(content, "change");
                        wait_for(chosen_name, mdfd);
                        close(mdfd);
+                       free(devices);
                        return 0;
                }
 
@@ -1189,6 +1279,29 @@ int Assemble(struct supertype *st, char *mddev,
                                                                      (4 * content->array.chunk_size / 4096) + 1);
                                        }
                                }
+                               if (okcnt < (unsigned)content->array.raid_disks) {
+                                       /* If any devices did not get added
+                                        * because the kernel rejected them based
+                                        * on event count, try adding them
+                                        * again providing the action policy is
+                                        * 're-add' or greater.  The bitmap
+                                        * might allow them to be included, or
+                                        * they will become spares.
+                                        */
+                                       for (i = 0; i <= bestcnt; i++) {
+                                               int j = best[i];
+                                               if (j >= 0 && !devices[j].uptodate) {
+                                                       if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add))
+                                                               continue;
+                                                       rv = add_disk(mdfd, st, content,
+                                                                     &devices[j].i);
+                                                       if (rv == 0 && verbose >= 0)
+                                                               fprintf(stderr,
+                                                                       Name ": %s has been re-added.\n",
+                                                                       devices[j].devname);
+                                               }
+                                       }
+                               }
                                wait_for(mddev, mdfd);
                                close(mdfd);
                                if (auto_assem) {
@@ -1217,6 +1330,7 @@ int Assemble(struct supertype *st, char *mddev,
                                                usecs <<= 1;
                                        }
                                }
+                               free(devices);
                                return 0;
                        }
                        fprintf(stderr, Name ": failed to RUN_ARRAY %s: %s\n",
@@ -1237,6 +1351,7 @@ int Assemble(struct supertype *st, char *mddev,
                        if (auto_assem)
                                ioctl(mdfd, STOP_ARRAY, NULL);
                        close(mdfd);
+                       free(devices);
                        return 1;
                }
                if (runstop == -1) {
@@ -1246,6 +1361,7 @@ int Assemble(struct supertype *st, char *mddev,
                                fprintf(stderr, " (out of %d)", content->array.raid_disks);
                        fprintf(stderr, ", but not started.\n");
                        close(mdfd);
+                       free(devices);
                        return 0;
                }
                if (verbose >= -1) {
@@ -1275,6 +1391,7 @@ int Assemble(struct supertype *st, char *mddev,
                if (auto_assem)
                        ioctl(mdfd, STOP_ARRAY, NULL);
                close(mdfd);
+               free(devices);
                return 1;
        } else {
                /* The "chosen_drive" is a good choice, and if necessary, the superblock has
@@ -1291,6 +1408,7 @@ int Assemble(struct supertype *st, char *mddev,
 
        }
        close(mdfd);
+       free(devices);
        return 0;
 }
 
diff --git a/Build.c b/Build.c
index 7f3925864731bf63e0c1c1a10357a3476a95a2fe..cb9f01e33264376e63be43c2efd8da1672c96461 100644 (file)
--- a/Build.c
+++ b/Build.c
@@ -29,7 +29,7 @@
 #define STOP_MD                _IO (MD_MAJOR, 3)
 
 int Build(char *mddev, int chunk, int level, int layout,
-         int raiddisks, mddev_dev_t devlist, int assume_clean,
+         int raiddisks, struct mddev_dev *devlist, int assume_clean,
          char *bitmap_file, int bitmap_chunk, int write_behind,
          int delay, int verbose, int autof, unsigned long long size)
 {
@@ -50,7 +50,7 @@ int Build(char *mddev, int chunk, int level, int layout,
        int vers;
        struct stat stb;
        int subdevs = 0, missing_disks = 0;
-       mddev_dev_t dv;
+       struct mddev_dev *dv;
        int bitmap_fd;
        unsigned long long bitmapsize;
        int mdfd;
index 2bf7ebe2dfac380e2c7abe38ab2c7dce38360d50..7c6979ac2047ad68e55035585aef7c3e8f937143 100644 (file)
--- a/Create.c
+++ b/Create.c
@@ -31,8 +31,8 @@ static int default_layout(struct supertype *st, int level, int verbose)
 {
        int layout = UnSet;
 
-       if (st && st->ss->default_layout)
-               layout = st->ss->default_layout(level);
+       if (st && st->ss->default_geometry)
+               st->ss->default_geometry(st, &level, &layout, NULL);
 
        if (layout == UnSet)
                switch(level) {
@@ -66,11 +66,13 @@ static int default_layout(struct supertype *st, int level, int verbose)
 
 
 int Create(struct supertype *st, char *mddev,
-          int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
+          int chunk, int level, int layout, unsigned long long size,
+          int raiddisks, int sparedisks,
           char *name, char *homehost, int *uuid,
-          int subdevs, mddev_dev_t devlist,
+          int subdevs, struct mddev_dev *devlist,
           int runstop, int verbose, int force, int assume_clean,
-          char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof)
+          char *bitmap_file, int bitmap_chunk, int write_behind,
+          int delay, int autof)
 {
        /*
         * Create a new raid array.
@@ -93,7 +95,7 @@ int Create(struct supertype *st, char *mddev,
        char *mindisc = NULL;
        char *maxdisc = NULL;
        int dnum;
-       mddev_dev_t dv;
+       struct mddev_dev *dv;
        int fail=0, warn=0;
        struct stat stb;
        int first_missing = subdevs * 2;
@@ -120,15 +122,8 @@ int Create(struct supertype *st, char *mddev,
        int major_num = BITMAP_MAJOR_HI;
 
        memset(&info, 0, sizeof(info));
-
-       if (level == UnSet) {
-               /* "ddf" and "imsm" metadata only supports one level - should possibly
-                * push this into metadata handler??
-                */
-               if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
-                       level = LEVEL_CONTAINER;
-       }
-
+       if (level == UnSet && st && st->ss->default_geometry)
+               st->ss->default_geometry(st, &level, NULL, NULL);
        if (level == UnSet) {
                fprintf(stderr,
                        Name ": a RAID level is needed to create an array.\n");
@@ -169,15 +164,15 @@ int Create(struct supertype *st, char *mddev,
                    inf.raid_disks == 0) {
                        /* yep, looks like a container */
                        if (st) {
-                               rv = st->ss->load_super(st, fd,
-                                                       devlist->devname);
+                               rv = st->ss->load_container(st, fd,
+                                                           devlist->devname);
                                if (rv == 0)
                                        have_container = 1;
                        } else {
-                               st = guess_super(fd);
+                               st = super_by_fd(fd, NULL);
                                if (st && !(rv = st->ss->
-                                           load_super(st, fd,
-                                                      devlist->devname)))
+                                           load_container(st, fd,
+                                                          devlist->devname)))
                                        have_container = 1;
                                else
                                        st = NULL;
@@ -235,11 +230,9 @@ int Create(struct supertype *st, char *mddev,
        case 6:
        case 0:
                if (chunk == 0) {
-                       if (st && st->ss->default_chunk)
-                               chunk = st->ss->default_chunk(st);
-
+                       if (st && st->ss->default_geometry)
+                               st->ss->default_geometry(st, NULL, NULL, &chunk);
                        chunk = chunk ? : 512;
-
                        if (verbose > 0)
                                fprintf(stderr, Name ": chunk size defaults to %dK\n", chunk);
                }
@@ -404,6 +397,12 @@ int Create(struct supertype *st, char *mddev,
                        close(fd);
                }
        }
+       if (raiddisks + sparedisks > st->max_devs) {
+               fprintf(stderr, Name ": Too many devices:"
+                       " %s metadata only supports %d\n",
+                       st->ss->name, st->max_devs);
+               return 1;
+       }
        if (have_container)
                info.array.working_disks = raiddisks;
        if (fail) {
@@ -614,7 +613,7 @@ int Create(struct supertype *st, char *mddev,
 
        total_slots = info.array.nr_disks;
        sysfs_init(&info, mdfd, 0);
-       st->ss->getinfo_super(st, &info);
+       st->ss->getinfo_super(st, &info, NULL);
 
        if (did_default && verbose >= 0) {
                if (is_subarray(info.text_version)) {
@@ -672,7 +671,7 @@ int Create(struct supertype *st, char *mddev,
 
        sysfs_init(&info, mdfd, 0);
 
-       if (st->ss->external && st->subarray[0]) {
+       if (st->ss->external && st->container_dev != NoMdDev) {
                /* member */
 
                /* When creating a member, we need to be careful
@@ -735,7 +734,7 @@ int Create(struct supertype *st, char *mddev,
        infos = malloc(sizeof(*infos) * total_slots);
 
        for (pass=1; pass <=2 ; pass++) {
-               mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
+               struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
 
                for (dnum=0, dv = devlist ; dv ;
                     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
@@ -775,7 +774,8 @@ int Create(struct supertype *st, char *mddev,
                                if (have_container)
                                        fd = -1;
                                else {
-                                       if (st->ss->external && st->subarray[0])
+                                       if (st->ss->external &&
+                                           st->container_dev != NoMdDev)
                                                fd = open(dv->devname, O_RDWR);
                                        else
                                                fd = open(dv->devname, O_RDWR|O_EXCL);
@@ -797,7 +797,7 @@ int Create(struct supertype *st, char *mddev,
                                        ioctl(mdfd, STOP_ARRAY, NULL);
                                        goto abort;
                                }
-                               st->ss->getinfo_super(st, inf);
+                               st->ss->getinfo_super(st, inf, NULL);
                                safe_mode_delay = inf->safe_mode_delay;
 
                                if (have_container && verbose > 0)
@@ -842,7 +842,7 @@ int Create(struct supertype *st, char *mddev,
                         * again returns container info.
                         */
                        map_lock(&map);
-                       st->ss->getinfo_super(st, &info_new);
+                       st->ss->getinfo_super(st, &info_new, NULL);
                        if (st->ss->external && level != LEVEL_CONTAINER &&
                            !same_uuid(info_new.uuid, info.uuid, 0)) {
                                map_update(&map, fd2devnum(mdfd),
@@ -857,7 +857,7 @@ int Create(struct supertype *st, char *mddev,
                        if (me) {
                                char *path = strdup(me->path);
 
-                               st->ss->getinfo_super(st, &info_new);
+                               st->ss->getinfo_super(st, &info_new, NULL);
                                map_update(&map, st->container_dev,
                                           info_new.text_version,
                                           info_new.uuid, path);
@@ -880,20 +880,27 @@ int Create(struct supertype *st, char *mddev,
                wait_for(chosen_name, mdfd);
        } else if (runstop == 1 || subdevs >= raiddisks) {
                if (st->ss->external) {
+                       int err;
                        switch(level) {
                        case LEVEL_LINEAR:
                        case LEVEL_MULTIPATH:
                        case 0:
-                               sysfs_set_str(&info, NULL, "array_state",
-                                             "active");
+                               err = sysfs_set_str(&info, NULL, "array_state",
+                                                   "active");
                                need_mdmon = 0;
                                break;
                        default:
-                               sysfs_set_str(&info, NULL, "array_state",
-                                             "readonly");
+                               err = sysfs_set_str(&info, NULL, "array_state",
+                                                   "readonly");
                                break;
                        }
                        sysfs_set_safemode(&info, safe_mode_delay);
+                       if (err) {
+                               fprintf(stderr, Name ": failed to"
+                                       " activate array.\n");
+                               ioctl(mdfd, STOP_ARRAY, NULL);
+                               goto abort;
+                       }
                } else {
                        /* param is not actually used */
                        mdu_param_t param;
@@ -906,7 +913,7 @@ int Create(struct supertype *st, char *mddev,
                }
                if (verbose >= 0)
                        fprintf(stderr, Name ": array %s started.\n", mddev);
-               if (st->ss->external && st->subarray[0]) {
+               if (st->ss->external && st->container_dev != NoMdDev) {
                        if (need_mdmon)
                                start_mdmon(st->container_dev);
 
index e0817aac20298c442ea32f59b33ad7298d92171e..94156288a19fc50be64fd1c881cf6371c64bfdb7 100644 (file)
--- a/Detail.c
+++ b/Detail.c
@@ -49,8 +49,9 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        int is_rebuilding = 0;
        int failed = 0;
        struct supertype *st;
+       char *subarray = NULL;
        int max_disks = MD_SB_DISKS; /* just a default */
-       struct mdinfo info;
+       struct mdinfo *info = NULL;
        struct mdinfo *sra;
        char *member = NULL;
        char *container = NULL;
@@ -88,7 +89,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                return rv;
        }
        sra = sysfs_read(fd, 0, GET_VERSION);
-       st = super_by_fd(fd);
+       st = super_by_fd(fd, &subarray);
 
        if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode))
                stb.st_rdev = 0;
@@ -97,16 +98,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        if (st)
                max_disks = st->max_devs;
 
-       if (sra && is_subarray(sra->text_version) &&
-               strchr(sra->text_version+1, '/')) {
+       if (subarray) {
                /* This is a subarray of some container.
                 * We want the name of the container, and the member
                 */
-               char *s = strchr(sra->text_version+1, '/');
-               int dn;
-               *s++ = '\0';
-               member = s;
-               dn = devname2devnum(sra->text_version+1);
+               int dn = st->container_dev;
+
+               member = subarray;
                container = map_dev(dev2major(dn), dev2minor(dn), 1);
        }
 
@@ -143,11 +141,20 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                close(fd2);
                if (err)
                        continue;
-               st->ss->getinfo_super(st, &info);
+               if (info)
+                       free(info);
+               if (subarray)
+                       info = st->ss->container_content(st, subarray);
+               else {
+                       info = malloc(sizeof(*info));
+                       st->ss->getinfo_super(st, info, NULL);
+               }
+               if (!info)
+                       continue;
 
                if (array.raid_disks != 0 && /* container */
-                   (info.array.ctime != array.ctime ||
-                    info.array.level != array.level)) {
+                   (info->array.ctime != array.ctime ||
+                    info->array.level != array.level)) {
                        st->ss->free_super(st);
                        continue;
                }
@@ -160,7 +167,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                 * ->load_super.
                 */
                if (memcmp(uuid_match_any,
-                          info.uuid,
+                          info->uuid,
                           sizeof(uuid_match_any)) == 0) {
                        st->ss->free_super(st);
                        continue;
@@ -191,13 +198,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                       array.major_version, array.minor_version);
                }
                
-               if (st && st->sb) {
+               if (st && st->sb && info) {
                        char nbuf[64];
                        struct map_ent *mp, *map = NULL;
 
-                       fname_from_uuid(st, &info, nbuf, ':');
+                       fname_from_uuid(st, info, nbuf, ':');
                        printf("MD_UUID=%s\n", nbuf+5);
-                       mp = map_by_uuid(&map, info.uuid);
+                       mp = map_by_uuid(&map, info->uuid);
                        if (mp && mp->path &&
                            strncmp(mp->path, "/dev/md/", 8) == 0)
                                printf("MD_DEVNAME=%s\n", mp->path+8);
@@ -355,6 +362,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                if (atime)
                        printf("    Update Time : %.24s\n", ctime(&atime));
                if (array.raid_disks) {
+                       static char *sync_action[] = {", recovering",", resyncing",", reshaping",", checking"};
                        char *st;
                        if (avail_disks == array.raid_disks)
                                st = "";
@@ -367,8 +375,7 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                        printf("          State : %s%s%s%s\n",
                               (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
                               st,
-                              (!e || e->percent < 0) ? "" :
-                              (e->resync) ? ", resyncing": ", recovering",
+                              (!e || e->percent < 0) ? "" : sync_action[e->resync],
                               larray_size ? "": ", Not Started");
                }
                if (array.raid_disks)
@@ -410,50 +417,50 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
 
                if (e && e->percent >= 0) {
                        printf(" Re%s Status : %d%% complete\n",
-                              (st && st->sb && info.reshape_active)?
+                              (st && st->sb && info->reshape_active)?
                                  "shape":"build",
                               e->percent);
                        is_rebuilding = 1;
                }
                free_mdstat(ms);
 
-               if (st->sb && info.reshape_active) {
+               if (st->sb && info->reshape_active) {
 #if 0
 This is pretty boring
-                       printf("  Reshape pos'n : %llu%s\n", (unsigned long long) info.reshape_progress<<9,
-                              human_size((unsigned long long)info.reshape_progress<<9));
+                       printf("  Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9,
+                              human_size((unsigned long long)info->reshape_progress<<9));
 #endif
-                       if (info.delta_disks > 0)
+                       if (info->delta_disks > 0)
                                printf("  Delta Devices : %d, (%d->%d)\n",
-                                      info.delta_disks, array.raid_disks - info.delta_disks, array.raid_disks);
-                       if (info.delta_disks < 0)
+                                      info->delta_disks, array.raid_disks - info->delta_disks, array.raid_disks);
+                       if (info->delta_disks < 0)
                                printf("  Delta Devices : %d, (%d->%d)\n",
-                                      info.delta_disks, array.raid_disks, array.raid_disks + info.delta_disks);
-                       if (info.new_level != array.level) {
-                               char *c = map_num(pers, info.new_level);
+                                      info->delta_disks, array.raid_disks, array.raid_disks + info->delta_disks);
+                       if (info->new_level != array.level) {
+                               char *c = map_num(pers, info->new_level);
                                printf("      New Level : %s\n", c?c:"-unknown-");
                        }
-                       if (info.new_level != array.level ||
-                           info.new_layout != array.layout) {
-                               if (info.new_level == 5) {
-                                       char *c = map_num(r5layout, info.new_layout);
+                       if (info->new_level != array.level ||
+                           info->new_layout != array.layout) {
+                               if (info->new_level == 5) {
+                                       char *c = map_num(r5layout, info->new_layout);
                                        printf("     New Layout : %s\n",
                                               c?c:"-unknown-");
                                }
-                               if (info.new_level == 6) {
-                                       char *c = map_num(r6layout, info.new_layout);
+                               if (info->new_level == 6) {
+                                       char *c = map_num(r6layout, info->new_layout);
                                        printf("     New Layout : %s\n",
                                               c?c:"-unknown-");
                                }
-                               if (info.new_level == 10) {
+                               if (info->new_level == 10) {
                                        printf("     New Layout : near=%d, %s=%d\n",
-                                              info.new_layout&255,
-                                              (info.new_layout&0x10000)?"offset":"far",
-                                              (info.new_layout>>8)&255);
+                                              info->new_layout&255,
+                                              (info->new_layout&0x10000)?"offset":"far",
+                                              (info->new_layout>>8)&255);
                                }
                        }
-                       if (info.new_chunk != array.chunk_size)
-                               printf("  New Chunksize : %dK\n", info.new_chunk/1024);
+                       if (info->new_chunk != array.chunk_size)
+                               printf("  New Chunksize : %dK\n", info->new_chunk/1024);
                        printf("\n");
                } else if (e && e->percent >= 0)
                        printf("\n");
@@ -500,6 +507,7 @@ This is pretty boring
                else
                        printf("    Number   Major   Minor   RaidDevice\n");
        }
+       free(info);
 
        for (d= 0; d < max_disks; d++) {
                char *dv;
@@ -581,6 +589,7 @@ This is pretty boring
        free(disks);
 out:
        close(fd);
+       free(subarray);
        return rv;
 }
 
index 7fbd4ae2cf932e5decd61b5a32c97177d1fe8dbb..ffca9ca5f9d615721b68db3a9af0a1d2eda57788 100644 (file)
--- a/Examine.c
+++ b/Examine.c
@@ -30,7 +30,7 @@
 #endif
 #include       "md_u.h"
 #include       "md_p.h"
-int Examine(mddev_dev_t devlist, int brief, int export, int scan,
+int Examine(struct mddev_dev *devlist, int brief, int export, int scan,
            int SparcAdjust, struct supertype *forcest,
            char *homehost)
 {
@@ -64,6 +64,7 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
 
        for (; devlist ; devlist=devlist->next) {
                struct supertype *st;
+               int have_container = 0;
 
                fd = dev_open(devlist->devname, O_RDONLY);
                if (fd < 0) {
@@ -75,15 +76,29 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                        err = 1;
                }
                else {
+                       int container = 0;
                        if (forcest)
                                st = dup_super(forcest);
-                       else
+                       else if (must_be_container(fd)) {
+                               /* might be a container */
+                               st = super_by_fd(fd, NULL);
+                               container = 1;
+                       } else
                                st = guess_super(fd);
-                       if (st)
-                               err = st->ss->load_super(st, fd,
-                                                        (brief||scan) ? NULL
-                                                          :devlist->devname);
-                       else {
+                       if (st) {
+                               err = 1;
+                               if (!container)
+                                       err = st->ss->load_super(st, fd,
+                                                                (brief||scan) ? NULL
+                                                                :devlist->devname);
+                               if (err && st->ss->load_container) {
+                                       err = st->ss->load_container(st, fd,
+                                                                (brief||scan) ? NULL
+                                                                :devlist->devname);
+                                       if (!err)
+                                               have_container = 1;
+                               }
+                       } else {
                                if (!brief) {
                                        fprintf(stderr, Name ": No md superblock detected on %s.\n", devlist->devname);
                                        rv = 1;
@@ -100,7 +115,11 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                             devlist->devname, 0, 0, NULL);
                /* Ok, its good enough to try, though the checksum could be wrong */
 
-               if (brief) {
+               if (brief && st->ss->brief_examine_super == NULL) {
+                       if (!scan)
+                               fprintf(stderr, Name ": No brief listing for %s on %s\n",
+                                       st->ss->name, devlist->devname);
+               } else if (brief) {
                        struct array *ap;
                        char *d;
                        for (ap=arrays; ap; ap=ap->next) {
@@ -115,10 +134,10 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                ap->spares = 0;
                                ap->st = st;
                                arrays = ap;
-                               st->ss->getinfo_super(st, &ap->info);
+                               st->ss->getinfo_super(st, &ap->info, NULL);
                        } else
-                               st->ss->getinfo_super(st, &ap->info);
-                       if (!st->loaded_container &&
+                               st->ss->getinfo_super(st, &ap->info, NULL);
+                       if (!have_container &&
                            !(ap->info.disk.state & (1<<MD_DISK_SYNC)))
                                ap->spares++;
                        d = dl_strdup(devlist->devname);
diff --git a/Grow.c b/Grow.c
index 0571f5b0e16d170e5bb9671d7f62df5f0cefa65c..222d755c81d387de82339a6adc95bf7c67daafa6 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -51,33 +51,41 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
        int nfd, fd2;
        int d, nd;
        struct supertype *st = NULL;
-
+       char *subarray = NULL;
 
        if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
                fprintf(stderr, Name ": cannot get array info for %s\n", devname);
                return 1;
        }
 
-       st = super_by_fd(fd);
+       if (info.array.level != -1) {
+               fprintf(stderr, Name ": can only add devices to linear arrays\n");
+               return 1;
+       }
+
+       st = super_by_fd(fd, &subarray);
        if (!st) {
                fprintf(stderr, Name ": cannot handle arrays with superblock version %d\n", info.array.major_version);
                return 1;
        }
 
-       if (info.array.level != -1) {
-               fprintf(stderr, Name ": can only add devices to linear arrays\n");
-               return 1;
+       if (subarray) {
+               fprintf(stderr, Name ": Cannot grow linear sub-arrays yet\n");
+               free(subarray);
+               free(st);
        }
 
        nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
        if (nfd < 0) {
                fprintf(stderr, Name ": cannot open %s\n", newdev);
+               free(st);
                return 1;
        }
        fstat(nfd, &stb);
        if ((stb.st_mode & S_IFMT) != S_IFBLK) {
                fprintf(stderr, Name ": %s is not a block device!\n", newdev);
                close(nfd);
+               free(st);
                return 1;
        }
        /* now check out all the devices and make sure we can read the superblock */
@@ -85,28 +93,37 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
                mdu_disk_info_t disk;
                char *dv;
 
+               st->ss->free_super(st);
+
                disk.number = d;
                if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
                        fprintf(stderr, Name ": cannot get device detail for device %d\n",
                                d);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
                dv = map_dev(disk.major, disk.minor, 1);
                if (!dv) {
                        fprintf(stderr, Name ": cannot find device file for device %d\n",
                                d);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
                fd2 = dev_open(dv, O_RDWR);
                if (!fd2) {
                        fprintf(stderr, Name ": cannot open device file %s\n", dv);
+                       close(nfd);
+                       free(st);
                        return 1;
                }
-               st->ss->free_super(st);
 
                if (st->ss->load_super(st, fd2, NULL)) {
                        fprintf(stderr, Name ": cannot find super block on %s\n", dv);
+                       close(nfd);
                        close(fd2);
+                       free(st);
                        return 1;
                }
                close(fd2);
@@ -204,16 +221,17 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
        mdu_bitmap_file_t bmf;
        mdu_array_info_t array;
        struct supertype *st;
+       char *subarray = NULL;
        int major = BITMAP_MAJOR_HI;
        int vers = md_get_version(fd);
        unsigned long long bitmapsize, array_size;
 
        if (vers < 9003) {
                major = BITMAP_MAJOR_HOSTENDIAN;
-#ifdef __BIG_ENDIAN
-               fprintf(stderr, Name ": Warning - bitmaps created on this kernel are not portable\n"
-                       "  between different architectured.  Consider upgrading the Linux kernel.\n");
-#endif
+               fprintf(stderr, Name ": Warning - bitmaps created on this kernel"
+                       " are not portable\n"
+                       "  between different architectures.  Consider upgrading"
+                       " the Linux kernel.\n");
        }
 
        if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
@@ -253,6 +271,11 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                        devname);
                return 1;
        }
+
+       if (strcmp(file, "none") == 0) {
+               fprintf(stderr, Name ": no bitmap found on %s\n", devname);
+               return 1;
+       }
        if (array.level <= 0) {
                fprintf(stderr, Name ": Bitmaps not meaningful with level %s\n",
                        map_num(pers, array.level)?:"of this array");
@@ -277,16 +300,19 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                bitmapsize = bitmapsize * array.raid_disks / ncopies;
        }
 
-       st = super_by_fd(fd);
+       st = super_by_fd(fd, &subarray);
        if (!st) {
                fprintf(stderr, Name ": Cannot understand version %d.%d\n",
                        array.major_version, array.minor_version);
                return 1;
        }
-       if (strcmp(file, "none") == 0) {
-               fprintf(stderr, Name ": no bitmap found on %s\n", devname);
+       if (subarray) {
+               fprintf(stderr, Name ": Cannot add bitmaps to sub-arrays yet\n");
+               free(subarray);
+               free(st);
                return 1;
-       } else if (strcmp(file, "internal") == 0) {
+       }
+       if (strcmp(file, "internal") == 0) {
                int d;
                if (st->ss->add_internal_bitmap == NULL) {
                        fprintf(stderr, Name ": Internal bitmaps not supported "
@@ -327,6 +353,10 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                }
                array.state |= (1<<MD_SB_BITMAP_PRESENT);
                if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
+                       if (errno == EBUSY)
+                               fprintf(stderr, Name
+                                       ": Cannot add bitmap while array is"
+                                       " resyncing or reshaping etc.\n");
                        fprintf(stderr, Name ": failed to set internal bitmap.\n");
                        return 1;
                }
@@ -373,8 +403,13 @@ int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int
                        return 1;
                }
                if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+                       int err = errno;
+                       if (errno == EBUSY)
+                               fprintf(stderr, Name
+                                       ": Cannot add bitmap while array is"
+                                       " resyncing or reshaping etc.\n");
                        fprintf(stderr, Name ": Cannot set bitmap file for %s: %s\n",
-                               devname, strerror(errno));
+                               devname, strerror(err));
                        return 1;
                }
        }
@@ -409,7 +444,7 @@ static struct mdp_backup_super {
        __u8 pad[512-68-32];
 } __attribute__((aligned(512))) bsb, bsb2;
 
-__u32 bsb_csum(char *buf, int len)
+static __u32 bsb_csum(char *buf, int len)
 {
        int i;
        int csum = 0;
@@ -432,53 +467,537 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
                           int disks, int chunk, int level, int layout, int data,
                           int dests, int *destfd, unsigned long long *destoffsets);
 
-int freeze_array(struct mdinfo *sra)
+static int freeze_container(struct supertype *st)
+{
+       int container_dev = (st->container_dev != NoMdDev
+                            ? st->container_dev : st->devnum);
+       char *container = devnum2devname(container_dev);
+
+       if (!container) {
+               fprintf(stderr, Name
+                       ": could not determine container name, freeze aborted\n");
+               return -2;
+       }
+
+       if (block_monitor(container, 1)) {
+               fprintf(stderr, Name ": failed to freeze container\n");
+               return -2;
+       }
+
+       return 1;
+}
+
+static void unfreeze_container(struct supertype *st)
+{
+       int container_dev = (st->container_dev != NoMdDev
+                            ? st->container_dev : st->devnum);
+       char *container = devnum2devname(container_dev);
+
+       if (!container) {
+               fprintf(stderr, Name
+                       ": could not determine container name, unfreeze aborted\n");
+               return;
+       }
+
+       unblock_monitor(container, 1);
+}
+
+static int freeze(struct supertype *st)
 {
-       /* Try to freeze resync on this array.
+       /* Try to freeze resync/rebuild on this array/container.
         * Return -1 if the array is busy,
+        * return -2 container cannot be frozen,
         * return 0 if this kernel doesn't support 'frozen'
         * return 1 if it worked.
         */
-       char buf[20];
-       if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
-               return 0;
-       if (strcmp(buf, "idle\n") != 0 &&
-           strcmp(buf, "frozen\n") != 0)
-               return -1;
-       if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
-               return 0;
-       return 1;
+       if (st->ss->external)
+               return freeze_container(st);
+       else {
+               struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
+               int err;
+
+               if (!sra)
+                       return -1;
+               err = sysfs_freeze_array(sra);
+               sysfs_free(sra);
+               return err;
+       }
 }
 
-void unfreeze_array(struct mdinfo *sra, int frozen)
+static void unfreeze(struct supertype *st, int frozen)
 {
        /* If 'frozen' is 1, unfreeze the array */
-       if (frozen > 0)
-               sysfs_set_str(sra, NULL, "sync_action", "idle");
+       if (frozen <= 0)
+               return;
+
+       if (st->ss->external)
+               return unfreeze_container(st);
+       else {
+               struct mdinfo *sra = sysfs_read(-1, st->devnum, GET_VERSION);
+
+               if (sra)
+                       sysfs_set_str(sra, NULL, "sync_action", "idle");
+               else
+                       fprintf(stderr, Name ": failed to unfreeze array\n");
+               sysfs_free(sra);
+       }
 }
 
-void wait_reshape(struct mdinfo *sra)
+static void wait_reshape(struct mdinfo *sra)
 {
        int fd = sysfs_get_fd(sra, NULL, "sync_action");
        char action[20];
 
-       do {
+       if (fd < 0)
+               return;
+
+       while  (sysfs_fd_get_str(fd, action, 20) > 0 &&
+               strncmp(action, "reshape", 7) == 0) {
                fd_set rfds;
                FD_ZERO(&rfds);
                FD_SET(fd, &rfds);
                select(fd+1, NULL, NULL, &rfds, NULL);
-               
-               if (sysfs_fd_get_str(fd, action, 20) < 0) {
-                       close(fd);
-                       return;
+       }
+       close(fd);
+}
+
+static int reshape_super(struct supertype *st, long long size, int level,
+                        int layout, int chunksize, int raid_disks,
+                        char *backup_file, char *dev, int verbose)
+{
+       /* nothing extra to check in the native case */
+       if (!st->ss->external)
+               return 0;
+       if (!st->ss->reshape_super ||
+           !st->ss->manage_reshape) {
+               fprintf(stderr, Name ": %s metadata does not support reshape\n",
+                       st->ss->name);
+               return 1;
+       }
+
+       return st->ss->reshape_super(st, size, level, layout, chunksize,
+                                    raid_disks, backup_file, dev, verbose);
+}
+
+static void sync_metadata(struct supertype *st)
+{
+       if (st->ss->external) {
+               if (st->update_tail)
+                       flush_metadata_updates(st);
+               else
+                       st->ss->sync_metadata(st);
+       }
+}
+
+static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
+{
+       /* when dealing with external metadata subarrays we need to be
+        * prepared to handle EAGAIN.  The kernel may need to wait for
+        * mdmon to mark the array active so the kernel can handle
+        * allocations/writeback when preparing the reshape action
+        * (md_allow_write()).  We temporarily disable safe_mode_delay
+        * to close a race with the array_state going clean before the
+        * next write to raid_disks / stripe_cache_size
+        */
+       char safe[50];
+       int rc;
+
+       /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
+       if (strcmp(name, "raid_disks") != 0 &&
+           strcmp(name, "stripe_cache_size") != 0)
+               return sysfs_set_num(sra, NULL, name, n);
+
+       rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
+       if (rc <= 0)
+               return -1;
+       sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
+       rc = sysfs_set_num(sra, NULL, name, n);
+       if (rc < 0 && errno == EAGAIN) {
+               ping_monitor(container);
+               /* if we get EAGAIN here then the monitor is not active
+                * so stop trying
+                */
+               rc = sysfs_set_num(sra, NULL, name, n);
+       }
+       sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
+       return rc;
+}
+
+static int reshape_container_raid_disks(char *container, int raid_disks)
+{
+       /* for each subarray switch to a raid level that can
+        * support the reshape, and set raid disks
+        */
+       struct mdstat_ent *ent, *e;
+       int changed = 0, rv = 0, err = 0;
+
+       ent = mdstat_read(1, 0);
+       if (!ent) {
+               fprintf(stderr, Name ": unable to read /proc/mdstat\n");
+               return -1;
+       }
+
+       changed = 0;
+       for (e = ent; e; e = e->next) {
+               struct mdinfo *sub;
+               unsigned int cache;
+               int level, takeover_delta = 0;
+
+               if (!is_container_member(e, container))
+                       continue;
+
+               level = map_name(pers, e->level);
+               if (level == 0) {
+                       sub = sysfs_read(-1, e->devnum, GET_VERSION);
+                       if (!sub)
+                               break;
+                       /* metadata records 'orig_level' */
+                       rv = sysfs_set_num(sub, NULL, "level", 4);
+                       if (rv < 0) {
+                               err = errno;
+                               break;
+                       }
+                       /* we want spares to be used for capacity
+                        * expansion, not rebuild
+                        */
+                       takeover_delta = 1;
+
+                       sysfs_free(sub);
+                       level = 4;
                }
-       } while  (strncmp(action, "reshape", 7) == 0);
+
+               sub = NULL;
+               switch (level) {
+               default:
+                       rv = -1;
+                       break;
+               case 4:
+               case 5:
+               case 6:
+                       sub = sysfs_read(-1, e->devnum, GET_CHUNK|GET_CACHE);
+                       if (!sub)
+                               break;
+                       cache = (sub->array.chunk_size / 4096) * 4;
+                       if (cache > sub->cache_size)
+                               rv = subarray_set_num(container, sub,
+                                                     "stripe_cache_size", cache);
+                       if (rv) {
+                               err = errno;
+                               break;
+                       }
+                       /* fall through */
+               case 1:
+                       if (!sub)
+                               sub = sysfs_read(-1, e->devnum, GET_VERSION);
+                       if (!sub)
+                               break;
+
+                       rv = subarray_set_num(container, sub, "raid_disks",
+                                             raid_disks + takeover_delta);
+                       if (rv)
+                               err = errno;
+                       else
+                               changed++;
+                       break;
+               }
+               sysfs_free(sub);
+               if (rv)
+                       break;
+       }
+       free_mdstat(ent);
+       if (rv) {
+               fprintf(stderr, Name
+                       ": failed to initiate container reshape%s%s\n",
+                       err ? ": " : "", err ? strerror(err) : "");
+               return rv;
+       }
+
+       return changed;
 }
-                       
-               
+
+static void revert_container_raid_disks(struct supertype *st, int fd, char *container)
+{
+       /* we failed to prepare all subarrays in the container for
+        * reshape, so cancel the changes and restore the nominal raid
+        * level
+        */
+       struct mdstat_ent *ent, *e;
+
+       ent = mdstat_read(0, 0);
+       if (!ent) {
+               fprintf(stderr, Name
+                       ": failed to read /proc/mdstat while aborting reshape\n");
+               return;
+       }
+
+       if (st->ss->load_container(st, fd, NULL)) {
+               fprintf(stderr, Name
+                       ": failed read metadata while aborting reshape\n");
+               return ;
+       }
+
+
+       for (e = ent; e; e = e->next) {
+               int level_fixed = 0, disks_fixed = 0;
+               struct mdinfo *sub, *prev;
+               char *subarray;
+
+               if (!is_container_member(e, container))
+                       continue;
+
+               subarray = to_subarray(e, container);
+               prev = st->ss->container_content(st, subarray);
+
+               /* changing level might change raid_disks so we do it
+                * first and then check if raid_disks still needs fixing
+                */
+               if (map_name(pers, e->level) != prev->array.level) {
+                       sub = sysfs_read(-1, e->devnum, GET_VERSION);
+                       if (sub &&
+                           !sysfs_set_num(sub, NULL, "level", prev->array.level))
+                               level_fixed = 1;
+                       sysfs_free(sub);
+               } else
+                       level_fixed = 1;
+
+               sub = sysfs_read(-1, e->devnum, GET_DISKS);
+               if (sub && sub->array.raid_disks != prev->array.raid_disks) {
+                       if (!subarray_set_num(container, sub, "raid_disks",
+                                             prev->array.raid_disks))
+                               disks_fixed = 1;
+               } else if (sub)
+                       disks_fixed = 1;
+               sysfs_free(sub);
+
+               if (!disks_fixed || !level_fixed)
+                       fprintf(stderr, Name
+                               ": failed to restore %s to a %d-disk %s array\n",
+                               e->dev, prev->array.raid_disks,
+                               map_num(pers, prev->array.level));
+               free(prev);
+       }
+       st->ss->free_super(st);
+       free_mdstat(ent);
+}
+
+int remove_disks_on_raid10_to_raid0_takeover(struct supertype *st,
+                                            struct mdinfo *sra,
+                                            int layout)
+{
+       int nr_of_copies;
+       struct mdinfo *remaining;
+       int slot;
+
+       nr_of_copies = layout & 0xff;
+
+       remaining = sra->devs;
+       sra->devs = NULL;
+       /* for each 'copy', select one device and remove from the list. */
+       for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
+               struct mdinfo **diskp;
+               int found = 0;
+
+               /* Find a working device to keep */
+               for (diskp =  &remaining; *diskp ; diskp = &(*diskp)->next) {
+                       struct mdinfo *disk = *diskp;
+
+                       if (disk->disk.raid_disk < slot)
+                               continue;
+                       if (disk->disk.raid_disk >= slot + nr_of_copies)
+                               continue;
+                       if (disk->disk.state & (1<<MD_DISK_REMOVED))
+                               continue;
+                       if (disk->disk.state & (1<<MD_DISK_FAULTY))
+                               continue;
+                       if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
+                               continue;
+
+                       /* We have found a good disk to use! */
+                       *diskp = disk->next;
+                       disk->next = sra->devs;
+                       sra->devs = disk;
+                       found = 1;
+                       break;
+               }
+               if (!found)
+                       break;
+       }
+
+       if (slot < sra->array.raid_disks) {
+               /* didn't find all slots */
+               struct mdinfo **e;
+               e = &remaining;
+               while (*e)
+                       e = &(*e)->next;
+               *e = sra->devs;
+               sra->devs = remaining;
+               return 1;
+       }
+
+       /* Remove all 'remaining' devices from the array */
+       while (remaining) {
+               struct mdinfo *sd = remaining;
+               remaining = sd->next;
+
+               sysfs_set_str(sra, sd, "state", "faulty");
+               sysfs_set_str(sra, sd, "slot", "none");
+               sysfs_set_str(sra, sd, "state", "remove");
+               sd->disk.state |= (1<<MD_DISK_REMOVED);
+               sd->disk.state &= ~(1<<MD_DISK_SYNC);
+               sd->next = sra->devs;
+               sra->devs = sd;
+       }
+       return 0;
+}
+
+void reshape_free_fdlist(int *fdlist,
+                        unsigned long long *offsets,
+                        int size)
+{
+       int i;
+
+       for (i = 0; i < size; i++)
+               if (fdlist[i] >= 0)
+                       close(fdlist[i]);
+
+       free(fdlist);
+       free(offsets);
+}
+
+int reshape_prepare_fdlist(char *devname,
+                          struct mdinfo *sra,
+                          int raid_disks,
+                          int nrdisks,
+                          unsigned long blocks,
+                          char *backup_file,
+                          int *fdlist,
+                          unsigned long long *offsets)
+{
+       int d = 0;
+       struct mdinfo *sd;
+
+       for (d = 0; d <= nrdisks; d++)
+               fdlist[d] = -1;
+       d = raid_disks;
+       for (sd = sra->devs; sd; sd = sd->next) {
+               if (sd->disk.state & (1<<MD_DISK_FAULTY))
+                       continue;
+               if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                       fdlist[sd->disk.raid_disk]
+                               = dev_open(dn, O_RDONLY);
+                       offsets[sd->disk.raid_disk] = sd->data_offset*512;
+                       if (fdlist[sd->disk.raid_disk] < 0) {
+                               fprintf(stderr,
+                                       Name ": %s: cannot open component %s\n",
+                                       devname, dn ? dn : "-unknown-");
+                               d = -1;
+                               goto release;
+                       }
+               } else if (backup_file == NULL) {
+                       /* spare */
+                       char *dn = map_dev(sd->disk.major,
+                                          sd->disk.minor, 1);
+                               fdlist[d] = dev_open(dn, O_RDWR);
+                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
+                               if (fdlist[d] < 0) {
+                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
+                                               devname, dn ? dn : "-unknown-");
+                                       d = -1;
+                                       goto release;
+                               }
+                               d++;
+                       }
+               }
+release:
+       return d;
+}
+
+int reshape_open_backup_file(char *backup_file,
+                            int fd,
+                            char *devname,
+                            long blocks,
+                            int *fdlist,
+                            unsigned long long *offsets)
+{
+       /* Return 1 on success, 0 on any form of failure */
+       /* need to check backup file is large enough */
+       char buf[512];
+       struct stat stb;
+       unsigned int dev;
+       int i;
+
+       *fdlist = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
+                      S_IRUSR | S_IWUSR);
+       *offsets = 8 * 512;
+       if (*fdlist < 0) {
+               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+                       devname, backup_file, strerror(errno));
+               return 0;
+       }
+       /* Guard against backup file being on array device.
+        * If array is partitioned or if LVM etc is in the
+        * way this will not notice, but it is better than
+        * nothing.
+        */
+       fstat(*fdlist, &stb);
+       dev = stb.st_dev;
+       fstat(fd, &stb);
+       if (stb.st_rdev == dev) {
+               fprintf(stderr, Name ": backup file must NOT be"
+                       " on the array being reshaped.\n");
+               close(*fdlist);
+               return 0;
+       }
+
+       memset(buf, 0, 512);
+       for (i=0; i < blocks + 1 ; i++) {
+               if (write(*fdlist, buf, 512) != 512) {
+                       fprintf(stderr, Name ": %s: cannot create"
+                               " backup file %s: %s\n",
+                               devname, backup_file, strerror(errno));
+                       return 0;
+               }
+       }
+       if (fsync(*fdlist) != 0) {
+               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
+                       devname, backup_file, strerror(errno));
+               return 0;
+       }
+
+       return 1;
+}
+
+unsigned long compute_backup_blocks(int nchunk, int ochunk,
+                                   unsigned int ndata, unsigned int odata)
+{
+       unsigned long a, b, blocks;
+       /* So how much do we need to backup.
+        * We need an amount of data which is both a whole number of
+        * old stripes and a whole number of new stripes.
+        * So LCM for (chunksize*datadisks).
+        */
+       a = (ochunk/512) * odata;
+       b = (nchunk/512) * ndata;
+       /* Find GCD */
+       while (a != b) {
+               if (a < b)
+                       b -= a;
+               if (b < a)
+                       a -= b;
+       }
+       /* LCM == product / GCD */
+       blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
+
+       return blocks;
+}
+
+
 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                 long long size,
-                int level, char *layout_str, int chunksize, int raid_disks)
+                int level, char *layout_str, int chunksize, int raid_disks,
+                int force)
 {
        /* Make some changes in the shape of an array.
         * The kernel must support the change.
@@ -501,6 +1020,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
        char *c;
        int rv = 0;
        struct supertype *st;
+       char *subarray = NULL;
 
        int nchunk, ochunk;
        int nlayout, olayout;
@@ -510,14 +1030,16 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
        char alt_layout[40];
        int *fdlist;
        unsigned long long *offsets;
-       int d, i;
+       int d;
        int nrdisks;
        int err;
        int frozen;
-       unsigned long a,b, blocks, stripes;
+       unsigned long blocks, stripes;
        unsigned long cache;
        unsigned long long array_size;
        int changed = 0;
+       char *container = NULL;
+       int cfd = -1;
        int done;
 
        struct mdinfo *sra;
@@ -545,15 +1067,99 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        "       Please use a newer kernel\n");
                return 1;
        }
-       sra = sysfs_read(fd, 0, GET_LEVEL);
-       if (sra)
-               frozen = freeze_array(sra);
-       else {
+
+       st = super_by_fd(fd, &subarray);
+       if (!st) {
+               fprintf(stderr, Name ": Unable to determine metadata format for %s\n", devname);
+               return 1;
+       }
+       if (raid_disks > st->max_devs) {
+               fprintf(stderr, Name ": Cannot increase raid-disks on this array"
+                       " beyond %d\n", st->max_devs);
+               return 1;
+       }
+
+       /* in the external case we need to check that the requested reshape is
+        * supported, and perform an initial check that the container holds the
+        * pre-requisite spare devices (mdmon owns final validation)
+        */
+       if (st->ss->external) {
+               int container_dev;
+               int rv;
+
+               if (subarray) {
+                       container_dev = st->container_dev;
+                       cfd = open_dev_excl(st->container_dev);
+               } else if (size >= 0 || layout_str != NULL || chunksize != 0 ||
+                          level != UnSet) {
+                       fprintf(stderr,
+                               Name ": %s is a container, only 'raid-devices' can be changed\n",
+                               devname);
+                       return 1;
+               } else {
+                       container_dev = st->devnum;
+                       close(fd);
+                       cfd = open_dev_excl(st->devnum);
+                       fd = cfd;
+               }
+               if (cfd < 0) {
+                       fprintf(stderr, Name ": Unable to open container for %s\n",
+                               devname);
+                       free(subarray);
+                       return 1;
+               }
+
+               container = devnum2devname(st->devnum);
+               if (!container) {
+                       fprintf(stderr, Name ": Could not determine container name\n");
+                       free(subarray);
+                       return 1;
+               }
+
+               if (subarray)
+                       rv = st->ss->load_container(st, cfd, NULL);
+               else
+                       rv = st->ss->load_super(st, cfd, NULL);
+               if (rv) {
+                       fprintf(stderr, Name ": Cannot read superblock for %s\n",
+                               devname);
+                       free(subarray);
+                       return 1;
+               }
+
+               if (mdmon_running(container_dev))
+                       st->update_tail = &st->updates;
+       } 
+
+       if (raid_disks > array.raid_disks &&
+           array.spare_disks < (raid_disks - array.raid_disks) &&
+           !force) {
+               fprintf(stderr,
+                       Name ": Need %d spare%s to avoid degraded array,"
+                       " and only have %d.\n"
+                       "       Use --force to over-ride this check.\n",
+                       raid_disks - array.raid_disks, 
+                       raid_disks - array.raid_disks == 1 ? "" : "s", 
+                       array.spare_disks);
+               return 1;
+       }
+
+       sra = sysfs_read(fd, 0, GET_LEVEL | GET_DISKS | GET_DEVS | GET_STATE);
+       if (sra) {
+               if (st->ss->external && subarray == NULL) {
+                       array.level = LEVEL_CONTAINER;
+                       sra->array.level = LEVEL_CONTAINER;
+               }
+       } else {
                fprintf(stderr, Name ": failed to read sysfs parameters for %s\n",
                        devname);
                return 1;
        }
-       if (frozen < 0) {
+       frozen = freeze(st);
+       if (frozen < -1) {
+               /* freeze() already spewed the reason */
+               return 1;
+       } else if (frozen < 0) {
                fprintf(stderr, Name ": %s is performing resync/recovery and cannot"
                        " be reshaped\n", devname);
                return 1;
@@ -561,6 +1167,13 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
 
        /* ========= set size =============== */
        if (size >= 0 && (size == 0 || size != array.size)) {
+               long long orig_size = array.size;
+
+               if (reshape_super(st, size, UnSet, UnSet, 0, 0, NULL, devname, !quiet)) {
+                       rv = 1;
+                       goto release;
+               }
+               sync_metadata(st);
                array.size = size;
                if (array.size != size) {
                        /* got truncated to 32bit, write to
@@ -575,6 +1188,11 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        rv = ioctl(fd, SET_ARRAY_INFO, &array);
                if (rv != 0) {
                        int err = errno;
+
+                       /* restore metadata */
+                       if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
+                                         NULL, devname, !quiet) == 0)
+                               sync_metadata(st);
                        fprintf(stderr, Name ": Cannot set device size for %s: %s\n",
                                devname, strerror(err));
                        if (err == EBUSY && 
@@ -591,12 +1209,31 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        fprintf(stderr, Name ": component size of %s has been set to %lluK\n",
                                devname, size);
                changed = 1;
-       } else {
+       } else if (array.level != LEVEL_CONTAINER) {
                size = get_component_size(fd)/2;
                if (size == 0)
                        size = array.size;
        }
 
+       /* ========= check for Raid10 -> Raid0 conversion ===============
+        * current implemenation assumes that following conditions must be met:
+        * - far_copies == 1
+        * - near_copies == 2
+        */
+       if (level == 0 && array.level == 10 &&
+           array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) {
+               int err;
+               err = remove_disks_on_raid10_to_raid0_takeover(st, sra, array.layout);
+               if (err) {
+                       dprintf(Name": Array cannot be reshaped\n");
+                       if (container)
+                               free(container);
+                       if (cfd > -1)
+                               close(cfd);
+                       return 1;
+               }
+       }
+
        /* ======= set level =========== */
        if (level != UnSet && level != array.level) {
                /* Trying to change the level.
@@ -674,11 +1311,48 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        } else
                                layout_str = "parity-last";
                } else {
+                       /* Level change is a simple takeover.  In the external
+                        * case we don't check with the metadata handler until
+                        * we establish what the final layout will be.  If the
+                        * level change is disallowed we will revert to
+                        * orig_level without disturbing the metadata, otherwise
+                        * we will send an update.
+                        */
                        c = map_num(pers, level);
                        if (c == NULL) {
                                rv = 1;/* not possible */
                                goto release;
                        }
+                       if (!force) {
+                               /* Need to check there are enough spares */
+                               int spares_needed = 0;
+                               switch (array.level * 16 + level) {
+                               case 0x05:
+                                       spares_needed = 1; break;
+                               case 0x06:
+                                       spares_needed = 2; break;
+                               case 0x15:
+                                       spares_needed = 1; break;
+                               case 0x16:
+                                       spares_needed = 2; break;
+                               case 0x56:
+                                       spares_needed = 1; break;
+                               }
+                               if (raid_disks > array.raid_disks)
+                                       spares_needed += raid_disks-array.raid_disks;
+                               if (spares_needed > array.spare_disks) {
+                                       fprintf(stderr,
+                                               Name ": Need %d spare%s to avoid"
+                                               " degraded array, and only have %d.\n"
+                                               "       Use --force to over-ride"
+                                               " this check.\n",
+                                               spares_needed,
+                                               spares_needed == 1 ? "" : "s", 
+                                               array.spare_disks);
+                                       rv = 1;
+                                       goto release;
+                               }
+                       }
                        err = sysfs_set_str(sra, NULL, "level", c);
                        if (err) {
                                err = errno;
@@ -706,9 +1380,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
 
        /* ========= set shape (chunk_size / layout / ndisks)  ============== */
        /* Check if layout change is a no-op */
-       if (layout_str) switch(array.level) {
+       switch (array.level) {
        case 5:
-               if (array.layout == map_name(r5layout, layout_str))
+               if (layout_str && array.layout == map_name(r5layout, layout_str))
                        layout_str = NULL;
                break;
        case 6:
@@ -724,8 +1398,9 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        rv = 1;
                        goto release;
                }
-               if (strcmp(layout_str, "normalise") == 0 ||
-                   strcmp(layout_str, "normalize") == 0) {
+               if (layout_str &&
+                   (strcmp(layout_str, "normalise") == 0 ||
+                    strcmp(layout_str, "normalize") == 0)) {
                        char *hyphen;
                        strcpy(alt_layout, map_num(r6layout, array.layout));
                        hyphen = strrchr(alt_layout, '-');
@@ -735,7 +1410,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        }
                }
 
-               if (array.layout == map_name(r6layout, layout_str))
+               if (layout_str && array.layout == map_name(r6layout, layout_str))
                        layout_str = NULL;
                if (layout_str && strcmp(layout_str, "preserve") == 0)
                        layout_str = NULL;
@@ -744,6 +1419,11 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
        if (layout_str == NULL
            && (chunksize == 0 || chunksize*1024 == array.chunk_size)
            && (raid_disks == 0 || raid_disks == array.raid_disks)) {
+               if (reshape_super(st, -1, level, UnSet, 0, 0, NULL, devname, !quiet)) {
+                       rv = 1;
+                       goto release;
+               }
+               sync_metadata(st);
                rv = 0;
                if (level != UnSet && level != array.level) {
                        /* Looks like this level change doesn't need
@@ -765,18 +1445,69 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                } else if (!changed && !quiet)
                        fprintf(stderr, Name ": %s: no change requested\n",
                                devname);
+
+               if (st->ss->external && !mdmon_running(st->container_dev) &&
+                   level > 0) {
+                       start_mdmon(st->container_dev);
+                       ping_monitor(container);
+               }
                goto release;
        }
 
        c = map_num(pers, array.level);
        if (c == NULL) c = "-unknown-";
-       switch(array.level) {
+       switch (array.level) {
        default: /* raid0, linear, multipath cannot be reconfigured */
                fprintf(stderr, Name ": %s array %s cannot be reshaped.\n",
                        c, devname);
+               /* TODO raid0 raiddisks can be reshaped via raid4 */
                rv = 1;
                break;
+       case LEVEL_CONTAINER: {
+               int count;
 
+               /* double check that we are not changing anything but raid_disks */
+               if (size >= 0 || layout_str != NULL || chunksize != 0 || level != UnSet) {
+                       fprintf(stderr,
+                               Name ": %s is a container, only 'raid-devices' can be changed\n",
+                               devname);
+                       rv = 1;
+                       goto release;
+               }
+
+               st->update_tail = &st->updates;
+               if (reshape_super(st, -1, UnSet, UnSet, 0, raid_disks,
+                                 backup_file, devname, !quiet)) {
+                       rv = 1;
+                       goto release;
+               }
+
+               count = reshape_container_raid_disks(container, raid_disks);
+               if (count < 0) {
+                       revert_container_raid_disks(st, fd, container);
+                       rv = 1;
+                       goto release;
+               } else if (count == 0) {
+                       if (!quiet)
+                               fprintf(stderr, Name
+                                       ": no active subarrays to reshape\n");
+                       goto release;
+               }
+
+               if (!mdmon_running(st->devnum)) {
+                       start_mdmon(st->devnum);
+                       ping_monitor(container);
+               }
+               sync_metadata(st);
+
+               /* give mdmon a chance to allocate spares */
+               ping_manager(container);
+
+               /* manage_reshape takes care of releasing the array(s) */
+               st->ss->manage_reshape(st, backup_file);
+               frozen = 0;
+               goto release;
+       }
        case LEVEL_FAULTY: /* only 'layout' change is permitted */
 
                if (chunksize  || raid_disks) {
@@ -812,6 +1543,12 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        break;
                }
                if (raid_disks > 0) {
+                       if (reshape_super(st, -1, UnSet, UnSet, 0, raid_disks,
+                                         NULL, devname, !quiet)) {
+                               rv = 1;
+                               goto release;
+                       }
+                       sync_metadata(st);
                        array.raid_disks = raid_disks;
                        if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
                                fprintf(stderr, Name ": Cannot set raid-devices for %s: %s\n",
@@ -829,7 +1566,10 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                 * layout/chunksize/raid_disks can be changed
                 * though the kernel may not support it all.
                 */
-               st = super_by_fd(fd);
+               if (subarray) {
+                       fprintf(stderr, Name ": Cannot reshape subarrays yet\n");
+                       break;
+               }
 
                /*
                 * There are three possibilities.
@@ -932,22 +1672,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        break;
                }
 
-               /* So how much do we need to backup.
-                * We need an amount of data which is both a whole number of
-                * old stripes and a whole number of new stripes.
-                * So LCM for (chunksize*datadisks).
-                */
-               a = (ochunk/512) * odata;
-               b = (nchunk/512) * ndata;
-               /* Find GCD */
-               while (a != b) {
-                       if (a < b)
-                               b -= a;
-                       if (b < a)
-                               a -= b;
-               }
-               /* LCM == product / GCD */
-               blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
+               blocks = compute_backup_blocks(nchunk, ochunk, ndata, odata);
 
                sysfs_free(sra);
                sra = sysfs_read(fd, 0,
@@ -989,40 +1714,21 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        rv = 1;
                        break;
                }
-               for (d=0; d <= nrdisks; d++)
-                       fdlist[d] = -1;
-               d = array.raid_disks;
-               for (sd = sra->devs; sd; sd=sd->next) {
-                       if (sd->disk.state & (1<<MD_DISK_FAULTY))
-                               continue;
-                       if (sd->disk.state & (1<<MD_DISK_SYNC)) {
-                               char *dn = map_dev(sd->disk.major,
-                                                  sd->disk.minor, 1);
-                               fdlist[sd->disk.raid_disk]
-                                       = dev_open(dn, O_RDONLY);
-                               offsets[sd->disk.raid_disk] = sd->data_offset*512;
-                               if (fdlist[sd->disk.raid_disk] < 0) {
-                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
-                                               devname, dn?dn:"-unknown-");
-                                       rv = 1;
-                                       goto release;
-                               }
-                       } else if (backup_file == NULL) {
-                               /* spare */
-                               char *dn = map_dev(sd->disk.major,
-                                                  sd->disk.minor, 1);
-                               fdlist[d] = dev_open(dn, O_RDWR);
-                               offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
-                               if (fdlist[d]<0) {
-                                       fprintf(stderr, Name ": %s: cannot open component %s\n",
-                                               devname, dn?dn:"-unknown");
-                                       rv = 1;
-                                       goto release;
-                               }
-                               d++;
-                       }
+
+               d = reshape_prepare_fdlist(devname, sra, array.raid_disks,
+                                          nrdisks, blocks, backup_file,
+                                          fdlist, offsets);
+               if (d < 0) {
+                       rv = 1;
+                       goto release;
                }
                if (backup_file == NULL) {
+                       if (st->ss->external && !st->ss->manage_reshape) {
+                               fprintf(stderr, Name ": %s Grow operation not supported by %s metadata\n",
+                                       devname, st->ss->name);
+                               rv = 1;
+                               break;
+                       }
                        if (ndata <= odata) {
                                fprintf(stderr, Name ": %s: Cannot grow - need backup-file\n",
                                        devname);
@@ -1042,35 +1748,22 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                break;
                        }
                } else {
-                       /* need to check backup file is large enough */
-                       char buf[512];
-                       fdlist[d] = open(backup_file, O_RDWR|O_CREAT|O_EXCL,
-                                    S_IRUSR | S_IWUSR);
-                       offsets[d] = 8 * 512;
-                       if (fdlist[d] < 0) {
-                               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                       devname, backup_file, strerror(errno));
-                               rv = 1;
-                               break;
-                       }
-                       memset(buf, 0, 512);
-                       for (i=0; i < (signed)blocks + 1 ; i++) {
-                               if (write(fdlist[d], buf, 512) != 512) {
-                                       fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                               devname, backup_file, strerror(errno));
-                                       rv = 1;
-                                       break;
-                               }
-                       }
-                       if (fsync(fdlist[d]) != 0) {
-                               fprintf(stderr, Name ": %s: cannot create backup file %s: %s\n",
-                                       devname, backup_file, strerror(errno));
+                       if (!reshape_open_backup_file(backup_file, fd, devname,
+                                                     (signed)blocks,
+                                                     fdlist+d, offsets+d)) {
                                rv = 1;
                                break;
                        }
                        d++;
                }
 
+               /* check that the operation is supported by the metadata */
+               if (reshape_super(st, -1, level, nlayout, nchunk, ndisks,
+                                 backup_file, devname, !quiet)) {
+                       rv = 1;
+                       break;
+               }
+
                /* lastly, check that the internal stripe cache is
                 * large enough, or it won't work.
                 */
@@ -1087,6 +1780,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                 * If only changing raid_disks, use ioctl, else use
                 * sysfs.
                 */
+               sync_metadata(st);
                if (ochunk == nchunk && olayout == nlayout) {
                        array.raid_disks = ndisks;
                        if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
@@ -1135,6 +1829,14 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        break;
                }
 
+               if (st->ss->external) {
+                       /* metadata handler takes it from here */
+                       ping_manager(container);
+                       st->ss->manage_reshape(st, backup_file);
+                       frozen = 0;
+                       break;
+               }
+
                /* set up the backup-super-block.  This requires the
                 * uuid from the array.
                 */
@@ -1236,8 +1938,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
                        fprintf(stderr, Name ": aborting level change\n");
        }
-       if (sra)
-               unfreeze_array(sra, frozen);
+       unfreeze(st, frozen);
        return rv;
 }
 
@@ -1266,7 +1967,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
  */
 
 /* FIXME return status is never checked */
-int grow_backup(struct mdinfo *sra,
+static int grow_backup(struct mdinfo *sra,
                unsigned long long offset, /* per device */
                unsigned long stripes, /* per device */
                int *sources, unsigned long long *offsets,
@@ -1381,7 +2082,7 @@ int grow_backup(struct mdinfo *sra,
  * every works.
  */
 /* FIXME return value is often ignored */
-int wait_backup(struct mdinfo *sra,
+static int wait_backup(struct mdinfo *sra,
                unsigned long long offset, /* per device */
                unsigned long long blocks, /* per device */
                unsigned long long blocks2, /* per device - hack */
@@ -1401,7 +2102,12 @@ int wait_backup(struct mdinfo *sra,
        sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
        if (offset == 0)
                sysfs_set_str(sra, NULL, "sync_action", "reshape");
-       do {
+
+       if (sysfs_fd_get_ll(fd, &completed) < 0) {
+               close(fd);
+               return -1;
+       }
+       while (completed < offset + blocks) {
                char action[20];
                fd_set rfds;
                FD_ZERO(&rfds);
@@ -1415,7 +2121,7 @@ int wait_backup(struct mdinfo *sra,
                                  action, 20) > 0 &&
                    strncmp(action, "reshape", 7) != 0)
                        break;
-       } while (completed < offset + blocks);
+       }
        close(fd);
 
        if (part) {
@@ -1728,7 +2434,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (st->ss->load_super(st, fd, NULL))
                                continue;
 
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        st->ss->free_super(st);
 
                        if (lseek64(fd,
@@ -1846,7 +2552,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                /* FIXME should be this be an error */
                                continue;
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        st->ss->free_super(st);
                        offsets[j] = dinfo.data_offset * 512;
                }
@@ -1908,7 +2614,7 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
                        if (fdlist[j] < 0) continue;
                        if (st->ss->load_super(st, fdlist[j], NULL))
                                continue;
-                       st->ss->getinfo_super(st, &dinfo);
+                       st->ss->getinfo_super(st, &dinfo, NULL);
                        dinfo.reshape_progress = info->reshape_progress;
                        st->ss->update_super(st, &dinfo,
                                             "_reshape_progress",
@@ -2048,6 +2754,11 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
        bsb.devstart2 = blocks;
 
        backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
+       if (backup_fd < 0) {
+               fprintf(stderr, Name ": Cannot open backup file %s\n",
+                       backup_file ?: "- no backup-file given");
+               return 1;
+       }
        backup_list[0] = backup_fd;
        backup_offsets[0] = 8 * 512;
        fds = malloc(odisks * sizeof(fds[0]));
index 4d3d181b10fbac9cce3e0dc119120925830fc5d4..bc4531a58faff5bea2ec2334ae8b8c4815290891 100644 (file)
  */
 
 #include       "mdadm.h"
+#include       <dirent.h>
+#include       <ctype.h>
 
-static int count_active(struct supertype *st, int mdfd, char **availp,
+static int count_active(struct supertype *st, struct mdinfo *sra,
+                       int mdfd, char **availp,
                        struct mdinfo *info);
 static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        int number, __u64 events, int verbose,
                        char *array_name);
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                    struct map_ent *target,
+                    struct supertype *st, int verbose);
+
+static int Incremental_container(struct supertype *st, char *devname,
+                                char *homehost,
+                                int verbose, int runstop, int autof);
+
+static struct mddev_ident *search_mdstat(struct supertype *st,
+                                          struct mdinfo *info,
+                                          char *devname,
+                                          int verbose, int *rvp);
 
 int Incremental(char *devname, int verbose, int runstop,
                struct supertype *st, char *homehost, int require_homehost,
@@ -78,20 +93,59 @@ int Incremental(char *devname, int verbose, int runstop,
         *   start the array (auto-readonly).
         */
        struct stat stb;
-       struct mdinfo info;
-       struct mddev_ident_s *array_list, *match;
+       struct mdinfo info, dinfo;
+       struct mdinfo *sra = NULL, *d;
+       struct mddev_ident *match;
        char chosen_name[1024];
-       int rv;
+       int rv = 1;
        struct map_ent *mp, *map = NULL;
-       int dfd, mdfd;
-       char *avail;
+       int dfd = -1, mdfd = -1;
+       char *avail = NULL;
        int active_disks;
-       int trustworthy = FOREIGN;
+       int trustworthy;
        char *name_to_use;
        mdu_array_info_t ainf;
+       struct dev_policy *policy = NULL;
+       struct map_ent target_array;
+       int have_target;
 
        struct createinfo *ci = conf_get_create_info();
 
+       if (stat(devname, &stb) < 0) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": stat failed for %s: %s.\n",
+                               devname, strerror(errno));
+               return rv;
+       }
+       if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": %s is not a block device.\n",
+                               devname);
+               return rv;
+       }
+       dfd = dev_open(devname, O_RDONLY|O_EXCL);
+       if (dfd < 0) {
+               if (verbose >= 0)
+                       fprintf(stderr, Name ": cannot open %s: %s.\n",
+                               devname, strerror(errno));
+               return rv;
+       }
+       /* If the device is a container, we do something very different */
+       if (must_be_container(dfd)) {
+               if (!st)
+                       st = super_by_fd(dfd, NULL);
+               if (st && st->ss->load_container)
+                       rv = st->ss->load_container(st, dfd, NULL);
+
+               close(dfd);
+               if (!rv && st->ss->container_content)
+                       return Incremental_container(st, devname, homehost,
+                                                    verbose, runstop, autof);
+
+               fprintf(stderr, Name ": %s is not part of an md array.\n",
+                       devname);
+               return rv;
+       }
 
        /* 1/ Check if device is permitted by mdadm.conf */
 
@@ -100,117 +154,61 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                                ": %s not permitted by mdadm.conf.\n",
                                devname);
-               return 1;
+               goto out;
        }
 
        /* 2/ Find metadata, reject if none appropriate (check
         *            version/name from args) */
 
-       dfd = dev_open(devname, O_RDONLY|O_EXCL);
-       if (dfd < 0) {
-               if (verbose >= 0)
-                       fprintf(stderr, Name ": cannot open %s: %s.\n",
-                               devname, strerror(errno));
-               return 1;
-       }
        if (fstat(dfd, &stb) < 0) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": fstat failed for %s: %s.\n",
                                devname, strerror(errno));
-               close(dfd);
-               return 1;
+               goto out;
        }
        if ((stb.st_mode & S_IFMT) != S_IFBLK) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": %s is not a block device.\n",
                                devname);
-               close(dfd);
-               return 1;
+               goto out;
        }
 
+       dinfo.disk.major = major(stb.st_rdev);
+       dinfo.disk.minor = minor(stb.st_rdev);
+
+       policy = disk_policy(&dinfo);
+       have_target = policy_check_path(&dinfo, &target_array);
+
        if (st == NULL && (st = guess_super(dfd)) == NULL) {
                if (verbose >= 0)
                        fprintf(stderr, Name
                                ": no recognisable superblock on %s.\n",
                                devname);
-               close(dfd);
-               return 1;
+               rv = try_spare(devname, &dfd, policy,
+                              have_target ? &target_array : NULL,
+                              st, verbose);
+               goto out;
        }
-       if (st->ss->load_super(st, dfd, NULL)) {
+       if (st->ss->compare_super == NULL ||
+           st->ss->load_super(st, dfd, NULL)) {
                if (verbose >= 0)
                        fprintf(stderr, Name ": no RAID superblock on %s.\n",
                                devname);
-               close(dfd);
-               return 1;
+               rv = try_spare(devname, &dfd, policy,
+                              have_target ? &target_array : NULL,
+                              st, verbose);
+               free(st);
+               goto out;
        }
-       close (dfd);
+       close (dfd); dfd = -1;
 
        memset(&info, 0, sizeof(info));
-       st->ss->getinfo_super(st, &info);
-       /* 3/ Check if there is a match in mdadm.conf */
-
-       array_list = conf_get_ident(NULL);
-       match = NULL;
-       for (; array_list; array_list = array_list->next) {
-               if (array_list->uuid_set &&
-                   same_uuid(array_list->uuid, info.uuid, st->ss->swapuuid)
-                   == 0) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": UUID differs from %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->name[0] &&
-                   strcasecmp(array_list->name, info.name) != 0) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Name differs from %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->devices &&
-                   !match_oneof(array_list->devices, devname)) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Not a listed device for %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (array_list->super_minor != UnSet &&
-                   array_list->super_minor != info.array.md_minor) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                                       ": Different super-minor to %s.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               if (!array_list->uuid_set &&
-                   !array_list->name[0] &&
-                   !array_list->devices &&
-                   array_list->super_minor == UnSet) {
-                       if (verbose >= 2 && array_list->devname)
-                               fprintf(stderr, Name
-                            ": %s doesn't have any identifying information.\n",
-                                       array_list->devname);
-                       continue;
-               }
-               /* FIXME, should I check raid_disks and level too?? */
+       st->ss->getinfo_super(st, &info, NULL);
 
-               if (match) {
-                       if (verbose >= 0) {
-                               if (match->devname && array_list->devname)
-                                       fprintf(stderr, Name
-                  ": we match both %s and %s - cannot decide which to use.\n",
-                                               match->devname, array_list->devname);
-                               else
-                                       fprintf(stderr, Name
-                                               ": multiple lines in mdadm.conf match\n");
-                       }
-                       return 2;
-               }
-               match = array_list;
-       }
+       /* 3/ Check if there is a match in mdadm.conf */
+       match = search_mdstat(st, &info, devname, verbose, &rv);
+       if (!match && rv == 2)
+               goto out;
 
        if (match && match->devname
            && strcasecmp(match->devname, "<ignore>") == 0) {
@@ -218,7 +216,7 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name ": array containing %s is explicitly"
                                " ignored by mdadm.conf\n",
                                devname);
-               return 1;
+               goto out;
        }
 
        /* 3a/ if not, check for homehost match.  If no match, continue
@@ -235,14 +233,14 @@ int Incremental(char *devname, int verbose, int runstop,
                trustworthy = FOREIGN;
 
 
-       if (!match && !conf_test_metadata(st->ss->name,
+       if (!match && !conf_test_metadata(st->ss->name, policy,
                                          (trustworthy == LOCAL))) {
                if (verbose >= 1)
                        fprintf(stderr, Name
                                ": %s has metadata type %s for which "
                                "auto-assembly is disabled\n",
                                devname, st->ss->name);
-               return 1;
+               goto out;
        }
        if (trustworthy == LOCAL_ANY)
                trustworthy = LOCAL;
@@ -257,23 +255,6 @@ int Incremental(char *devname, int verbose, int runstop,
        if (autof == 0)
                autof = ci->autof;
 
-       if (st->ss->container_content && st->loaded_container) {
-               if ((runstop > 0 && info.container_enough >= 0) ||
-                   info.container_enough > 0)
-                       /* pass */;
-               else {
-                       if (verbose)
-                               fprintf(stderr, Name ": not enough devices to start the container\n");
-                       return 0;
-               }
-
-               /* This is a pre-built container array, so we do something
-                * rather different.
-                */
-               return Incremental_container(st, devname, verbose, runstop,
-                                            autof, trustworthy);
-       }
-
        name_to_use = info.name;
        if (name_to_use[0] == 0 &&
            info.array.level == LEVEL_CONTAINER &&
@@ -304,23 +285,21 @@ int Incremental(char *devname, int verbose, int runstop,
                mdfd = -1;
 
        if (mdfd < 0) {
-               struct mdinfo *sra;
-               struct mdinfo dinfo;
 
                /* Couldn't find an existing array, maybe make a new one */
                mdfd = create_mddev(match ? match->devname : NULL,
                                    name_to_use, autof, trustworthy, chosen_name);
 
                if (mdfd < 0)
-                       return 1;
+                       goto out;
 
                sysfs_init(&info, mdfd, 0);
 
                if (set_array_info(mdfd, st, &info) != 0) {
                        fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                chosen_name, strerror(errno));
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
 
                dinfo = info;
@@ -330,10 +309,12 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, 0);
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), GET_DEVS);
+               sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                           GET_OFFSET | GET_SIZE));
+       
                if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
                        /* It really should be 'none' - must be old buggy
                         * kernel, and mdadm -I may not be able to complete.
@@ -343,12 +324,11 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                      ": You have an old buggy kernel which cannot support\n"
                                "      --incremental reliably.  Aborting.\n");
-                       close(mdfd);
                        sysfs_free(sra);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
                info.array.working_disks = 1;
-               sysfs_free(sra);
                /* 6/ Make sure /var/run/mdadm.map contains this array. */
                map_update(&map, fd2devnum(mdfd),
                           info.text_version,
@@ -361,10 +341,12 @@ int Incremental(char *devname, int verbose, int runstop,
                char dn[20];
                int dfd2;
                int err;
-               struct mdinfo *sra;
                struct supertype *st2;
                struct mdinfo info2, *d;
 
+               sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                           GET_OFFSET | GET_SIZE));
+       
                if (mp->path)
                        strcpy(chosen_name, mp->path);
                else
@@ -373,11 +355,14 @@ int Incremental(char *devname, int verbose, int runstop,
                /* It is generally not OK to add non-spare drives to a
                 * running array as they are probably missing because
                 * they failed.  However if runstop is 1, then the
-                * array was possibly started early and our best be is
-                * to add this anyway.  It would probably be good to
-                * allow explicit policy statement about this.
+                * array was possibly started early and our best bet is
+                * to add this anyway.
+                * Also if action policy is re-add or better we allow
+                * re-add
                 */
                if ((info.disk.state & (1<<MD_DISK_SYNC)) != 0
+                   && ! policy_action_allows(policy, st->ss->name,
+                                             act_re_add)
                    && runstop < 1) {
                        int active = 0;
                        
@@ -392,14 +377,14 @@ int Incremental(char *devname, int verbose, int runstop,
                                fprintf(stderr, Name
                                        ": not adding %s to active array (without --run) %s\n",
                                        devname, chosen_name);
-                               close(mdfd);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), (GET_DEVS | GET_STATE));
-               if (!sra)
-                       return 2;
-
+               if (!sra) {
+                       rv = 2;
+                       goto out;
+               }
                if (sra->devs) {
                        sprintf(dn, "%d:%d", sra->devs->disk.major,
                                sra->devs->disk.minor);
@@ -411,13 +396,13 @@ int Incremental(char *devname, int verbose, int runstop,
                                        ": metadata mismatch between %s and "
                                        "chosen array %s\n",
                                        devname, chosen_name);
-                               close(mdfd);
                                close(dfd2);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                        close(dfd2);
                        memset(&info2, 0, sizeof(info2));
-                       st2->ss->getinfo_super(st2, &info2);
+                       st2->ss->getinfo_super(st2, &info2, NULL);
                        st2->ss->free_super(st2);
                        if (info.array.level != info2.array.level ||
                            memcmp(info.uuid, info2.uuid, 16) != 0 ||
@@ -425,8 +410,8 @@ int Incremental(char *devname, int verbose, int runstop,
                                fprintf(stderr, Name
                                        ": unexpected difference between %s and %s.\n",
                                        chosen_name, devname);
-                               close(mdfd);
-                               return 2;
+                               rv = 2;
+                               goto out;
                        }
                }
                info2.disk.major = major(stb.st_rdev);
@@ -446,8 +431,8 @@ int Incremental(char *devname, int verbose, int runstop,
                if (err < 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
-                       close(mdfd);
-                       return 2;
+                       rv = 2;
+                       goto out;
                }
                info.array.working_disks = 0;
                for (d = sra->devs; d; d=d->next)
@@ -467,6 +452,7 @@ int Incremental(char *devname, int verbose, int runstop,
                                chosen_name, info.array.working_disks);
                wait_for(chosen_name, mdfd);
                close(mdfd);
+               sysfs_free(sra);
                rv = Incremental(chosen_name, verbose, runstop,
                                 NULL, homehost, require_homehost, autof);
                if (rv == 1)
@@ -476,21 +462,26 @@ int Incremental(char *devname, int verbose, int runstop,
                        rv = 0;
                return rv;
        }
-       avail = NULL;
-       active_disks = count_active(st, mdfd, &avail, &info);
+
+       /* We have added something to the array, so need to re-read the
+        * state.  Eventually this state should be kept up-to-date as
+        * things change.
+        */
+       sysfs_free(sra);
+       sra = sysfs_read(mdfd, -1, (GET_DEVS | GET_STATE |
+                                   GET_OFFSET | GET_SIZE));
+       active_disks = count_active(st, sra, mdfd, &avail, &info);
        if (enough(info.array.level, info.array.raid_disks,
                   info.array.layout, info.array.state & 1,
                   avail, active_disks) == 0) {
-               free(avail);
                if (verbose >= 0)
                        fprintf(stderr, Name
                             ": %s attached to %s, not enough to start (%d).\n",
                                devname, chosen_name, active_disks);
                map_unlock(&map);
-               close(mdfd);
-               return 0;
+               rv = 0;
+               goto out;
        }
-       free(avail);
 
        /* 7b/ if yes, */
        /* - if number of OK devices match expected, or -R and there */
@@ -503,14 +494,14 @@ int Incremental(char *devname, int verbose, int runstop,
                        fprintf(stderr, Name
                           ": %s attached to %s which is already active.\n",
                                devname, chosen_name);
-               close(mdfd);
                map_unlock(&map);
-               return 0;
+               rv = 0;
+               goto out;
        }
 
        map_unlock(&map);
        if (runstop > 0 || active_disks >= info.array.working_disks) {
-               struct mdinfo *sra;
+               struct mdinfo *dsk;
                /* Let's try to start it */
                if (match && match->bitmap_file) {
                        int bmfd = open(match->bitmap_file, O_RDWR);
@@ -518,20 +509,24 @@ int Incremental(char *devname, int verbose, int runstop,
                                fprintf(stderr, Name
                                        ": Could not open bitmap file %s.\n",
                                        match->bitmap_file);
-                               close(mdfd);
-                               return 1;
+                               goto out;
                        }
                        if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
                                close(bmfd);
                                fprintf(stderr, Name
                                        ": Failed to set bitmapfile for %s.\n",
                                        chosen_name);
-                               close(mdfd);
-                               return 1;
+                               goto out;
                        }
                        close(bmfd);
                }
-               sra = sysfs_read(mdfd, fd2devnum(mdfd), 0);
+               /* Need to remove from the array any devices which
+                * 'count_active' discerned were too old or inappropriate
+                */
+               for (d = sra ? sra->devs : NULL ; d ; d = d->next)
+                       if (d->disk.state & (1<<MD_DISK_REMOVED))
+                               remove_disk(mdfd, st, sra, d);
+
                if ((sra == NULL || active_disks >= info.array.working_disks)
                    && trustworthy != FOREIGN)
                        rv = ioctl(mdfd, RUN_ARRAY, NULL);
@@ -541,10 +536,23 @@ int Incremental(char *devname, int verbose, int runstop,
                if (rv == 0) {
                        if (verbose >= 0)
                                fprintf(stderr, Name
-                          ": %s attached to %s, which has been started.\n",
+                                       ": %s attached to %s, which has been started.\n",
                                        devname, chosen_name);
                        rv = 0;
                        wait_for(chosen_name, mdfd);
+                       /* We just started the array, so some devices
+                        * might have been evicted from the array
+                        * because their event counts were too old.
+                        * If the action=re-add policy is in-force for
+                        * those devices we should re-add them now.
+                        */
+                       for (dsk = sra->devs; dsk ; dsk = dsk->next) {
+                               if (disk_action_allows(dsk, st->ss->name, act_re_add) &&
+                                   add_disk(mdfd, st, sra, dsk) == 0)
+                                       fprintf(stderr, Name
+                                               ": %s re-added to %s\n",
+                                               dsk->sys_name, chosen_name);
+                       }
                } else {
                        fprintf(stderr, Name
                              ": %s attached to %s, but failed to start: %s.\n",
@@ -558,10 +566,92 @@ int Incremental(char *devname, int verbose, int runstop,
                                devname, chosen_name);
                rv = 0;
        }
-       close(mdfd);
+out:
+       free(avail);
+       if (dfd >= 0)
+               close(dfd);
+       if (mdfd >= 0)
+               close(mdfd);
+       if (policy)
+               dev_policy_free(policy);
+       if (sra)
+               sysfs_free(sra);
        return rv;
 }
 
+static struct mddev_ident *search_mdstat(struct supertype *st,
+                                          struct mdinfo *info,
+                                          char *devname,
+                                          int verbose, int *rvp)
+{
+       struct mddev_ident *array_list, *match;
+       array_list = conf_get_ident(NULL);
+       match = NULL;
+       for (; array_list; array_list = array_list->next) {
+               if (array_list->uuid_set &&
+                   same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid)
+                   == 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": UUID differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->name[0] &&
+                   strcasecmp(array_list->name, info->name) != 0) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Name differs from %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->devices &&
+                   !match_oneof(array_list->devices, devname)) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Not a listed device for %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (array_list->super_minor != UnSet &&
+                   array_list->super_minor != info->array.md_minor) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": Different super-minor to %s.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               if (!array_list->uuid_set &&
+                   !array_list->name[0] &&
+                   !array_list->devices &&
+                   array_list->super_minor == UnSet) {
+                       if (verbose >= 2 && array_list->devname)
+                               fprintf(stderr, Name
+                                       ": %s doesn't have any identifying information.\n",
+                                       array_list->devname);
+                       continue;
+               }
+               /* FIXME, should I check raid_disks and level too?? */
+
+               if (match) {
+                       if (verbose >= 0) {
+                               if (match->devname && array_list->devname)
+                                       fprintf(stderr, Name
+                                               ": we match both %s and %s - cannot decide which to use.\n",
+                                               match->devname, array_list->devname);
+                               else
+                                       fprintf(stderr, Name
+                                               ": multiple lines in mdadm.conf match\n");
+                       }
+                       *rvp = 2;
+                       match = NULL;
+                       break;
+               }
+               match = array_list;
+       }
+       return match;
+}
+
 static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        int number, __u64 events, int verbose,
                        char *array_name)
@@ -588,7 +678,7 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
                        close(dfd);
                        continue;
                }
-               st->ss->getinfo_super(st, &info);
+               st->ss->getinfo_super(st, &info, NULL);
                st->ss->free_super(st);
                close(dfd);
 
@@ -606,20 +696,28 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
        }
 }
 
-static int count_active(struct supertype *st, int mdfd, char **availp,
+static int count_active(struct supertype *st, struct mdinfo *sra,
+                       int mdfd, char **availp,
                        struct mdinfo *bestinfo)
 {
        /* count how many devices in sra think they are active */
        struct mdinfo *d;
-       int cnt = 0, cnt1 = 0;
+       int cnt = 0;
        __u64 max_events = 0;
-       struct mdinfo *sra = sysfs_read(mdfd, -1, GET_DEVS | GET_STATE);
        char *avail = NULL;
+       int *best;
+       char *devmap = NULL;
+       int numdevs = 0;
+       int devnum;
+       int b, i;
+       int raid_disks = 0;
 
        if (!sra)
                return 0;
 
-       for (d = sra->devs ; d ; d = d->next) {
+       for (d = sra->devs ; d ; d = d->next)
+               numdevs++;
+       for (d = sra->devs, devnum=0 ; d ; d = d->next, devnum++) {
                char dn[30];
                int dfd;
                int ok;
@@ -633,15 +731,21 @@ static int count_active(struct supertype *st, int mdfd, char **availp,
                close(dfd);
                if (ok != 0)
                        continue;
-               st->ss->getinfo_super(st, &info);
+               info.array.raid_disks = raid_disks;
+               st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
                if (!avail) {
-                       avail = malloc(info.array.raid_disks);
+                       raid_disks = info.array.raid_disks;
+                       avail = calloc(raid_disks, 1);
                        if (!avail) {
                                fprintf(stderr, Name ": out of memory.\n");
                                exit(1);
                        }
-                       memset(avail, 0, info.array.raid_disks);
                        *availp = avail;
+
+                       best = calloc(raid_disks, sizeof(int));
+                       devmap = calloc(raid_disks * numdevs, 1);
+
+                       st->ss->getinfo_super(st, &info, devmap);
                }
 
                if (info.disk.state & (1<<MD_DISK_SYNC))
@@ -650,35 +754,502 @@ static int count_active(struct supertype *st, int mdfd, char **availp,
                                cnt++;
                                max_events = info.events;
                                avail[info.disk.raid_disk] = 2;
-                               st->ss->getinfo_super(st, bestinfo);
+                               best[info.disk.raid_disk] = devnum;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        } else if (info.events == max_events) {
-                               cnt++;
                                avail[info.disk.raid_disk] = 2;
+                               best[info.disk.raid_disk] = devnum;
                        } else if (info.events == max_events-1) {
-                               cnt1++;
-                               avail[info.disk.raid_disk] = 1;
+                               if (avail[info.disk.raid_disk] == 0) {
+                                       avail[info.disk.raid_disk] = 1;
+                                       best[info.disk.raid_disk] = devnum;
+                               }
                        } else if (info.events < max_events - 1)
                                ;
                        else if (info.events == max_events+1) {
                                int i;
-                               cnt1 = cnt;
-                               cnt = 1;
                                max_events = info.events;
-                               for (i=0; i<info.array.raid_disks; i++)
+                               for (i=0; i < raid_disks; i++)
                                        if (avail[i])
                                                avail[i]--;
                                avail[info.disk.raid_disk] = 2;
-                               st->ss->getinfo_super(st, bestinfo);
+                               best[info.disk.raid_disk] = devnum;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        } else { /* info.events much bigger */
-                               cnt = 1; cnt1 = 0;
                                memset(avail, 0, info.disk.raid_disk);
                                max_events = info.events;
-                               st->ss->getinfo_super(st, bestinfo);
+                               avail[info.disk.raid_disk] = 2;
+                               st->ss->getinfo_super(st, bestinfo, NULL);
                        }
                }
                st->ss->free_super(st);
        }
-       return cnt + cnt1;
+       if (!avail)
+               return 0;
+       /* We need to reject any device that thinks the best device is
+        * failed or missing */
+       for (b = 0; b < raid_disks; b++)
+               if (avail[b] == 2)
+                       break;
+       cnt = 0;
+       for (i = 0 ; i < raid_disks ; i++) {
+               if (i != b && avail[i])
+                       if (devmap[raid_disks * best[i] + b] == 0) {
+                               /* This device thinks 'b' is failed -
+                                * don't use it */
+                               devnum = best[i];
+                               for (d=sra->devs ; devnum; d = d->next)
+                                       devnum--;
+                               d->disk.state |= (1 << MD_DISK_REMOVED);
+                               avail[i] = 0;
+                       }
+               if (avail[i])
+                       cnt++;
+       }
+       free(best);
+       free(devmap);
+       return cnt;
+}
+
+static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                          struct map_ent *target, int bare,
+                          struct supertype *st, int verbose)
+{
+       /* This device doesn't have any md metadata
+        * The device policy allows 'spare' and if !bare, it allows spare-same-slot.
+        * If 'st' is not set, then we only know that some metadata allows this,
+        * others possibly don't.
+        * So look for a container or array to attach the device to.
+        * Prefer 'target' if that is set and the array is found.
+        *
+        * If st is set, then only arrays of that type are considered
+        * Return 0 on success, or some exit code on failure, probably 1.
+        */
+       int rv = -1;
+       struct stat stb;
+       struct map_ent *mp, *map = NULL;
+       struct mdinfo *chosen = NULL;
+       int dfd = *dfdp;
+
+       if (fstat(dfd, &stb) != 0)
+               return 1;
+
+       /*
+        * Now we need to find a suitable array to add this to.
+        * We only accept arrays that:
+        *  - match 'st'
+        *  - are in the same domains as the device
+        *  - are of an size for which the device will be useful
+        * and we choose the one that is the most degraded
+        */
+
+       if (map_lock(&map)) {
+               fprintf(stderr, Name ": failed to get exclusive lock on "
+                       "mapfile\n");
+               return 1;
+       }
+       for (mp = map ; mp ; mp = mp->next) {
+               struct supertype *st2;
+               struct domainlist *dl = NULL;
+               struct mdinfo *sra;
+               unsigned long long devsize;
+
+               if (is_subarray(mp->metadata))
+                       continue;
+               if (st) {
+                       st2 = st->ss->match_metadata_desc(mp->metadata);
+                       if (!st2 ||
+                           (st->minor_version >= 0 &&
+                            st->minor_version != st2->minor_version)) {
+                               if (verbose > 1)
+                                       fprintf(stderr, Name ": not adding %s to %s as metadata type doesn't match\n",
+                                               devname, mp->path);
+                               free(st2);
+                               continue;
+                       }
+                       free(st2);
+               }
+               sra = sysfs_read(-1, mp->devnum,
+                                GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+                                GET_DEGRADED|GET_COMPONENT|GET_VERSION);
+               if (!sra) {
+                       /* Probably a container - no degraded info */
+                       sra = sysfs_read(-1, mp->devnum,
+                                        GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+                                        GET_COMPONENT|GET_VERSION);
+                       if (sra)
+                               sra->array.failed_disks = 0;
+               }
+               if (!sra)
+                       continue;
+               if (st == NULL) {
+                       int i;
+                       st2 = NULL;
+                       for(i=0; !st2 && superlist[i]; i++)
+                               st2 = superlist[i]->match_metadata_desc(
+                                       sra->text_version);
+                       if (!st2) {
+                               if (verbose > 1)
+                                       fprintf(stderr, Name ": not adding %s to %s"
+                                               " as metadata not recognised.\n",
+                                               devname, mp->path);
+                               goto next;
+                       }
+                       /* Need to double check the 'act_spare' permissions applies
+                        * to this metadata.
+                        */
+                       if (!policy_action_allows(pol, st2->ss->name, act_spare))
+                               goto next;
+                       if (!bare && !policy_action_allows(pol, st2->ss->name,
+                                                          act_spare_same_slot))
+                               goto next;
+               } else
+                       st2 = st;
+               get_dev_size(dfd, NULL, &devsize);
+               if (st2->ss->avail_size(st2, devsize) < sra->component_size) {
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": not adding %s to %s as it is too small\n",
+                                       devname, mp->path);
+                       goto next;
+               }
+               dl = domain_from_array(sra, st2->ss->name);
+               if (!domain_test(dl, pol, st2->ss->name)) {
+                       /* domain test fails */
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": not adding %s to %s as it is not in a compatible domain\n",
+                                       devname, mp->path);
+
+                       goto next;
+               }
+               /* test against target.
+                * If 'target' is set and 'bare' is false, we only accept
+                * arrays/containers that match 'target'.
+                * If 'target' is set and 'bare' is true, we prefer the
+                * array which matches 'target'.
+                */
+               if (target) {
+                       if (strcmp(target->metadata, mp->metadata) == 0 &&
+                           memcmp(target->uuid, mp->uuid,
+                                  sizeof(target->uuid)) == 0) {
+                               /* This is our target!! */
+                               if (chosen)
+                                       sysfs_free(chosen);
+                               chosen = sra;
+                               sra = NULL;
+                               /* skip to end so we don't check any more */
+                               while (mp->next)
+                                       mp = mp->next;
+                               goto next;
+                       }
+                       /* not our target */
+                       if (!bare)
+                               goto next;
+               }
+
+               /* all tests passed, OK to add to this array */
+               if (!chosen) {
+                       chosen = sra;
+                       sra = NULL;
+               } else if (chosen->array.failed_disks < sra->array.failed_disks) {
+                       sysfs_free(chosen);
+                       chosen = sra;
+                       sra = NULL;
+               }
+       next:
+               if (sra)
+                       sysfs_free(sra);
+               if (st != st2)
+                       free(st2);
+               if (dl)
+                       domain_free(dl);
+       }
+       if (chosen) {
+               /* add current device to chosen array as a spare */
+               int mdfd = open_dev(devname2devnum(chosen->sys_name));
+               if (mdfd >= 0) {
+                       struct mddev_dev devlist;
+                       char devname[20];
+                       devlist.next = NULL;
+                       devlist.used = 0;
+                       devlist.re_add = 0;
+                       devlist.writemostly = 0;
+                       devlist.devname = devname;
+                       sprintf(devname, "%d:%d", major(stb.st_rdev),
+                               minor(stb.st_rdev));
+                       devlist.disposition = 'a';
+                       close(dfd);
+                       *dfdp = -1;
+                       rv =  Manage_subdevs(chosen->sys_name, mdfd, &devlist,
+                                            -1, 0, NULL);
+                       close(mdfd);
+               }
+               if (verbose > 0) {
+                       if (rv == 0)
+                               fprintf(stderr, Name ": added %s as spare for %s\n",
+                                       devname, chosen->sys_name);
+                       else
+                               fprintf(stderr, Name ": failed to add %s as spare for %s\n",
+                                       devname, chosen->sys_name);
+               }
+               sysfs_free(chosen);
+       }
+       return rv ? 0 : 1;
+}
+
+static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                              struct supertype *st, int verbose)
+{
+       /* we know that at least one partition virtual-metadata is
+        * allowed to incorporate spares like this device.  We need to
+        * find a suitable device to copy partition information from.
+        *
+        * Getting a list of all disk (not partition) devices is
+        * slightly non-trivial.  We could look at /sys/block, but
+        * that is theoretically due to be removed.  Maybe best to use
+        * /dev/disk/by-path/?* and ignore names ending '-partNN' as
+        * we depend on this directory of 'path' info.  But that fails
+        * to find loop devices and probably others.  Maybe don't
+        * worry about that, they aren't the real target.
+        *
+        * So: check things in /dev/disk/by-path to see if they are in
+        * a compatible domain, then load the partition table and see
+        * if it is OK for the new device, and choose the largest
+        * partition table that fits.
+        */
+       DIR *dir;
+       struct dirent *de;
+       char *chosen = NULL;
+       unsigned long long chosen_size;
+       struct supertype *chosen_st = NULL;
+       int fd;
+
+       dir = opendir("/dev/disk/by-path");
+       if (!dir)
+               return 1;
+       while ((de = readdir(dir)) != NULL) {
+               char *ep;
+               struct dev_policy *pol2 = NULL;
+               struct domainlist *domlist = NULL;
+               int fd = -1;
+               struct mdinfo info;
+               struct supertype *st2 = NULL;
+               char *devname = NULL;
+               unsigned long long devsectors;
+
+               if (de->d_ino == 0 ||
+                   de->d_name[0] == '.' ||
+                   (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN))
+                       goto next;
+
+               ep = de->d_name + strlen(de->d_name);
+               while (ep > de->d_name &&
+                      isdigit(ep[-1]))
+                       ep--;
+               if (ep > de->d_name + 5 &&
+                   strncmp(ep-5, "-part", 5) == 0)
+                       /* This is a partition - skip it */
+                       goto next;
+
+               pol2 = path_policy(de->d_name, type_disk);
+
+               domain_merge(&domlist, pol2, st ? st->ss->name : NULL);
+               if (domain_test(domlist, pol, st ? st->ss->name : NULL) == 0)
+                       /* new device is incompatible with this device. */
+                       goto next;
+
+               domain_free(domlist);
+               domlist = NULL;
+
+               asprintf(&devname, "/dev/disk/by-path/%s", de->d_name);
+               fd = open(devname, O_RDONLY);
+               if (fd < 0)
+                       goto next;
+               if (get_dev_size(fd, devname, &devsectors) == 0)
+                       goto next;
+               devsectors >>= 9;
+
+               if (st)
+                       st2 = dup_super(st);
+               else
+                       st2 = guess_super_type(fd, guess_partitions);
+               if (st2 == NULL ||
+                   st2->ss->load_super(st2, fd, NULL) < 0)
+                       goto next;
+
+               if (!st) {
+                       /* Check domain policy again, this time referring to metadata */
+                       domain_merge(&domlist, pol2, st2->ss->name);
+                       if (domain_test(domlist, pol, st2->ss->name) == 0)
+                               /* Incompatible devices for this metadata type */
+                               goto next;
+                       if (!policy_action_allows(pol, st2->ss->name, act_spare))
+                               /* Some partition types allow sparing, but not
+                                * this one.
+                                */
+                               goto next;
+               }
+
+               st2->ss->getinfo_super(st2, &info, NULL);
+               if (info.component_size > devsectors)
+                       /* This partitioning doesn't fit in the device */
+                       goto next;
+
+               /* This is an acceptable device to copy partition
+                * metadata from.  We could just stop here, but I
+                * think I want to keep looking incase a larger
+                * metadata which makes better use of the device can
+                * be found.
+                */
+               if (chosen == NULL ||
+                   chosen_size < info.component_size) {
+                       chosen_size = info.component_size;
+                       free(chosen);
+                       chosen = devname;
+                       devname = NULL;
+                       if (chosen_st) {
+                               chosen_st->ss->free_super(chosen_st);
+                               free(chosen_st);
+                       }
+                       chosen_st = st2;
+                       st2 = NULL;
+               }
+
+       next:
+               free(devname);
+               domain_free(domlist);
+               dev_policy_free(pol2);
+               if (st2)
+                       st2->ss->free_super(st2);
+               free(st2);
+
+               if (fd >= 0)
+                       close(fd);
+       }
+
+       if (!chosen)
+               return 1;
+
+       /* 'chosen' is the best device we can find.  Let's write its
+        * metadata to devname dfd is read-only so don't use that
+        */
+       fd = open(devname, O_RDWR);
+       if (fd >= 0) {
+               chosen_st->ss->store_super(chosen_st, fd);
+               close(fd);
+       }
+       free(chosen);
+       chosen_st->ss->free_super(chosen_st);
+       free(chosen_st);
+       return 0;
+}
+
+static int is_bare(int dfd)
+{
+       unsigned long long size = 0;
+       char bufpad[4096 + 4096];
+       char *buf = (char*)(((long)bufpad + 4096) & ~4095);
+
+       if (lseek(dfd, 0, SEEK_SET) != 0 ||
+           read(dfd, buf, 4096) != 4096)
+               return 0;
+
+       if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+               return 0;
+       if (memcmp(buf, buf+1, 4095) != 0)
+               return 0;
+
+       /* OK, first 4K appear blank, try the end. */
+       get_dev_size(dfd, NULL, &size);
+       if (lseek(dfd, size-4096, SEEK_SET) < 0 ||
+           read(dfd, buf, 4096) != 4096)
+               return 0;
+
+       if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+               return 0;
+       if (memcmp(buf, buf+1, 4095) != 0)
+               return 0;
+
+       return 1;
+}
+
+/* adding a spare to a regular array is quite different from adding one to
+ * a set-of-partitions virtual array.
+ * This function determines which is worth trying and tries as appropriate.
+ * Arrays are given priority over partitions.
+ */
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+                    struct map_ent *target,
+                    struct supertype *st, int verbose)
+{
+       int i;
+       int rv;
+       int arrays_ok = 0;
+       int partitions_ok = 0;
+       int dfd = *dfdp;
+       int bare;
+
+       /* Can only add a spare if device has at least one domain */
+       if (pol_find(pol, pol_domain) == NULL)
+               return 1;
+       /* And only if some action allows spares */
+       if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare))
+               return 1;
+
+       /* Now check if the device is bare.
+        * bare devices can always be added as a spare
+        * non-bare devices can only be added if spare-same-slot is permitted,
+        * and this device is replacing a previous device - in which case 'target'
+        * will be set.
+        */
+       if (!is_bare(dfd)) {
+               /* Must have a target and allow same_slot */
+               /* Later - may allow force_spare without target */
+               if (!target ||
+                   !policy_action_allows(pol, st?st->ss->name:NULL,
+                                         act_spare_same_slot)) {
+                       if (verbose > 1)
+                               fprintf(stderr, Name ": %s is not bare, so not "
+                                       "considering as a spare\n",
+                                       devname);
+                       return 1;
+               }
+               bare = 0;
+       } else
+               bare = 1;
+
+       /* It might be OK to add this device to an array - need to see
+        * what arrays might be candidates.
+        */
+       if (st) {
+               /* just try try 'array' or 'partition' based on this metadata */
+               if (st->ss->add_to_super)
+                       return array_try_spare(devname, dfdp, pol, target, bare,
+                                              st, verbose);
+               else
+                       return partition_try_spare(devname, dfdp, pol,
+                                                  st, verbose);
+       }
+       /* No metadata was specified or found so options are open.
+        * Check for whether any array metadata, or any partition metadata
+        * might allow adding the spare.  This check is just help to avoid
+        * a more costly scan of all arrays when we can be sure that will
+        * fail.
+        */
+       for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) {
+               if (superlist[i]->add_to_super && !arrays_ok &&
+                   policy_action_allows(pol, superlist[i]->name, act_spare))
+                       arrays_ok = 1;
+               if (superlist[i]->add_to_super == NULL && !partitions_ok &&
+                   policy_action_allows(pol, superlist[i]->name, act_spare))
+                       partitions_ok = 1;
+       }
+       rv = 1;
+       if (arrays_ok)
+               rv = array_try_spare(devname, dfdp, pol, target, bare,
+                                    st, verbose);
+       if (rv != 0 && partitions_ok)
+               rv = partition_try_spare(devname, dfdp, pol, st, verbose);
+       return rv;
 }
 
 int IncrementalScan(int verbose)
@@ -691,7 +1262,7 @@ int IncrementalScan(int verbose)
         */
        struct map_ent *mapl = NULL;
        struct map_ent *me;
-       mddev_ident_t devs, mddev;
+       struct mddev_ident *devs, *mddev;
        int rv = 0;
 
        map_read(&mapl);
@@ -785,17 +1356,49 @@ static char *container2devname(char *devname)
        return mdname;
 }
 
-int Incremental_container(struct supertype *st, char *devname, int verbose,
-                         int runstop, int autof, int trustworthy)
+static int Incremental_container(struct supertype *st, char *devname,
+                                char *homehost, int verbose,
+                                int runstop, int autof)
 {
        /* Collect the contents of this container and for each
         * array, choose a device name and assemble the array.
         */
 
-       struct mdinfo *list = st->ss->container_content(st);
+       struct mdinfo *list;
        struct mdinfo *ra;
        struct map_ent *map = NULL;
+       struct mdinfo info;
+       int trustworthy;
+       struct mddev_ident *match;
+       int rv = 0;
 
+       memset(&info, 0, sizeof(info));
+       st->ss->getinfo_super(st, &info, NULL);
+
+       if ((runstop > 0 && info.container_enough >= 0) ||
+           info.container_enough > 0)
+               /* pass */;
+       else {
+               if (verbose)
+                       fprintf(stderr, Name ": not enough devices to start the container\n");
+               return 0;
+       }
+
+       match = search_mdstat(st, &info, devname, verbose, &rv);
+       if (match == NULL && rv == 2)
+               return rv;
+
+       /* Need to compute 'trustworthy' */
+       if (match)
+               trustworthy = LOCAL;
+       else if (st->ss->match_home(st, homehost) == 1)
+               trustworthy = LOCAL;
+       else if (st->ss->match_home(st, "any") == 1)
+               trustworthy = LOCAL;
+       else
+               trustworthy = FOREIGN;
+
+       list = st->ss->container_content(st, NULL);
        if (map_lock(&map))
                fprintf(stderr, Name ": failed to get exclusive lock on "
                        "mapfile\n");
@@ -804,7 +1407,7 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
                int mdfd;
                char chosen_name[1024];
                struct map_ent *mp;
-               struct mddev_ident_s *match = NULL;
+               struct mddev_ident *match = NULL;
 
                mp = map_by_uuid(&map, ra->uuid);
 
@@ -820,7 +1423,7 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
                         * member == ra->text_version after second slash.
                         */
                        char *sub = strchr(ra->text_version+1, '/');
-                       struct mddev_ident_s *array_list;
+                       struct mddev_ident *array_list;
                        if (sub) {
                                sub++;
                                array_list = conf_get_ident(NULL);
@@ -890,16 +1493,22 @@ int Incremental_container(struct supertype *st, char *devname, int verbose,
  * raid arrays, and if so first fail (if needed) and then remove the device.
  *
  * @devname - The device we want to remove
+ * @id_path - name as found in /dev/disk/by-path for this device
  *
  * Note: the device name must be a kernel name like "sda", so
  * that we can find it in /proc/mdstat
  */
-int IncrementalRemove(char *devname, int verbose)
+int IncrementalRemove(char *devname, char *id_path, int verbose)
 {
        int mdfd;
        int rv;
        struct mdstat_ent *ent;
-       struct mddev_dev_s devlist;
+       struct mddev_dev devlist;
+
+       if (!id_path)
+               dprintf(Name ": incremental removal without --path <id_path> "
+                       "lacks the possibility to re-add new device in this "
+                       "port\n");
 
        if (strchr(devname, '/')) {
                fprintf(stderr, Name ": incremental removal requires a "
@@ -915,14 +1524,42 @@ int IncrementalRemove(char *devname, int verbose)
        mdfd = open_dev(ent->devnum);
        if (mdfd < 0) {
                fprintf(stderr, Name ": Cannot open array %s!!\n", ent->dev);
+               free_mdstat(ent);
                return 1;
        }
+
+       if (id_path) {
+               struct map_ent *map = NULL, *me;
+               me = map_by_devnum(&map, ent->devnum);
+               if (me)
+                       policy_save_path(id_path, me);
+               map_free(map);
+       }
+
        memset(&devlist, 0, sizeof(devlist));
        devlist.devname = devname;
        devlist.disposition = 'f';
-       Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0);
+       /* for a container, we must fail each member array */
+       if (ent->metadata_version &&
+           strncmp(ent->metadata_version, "external:", 9) == 0) {
+               struct mdstat_ent *mdstat = mdstat_read(0, 0);
+               struct mdstat_ent *memb;
+               for (memb = mdstat ; memb ; memb = memb->next)
+                       if (is_container_member(memb, ent->dev)) {
+                               int subfd = open_dev(memb->devnum);
+                               if (subfd >= 0) {
+                                       Manage_subdevs(memb->dev, subfd,
+                                                      &devlist, verbose, 0,
+                                                      NULL);
+                                       close(subfd);
+                               }
+                       }
+               free_mdstat(mdstat);
+       } else
+               Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL);
        devlist.disposition = 'r';
-       rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0);
+       rv = Manage_subdevs(ent->dev, mdfd, &devlist, verbose, 0, NULL);
        close(mdfd);
+       free_mdstat(ent);
        return rv;
 }
diff --git a/Kill.c b/Kill.c
index 3d1810f04fbcc820f4f140caee8abf98411bb6db..29a43ea6bf20fe10071d43f4f9262a766e197a88 100644 (file)
--- a/Kill.c
+++ b/Kill.c
@@ -53,7 +53,7 @@ int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl)
        }
        if (st == NULL)
                st = guess_super(fd);
-       if (st == NULL) {
+       if (st == NULL || st->ss->init_super == NULL) {
                if (!quiet)
                        fprintf(stderr, Name ": Unrecognised md component device - %s\n", dev);
                close(fd);
@@ -96,16 +96,7 @@ int Kill_subarray(char *dev, char *subarray, int quiet)
 
        memset(st, 0, sizeof(*st));
 
-       if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >=
-           (int)sizeof(st->subarray)) {
-               if (!quiet)
-                       fprintf(stderr,
-                               Name ": Input overflow for subarray '%s' > %zu bytes\n",
-                               subarray, sizeof(st->subarray) - 1);
-               return 2;
-       }
-
-       fd = open_subarray(dev, st, quiet);
+       fd = open_subarray(dev, subarray, st, quiet);
        if (fd < 0)
                return 2;
 
index 0cc9a87c2b2de604f4794a27636de00d377e219f..2b888188cbb8497303ce4c1382580050fe18f22c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -71,8 +71,11 @@ CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
 MAP_DIR=/dev/.mdadm
 MAP_FILE = map
 MDMON_DIR = /dev/.mdadm
+# place for autoreplace cookies
+FAILED_SLOTS_DIR = /dev/.mdadm/failed-slots
 DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
 DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
+DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
 CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS)
 
 # The glibc TLS ABI requires applications that call clone(2) to set up
@@ -95,36 +98,42 @@ MAN4DIR = $(MANDIR)/man4
 MAN5DIR = $(MANDIR)/man5
 MAN8DIR = $(MANDIR)/man8
 
-OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
+OBJS =  mdadm.o config.o policy.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
        Incremental.o \
        mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+       super-mbr.o super-gpt.o \
        restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o \
        platform-intel.o probe_roms.o
 
-SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
+SRCS =  mdadm.c config.c policy.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
        Incremental.c \
        mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+       super-mbr.c super-gpt.c \
        restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c \
        platform-intel.c probe_roms.c
 
-MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+INCL = mdadm.h part.h bitmap.h
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o policy.o \
        Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+       super-mbr.o super-gpt.o \
        super-ddf.o sha1.o crc32.o msg.o bitmap.o \
        platform-intel.o probe_roms.o
 
-MON_SRCS = mdmon.c monitor.c managemon.c util.c mdstat.c sysfs.c config.c \
+MON_SRCS = mdmon.c monitor.c managemon.c util.c mdstat.c sysfs.c config.c policy.c \
        Kill.c sg_io.c dlink.c ReadMe.c super0.c super1.c super-intel.c \
+       super-mbr.c super-gpt.c \
        super-ddf.c sha1.c crc32.c msg.c bitmap.c \
        platform-intel.c probe_roms.c
 
 STATICSRC = pwgr.c
 STATICOBJS = pwgr.o
 
-ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
+ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \
        super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
-       platform-intel.c probe_roms.c sysfs.c
+       platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c
 ASSEMBLE_AUTO_SRCS := mdopen.c
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
 ifdef MDASSEMBLE_AUTO
@@ -149,20 +158,20 @@ mdadm : $(OBJS)
 mdadm.static : $(OBJS) $(STATICOBJS)
        $(CC) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS)
 
-mdadm.tcc : $(SRCS) mdadm.h
+mdadm.tcc : $(SRCS) $(INCL)
        $(TCC) -o mdadm.tcc $(SRCS)
 
-mdadm.klibc : $(SRCS) mdadm.h
+mdadm.klibc : $(SRCS) $(INCL)
        rm -f $(OBJS) 
        $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
 
-mdadm.Os : $(SRCS) mdadm.h
+mdadm.Os : $(SRCS) $(INCL)
        $(CC) -o mdadm.Os $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
 
-mdadm.O2 : $(SRCS) mdadm.h mdmon.O2
+mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
        $(CC) -o mdadm.O2 $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
 
-mdmon.O2 : $(MON_SRCS) mdadm.h mdmon.h
+mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
        $(CC) -o mdmon.O2 $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
 
 # use '-z now' to guarantee no dynamic linker interactions with the monitor thread
@@ -173,25 +182,25 @@ msg.o: msg.c msg.h
 test_stripe : restripe.c mdadm.h
        $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
 
-mdassemble : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS)  $(STATICSRC)
 
-mdassemble.static : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.static : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(CC) $(LDFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC)
 
-mdassemble.auto : $(ASSEMBLE_SRCS) mdadm.h $(ASSEMBLE_AUTO_SRCS)
+mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS)
        rm -f mdassemble.static
        $(MAKE) MDASSEMBLE_AUTO=1 mdassemble.static
        mv mdassemble.static mdassemble.auto
 
-mdassemble.uclibc : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.uclibc : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OJS)
        $(UCLIBC_GCC) $(ASSEMBLE_FLAGS) -DUCLIBC -DHAVE_STDINT_H -static -o mdassemble.uclibc $(ASSEMBLE_SRCS) $(STATICSRC)
 
 # This doesn't work
-mdassemble.klibc : $(ASSEMBLE_SRCS) mdadm.h
+mdassemble.klibc : $(ASSEMBLE_SRCS) $(INCL)
        rm -f $(OBJS)
        $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS)
 
@@ -213,8 +222,8 @@ mdadm.conf.man : mdadm.conf.5
 mdassemble.man : mdassemble.8
        nroff -man mdassemble.8 > mdassemble.man
 
-$(OBJS) : mdadm.h mdmon.h bitmap.h
-$(MON_OBJS) : mdadm.h mdmon.h bitmap.h
+$(OBJS) : $(INCL) mdmon.h
+$(MON_OBJS) : $(INCL) mdmon.h
 
 sha1.o : sha1.c sha1.h md5.h
        $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
index 6e9d4a03b45e76c7e3cfb71c816ade60a6a06b0d..81fa986d770c2f8ee201b61b90944c62390db624 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -56,7 +56,6 @@ int Manage_ro(char *devname, int fd, int readonly)
        mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
        if (mdi &&
            mdi->array.major_version == -1 &&
-           mdi->array.level > 0 &&
            is_subarray(mdi->text_version)) {
                char vers[64];
                strcpy(vers, "external:");
@@ -88,6 +87,8 @@ int Manage_ro(char *devname, int fd, int readonly)
                        if (*cp)
                                *cp = 0;
                        ping_monitor(vers+10);
+                       if (mdi->array.level <= 0)
+                               sysfs_set_str(mdi, NULL, "array_state", "active");
                }
                return 0;
        }
@@ -324,7 +325,8 @@ int Manage_resize(char *devname, int fd, long long size, int raid_disks)
 }
 
 int Manage_subdevs(char *devname, int fd,
-                  mddev_dev_t devlist, int verbose, int test)
+                  struct mddev_dev *devlist, int verbose, int test,
+                  char *update)
 {
        /* do something to each dev.
         * devmode can be
@@ -340,15 +342,16 @@ int Manage_subdevs(char *devname, int fd,
         * For 'f' and 'r', the device can also be a kernel-internal
         * name such as 'sdb'.
         */
-       mddev_dev_t add_devlist = NULL;
+       struct mddev_dev *add_devlist = NULL;
        mdu_array_info_t array;
        mdu_disk_info_t disc;
        unsigned long long array_size;
-       mddev_dev_t dv, next = NULL;
+       struct mddev_dev *dv, *next = NULL;
        struct stat stb;
        int j, jnext = 0;
        int tfd = -1;
        struct supertype *st, *tst;
+       char *subarray = NULL;
        int duuid[4];
        int ouuid[4];
        int lfd = -1;
@@ -369,7 +372,7 @@ int Manage_subdevs(char *devname, int fd,
        if (array_size <= 0)
                array_size = array.size * 2;
 
-       tst = super_by_fd(fd);
+       tst = super_by_fd(fd, &subarray);
        if (!tst) {
                fprintf(stderr, Name ": unsupport array - version %d.%d\n",
                        array.major_version, array.minor_version);
@@ -383,6 +386,7 @@ int Manage_subdevs(char *devname, int fd,
                char *dnprintable = dv->devname;
                char *add_dev = dv->devname;
                int err;
+               int re_add_failed = 0;
 
                next = dv->next;
                jnext = 0;
@@ -547,7 +551,7 @@ int Manage_subdevs(char *devname, int fd,
                        return 1;
                case 'a':
                        /* add the device */
-                       if (tst->subarray[0]) {
+                       if (subarray) {
                                fprintf(stderr, Name ": Cannot add disks to a"
                                        " \'member\' array, perform this"
                                        " operation on the parent container\n");
@@ -607,7 +611,7 @@ int Manage_subdevs(char *devname, int fd,
                                if (tst->sb)
                                        /* already loaded */;
                                else if (tst->ss->external) {
-                                       tst->ss->load_super(tst, fd, NULL);
+                                       tst->ss->load_container(tst, fd, NULL);
                                } else for (j = 0; j < tst->max_devs; j++) {
                                        char *dev;
                                        int dfd;
@@ -662,14 +666,20 @@ int Manage_subdevs(char *devname, int fd,
                                    get_linux_version() <= 2006018)
                                        ;
                                else if (st->sb) {
+                                       struct mdinfo mdi;
+                                       st->ss->getinfo_super(st, &mdi, NULL);
                                        st->ss->uuid_from_super(st, ouuid);
-                                       if (memcmp(duuid, ouuid, sizeof(ouuid))==0) {
-                                               /* looks close enough for now.  Kernel
-                                                * will worry about whether a bitmap
-                                                * based reconstruction is possible.
+                                       if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
+                                           !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
+                                           memcmp(duuid, ouuid, sizeof(ouuid))==0) {
+                                               /* look like it is worth a try.  Need to
+                                                * make sure kernel will accept it though.
                                                 */
-                                               struct mdinfo mdi;
-                                               st->ss->getinfo_super(st, &mdi);
+                                               disc.number = mdi.disk.number;
+                                               if (ioctl(fd, GET_DISK_INFO, &disc) != 0
+                                                   || disc.major != 0 || disc.minor != 0
+                                                   || !enough_fd(fd))
+                                                       goto skip_re_add;
                                                disc.major = major(stb.st_rdev);
                                                disc.minor = minor(stb.st_rdev);
                                                disc.number = mdi.disk.number;
@@ -682,10 +692,27 @@ int Manage_subdevs(char *devname, int fd,
                                                remove_partitions(tfd);
                                                close(tfd);
                                                tfd = -1;
+                                               if (update) {
+                                                       int rv = -1;
+                                                       tfd = dev_open(dv->devname, O_RDWR);
+
+                                                       if (tfd >= 0)
+                                                               rv = st->ss->update_super(
+                                                                       st, NULL, update,
+                                                                       devname, verbose, 0, NULL);
+                                                       if (rv == 0)
+                                                               rv = tst->ss->store_super(st, tfd);
+                                                       close(tfd);
+                                                       tfd = -1;
+                                                       if (rv != 0) {
+                                                               fprintf(stderr, Name ": failed to update"
+                                                                       " superblock during re-add\n");
+                                                               return 1;
+                                                       }
+                                               }
                                                /* don't even try if disk is marked as faulty */
                                                errno = 0;
-                                               if ((disc.state & 1) == 0 &&
-                                                   ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
+                                               if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
                                                        if (verbose >= 0)
                                                                fprintf(stderr, Name ": re-added %s\n", add_dev);
                                                        count++;
@@ -698,7 +725,8 @@ int Manage_subdevs(char *devname, int fd,
                                                                continue;
                                                        return 1;
                                                }
-                                               /* fall back on normal-add */
+                                       skip_re_add:
+                                               re_add_failed = 1;
                                        }
                                }
                                if (add_dev != dv->devname) {
@@ -720,6 +748,17 @@ int Manage_subdevs(char *devname, int fd,
                                                dv->devname, devname);
                                        return 1;
                                }
+                               if (re_add_failed) {
+                                       fprintf(stderr, Name ": %s reports being an active member for %s, but a --re-add fails.\n",
+                                               dv->devname, devname);
+                                       fprintf(stderr, Name ": not performing --add as that would convert %s in to a spare.\n",
+                                               dv->devname);
+                                       fprintf(stderr, Name ": To make this a spare, use \"mdadm --zero-superblock %s\" first.\n",     
+                                               dv->devname);
+                                       if (tfd >= 0)
+                                               close(tfd);
+                                       return 1;
+                               }
                        } else {
                                /* non-persistent. Must ensure that new drive
                                 * is at least array.size big.
@@ -837,7 +876,7 @@ int Manage_subdevs(char *devname, int fd,
                                }
                                sra->array.level = LEVEL_CONTAINER;
                                /* Need to set data_offset and component_size */
-                               tst->ss->getinfo_super(tst, &new_mdi);
+                               tst->ss->getinfo_super(tst, &new_mdi, NULL);
                                new_mdi.disk.major = disc.major;
                                new_mdi.disk.minor = disc.minor;
                                new_mdi.recovery_start = 0;
@@ -861,7 +900,7 @@ int Manage_subdevs(char *devname, int fd,
 
                case 'r':
                        /* hot remove */
-                       if (tst->subarray[0]) {
+                       if (subarray) {
                                fprintf(stderr, Name ": Cannot remove disks from a"
                                        " \'member\' array, perform this"
                                        " operation on the parent container\n");
@@ -1013,22 +1052,14 @@ int autodetect(void)
        return rv;
 }
 
-int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet)
+int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet)
 {
        struct supertype supertype, *st = &supertype;
        int fd, rv = 2;
 
        memset(st, 0, sizeof(*st));
-       if (snprintf(st->subarray, sizeof(st->subarray), "%s", subarray) >=
-           (signed)sizeof(st->subarray)) {
-               if (!quiet)
-                       fprintf(stderr,
-                               Name ": Input overflow for subarray '%s' > %zu bytes\n",
-                               subarray, sizeof(st->subarray) - 1);
-               return 2;
-       }
 
-       fd = open_subarray(dev, st, quiet);
+       fd = open_subarray(dev, subarray, st, quiet);
        if (fd < 0)
                return 2;
 
@@ -1043,7 +1074,7 @@ int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident
        if (mdmon_running(st->devnum))
                st->update_tail = &st->updates;
 
-       rv = st->ss->update_subarray(st, update, ident);
+       rv = st->ss->update_subarray(st, subarray, update, ident);
 
        if (rv) {
                if (!quiet)
index 0f0adb54502186095c0463cf0818d70fb95b19fc..af701939d12a1794201607886edbaa59dca05e00 100644 (file)
--- a/Monitor.c
+++ b/Monitor.c
 #include       <limits.h>
 #include       <syslog.h>
 
-static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom,
-                 char *cmd, int dosyslog);
-
 /* The largest number of disks current arrays can manage is 384
  * This really should be dynamically, but that will have to wait
  * At least it isn't MD_SB_DISKS.
  */
 #define MaxDisks 384
-int Monitor(mddev_dev_t devlist,
+struct state {
+       char *devname;
+       int devnum;     /* to sync with mdstat info */
+       long utime;
+       int err;
+       char *spare_group;
+       int active, working, failed, spare, raid;
+       int expected_spares;
+       int devstate[MaxDisks];
+       dev_t devid[MaxDisks];
+       int percent;
+       int parent_dev; /* For subarray, devnum of parent.
+                        * For others, NoMdDev
+                        */
+       struct supertype *metadata;
+       struct state *subarray;/* for a container it is a link to first subarray
+                               * for a subarray it is a link to next subarray
+                               * in the same container */
+       struct state *parent;  /* for a subarray it is a link to its container
+                               */
+       struct state *next;
+};
+
+struct alert_info {
+       char *mailaddr;
+       char *mailfrom;
+       char *alert_cmd;
+       int dosyslog;
+};
+static int make_daemon(char *pidfile);
+static int check_one_sharer(int scan);
+static void alert(char *event, char *dev, char *disc, struct alert_info *info);
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+                      int test, struct alert_info *info,
+                      int increments);
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+                         int test, struct alert_info *info);
+static void try_spare_migration(struct state *statelist, struct alert_info *info);
+static void link_containers_with_subarrays(struct state *list);
+
+int Monitor(struct mddev_dev *devlist,
            char *mailaddr, char *alert_cmd,
            int period, int daemonise, int scan, int oneshot,
-           int dosyslog, int test, char* pidfile, int increments)
+           int dosyslog, int test, char *pidfile, int increments,
+           int share)
 {
        /*
         * Every few seconds, scan every md device looking for changes
@@ -85,22 +123,11 @@ int Monitor(mddev_dev_t devlist,
         * that appears in /proc/mdstat
         */
 
-       struct state {
-               char *devname;
-               int devnum;     /* to sync with mdstat info */
-               long utime;
-               int err;
-               char *spare_group;
-               int active, working, failed, spare, raid;
-               int expected_spares;
-               int devstate[MaxDisks];
-               unsigned devid[MaxDisks];
-               int percent;
-               struct state *next;
-       } *statelist = NULL;
+       struct state *statelist = NULL;
        int finished = 0;
        struct mdstat_ent *mdstat = NULL;
        char *mailfrom = NULL;
+       struct alert_info info;
 
        if (!mailaddr) {
                mailaddr = conf_get_mailaddr();
@@ -120,44 +147,28 @@ int Monitor(mddev_dev_t devlist,
                fprintf(stderr, Name ": No mail address or alert command - not monitoring.\n");
                return 1;
        }
+       info.alert_cmd = alert_cmd;
+       info.mailaddr = mailaddr;
+       info.mailfrom = mailfrom;
+       info.dosyslog = dosyslog;
 
-       if (daemonise) {
-               int pid = fork();
-               if (pid > 0) {
-                       if (!pidfile)
-                               printf("%d\n", pid);
-                       else {
-                               FILE *pid_file;
-                               pid_file=fopen(pidfile, "w");
-                               if (!pid_file)
-                                       perror("cannot create pid file");
-                               else {
-                                       fprintf(pid_file,"%d\n", pid);
-                                       fclose(pid_file);
-                               }
-                       }
-                       return 0;
-               }
-               if (pid < 0) {
-                       perror("daemonise");
+       if (daemonise)
+               if (make_daemon(pidfile))
+                       return 1;
+
+       if (share) 
+               if (check_one_sharer(scan))
                        return 1;
-               }
-               close(0);
-               open("/dev/null", O_RDWR);
-               dup2(0,1);
-               dup2(0,2);
-               setsid();
-       }
 
        if (devlist == NULL) {
-               mddev_ident_t mdlist = conf_get_ident(NULL);
+               struct mddev_ident *mdlist = conf_get_ident(NULL);
                for (; mdlist; mdlist=mdlist->next) {
                        struct state *st;
                        if (mdlist->devname == NULL)
                                continue;
                        if (strcasecmp(mdlist->devname, "<ignore>") == 0)
                                continue;
-                       st = malloc(sizeof *st);
+                       st = calloc(1, sizeof *st);
                        if (st == NULL)
                                continue;
                        if (mdlist->devname[0] == '/')
@@ -167,33 +178,26 @@ int Monitor(mddev_dev_t devlist,
                                strcpy(strcpy(st->devname, "/dev/md/"),
                                       mdlist->devname);
                        }
-                       st->utime = 0;
                        st->next = statelist;
-                       st->err = 0;
                        st->devnum = INT_MAX;
                        st->percent = -2;
                        st->expected_spares = mdlist->spare_disks;
                        if (mdlist->spare_group)
                                st->spare_group = strdup(mdlist->spare_group);
-                       else
-                               st->spare_group = NULL;
                        statelist = st;
                }
        } else {
-               mddev_dev_t dv;
+               struct mddev_dev *dv;
                for (dv=devlist ; dv; dv=dv->next) {
-                       mddev_ident_t mdlist = conf_get_ident(dv->devname);
-                       struct state *st = malloc(sizeof *st);
+                       struct mddev_ident *mdlist = conf_get_ident(dv->devname);
+                       struct state *st = calloc(1, sizeof *st);
                        if (st == NULL)
                                continue;
                        st->devname = strdup(dv->devname);
-                       st->utime = 0;
                        st->next = statelist;
-                       st->err = 0;
                        st->devnum = INT_MAX;
                        st->percent = -2;
                        st->expected_spares = -1;
-                       st->spare_group = NULL;
                        if (mdlist) {
                                st->expected_spares = mdlist->spare_disks;
                                if (mdlist->spare_group)
@@ -207,305 +211,27 @@ int Monitor(mddev_dev_t devlist,
        while (! finished) {
                int new_found = 0;
                struct state *st;
+               int anydegraded = 0;
 
                if (mdstat)
                        free_mdstat(mdstat);
                mdstat = mdstat_read(oneshot?0:1, 0);
 
-               for (st=statelist; st; st=st->next) {
-                       struct { int state, major, minor; } info[MaxDisks];
-                       mdu_array_info_t array;
-                       struct mdstat_ent *mse = NULL, *mse2;
-                       char *dev = st->devname;
-                       int fd;
-                       int i;
-
-                       if (test)
-                               alert("TestMessage", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       fd = open(dev, O_RDONLY);
-                       if (fd < 0) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, NULL,
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-/*                                     fprintf(stderr, Name ": cannot open %s: %s\n",
-                                               dev, strerror(errno));
-*/                             st->err=1;
-                               continue;
-                       }
-                       fcntl(fd, F_SETFD, FD_CLOEXEC);
-                       if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, NULL,
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-/*                                     fprintf(stderr, Name ": cannot get array info for %s: %s\n",
-                                               dev, strerror(errno));
-*/                             st->err=1;
-                               close(fd);
-                               continue;
-                       }
-                       /* It's much easier to list what array levels can't
-                        * have a device disappear than all of them that can
-                        */
-                       if (array.level == 0 || array.level == -1) {
-                               if (!st->err)
-                                       alert("DeviceDisappeared", dev, "Wrong-Level",
-                                             mailaddr, mailfrom, alert_cmd, dosyslog);
-                               st->err = 1;
-                               close(fd);
-                               continue;
-                       }
-                       if (st->devnum == INT_MAX) {
-                               struct stat stb;
-                               if (fstat(fd, &stb) == 0 &&
-                                   (S_IFMT&stb.st_mode)==S_IFBLK) {
-                                       if (major(stb.st_rdev) == MD_MAJOR)
-                                               st->devnum = minor(stb.st_rdev);
-                                       else
-                                               st->devnum = -1- (minor(stb.st_rdev)>>6);
-                               }
-                       }
-
-                       for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
-                               if (mse2->devnum == st->devnum) {
-                                       mse2->devnum = INT_MAX; /* flag it as "used" */
-                                       mse = mse2;
-                               }
-
-                       if (array.utime == 0)
-                               /* external arrays don't update utime */
-                               array.utime = time(0);
-
-                       if (st->utime == array.utime &&
-                           st->failed == array.failed_disks &&
-                           st->working == array.working_disks &&
-                           st->spare == array.spare_disks &&
-                           (mse == NULL  || (
-                                   mse->percent == st->percent
-                                   ))) {
-                               close(fd);
-                               st->err = 0;
-                               continue;
-                       }
-                       if (st->utime == 0 && /* new array */
-                           mse &&      /* is in /proc/mdstat */
-                           mse->pattern && strchr(mse->pattern, '_') /* degraded */
-                               )
-                               alert("DegradedArray", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-
-                       if (st->utime == 0 && /* new array */
-                           st->expected_spares > 0 &&
-                           array.spare_disks < st->expected_spares)
-                               alert("SparesMissing", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       if (mse &&
-                           st->percent == -1 &&
-                           mse->percent >= 0)
-                               alert("RebuildStarted", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       if (mse &&
-                           st->percent >= 0 &&
-                           mse->percent >= 0 &&
-                           (mse->percent / increments) > (st->percent / increments)) {
-                               char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
-
-                               if((mse->percent / increments) == 0)
-                                       snprintf(percentalert, sizeof(percentalert), "RebuildStarted");
-                               else
-                                       snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent);
-
-                               alert(percentalert,
-                                     dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                       }
-
-                       if (mse &&
-                           mse->percent == -1 &&
-                           st->percent >= 0) {
-                               /* Rebuild/sync/whatever just finished.
-                                * If there is a number in /mismatch_cnt,
-                                * we should report that.
-                                */
-                               struct mdinfo *sra =
-                                      sysfs_read(-1, st->devnum, GET_MISMATCH);
-                               if (sra && sra->mismatch_cnt > 0) {
-                                       char cnt[40];
-                                       sprintf(cnt, " mismatches found: %d", sra->mismatch_cnt);
-                                       alert("RebuildFinished", dev, cnt, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               } else
-                                       alert("RebuildFinished", dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               if (sra)
-                                       free(sra);
-                       }
-
-                       if (mse)
-                               st->percent = mse->percent;
-
-
-                       for (i=0; i<MaxDisks && i <= array.raid_disks + array.nr_disks;
-                            i++) {
-                               mdu_disk_info_t disc;
-                               disc.number = i;
-                               if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) {
-                                       info[i].state = disc.state;
-                                       info[i].major = disc.major;
-                                       info[i].minor = disc.minor;
-                               } else
-                                       info[i].major = info[i].minor = 0;
-                       }
-                       close(fd);
-
-                       for (i=0; i<MaxDisks; i++) {
-                               mdu_disk_info_t disc = {0,0,0,0,0};
-                               int newstate=0;
-                               int change;
-                               char *dv = NULL;
-                               disc.number = i;
-                               if (i > array.raid_disks + array.nr_disks) {
-                                       newstate = 0;
-                                       disc.major = disc.minor = 0;
-                               } else if (info[i].major || info[i].minor) {
-                                       newstate = info[i].state;
-                                       dv = map_dev(info[i].major, info[i].minor, 1);
-                                       disc.state = newstate;
-                                       disc.major = info[i].major;
-                                       disc.minor = info[i].minor;
-                               } else if (mse &&  mse->pattern && i < (int)strlen(mse->pattern)) {
-                                       switch(mse->pattern[i]) {
-                                       case 'U': newstate = 6 /* ACTIVE/SYNC */; break;
-                                       case '_': newstate = 0; break;
-                                       }
-                                       disc.major = disc.minor = 0;
-                               }
-                               if (dv == NULL && st->devid[i])
-                                       dv = map_dev(major(st->devid[i]),
-                                                    minor(st->devid[i]), 1);
-                               change = newstate ^ st->devstate[i];
-                               if (st->utime && change && !st->err) {
-                                       if (i < array.raid_disks &&
-                                           (((newstate&change)&(1<<MD_DISK_FAULTY)) ||
-                                            ((st->devstate[i]&change)&(1<<MD_DISK_ACTIVE)) ||
-                                            ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)))
-                                               )
-                                               alert("Fail", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       else if (i >= array.raid_disks &&
-                                                (disc.major || disc.minor) &&
-                                                st->devid[i] == makedev(disc.major, disc.minor) &&
-                                                ((newstate&change)&(1<<MD_DISK_FAULTY))
-                                               )
-                                               alert("FailSpare", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       else if (i < array.raid_disks &&
-                                                ! (newstate & (1<<MD_DISK_REMOVED)) &&
-                                                (((st->devstate[i]&change)&(1<<MD_DISK_FAULTY)) ||
-                                                 ((newstate&change)&(1<<MD_DISK_ACTIVE)) ||
-                                                 ((newstate&change)&(1<<MD_DISK_SYNC)))
-                                               )
-                                               alert("SpareActive", dev, dv, mailaddr, mailfrom, alert_cmd, dosyslog);
-                               }
-                               st->devstate[i] = newstate;
-                               st->devid[i] = makedev(disc.major, disc.minor);
-                       }
-                       st->active = array.active_disks;
-                       st->working = array.working_disks;
-                       st->spare = array.spare_disks;
-                       st->failed = array.failed_disks;
-                       st->utime = array.utime;
-                       st->raid = array.raid_disks;
-                       st->err = 0;
-               }
+               for (st=statelist; st; st=st->next)
+                       if (check_array(st, mdstat, test, &info, increments))
+                               anydegraded = 1;
+               
                /* now check if there are any new devices found in mdstat */
-               if (scan) {
-                       struct mdstat_ent *mse;
-                       for (mse=mdstat; mse; mse=mse->next)
-                               if (mse->devnum != INT_MAX &&
-                                   mse->level &&
-                                   (strcmp(mse->level, "raid0")!=0 &&
-                                    strcmp(mse->level, "linear")!=0)
-                                       ) {
-                                       struct state *st = malloc(sizeof *st);
-                                       mdu_array_info_t array;
-                                       int fd;
-                                       if (st == NULL)
-                                               continue;
-                                       st->devname = strdup(get_md_name(mse->devnum));
-                                       if ((fd = open(st->devname, O_RDONLY)) < 0 ||
-                                           ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
-                                               /* no such array */
-                                               if (fd >=0) close(fd);
-                                               put_md_name(st->devname);
-                                               free(st->devname);
-                                               free(st);
-                                               continue;
-                                       }
-                                       close(fd);
-                                       st->utime = 0;
-                                       st->next = statelist;
-                                       st->err = 1;
-                                       st->devnum = mse->devnum;
-                                       st->percent = -2;
-                                       st->spare_group = NULL;
-                                       st->expected_spares = -1;
-                                       statelist = st;
-                                       if (test)
-                                               alert("TestMessage", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       alert("NewArray", st->devname, NULL, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                       new_found = 1;
-                               }
-               }
+               if (scan)
+                       new_found = add_new_arrays(mdstat, &statelist, test,
+                                                  &info);
+
                /* If an array has active < raid && spare == 0 && spare_group != NULL
                 * Look for another array with spare > 0 and active == raid and same spare_group
                 *  if found, choose a device and hotremove/hotadd
                 */
-               for (st = statelist; st; st=st->next)
-                       if (st->active < st->raid &&
-                           st->spare == 0 &&
-                           st->spare_group != NULL) {
-                               struct state *st2;
-                               for (st2=statelist ; st2 ; st2=st2->next)
-                                       if (st2 != st &&
-                                           st2->spare > 0 &&
-                                           st2->active == st2->raid &&
-                                           st2->spare_group != NULL &&
-                                           strcmp(st->spare_group, st2->spare_group) == 0) {
-                                               /* try to remove and add */
-                                               int fd1 = open(st->devname, O_RDONLY);
-                                               int fd2 = open(st2->devname, O_RDONLY);
-                                               int dev = -1;
-                                               int d;
-                                               if (fd1 < 0 || fd2 < 0) {
-                                                       if (fd1>=0) close(fd1);
-                                                       if (fd2>=0) close(fd2);
-                                                       continue;
-                                               }
-                                               for (d=st2->raid; d < MaxDisks; d++) {
-                                                       if (st2->devid[d] > 0 &&
-                                                           st2->devstate[d] == 0) {
-                                                               dev = st2->devid[d];
-                                                               break;
-                                                       }
-                                               }
-                                               if (dev > 0) {
-                                                       struct mddev_dev_s devlist;
-                                                       char devname[20];
-                                                       devlist.next = NULL;
-                                                       devlist.used = 0;
-                                                       devlist.re_add = 0;
-                                                       devlist.writemostly = 0;
-                                                       devlist.devname = devname;
-                                                       sprintf(devname, "%d:%d", major(dev), minor(dev));
-
-                                                       devlist.disposition = 'r';
-                                                       if (Manage_subdevs(st2->devname, fd2, &devlist, -1, 0) == 0) {
-                                                               devlist.disposition = 'a';
-                                                               if (Manage_subdevs(st->devname, fd1, &devlist, -1, 0) == 0) {
-                                                                       alert("MoveSpare", st->devname, st2->devname, mailaddr, mailfrom, alert_cmd, dosyslog);
-                                                                       close(fd1);
-                                                                       close(fd2);
-                                                                       break;
-                                                               }
-                                                               else Manage_subdevs(st2->devname, fd2, &devlist, -1, 0);
-                                                       }
-                                               }
-                                               close(fd1);
-                                               close(fd2);
-                                       }
-                       }
+               if (share && anydegraded)
+                       try_spare_migration(statelist, &info);
                if (!new_found) {
                        if (oneshot)
                                break;
@@ -519,18 +245,93 @@ int Monitor(mddev_dev_t devlist,
        return 0;
 }
 
+static int make_daemon(char *pidfile)
+{
+       int pid = fork();
+       if (pid > 0) {
+               if (!pidfile)
+                       printf("%d\n", pid);
+               else {
+                       FILE *pid_file;
+                       pid_file=fopen(pidfile, "w");
+                       if (!pid_file)
+                               perror("cannot create pid file");
+                       else {
+                               fprintf(pid_file,"%d\n", pid);
+                               fclose(pid_file);
+                       }
+               }
+               return 0;
+       }
+       if (pid < 0) {
+               perror("daemonise");
+               return 1;
+       }
+       close(0);
+       open("/dev/null", O_RDWR);
+       dup2(0,1);
+       dup2(0,2);
+       setsid();
+       return 0;
+}
 
-static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd,
-                 int dosyslog)
+static int check_one_sharer(int scan)
+{
+       int pid, rv;
+       FILE *fp;
+       char dir[20];
+       struct stat buf;
+       fp = fopen("/var/run/mdadm/autorebuild.pid", "r");
+       if (fp) {
+               fscanf(fp, "%d", &pid);
+               sprintf(dir, "/proc/%d", pid);
+               rv = stat(dir, &buf);
+               if (rv != -1) {
+                       if (scan) {
+                               fprintf(stderr, Name ": Only one "
+                                       "autorebuild process allowed"
+                                       " in scan mode, aborting\n");
+                               fclose(fp);
+                               return 1;
+                       } else {
+                               fprintf(stderr, Name ": Warning: One"
+                                       " autorebuild process already"
+                                       " running.\n");
+                       }
+               }
+               fclose(fp);
+       }
+       if (scan) {
+               if (mkdir("/var/run/mdadm", S_IRWXU) < 0 &&
+                   errno != EEXIST) {
+                       fprintf(stderr, Name ": Can't create "
+                               "autorebuild.pid file\n");
+               } else {
+                       fp = fopen("/var/run/mdadm/autorebuild.pid", "w");
+                       if (!fp)
+                               fprintf(stderr, Name ": Cannot create"
+                                       " autorebuild.pid"
+                                       "file\n");
+                       else {
+                               pid = getpid();
+                               fprintf(fp, "%d\n", pid);
+                               fclose(fp);
+                       }
+               }
+       }
+       return 0;
+}
+
+static void alert(char *event, char *dev, char *disc, struct alert_info *info)
 {
        int priority;
 
-       if (!cmd && !mailaddr) {
+       if (!info->alert_cmd && !info->mailaddr) {
                time_t now = time(0);
 
                printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device");
        }
-       if (cmd) {
+       if (info->alert_cmd) {
                int pid = fork();
                switch(pid) {
                default:
@@ -539,11 +340,12 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                case -1:
                        break;
                case 0:
-                       execl(cmd, cmd, event, dev, disc, NULL);
+                       execl(info->alert_cmd, info->alert_cmd,
+                             event, dev, disc, NULL);
                        exit(2);
                }
        }
-       if (mailaddr &&
+       if (info->mailaddr &&
            (strncmp(event, "Fail", 4)==0 ||
             strncmp(event, "Test", 4)==0 ||
             strncmp(event, "Spares", 6)==0 ||
@@ -554,20 +356,27 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        char hname[256];
                        gethostname(hname, sizeof(hname));
                        signal(SIGPIPE, SIG_IGN);
-                       if (mailfrom)
-                               fprintf(mp, "From: %s\n", mailfrom);
+                       if (info->mailfrom)
+                               fprintf(mp, "From: %s\n", info->mailfrom);
                        else
                                fprintf(mp, "From: " Name " monitoring <root>\n");
-                       fprintf(mp, "To: %s\n", mailaddr);
-                       fprintf(mp, "Subject: %s event on %s:%s\n\n", event, dev, hname);
+                       fprintf(mp, "To: %s\n", info->mailaddr);
+                       fprintf(mp, "Subject: %s event on %s:%s\n\n",
+                               event, dev, hname);
 
-                       fprintf(mp, "This is an automatically generated mail message from " Name "\n");
+                       fprintf(mp,
+                               "This is an automatically generated"
+                               " mail message from " Name "\n");
                        fprintf(mp, "running on %s\n\n", hname);
 
-                       fprintf(mp, "A %s event had been detected on md device %s.\n\n", event, dev);
+                       fprintf(mp,
+                               "A %s event had been detected on"
+                               " md device %s.\n\n", event, dev);
 
                        if (disc && disc[0] != ' ')
-                               fprintf(mp, "It could be related to component device %s.\n\n", disc);
+                               fprintf(mp,
+                                       "It could be related to"
+                                       " component device %s.\n\n", disc);
                        if (disc && disc[0] == ' ')
                                fprintf(mp, "Extra information:%s.\n\n", disc);
 
@@ -577,18 +386,19 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        if (mdstat) {
                                char buf[8192];
                                int n;
-                               fprintf(mp, "\nP.S. The /proc/mdstat file currently contains the following:\n\n");
+                               fprintf(mp,
+                                       "\nP.S. The /proc/mdstat file"
+                                       " currently contains the following:\n\n");
                                while ( (n=fread(buf, 1, sizeof(buf), mdstat)) > 0)
-                                       n=fwrite(buf, 1, n, mp); /* yes, i don't care about the result */
+                                       n=fwrite(buf, 1, n, mp);
                                fclose(mdstat);
                        }
                        pclose(mp);
                }
-
        }
 
        /* log the event to syslog maybe */
-       if (dosyslog) {
+       if (info->dosyslog) {
                /* Log at a different severity depending on the event.
                 *
                 * These are the critical events:  */
@@ -606,10 +416,549 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mail
                        priority = LOG_INFO;
 
                if (disc)
-                       syslog(priority, "%s event detected on md device %s, component device %s", event, dev, disc);
+                       syslog(priority,
+                              "%s event detected on md device %s,"
+                              " component device %s", event, dev, disc);
+               else
+                       syslog(priority,
+                              "%s event detected on md device %s",
+                              event, dev);
+       }
+}
+
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+                      int test, struct alert_info *ainfo,
+                      int increments)
+{
+       /* Update the state 'st' to reflect any changes shown in mdstat,
+        * or found by directly examining the array, and return
+        * '1' if the array is degraded, or '0' if it is optimal (or dead).
+        */
+       struct { int state, major, minor; } info[MaxDisks];
+       mdu_array_info_t array;
+       struct mdstat_ent *mse = NULL, *mse2;
+       char *dev = st->devname;
+       int fd;
+       int i;
+
+       if (test)
+               alert("TestMessage", dev, NULL, ainfo);
+       fd = open(dev, O_RDONLY);
+       if (fd < 0) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, NULL, ainfo);
+               st->err=1;
+               return 0;
+       }
+       fcntl(fd, F_SETFD, FD_CLOEXEC);
+       if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, NULL, ainfo);
+               st->err=1;
+               close(fd);
+               return 0;
+       }
+       /* It's much easier to list what array levels can't
+        * have a device disappear than all of them that can
+        */
+       if (array.level == 0 || array.level == -1) {
+               if (!st->err)
+                       alert("DeviceDisappeared", dev, "Wrong-Level", ainfo);
+               st->err = 1;
+               close(fd);
+               return 0;
+       }
+       if (st->devnum == INT_MAX) {
+               struct stat stb;
+               if (fstat(fd, &stb) == 0 &&
+                   (S_IFMT&stb.st_mode)==S_IFBLK) {
+                       if (major(stb.st_rdev) == MD_MAJOR)
+                               st->devnum = minor(stb.st_rdev);
+                       else
+                               st->devnum = -1- (minor(stb.st_rdev)>>6);
+               }
+       }
+
+       for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
+               if (mse2->devnum == st->devnum) {
+                       mse2->devnum = INT_MAX; /* flag it as "used" */
+                       mse = mse2;
+               }
+
+       if (!mse) {
+               /* duplicated array in statelist
+                * or re-created after reading mdstat*/
+               st->err = 1;
+               close(fd);
+               return 0;
+       }
+       /* this array is in /proc/mdstat */
+       if (array.utime == 0)
+               /* external arrays don't update utime, so
+                * just make sure it is always different. */
+               array.utime = st->utime + 1;;
+
+       if (st->utime == array.utime &&
+           st->failed == array.failed_disks &&
+           st->working == array.working_disks &&
+           st->spare == array.spare_disks &&
+           (mse == NULL  || (
+                   mse->percent == st->percent
+                   ))) {
+               close(fd);
+               st->err = 0;
+               if ((st->active < st->raid) && st->spare == 0)
+                       return 1;
+               else
+                       return 0;
+       }
+       if (st->utime == 0 && /* new array */
+           mse->pattern && strchr(mse->pattern, '_') /* degraded */
+               )
+               alert("DegradedArray", dev, NULL, ainfo);
+
+       if (st->utime == 0 && /* new array */
+           st->expected_spares > 0 &&
+           array.spare_disks < st->expected_spares)
+               alert("SparesMissing", dev, NULL, ainfo);
+       if (st->percent == -1 &&
+           mse->percent >= 0)
+               alert("RebuildStarted", dev, NULL, ainfo);
+       if (st->percent >= 0 &&
+           mse->percent >= 0 &&
+           (mse->percent / increments) > (st->percent / increments)) {
+               char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
+
+               if((mse->percent / increments) == 0)
+                       snprintf(percentalert, sizeof(percentalert), "RebuildStarted");
                else
-                       syslog(priority, "%s event detected on md device %s", event, dev);
+                       snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent);
+
+               alert(percentalert, dev, NULL, ainfo);
+       }
+
+       if (mse->percent == -1 &&
+           st->percent >= 0) {
+               /* Rebuild/sync/whatever just finished.
+                * If there is a number in /mismatch_cnt,
+                * we should report that.
+                */
+               struct mdinfo *sra =
+                       sysfs_read(-1, st->devnum, GET_MISMATCH);
+               if (sra && sra->mismatch_cnt > 0) {
+                       char cnt[40];
+                       sprintf(cnt, " mismatches found: %d", sra->mismatch_cnt);
+                       alert("RebuildFinished", dev, cnt, ainfo);
+               } else
+                       alert("RebuildFinished", dev, NULL, ainfo);
+               if (sra)
+                       free(sra);
+       }
+       st->percent = mse->percent;
+
+       for (i=0; i<MaxDisks && i <= array.raid_disks + array.nr_disks;
+            i++) {
+               mdu_disk_info_t disc;
+               disc.number = i;
+               if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) {
+                       info[i].state = disc.state;
+                       info[i].major = disc.major;
+                       info[i].minor = disc.minor;
+               } else
+                       info[i].major = info[i].minor = 0;
+       }
+
+       if (strncmp(mse->metadata_version, "external:", 9) == 0 &&
+           is_subarray(mse->metadata_version+9))
+               st->parent_dev =
+                       devname2devnum(mse->metadata_version+10);
+       else
+               st->parent_dev = NoMdDev;
+       if (st->metadata == NULL &&
+           st->parent_dev == NoMdDev)
+               st->metadata = super_by_fd(fd, NULL);
+
+       close(fd);
+
+       for (i=0; i<MaxDisks; i++) {
+               mdu_disk_info_t disc = {0,0,0,0,0};
+               int newstate=0;
+               int change;
+               char *dv = NULL;
+               disc.number = i;
+               if (i > array.raid_disks + array.nr_disks) {
+                       newstate = 0;
+                       disc.major = disc.minor = 0;
+               } else if (info[i].major || info[i].minor) {
+                       newstate = info[i].state;
+                       dv = map_dev(info[i].major, info[i].minor, 1);
+                       disc.state = newstate;
+                       disc.major = info[i].major;
+                       disc.minor = info[i].minor;
+               } else if (mse &&  mse->pattern && i < (int)strlen(mse->pattern)) {
+                       switch(mse->pattern[i]) {
+                       case 'U': newstate = 6 /* ACTIVE/SYNC */; break;
+                       case '_': newstate = 0; break;
+                       }
+                       disc.major = disc.minor = 0;
+               }
+               if (dv == NULL && st->devid[i])
+                       dv = map_dev(major(st->devid[i]),
+                                    minor(st->devid[i]), 1);
+               change = newstate ^ st->devstate[i];
+               if (st->utime && change && !st->err) {
+                       if (i < array.raid_disks &&
+                           (((newstate&change)&(1<<MD_DISK_FAULTY)) ||
+                            ((st->devstate[i]&change)&(1<<MD_DISK_ACTIVE)) ||
+                            ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)))
+                               )
+                               alert("Fail", dev, dv, ainfo);
+                       else if (i >= array.raid_disks &&
+                                (disc.major || disc.minor) &&
+                                st->devid[i] == makedev(disc.major, disc.minor) &&
+                                ((newstate&change)&(1<<MD_DISK_FAULTY))
+                               )
+                               alert("FailSpare", dev, dv, ainfo);
+                       else if (i < array.raid_disks &&
+                                ! (newstate & (1<<MD_DISK_REMOVED)) &&
+                                (((st->devstate[i]&change)&(1<<MD_DISK_FAULTY)) ||
+                                 ((newstate&change)&(1<<MD_DISK_ACTIVE)) ||
+                                 ((newstate&change)&(1<<MD_DISK_SYNC)))
+                               )
+                               alert("SpareActive", dev, dv, ainfo);
+               }
+               st->devstate[i] = newstate;
+               st->devid[i] = makedev(disc.major, disc.minor);
+       }
+       st->active = array.active_disks;
+       st->working = array.working_disks;
+       st->spare = array.spare_disks;
+       st->failed = array.failed_disks;
+       st->utime = array.utime;
+       st->raid = array.raid_disks;
+       st->err = 0;
+       if ((st->active < st->raid) && st->spare == 0)
+               return 1;
+       return 0;
+}
+
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+                         int test, struct alert_info *info)
+{
+       struct mdstat_ent *mse;
+       int new_found = 0;
+
+       for (mse=mdstat; mse; mse=mse->next)
+               if (mse->devnum != INT_MAX &&
+                   (!mse->level  || /* retrieve containers */
+                    (strcmp(mse->level, "raid0") != 0 &&
+                     strcmp(mse->level, "linear") != 0))
+                       ) {
+                       struct state *st = calloc(1, sizeof *st);
+                       mdu_array_info_t array;
+                       int fd;
+                       if (st == NULL)
+                               continue;
+                       st->devname = strdup(get_md_name(mse->devnum));
+                       if ((fd = open(st->devname, O_RDONLY)) < 0 ||
+                           ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
+                               /* no such array */
+                               if (fd >=0) close(fd);
+                               put_md_name(st->devname);
+                               free(st->devname);
+                               if (st->metadata) {
+                                       st->metadata->ss->free_super(st->metadata);
+                                       free(st->metadata);
+                               }
+                               free(st);
+                               continue;
+                       }
+                       close(fd);
+                       st->next = *statelist;
+                       st->err = 1;
+                       st->devnum = mse->devnum;
+                       st->percent = -2;
+                       st->expected_spares = -1;
+                       if (strncmp(mse->metadata_version, "external:", 9) == 0 &&
+                           is_subarray(mse->metadata_version+9))
+                               st->parent_dev =
+                                       devname2devnum(mse->metadata_version+10);
+                       else
+                               st->parent_dev = NoMdDev;
+                       *statelist = st;
+                       if (test)
+                               alert("TestMessage", st->devname, NULL, info);
+                       alert("NewArray", st->devname, NULL, info);
+                       new_found = 1;
+               }
+       return new_found;
+}
+
+unsigned long long min_spare_size_required(struct state *st)
+{
+       int fd;
+       unsigned long long rv = 0;
+
+       if (!st->metadata ||
+           !st->metadata->ss->min_acceptable_spare_size)
+               return rv;
+
+       fd = open(st->devname, O_RDONLY);
+       if (fd < 0)
+               return 0;
+       st->metadata->ss->load_super(st->metadata, fd, st->devname);
+       close(fd);
+       rv = st->metadata->ss->min_acceptable_spare_size(st->metadata);
+       st->metadata->ss->free_super(st->metadata);
+
+       return rv;
+}
+
+static int move_spare(struct state *from, struct state *to,
+                     dev_t devid,
+                     struct alert_info *info)
+{
+       struct mddev_dev devlist;
+       char devname[20];
+
+       /* try to remove and add */
+       int fd1 = open(to->devname, O_RDONLY);
+       int fd2 = open(from->devname, O_RDONLY);
+
+       if (fd1 < 0 || fd2 < 0) {
+               if (fd1>=0) close(fd1);
+               if (fd2>=0) close(fd2);
+               return 0;
+       }
+
+       devlist.next = NULL;
+       devlist.used = 0;
+       devlist.re_add = 0;
+       devlist.writemostly = 0;
+       devlist.devname = devname;
+       sprintf(devname, "%d:%d", major(devid), minor(devid));
+
+       devlist.disposition = 'r';
+       if (Manage_subdevs(from->devname, fd2, &devlist, -1, 0, NULL) == 0) {
+               devlist.disposition = 'a';
+               if (Manage_subdevs(to->devname, fd1, &devlist, -1, 0, NULL) == 0) {
+                       alert("MoveSpare", to->devname, from->devname, info);
+                       /* make sure we will see newly added spare before next
+                        * time through loop
+                        */
+                       ping_manager(to->devname);
+                       ping_manager(from->devname);
+                       close(fd1);
+                       close(fd2);
+                       return 1;
+               }
+               else Manage_subdevs(from->devname, fd2, &devlist, -1, 0, NULL);
        }
+       close(fd1);
+       close(fd2);
+       return 0;
+}
+
+static int check_donor(struct state *from, struct state *to,
+                      struct domainlist *domlist)
+{
+       struct state *sub;
+
+       if (from == to)
+               return 0;
+       if (from->parent)
+               /* Cannot move from a member */
+               return 0;
+       if (from->err)
+               return 0;
+       for (sub = from->subarray; sub; sub = sub->subarray)
+               /* If source array has degraded subarrays, don't
+                * remove anything
+                */
+               if (sub->active < sub->raid)
+                       return 0;
+       if (from->metadata->ss->external == 0)
+               if (from->active < from->raid)
+                       return 0;
+       if (from->spare <= 0)
+               return 0;
+       if (domlist == NULL)
+               return 0;
+       return 1;
+}
+
+static dev_t choose_spare(struct state *from, struct state *to,
+                       struct domainlist *domlist, unsigned long long min_size)
+{
+       int d;
+       dev_t dev = 0;
+
+       for (d = from->raid; !dev && d < MaxDisks; d++) {
+               if (from->devid[d] > 0 &&
+                   from->devstate[d] == 0) {
+                       struct dev_policy *pol;
+                       unsigned long long dev_size;
+
+                       if (min_size &&
+                           dev_size_from_id(from->devid[d], &dev_size) &&
+                           dev_size < min_size)
+                               continue;
+
+                       pol = devnum_policy(from->devid[d]);
+                       if (from->spare_group)
+                               pol_add(&pol, pol_domain,
+                                       from->spare_group, NULL);
+                       if (domain_test(domlist, pol, to->metadata->ss->name))
+                           dev = from->devid[d];
+                       dev_policy_free(pol);
+               }
+       }
+       return dev;
+}
+
+static dev_t container_choose_spare(struct state *from, struct state *to,
+                                   struct domainlist *domlist,
+                                   unsigned long long min_size)
+{
+       /* This is similar to choose_spare, but we cannot trust devstate,
+        * so we need to read the metadata instead
+        */
+
+       struct supertype *st = from->metadata;
+       int fd = open(from->devname, O_RDONLY);
+       int err;
+       struct mdinfo *disks, *d;
+       dev_t dev = 0;
+
+       if (fd < 0)
+               return 0;
+       if (!st->ss->getinfo_super_disks)
+               return 0;
+       
+       err = st->ss->load_container(st, fd, NULL);
+       close(fd);
+       if (err)
+               return 0;
+
+       disks = st->ss->getinfo_super_disks(st);
+       st->ss->free_super(st);
+
+       if (!disks)
+               return 0;
+       
+       for (d = disks->devs ; d && !dev ; d = d->next) {
+               if (d->disk.state == 0) {
+                       struct dev_policy *pol;
+                       unsigned long long dev_size;
+                       dev = makedev(d->disk.major,d->disk.minor);
+                       
+                       if (min_size &&
+                           dev_size_from_id(dev,  &dev_size) &&
+                           dev_size < min_size) {
+                               dev = 0;
+                               continue;
+                       }
+                       if (from == to)
+                               /* Just checking if destination already has
+                                * a spare, no need to check policy, we are
+                                * done.
+                                */
+                               break;
+
+                       pol = devnum_policy(dev);
+                       if (from->spare_group)
+                               pol_add(&pol, pol_domain,
+                                       from->spare_group, NULL);
+                       if (!domain_test(domlist, pol, to->metadata->ss->name))
+                               dev = 0;
+
+                       dev_policy_free(pol);
+               }
+       }
+       sysfs_free(disks);
+       return dev;
+}
+
+
+static void try_spare_migration(struct state *statelist, struct alert_info *info)
+{
+       struct state *from;
+       struct state *st;
+
+       link_containers_with_subarrays(statelist);
+       for (st = statelist; st; st = st->next)
+               if (st->active < st->raid &&
+                   st->spare == 0 && !st->err) {
+                       struct domainlist *domlist = NULL;
+                       int d;
+                       struct state *to = st;
+                       unsigned long long min_size;
+
+                       if (to->parent)
+                               /* member of a container */
+                               to = to->parent;
+
+                       min_size = min_spare_size_required(to);
+                       if (to->metadata->ss->external) {
+                               /* We must make sure there is
+                                * no suitable spare in container already.
+                                * If there is we don't add more */
+                               dev_t devid = container_choose_spare(
+                                       to, to, NULL, min_size);
+                               if (devid > 0)
+                                       continue;
+                       }
+                       for (d = 0; d < MaxDisks; d++)
+                               if (to->devid[d])
+                                       domainlist_add_dev(&domlist,
+                                                          to->devid[d],
+                                                          to->metadata->ss->name);
+                       if (to->spare_group)
+                               domain_add(&domlist, to->spare_group);
+
+                       for (from=statelist ; from ; from=from->next) {
+                               dev_t devid;
+                               if (!check_donor(from, to, domlist))
+                                       continue;
+                               if (from->metadata->ss->external)
+                                       devid = container_choose_spare(
+                                               from, to, domlist, min_size);
+                               else
+                                       devid = choose_spare(from, to, domlist,
+                                                            min_size);
+                               if (devid > 0
+                                   && move_spare(from, to, devid, info))
+                                               break;
+                       }
+                       domain_free(domlist);
+               }
+}
+
+/* search the statelist to connect external
+ * metadata subarrays with their containers
+ * We always completely rebuild the tree from scratch as
+ * that is safest considering the possibility of entries
+ * disappearing or changing.
+ */
+static void link_containers_with_subarrays(struct state *list)
+{
+       struct state *st;
+       struct state *cont;
+       for (st = list; st; st = st->next) {
+               st->parent = NULL;
+               st->subarray = NULL;
+       }
+       for (st = list; st; st = st->next)
+               if (st->parent_dev != NoMdDev)
+                       for (cont = list; cont; cont = cont->next)
+                               if (!cont->err &&
+                                   cont->parent_dev == NoMdDev &&
+                                   cont->devnum == st->parent_dev) {
+                                       st->parent = cont;
+                                       st->subarray = cont->subarray;
+                                       cont->subarray = st;
+                                       break;
+                               }
 }
 
 /* Not really Monitor but ... */
diff --git a/Query.c b/Query.c
index 8847be7ec0b6a1e0880865d672b38f9f0374d2e2..f9857d6d9e2ddadd86726db70046324bf4b0ee7d 100644 (file)
--- a/Query.c
+++ b/Query.c
@@ -90,7 +90,7 @@ int Query(char *dev)
        close(fd);
        if (superror == 0) {
                /* array might be active... */
-               st->ss->getinfo_super(st, &info);
+               st->ss->getinfo_super(st, &info, NULL);
                if (st->ss == &super0) {
                        mddev = get_md_name(info.array.md_minor);
                        disc.number = info.disk.number;
index b97c55e76c62ea9be2ac852033dc1ed1cf5f3cc9..5714849293a766b18d2ef26242f02dadbb7a9c18 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -24,7 +24,7 @@
 
 #include "mdadm.h"
 
-char Version[] = Name " - v3.1.4 - 31st August 2010\n";
+char Version[] = Name " - v3.2-devel - 23rd November 2010\n";
 
 /*
  * File: ReadMe.c
@@ -93,8 +93,8 @@ char short_bitmap_auto_options[]=
                    "-ABCDEFGIQhVXWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
 
 struct option long_options[] = {
-    {"manage",    0, 0, '@'},
-    {"misc",      0, 0, '#'},
+    {"manage",    0, 0, ManageOpt},
+    {"misc",      0, 0, MiscOpt},
     {"assemble",  0, 0, 'A'},
     {"build",     0, 0, 'B'},
     {"create",    0, 0, 'C'},
@@ -116,29 +116,30 @@ struct option long_options[] = {
 
     /* after those will normally come the name of the md device */
     {"help",      0, 0, 'h'},
-    {"help-options",0,0,'h'},
+    {"help-options",0,0, HelpOptions},
     {"version",          0, 0, 'V'},
     {"verbose",   0, 0, 'v'},
     {"quiet",    0, 0, 'q'},
 
     /* For create or build: */
-    {"chunk",    1, 0, 'c'},
-    {"rounding",  1, 0, 'c'}, /* for linear, chunk is really a rounding number */
+    {"chunk",    1, 0, ChunkSize},
+    {"rounding",  1, 0, ChunkSize}, /* for linear, chunk is really a
+                                    * rounding number */
     {"level",     1, 0, 'l'}, /* 0,1,4,5,6,linear */
-    {"parity",    1, 0, 'p'}, /* {left,right}-{a,}symmetric */
-    {"layout",    1, 0, 'p'},
+    {"parity",    1, 0, Layout}, /* {left,right}-{a,}symmetric */
+    {"layout",    1, 0, Layout},
     {"raid-disks",1, 0, 'n'},
     {"raid-devices",1, 0, 'n'},
     {"spare-disks",1,0, 'x'},
     {"spare-devices",1,0, 'x'},
     {"size",     1, 0, 'z'},
-    {"auto",     1, 0, 'a'}, /* also for --assemble */
+    {"auto",     1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
     {"metadata",  1, 0, 'e'}, /* superblock format */
-    {"bitmap",   1, 0, 'b'},
+    {"bitmap",   1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
     {"write-behind", 2, 0, WriteBehind},
-    {"write-mostly",0, 0, 'W'},
+    {"write-mostly",0, 0, WriteMostly},
     {"re-add",    0, 0,  ReAdd},
     {"homehost",  1, 0,  HomeHost},
 #if 0
@@ -148,49 +149,54 @@ struct option long_options[] = {
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
-    {"super-minor",1,0, 'm'},
+    {"super-minor",1,0, SuperMinor},
     {"name",     1, 0, 'N'},
-    {"config",    1, 0, 'c'},
+    {"config",    1, 0, ConfigFile},
     {"scan",      0, 0, 's'},
-    {"force",    0, 0, 'f'},
+    {"force",    0, 0, Force},
     {"update",   1, 0, 'U'},
 
     /* Management */
-    {"add",       0, 0, 'a'},
-    {"remove",    0, 0, 'r'},
-    {"fail",      0, 0, 'f'},
-    {"set-faulty",0, 0, 'f'},
+    {"add",       0, 0, Add},
+    {"remove",    0, 0, Remove},
+    {"fail",      0, 0, Fail},
+    {"set-faulty",0, 0, Fail},
     {"run",       0, 0, 'R'},
     {"stop",      0, 0, 'S'},
     {"readonly",  0, 0, 'o'},
     {"readwrite", 0, 0, 'w'},
     {"no-degraded",0,0,  NoDegraded },
-    {"wait",     0, 0, 'W'},
+    {"wait",     0, 0,  WaitOpt},
     {"wait-clean", 0, 0, Waitclean },
 
     /* For Detail/Examine */
-    {"brief",    0, 0, 'b'},
+    {"brief",    0, 0, Brief},
     {"export",   0, 0, 'Y'},
     {"sparc2.2",  0, 0, Sparc22},
     {"test",      0, 0, 't'},
 
     /* For Follow/monitor */
-    {"mail",      1, 0, 'm'},
-    {"program",   1, 0, 'p'},
-    {"alert",     1, 0, 'p'},
-    {"increment", 1, 0, 'r'},
+    {"mail",      1, 0, EMail},
+    {"program",   1, 0, ProgramOpt},
+    {"alert",     1, 0, ProgramOpt},
+    {"increment", 1, 0, Increment},
     {"delay",     1, 0, 'd'},
-    {"daemonise", 0, 0, 'f'},
-    {"daemonize", 0, 0, 'f'},
+    {"daemonise", 0, 0, Fork},
+    {"daemonize", 0, 0, Fork},
     {"oneshot",   0, 0, '1'},
     {"pid-file",  1, 0, 'i'},
     {"syslog",    0, 0, 'y'},
+    {"no-sharing", 0, 0, NoSharing},
+
     /* For Grow */
     {"backup-file", 1,0, BackupFile},
+    {"invalid-backup",0,0,InvalidBackup},
     {"array-size", 1, 0, 'Z'},
 
     /* For Incremental */
-    {"rebuild-map", 0, 0, 'r'},
+    {"rebuild-map", 0, 0, RebuildMapOpt},
+    {"path", 1, 0, IncrementalPath},
+
     {0, 0, 0, 0}
 };
 
index 541a85d83e7ff17b7a0c508cb4c2d86d380fcb43..1f78c689ba5d4407d4331967c9cdb90ae86d8519 100644 (file)
--- a/config.c
+++ b/config.c
@@ -75,7 +75,7 @@ char DefaultConfFile[] = CONFFILE;
 char DefaultAltConfFile[] = CONFFILE2;
 
 enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
-               Homehost, AutoMode, LTEnd };
+               Homehost, AutoMode, Policy, PartPolicy, LTEnd };
 char *keywords[] = {
        [Devices]  = "devices",
        [Array]    = "array",
@@ -85,6 +85,8 @@ char *keywords[] = {
        [CreateDev]= "create",
        [Homehost] = "homehost",
        [AutoMode] = "auto",
+       [Policy]   = "policy",
+       [PartPolicy]="part-policy",
        [LTEnd]    = NULL
 };
 
@@ -229,11 +231,11 @@ struct conf_dev {
     char *name;
 } *cdevlist = NULL;
 
-mddev_dev_t load_partitions(void)
+struct mddev_dev *load_partitions(void)
 {
        FILE *f = fopen("/proc/partitions", "r");
        char buf[1024];
-       mddev_dev_t rv = NULL;
+       struct mddev_dev *rv = NULL;
        if (f == NULL) {
                fprintf(stderr, Name ": cannot open /proc/partitions\n");
                return NULL;
@@ -241,7 +243,7 @@ mddev_dev_t load_partitions(void)
        while (fgets(buf, 1024, f)) {
                int major, minor;
                char *name, *mp;
-               mddev_dev_t d;
+               struct mddev_dev *d;
 
                buf[1023] = '\0';
                if (buf[0] != ' ')
@@ -258,19 +260,18 @@ mddev_dev_t load_partitions(void)
                d->devname = strdup(name);
                d->next = rv;
                d->used = 0;
-               d->content = NULL;
                rv = d;
        }
        fclose(f);
        return rv;
 }
 
-mddev_dev_t load_containers(void)
+struct mddev_dev *load_containers(void)
 {
        struct mdstat_ent *mdstat = mdstat_read(1, 0);
        struct mdstat_ent *ent;
-       mddev_dev_t d;
-       mddev_dev_t rv = NULL;
+       struct mddev_dev *d;
+       struct mddev_dev *rv = NULL;
 
        if (!mdstat)
                return NULL;
@@ -288,7 +289,6 @@ mddev_dev_t load_containers(void)
                        }
                        d->next = rv;
                        d->used = 0;
-                       d->content = NULL;
                        rv = d;
                }
        free_mdstat(mdstat);
@@ -440,8 +440,8 @@ void devline(char *line)
        }
 }
 
-mddev_ident_t mddevlist = NULL;
-mddev_ident_t *mddevlp = &mddevlist;
+struct mddev_ident *mddevlist = NULL;
+struct mddev_ident **mddevlp = &mddevlist;
 
 static int is_number(char *w)
 {
@@ -458,8 +458,8 @@ void arrayline(char *line)
 {
        char *w;
 
-       struct mddev_ident_s mis;
-       mddev_ident_t mi;
+       struct mddev_ident mis;
+       struct mddev_ident *mi;
 
        mis.uuid_set = 0;
        mis.super_minor = UnSet;
@@ -675,24 +675,113 @@ void homehostline(char *line)
        }
 }
 
-static char *auto_options = NULL;
+char auto_yes[] = "yes";
+char auto_no[] = "no";
+char auto_homehost[] = "homehost";
+
+static int auto_seen = 0;
 void autoline(char *line)
 {
        char *w;
+       char *seen;
+       int super_cnt;
+       char *dflt = auto_yes;
+       int homehost = 0;
+       int i;
 
-       if (auto_options) {
+       if (auto_seen) {
                fprintf(stderr, Name ": AUTO line may only be give once."
                        "  Subsequent lines ignored\n");
                return;
        }
+       /* Parse the 'auto' line creating policy statements for the 'auto' policy.
+        *
+        * The default is 'yes' but the 'auto' line might over-ride that.
+        * Words in the line are processed in order with the first
+        * match winning.
+        * word can be:
+        *   +version   - that version can be assembled
+        *   -version   - that version cannot be auto-assembled
+        *   yes or +all - any other version can be assembled
+        *   no or -all  - no other version can be assembled.
+        *   homehost   - any array associated by 'homehost' to this
+        *                host can be assembled.
+        *
+        * Thus:
+        *   +ddf -0.90 homehost -all
+        * will auto-assemble any ddf array, no 0.90 array, and
+        * any other array (imsm, 1.x) if and only if it is identified
+        * as belonging to this host.
+        *
+        * We translate that to policy by creating 'auto=yes' when we see
+        * a '+version' line, 'auto=no' if we see '-version' before 'homehost',
+        * or 'auto=homehost' if we see '-version' after 'homehost'.
+        * When we see yes, no, +all or -all we stop an any version that hasn't
+        * been seen gets an appropriate auto= entry.
+        */
 
-       auto_options = dl_strdup(line);
-       dl_init(auto_options);
+       for (super_cnt = 0; superlist[super_cnt]; super_cnt++)
+               ;
+       seen = calloc(super_cnt, 1);
 
-       for (w=dl_next(line); w != line ; w=dl_next(w)) {
-               char *w2 = dl_strdup(w);
-               dl_add(auto_options, w2);
+       for (w = dl_next(line); w != line ; w = dl_next(w)) {
+               char *val;
+
+               if (strcasecmp(w, "yes") == 0) {
+                       dflt = auto_yes;
+                       break;
+               }
+               if (strcasecmp(w, "no") == 0) {
+                       if (homehost)
+                               dflt = auto_homehost;
+                       else
+                               dflt = auto_no;
+                       break;
+               }
+               if (strcasecmp(w, "homehost") == 0) {
+                       homehost = 1;
+                       continue;
+               }
+               if (w[0] == '+')
+                       val = auto_yes;
+               else if (w[0] == '-') {
+                       if (homehost)
+                               val = auto_homehost;
+                       else
+                               val = auto_no;
+               } else
+                       continue;
+
+               if (strcasecmp(w+1, "all") == 0) {
+                       dflt = val;
+                       break;
+               }
+               for (i = 0; superlist[i]; i++) {
+                       const char *version = superlist[i]->name;
+                       if (strcasecmp(w+1, version) == 0)
+                               break;
+                       /* 1 matches 1.x, 0 matches 0.90 */
+                       if (version[1] == '.' &&
+                           strlen(w+1) == 1 &&
+                           w[1] == version[0])
+                               break;
+                       /* 1.anything matches 1.x */
+                       if (strcmp(version, "1.x") == 0 &&
+                           strncmp(w+1, "1.", 2) == 0)
+                               break;
+               }
+               if (superlist[i] == NULL)
+                       /* ignore this word */
+                       continue;
+               if (seen[i])
+                       /* already know about this metadata */
+                       continue;
+               policy_add(rule_policy, pol_auto, val, pol_metadata, superlist[i]->name, NULL);
+               seen[i] = 1;
        }
+       for (i = 0; i < super_cnt; i++)
+               if (!seen[i])
+                       policy_add(rule_policy, pol_auto, dflt, pol_metadata, superlist[i]->name, NULL);
 }
 
 int loaded = 0;
@@ -767,6 +856,12 @@ void load_conffile(void)
                case AutoMode:
                        autoline(line);
                        break;
+               case Policy:
+                       policyline(line, rule_policy);
+                       break;
+               case PartPolicy:
+                       policyline(line, rule_part);
+                       break;
                default:
                        fprintf(stderr, Name ": Unknown keyword %s\n", line);
                }
@@ -810,9 +905,9 @@ struct createinfo *conf_get_create_info(void)
        return &createinfo;
 }
 
-mddev_ident_t conf_get_ident(char *dev)
+struct mddev_ident *conf_get_ident(char *dev)
 {
-       mddev_ident_t rv;
+       struct mddev_ident *rv;
        load_conffile();
        rv = mddevlist;
        while (dev && rv && (rv->devname == NULL
@@ -821,23 +916,23 @@ mddev_ident_t conf_get_ident(char *dev)
        return rv;
 }
 
-static void append_dlist(mddev_dev_t *dlp, mddev_dev_t list)
+static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list)
 {
        while (*dlp)
                dlp = &(*dlp)->next;
        *dlp = list;
 }
 
-mddev_dev_t conf_get_devs()
+struct mddev_dev *conf_get_devs()
 {
        glob_t globbuf;
        struct conf_dev *cd;
        int flags = 0;
-       static mddev_dev_t dlist = NULL;
+       static struct mddev_dev *dlist = NULL;
        unsigned int i;
 
        while (dlist) {
-               mddev_dev_t t = dlist;
+               struct mddev_dev *t = dlist;
                dlist = dlist->next;
                free(t->devname);
                free(t);
@@ -863,11 +958,10 @@ mddev_dev_t conf_get_devs()
        }
        if (flags & GLOB_APPEND) {
                for (i=0; i<globbuf.gl_pathc; i++) {
-                       mddev_dev_t t = malloc(sizeof(*t));
+                       struct mddev_dev *t = malloc(sizeof(*t));
                        t->devname = strdup(globbuf.gl_pathv[i]);
                        t->next = dlist;
                        t->used = 0;
-                       t->content = NULL;
                        dlist = t;
 /*     printf("one dev is %s\n", t->devname);*/
                }
@@ -892,64 +986,30 @@ int conf_test_dev(char *devname)
        return 0;
 }
 
-int conf_test_metadata(const char *version, int is_homehost)
+int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost)
 {
-       /* Check if the given metadata version is allowed
-        * to be auto-assembled.
-        * The default is 'yes' but the 'auto' line might over-ride that.
-        * Words in auto_options are processed in order with the first
-        * match winning.
-        * word can be:
-        *   +version   - that version can be assembled
-        *   -version   - that version cannot be auto-assembled
-        *   yes or +all - any other version can be assembled
-        *   no or -all  - no other version can be assembled.
-        *   homehost   - any array associated by 'homehost' to this
-        *                host can be assembled.
-        *
-        * Thus:
-        *   +ddf -0.90 homehost -all
-        * will auto-assemble any ddf array, no 0.90 array, and
-        * any other array (imsm, 1.x) if and only if it is identified
-        * as belonging to this host.
+       /* If anyone said 'yes', that sticks.
+        * else if homehost applies, use that
+        * else if there is a 'no', say 'no'.
+        * else 'yes'.
         */
-       char *w;
+       struct dev_policy *p;
+       int no=0, found_auto=0;
        load_conffile();
-       if (!auto_options)
-               return 1;
-       for (w = dl_next(auto_options); w != auto_options; w = dl_next(w)) {
-               int rv;
-               if (strcasecmp(w, "yes") == 0)
+
+       pol = pol_find(pol, pol_auto);
+       pol_for_each(p, pol, version) {
+               if (strcmp(p->value, "yes") == 0)
                        return 1;
-               if (strcasecmp(w, "no") == 0)
-                       return 0;
-               if (strcasecmp(w, "homehost") == 0) {
-                       if (is_homehost)
-                               return 1;
-                       else
-                               continue;
-               }
-               if (w[0] == '+')
-                       rv = 1;
-               else if (w[0] == '-')
-                       rv = 0;
-               else continue;
-
-               if (strcasecmp(w+1, "all") == 0)
-                       return rv;
-               if (strcasecmp(w+1, version) == 0)
-                       return rv;
-               /* allow  '0' to match version '0.90'
-                * and 1 or 1.whatever to match version '1.x'
-                */
-               if (version[1] == '.' &&
-                   strlen(w+1) == 1 &&
-                   w[1] == version[0])
-                       return rv;
-               if (version[1] == '.' && version[2] == 'x' &&
-                   strncmp(w+1, version, 2) == 0)
-                       return rv;
+               if (strcmp(p->value, "auto") == 0)
+                       found_auto = 1;
+               if (strcmp(p->value, "no") == 0)
+                       no = 1;
        }
+       if (is_homehost && found_auto)
+               return 1;
+       if (no)
+               return 0;
        return 1;
 }
 
@@ -959,7 +1019,6 @@ int match_oneof(char *devices, char *devname)
      * matches devname
      */
 
-
     while (devices && *devices) {
        char patn[1024];
        char *p = devices;
@@ -1016,7 +1075,7 @@ int conf_name_is_free(char *name)
         * It can be taken either by a match on devname, name, or
         * even super-minor.
         */
-       mddev_ident_t dev;
+       struct mddev_ident *dev;
 
        load_conffile();
        for (dev = mddevlist; dev; dev = dev->next) {
@@ -1033,9 +1092,9 @@ int conf_name_is_free(char *name)
        return 1;
 }
 
-struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st)
+struct mddev_ident *conf_match(struct mdinfo *info, struct supertype *st)
 {
-       struct mddev_ident_s *array_list, *match;
+       struct mddev_ident *array_list, *match;
        int verbose = 0;
        char *devname = NULL;
        array_list = conf_get_ident(NULL);
diff --git a/external-reshape-design.txt b/external-reshape-design.txt
new file mode 100644 (file)
index 0000000..28e3434
--- /dev/null
@@ -0,0 +1,168 @@
+External Reshape
+
+1 Problem statement
+
+External (third-party metadata) reshape differs from native-metadata
+reshape in three key ways:
+
+1.1 Format specific constraints
+
+In the native case reshape is limited by what is implemented in the
+generic reshape routine (Grow_reshape()) and what is supported by the
+kernel.  There are exceptional cases where Grow_reshape() may block
+operations when it knows that the kernel implementation is broken, but
+otherwise the kernel is relied upon to be the final arbiter of what
+reshape operations are supported.
+
+In the external case the kernel, and the generic checks in
+Grow_reshape(), become the super-set of what reshapes are possible.  The
+metadata format may not support, or have yet to implement a given
+reshape type.  The implication for Grow_reshape() is that it must query
+the metadata handler and effect changes in the metadata before the new
+geometry is posted to the kernel.  The ->reshape_super method allows
+Grow_reshape() to validate the requested operation and post the metadata
+update.
+
+1.2 Scope of reshape
+
+Native metadata reshape is always performed at the array scope (no
+metadata relationship with sibling arrays on the same disks).  External
+reshape, depending on the format, may not allow the number of member
+disks to be changed in a subarray unless the change is simultaneously
+applied to all subarrays in the container.  For example the imsm format
+requires all member disks to be a member of all subarrays, so a 4-disk
+raid5 in a container that also houses a 4-disk raid10 array could not be
+reshaped to 5 disks as the imsm format does not support a 5-disk raid10
+representation.  This requires the ->reshape_super method to check the
+contents of the array and ask the user to run the reshape at container
+scope (if both subarrays are agreeable to the change), or report an
+error in the case where one subarray cannot support the change.
+
+1.3 Monitoring / checkpointing
+
+Reshape, unlike rebuild/resync, requires strict checkpointing to survive
+interrupted reshape operations.  For example when expanding a raid5
+array the first few stripes of the array will be overwritten in a
+destructive manner.  When restarting the reshape process we need to know
+the exact location of the last successfully written stripe, and we need
+to restore the data in any partially overwritten stripe.  Native
+metadata stores this backup data in the unused portion of spares that
+are being promoted to array members, or in an external backup file
+(located on a non-involved block device).
+
+The kernel is in charge of recording checkpoints of reshape progress,
+but mdadm is delegated the task of managing the backup space which
+involves:
+1/ Identifying what data will be overwritten in the next unit of reshape
+   operation
+2/ Suspending access to that region so that a snapshot of the data can
+   be transferred to the backup space.
+3/ Allowing the kernel to reshape the saved region and setting the
+   boundary for the next backup.
+
+In the external reshape case we want to preserve this mdadm
+'reshape-manager' arrangement, but have a third actor, mdmon, to
+consider.  It is tempting to give the role of managing reshape to mdmon,
+but that is counter to its role as a monitor, and conflicts with the
+existing capabilities and role of mdadm to manage the progress of
+reshape.  For clarity the external reshape implementation maintains the
+role of mdmon as a (mostly) passive recorder of raid events, and mdadm
+treats it as it would the kernel in the native reshape case (modulo
+needing to send explicit metadata update messages and checking that
+mdmon took the expected action).
+
+External reshape can use the generic md backup file as a fallback, but in the
+optimal/firmware-compatible case the reshape-manager will use the metadata
+specific areas for managing reshape.  The implementation also needs to spawn a
+reshape-manager per subarray when the reshape is being carried out at the
+container level.  For these two reasons the ->manage_reshape() method is
+introduced.  This method in addition to base tasks mentioned above:
+1/ Spawns a manager per-subarray, when necessary
+2/ Uses either generic routines in Grow.c for md-style backup file
+   support, or uses the metadata-format specific location for storing
+   recovery data.
+This aims to avoid a "midlayer mistake"[1] and lets the metadata handler
+optionally take advantage of generic infrastructure in Grow.c
+
+2 Details for specific reshape requests
+
+There are quite a few moving pieces spread out across md, mdadm, and mdmon for
+the support of external reshape, and there are several different types of
+reshape that need to be comprehended by the implementation.  A rundown of
+these details follows.
+
+2.0 General provisions:
+
+Obtain an exclusive open on the container to make sure we are not
+running concurrently with a Create() event.
+
+2.1 Freezing sync_action
+
+2.2 Reshape size
+
+   1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+      initializes st->update_tail
+   2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change
+      is allowed (being performed at subarray scope / enough room) prepares a
+      metadata update
+   3/ mdadm::Grow_reshape(): flushes the metadata update (via
+      flush_metadata_update(), or ->sync_metadata())
+   4/ mdadm::Grow_reshape(): post the new size to the kernel
+
+
+2.3 Reshape level (simple-takeover)
+
+"simple-takeover" implies the level change can be satisfied without touching
+sync_action
+
+    1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+       initializes st->update_tail
+    2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change
+       is allowed (being performed at subarray scope) prepares a
+       metadata update
+       2a/ raid10 --> raid0: degrade all mirror legs prior to calling
+           ->reshape_super
+    3/ mdadm::Grow_reshape(): flushes the metadata update (via
+       flush_metadata_update(), or ->sync_metadata())
+    4/ mdadm::Grow_reshape(): post the new level to the kernel
+
+2.4 Reshape chunk, layout
+
+2.5 Reshape raid disks (grow)
+
+    1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail
+       because only redundant raid levels can modify the number of raid disks
+    2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level
+       change is allowed (being performed at proper scope / permissible
+       geometry / proper spares available in the container) prepares a metadata
+       update.
+    3/ mdadm::Grow_reshape(): Converts each subarray in the container to the
+       raid level that can perform the reshape and starts mdmon.
+    4/ mdadm::Grow_reshape(): Pushes the update to mdmon...
+       4a/ mdmon::process_update(): marks the array as reshaping
+       4b/ mdmon::manage_member(): adds the spares (without assigning a slot)
+    5/ mdadm::Grow_reshape(): Notes that mdmon has assigned spares and invokes
+       ->manage_reshape()
+    5/ mdadm::<format>->manage_reshape(): (for each subarray) sets sync_max to
+       zero, starts the reshape, and pings mdmon
+       5a/ mdmon::read_and_act(): notices that reshape has started and notifies
+           the metadata handler to record the slots chosen by the kernel
+    6/ mdadm::<format>->manage_reshape(): saves data that will be overwritten by
+       the kernel to either the backup file or the metadata specific location,
+       advances sync_max, waits for reshape, ping mdmon, repeat.
+       6a/ mdmon::read_and_act(): records checkpoints
+    7/ mdadm::<format>->manage_reshape(): Once reshape completes changes the raid
+       level back to the nominal raid level (if necessary)
+
+       FIXME: native metadata does not have the capability to record the original
+       raid level in reshape-restart case because the kernel always records current
+       raid level to the metadata, whereas external metadata can masquerade at an
+       alternate level based on the reshape state.
+
+2.6 Reshape raid disks (shrink)
+
+3 TODO
+
+...
+
+[1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/
index bab0397921a7d30dc24e012cfa9decc8d5e877f5..ebd9b73ff2024aa6e50d7737f8800bca376381f9 100644 (file)
@@ -120,6 +120,8 @@ static void close_aa(struct active_array *aa)
        close(aa->action_fd);
        close(aa->info.state_fd);
        close(aa->resync_start_fd);
+       close(aa->metadata_fd);
+       close(aa->sync_completed_fd);
 }
 
 static void free_aa(struct active_array *aa)
@@ -276,7 +278,7 @@ static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
         */
        st2 = dup_super(st);
        if (st2->ss->load_super(st2, dfd, NULL) == 0) {
-               st2->ss->getinfo_super(st, &info);
+               st2->ss->getinfo_super(st, &info, NULL);
                if (st->ss->compare_super(st, st2) == 0 &&
                    info.disk.raid_disk >= 0) {
                        /* Looks like a good member of array.
@@ -394,12 +396,20 @@ static void manage_member(struct mdstat_ent *mdstat,
         * trying to find and assign a spare.
         * We do that whenever the monitor tells us too.
         */
+       char buf[64];
+       int frozen;
+
        // FIXME
        a->info.array.raid_disks = mdstat->raid_disks;
-       a->info.array.chunk_size = mdstat->chunk_size;
        // MORE
 
-       if (a->check_degraded) {
+       /* honor 'frozen' */
+       if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0)
+               frozen = buf[9] == '-';
+       else
+               frozen = 1; /* can't read metadata_version assume the worst */
+
+       if (a->check_degraded && !frozen) {
                struct metadata_update *updates = NULL;
                struct mdinfo *newdev = NULL;
                struct active_array *newa;
@@ -511,7 +521,7 @@ static void manage_new(struct mdstat_ent *mdstat,
 
        new->container = container;
 
-       inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+       inst = to_subarray(mdstat, container->devname);
 
        new->info.array = mdi->array;
        new->info.component_size = mdi->component_size;
@@ -656,7 +666,13 @@ void read_sock(struct supertype *container)
                /* read and validate the message */
                if (receive_message(fd, &msg, tmo) == 0) {
                        handle_message(container, &msg);
-                       if (ack(fd, tmo) < 0)
+                       if (msg.len == 0) {
+                               /* ping reply with version */
+                               msg.buf = Version;
+                               msg.len = strlen(Version) + 1;
+                               if (send_message(fd, &msg, tmo) < 0)
+                                       terminate = 1;
+                       } else if (ack(fd, tmo) < 0)
                                terminate = 1;
                } else
                        terminate = 1;
index f334822bf697783ba01845b7aee814c7fff7989b..1cc61d1b82aa702437768fbdfaa2dbb0b6028452 100644 (file)
--- a/mapfile.c
+++ b/mapfile.c
@@ -334,31 +334,20 @@ struct map_ent *map_by_name(struct map_ent **map, char *name)
  * version super_by_fd does this automatically, this routine is meant as
  * a supplement for guess_super()
  */
-static void set_member_info(struct supertype *st, struct mdstat_ent *ent)
+static char *get_member_info(struct mdstat_ent *ent)
 {
 
-       st->subarray[0] = '\0';
-
        if (ent->metadata_version == NULL ||
            strncmp(ent->metadata_version, "external:", 9) != 0)
-               return;
+               return NULL;
 
        if (is_subarray(&ent->metadata_version[9])) {
-               char version[strlen(ent->metadata_version)+1];
                char *subarray;
-               char *name = &version[10];
-
-               strcpy(version, ent->metadata_version);
-               subarray = strrchr(version, '/');
-               name = &version[10];
 
-               if (!subarray)
-                       return;
-               *subarray++ = '\0';
-
-               st->container_dev = devname2devnum(name);
-               strncpy(st->subarray, subarray, sizeof(st->subarray));
+               subarray = strrchr(ent->metadata_version, '/');
+               return subarray + 1;
        }
+       return NULL;
 }
 
 void RebuildMap(void)
@@ -391,8 +380,9 @@ void RebuildMap(void)
                        int dfd;
                        int ok;
                        struct supertype *st;
+                       char *subarray;
                        char *path;
-                       struct mdinfo info;
+                       struct mdinfo *info;
 
                        sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor);
                        dfd = dev_open(dn, O_RDONLY);
@@ -402,13 +392,14 @@ void RebuildMap(void)
                        if ( st == NULL)
                                ok = -1;
                        else {
-                               set_member_info(st, md);
+                               subarray = get_member_info(md);
                                ok = st->ss->load_super(st, dfd, NULL);
                        }
                        close(dfd);
                        if (ok != 0)
                                continue;
-                       st->ss->getinfo_super(st, &info);
+                       info = st->ss->container_content(st, subarray);
+
                        if (md->devnum >= 0)
                                path = map_dev(MD_MAJOR, md->devnum, 0);
                        else
@@ -428,7 +419,7 @@ void RebuildMap(void)
                                 *   find a unique name based on metadata name.
                                 *   
                                 */
-                               struct mddev_ident_s *match = conf_match(&info, st);
+                               struct mddev_ident *match = conf_match(info, st);
                                struct stat stb;
                                if (match && match->devname && match->devname[0] == '/') {
                                        path = match->devname;
@@ -446,13 +437,13 @@ void RebuildMap(void)
                                             st->ss->match_home(st, homehost) != 1) &&
                                            st->ss->match_home(st, "any") != 1 &&
                                            (require_homehost
-                                            || ! conf_name_is_free(info.name)))
+                                            || ! conf_name_is_free(info->name)))
                                                /* require a numeric suffix */
                                                unum = 0;
                                        else
                                                /* allow name to be used as-is if no conflict */
                                                unum = -1;
-                                       name = info.name;
+                                       name = info->name;
                                        if (!*name) {
                                                name = st->ss->name;
                                                if (!isdigit(name[strlen(name)-1]) &&
@@ -485,9 +476,10 @@ void RebuildMap(void)
                                }
                        }
                        map_add(&map, md->devnum,
-                               info.text_version,
-                               info.uuid, path);
+                               info->text_version,
+                               info->uuid, path);
                        st->ss->free_super(st);
+                       free(info);
                        break;
                }
                sysfs_free(sra);
index 00c32dc22286a32119fa6b0e759bea933a5cb870..ac87b47b6dac52fccbadac44afd500954107c72c 100644 (file)
@@ -414,6 +414,9 @@ If this is not specified
 size, though if there is a variance among the drives of greater than 1%, a warning is
 issued.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 This value can be set with
 .B \-\-grow
 for RAID level 1/4/5/6.  If the array was created with a size smaller
@@ -432,7 +435,7 @@ metadata such as DDF and IMSM.
 .BR \-Z ", " \-\-array-size=
 This is only meaningful with
 .B \-\-grow
-and its effect is not persistent: when the array is stopped an
+and its effect is not persistent: when the array is stopped and
 restarted the default array size will be restored.
 
 Setting the array-size causes the array to appear smaller to programs
@@ -443,6 +446,13 @@ but setting the size with
 is, it is required that the array size is reduced as appropriate
 before the number of devices in the array is reduced.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+A value of
+.B max
+restores the apparent size of the array to be whatever the real
+amount of available space is.
+
 .TP
 .BR \-c ", " \-\-chunk=
 Specify chunk size of kibibytes.  The default when creating an
@@ -450,6 +460,9 @@ array is 512KB.  To ensure compatibility with earlier versions, the
 default when Building and array with no persistent metadata is 64KB.
 This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 .TP
 .BR \-\-rounding=
 Specify rounding factor for a Linear array.  The size of each
@@ -622,6 +635,9 @@ When using an
 bitmap, the chunksize defaults to 64Meg, or larger if necessary to
 fit the bitmap into the available space.
 
+A suffix of 'M' or 'G' can be given to indicate Megabytes or
+Gigabytes respectively.
+
 .TP
 .BR \-W ", " \-\-write\-mostly
 subsequent devices listed in a
@@ -670,21 +686,6 @@ or layout.  See the GROW MODE section below on RAID\-DEVICES CHANGES.
 The file must be stored on a separate device, not on the RAID array
 being reshaped.
 
-.TP
-.BR \-\-array-size= ", " \-Z
-Set the size of the array which is seen by users of the device such as
-filesystems.  This can be less that the real size, but never greater.
-The size set this way does not persist across restarts of the array.
-
-This is most useful when reducing the number of devices in a RAID5 or
-RAID6.  Such arrays require the array-size to be reduced before a
-reshape can be performed that reduces the real size.
-
-A value of
-.B max
-restores the apparent size of the array to be whatever the real
-amount of available space is.
-
 .TP
 .BR \-N ", " \-\-name=
 Set a
@@ -887,15 +888,25 @@ bitmap, there is no need to specify this when assembling the array.
 .BR \-\-backup\-file=
 If
 .B \-\-backup\-file
-was used when requesting a grow, shrink, RAID level change or other
-reshape, and the system crashed during the critical section, then the
-same
+was used while reshaping an array (e.g. changing number of devices or
+chunk size) and the system crashed during the critical section, then the same
 .B \-\-backup\-file
 must be presented to
 .B \-\-assemble
 to allow possibly corrupted data to be restored, and the reshape
 to be completed.
 
+.TP
+.BR \-\-invalid\-backup
+If the file needed for the above option is not available for any
+reason an empty file can be given together with this option to
+indicate that the backup file is invalid.  In this case the data that
+was being rearranged at the time of the crash could be irrecoverably
+lost, but the rest of the array may still be recoverable.  This option
+should only be used as a last resort if there is no way to recover the
+backup file.
+
+
 .TP
 .BR \-U ", " \-\-update=
 Update the superblock on each device while assembling the array.  The
@@ -908,6 +919,7 @@ argument given to this flag can be one of
 .BR resync ,
 .BR byteorder ,
 .BR devicesize ,
+.BR no\-bitmap ,
 or
 .BR super\-minor .
 
@@ -990,7 +1002,7 @@ counts of total, working, active, failed, and spare devices.
 
 The
 .B devicesize
-will rarely be of use.  It applies to version 1.1 and 1.2 metadata
+option will rarely be of use.  It applies to version 1.1 and 1.2 metadata
 only (where the metadata is at the start of the device) and is only
 useful when the component device has changed size (typically become
 larger).  The version 1 metadata records the amount of the device that
@@ -1004,6 +1016,12 @@ This will cause
 to determine the maximum usable amount of space on each device and
 update the relevant field in the metadata.
 
+The
+.B no\-bitmap
+option can be used when an array has an internal bitmap which is
+corrupt in some way so that assembling the array normally fails.  It
+will cause any internal bitmap to be ignored.
+
 .ig
 .TP
 .B \-\-auto\-update\-homehost
@@ -1035,7 +1053,7 @@ will report failure if these specifiers didn't find any match.
 .BR \-a ", " \-\-add
 hot-add listed devices.
 If a device appears to have recently been part of the array
-(possibly it failed or was removed) the device is re-added as describe
+(possibly it failed or was removed) the device is re\-added as describe
 in the next point.
 If that fails or the device was never part of the array, the device is
 added as a hot-spare.
@@ -1061,6 +1079,13 @@ When used on an array that has no metadata (i.e. it was built with
 it will be assumed that bitmap-based recovery is enough to make the
 device fully consistent with the array.
 
+When
+.B \-\-re\-add
+can be accompanied by
+.BR \-\-update=devicesize .
+See the description of this option when used in Assemble mode for an
+explanation of its use.
+
 If the device name given is
 .B missing
 then mdadm will try to find any device that looks like it should be
diff --git a/mdadm.c b/mdadm.c
index 08e8ea4e0052950ce4f695a633318a9eb4721c10..2ffe94f7c6c7c30bdec788f1d5ec6900124bfb22 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -46,25 +46,25 @@ int main(int argc, char *argv[])
        int layout = UnSet;
        char *layout_str = NULL;
        int raiddisks = 0;
-       int max_disks = MD_SB_DISKS; /* just a default */
        int sparedisks = 0;
-       struct mddev_ident_s ident;
+       struct mddev_ident ident;
        char *configfile = NULL;
        char *cp;
        char *update = NULL;
        int scan = 0;
-       char devmode = 0;
+       int devmode = 0;
        int runstop = 0;
        int readonly = 0;
        int write_behind = 0;
        int bitmap_fd = -1;
        char *bitmap_file = NULL;
        char *backup_file = NULL;
+       int invalid_backup = 0;
        int bitmap_chunk = UnSet;
        int SparcAdjust = 0;
-       mddev_dev_t devlist = NULL;
-       mddev_dev_t *devlistend = & devlist;
-       mddev_dev_t dv;
+       struct mddev_dev *devlist = NULL;
+       struct mddev_dev **devlistend = & devlist;
+       struct mddev_dev *dv;
        int devs_found = 0;
        int verbose = 0;
        int quiet = 0;
@@ -96,6 +96,7 @@ int main(int argc, char *argv[])
        int daemonise = 0;
        char *pidfile = NULL;
        int oneshot = 0;
+       int spare_sharing = 1;
        struct supertype *ss = NULL;
        int writemostly = 0;
        int re_add = 0;
@@ -104,6 +105,7 @@ int main(int argc, char *argv[])
        int rebuild_map = 0;
        int auto_update_home = 0;
        char *subarray = NULL;
+       char *remove_path = NULL;
 
        int print_help = 0;
        FILE *outf;
@@ -133,12 +135,11 @@ int main(int argc, char *argv[])
                int newmode = mode;
                /* firstly, some mode-independent options */
                switch(opt) {
+               case HelpOptions:
+                       print_help = 2;
+                       continue;
                case 'h':
-                       if (option_index > 0 &&
-                           strcmp(long_options[option_index].name, "help-options")==0)
-                               print_help = 2;
-                       else
-                               print_help = 1;
+                       print_help = 1;
                        continue;
 
                case 'V':
@@ -152,9 +153,11 @@ int main(int argc, char *argv[])
                        continue;
 
                case 'b':
-                       if (mode == ASSEMBLE || mode == BUILD || mode == CREATE || mode == GROW ||
-                           mode == INCREMENTAL || mode == MANAGE)
+                       if (mode == ASSEMBLE || mode == BUILD || mode == CREATE
+                           || mode == GROW || mode == INCREMENTAL
+                           || mode == MANAGE)
                                break; /* b means bitmap */
+               case Brief:
                        brief = 1;
                        continue;
 
@@ -179,13 +182,16 @@ int main(int argc, char *argv[])
                 */
 
                switch(opt) {
-               case '@': /* just incase they say --manage */
+               case ManageOpt:
                        newmode = MANAGE;
                        shortopt = short_bitmap_options;
                        break;
                case 'a':
+               case Add:
                case 'r':
+               case Remove:
                case 'f':
+               case Fail:
                case ReAdd: /* re-add */
                        if (!mode) {
                                newmode = MANAGE;
@@ -205,7 +211,7 @@ int main(int argc, char *argv[])
                case AutoDetect:
                        newmode = AUTODETECT; break;
 
-               case '#':
+               case MiscOpt:
                case 'D':
                case 'E':
                case 'X':
@@ -215,18 +221,21 @@ int main(int argc, char *argv[])
                case 'o':
                case 'w':
                case 'W':
+               case WaitOpt:
                case Waitclean:
                case DetailPlatform:
                case KillSubarray:
                case UpdateSubarray:
                        if (opt == KillSubarray || opt == UpdateSubarray) {
                                if (subarray) {
-                                       fprintf(stderr, Name ": subarray can only be specified once\n");
+                                       fprintf(stderr, Name ": subarray can only"
+                                               " be specified once\n");
                                        exit(2);
                                }
                                subarray = optarg;
                        }
                case 'K': if (!mode) newmode = MISC; break;
+               case NoSharing: newmode = MONITOR; break;
                }
                if (mode && newmode == mode) {
                        /* everybody happy ! */
@@ -245,7 +254,7 @@ int main(int argc, char *argv[])
                        mode = newmode;
                } else {
                        /* special case of -c --help */
-                       if (opt == 'c' &&
+                       if ((opt == 'c' || opt == ConfigFile) &&
                            ( strncmp(optarg, "--h", 3)==0 ||
                              strncmp(optarg, "-h", 2)==0)) {
                                fputs(Help_config, stdout);
@@ -265,7 +274,6 @@ int main(int argc, char *argv[])
                                        dv->writemostly = writemostly;
                                        dv->re_add = re_add;
                                        dv->used = 0;
-                                       dv->content = NULL;
                                        dv->next = NULL;
                                        *devlistend = dv;
                                        devlistend = &dv->next;
@@ -288,8 +296,8 @@ int main(int argc, char *argv[])
 
                /* if we just set the mode, then done */
                switch(opt) {
-               case '@':
-               case '#':
+               case ManageOpt:
+               case MiscOpt:
                case 'A':
                case 'B':
                case 'C':
@@ -302,12 +310,14 @@ int main(int argc, char *argv[])
                if (opt == 1) {
                        /* an undecorated option - must be a device name.
                         */
-                       if (devs_found > 0 && mode == '@' && !devmode) {
-                               fprintf(stderr, Name ": Must give one of -a/-r/-f for subsequent devices at %s\n", optarg);
+                       if (devs_found > 0 && mode == MANAGE && !devmode) {
+                               fprintf(stderr, Name ": Must give one of -a/-r/-f"
+                                       " for subsequent devices at %s\n", optarg);
                                exit(2);
                        }
-                       if (devs_found > 0 && mode == 'G' && !devmode) {
-                               fprintf(stderr, Name ": Must give one of -a for devices do add: %s\n", optarg);
+                       if (devs_found > 0 && mode == GROW && !devmode) {
+                               fprintf(stderr, Name ": Must give -a/--add for"
+                                       " devices to add: %s\n", optarg);
                                exit(2);
                        }
                        dv = malloc(sizeof(*dv));
@@ -320,7 +330,6 @@ int main(int argc, char *argv[])
                        dv->writemostly = writemostly;
                        dv->re_add = re_add;
                        dv->used = 0;
-                       dv->content = NULL;
                        dv->next = NULL;
                        *devlistend = dv;
                        devlistend = &dv->next;
@@ -331,22 +340,27 @@ int main(int argc, char *argv[])
 
                /* We've got a mode, and opt is now something else which
                 * could depend on the mode */
-#define O(a,b) ((a<<8)|b)
+#define O(a,b) ((a<<16)|b)
                switch (O(mode,opt)) {
                case O(GROW,'c'):
+               case O(GROW,ChunkSize):
                case O(CREATE,'c'):
+               case O(CREATE,ChunkSize):
                case O(BUILD,'c'): /* chunk or rounding */
+               case O(BUILD,ChunkSize): /* chunk or rounding */
                        if (chunk) {
                                fprintf(stderr, Name ": chunk/rounding may only be specified once. "
                                        "Second value is %s.\n", optarg);
                                exit(2);
                        }
-                       chunk = strtol(optarg, &c, 10);
-                       if (!optarg[0] || *c || chunk<4 || ((chunk-1)&chunk)) {
+                       chunk = parse_size(optarg);
+                       if (chunk < 8 || ((chunk-1)&chunk)) {
                                fprintf(stderr, Name ": invalid chunk/rounding value: %s\n",
                                        optarg);
                                exit(2);
                        }
+                       /* Covert sectors to K */
+                       chunk /= 2;
                        continue;
 
 #if 0
@@ -369,12 +383,14 @@ int main(int argc, char *argv[])
                                fprintf(stderr, Name ": unrecognised metadata identifier: %s\n", optarg);
                                exit(2);
                        }
-                       max_disks = ss->max_devs;
                        continue;
 
                case O(MANAGE,'W'):
+               case O(MANAGE,WriteMostly):
                case O(BUILD,'W'):
+               case O(BUILD,WriteMostly):
                case O(CREATE,'W'):
+               case O(CREATE,WriteMostly):
                        /* set write-mostly for following devices */
                        writemostly = 1;
                        continue;
@@ -456,6 +472,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(GROW, 'p'): /* new layout */
+               case O(GROW, Layout):
                        if (layout_str) {
                                fprintf(stderr,Name ": layout may only be sent once.  "
                                        "Second value was %s\n", optarg);
@@ -466,7 +483,9 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(CREATE,'p'): /* raid5 layout */
+               case O(CREATE,Layout):
                case O(BUILD,'p'): /* faulty layout */
+               case O(BUILD,Layout):
                        if (layout != UnSet) {
                                fprintf(stderr,Name ": layout may only be sent once.  "
                                        "Second value was %s\n", optarg);
@@ -561,9 +580,13 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(CREATE,'a'):
+               case O(CREATE,Auto):
                case O(BUILD,'a'):
+               case O(BUILD,Auto):
                case O(INCREMENTAL,'a'):
-               case O(ASSEMBLE,'a'): /* auto-creation of device node */
+               case O(INCREMENTAL,Auto):
+               case O(ASSEMBLE,'a'):
+               case O(ASSEMBLE,Auto): /* auto-creation of device node */
                        autof = parse_auto(optarg, "--auto flag", 0);
                        continue;
 
@@ -574,10 +597,15 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(BUILD,'f'): /* force honouring '-n 1' */
+               case O(BUILD,Force): /* force honouring '-n 1' */
                case O(GROW,'f'): /* ditto */
+               case O(GROW,Force): /* ditto */
                case O(CREATE,'f'): /* force honouring of device list */
+               case O(CREATE,Force): /* force honouring of device list */
                case O(ASSEMBLE,'f'): /* force assembly */
+               case O(ASSEMBLE,Force): /* force assembly */
                case O(MISC,'f'): /* force zero */
+               case O(MISC,Force): /* force zero */
                        force=1;
                        continue;
 
@@ -618,6 +646,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(ASSEMBLE,'m'): /* super-minor for array */
+               case O(ASSEMBLE,SuperMinor):
                        if (ident.super_minor != UnSet) {
                                fprintf(stderr, Name ": super-minor cannot be set twice.  "
                                        "Second value: %s.\n", optarg);
@@ -637,12 +666,14 @@ int main(int argc, char *argv[])
                case O(ASSEMBLE,'U'): /* update the superblock */
                case O(MISC,'U'):
                        if (update) {
-                               fprintf(stderr, Name ": Can only update one aspect of superblock, both %s and %s given.\n",
+                               fprintf(stderr, Name ": Can only update one aspect"
+                                       " of superblock, both %s and %s given.\n",
                                        update, optarg);
                                exit(2);
                        }
                        if (mode == MISC && !subarray) {
-                               fprintf(stderr, Name ": Only subarrays can be updated in misc mode\n");
+                               fprintf(stderr, Name ": Only subarrays can be"
+                                       " updated in misc mode\n");
                                exit(2);
                        }
                        update = optarg;
@@ -662,15 +693,21 @@ int main(int argc, char *argv[])
                                continue;
                        if (strcmp(update, "devicesize")==0)
                                continue;
+                       if (strcmp(update, "no-bitmap")==0)
+                               continue;
                        if (strcmp(update, "byteorder")==0) {
                                if (ss) {
-                                       fprintf(stderr, Name ": must not set metadata type with --update=byteorder.\n");
+                                       fprintf(stderr,
+                                               Name ": must not set metadata"
+                                               " type with --update=byteorder.\n");
                                        exit(2);
                                }
                                for(i=0; !ss && superlist[i]; i++)
-                                       ss = superlist[i]->match_metadata_desc("0.swap");
+                                       ss = superlist[i]->match_metadata_desc(
+                                               "0.swap");
                                if (!ss) {
-                                       fprintf(stderr, Name ": INTERNAL ERROR cannot find 0.swap\n");
+                                       fprintf(stderr, Name ": INTERNAL ERROR"
+                                               " cannot find 0.swap\n");
                                        exit(2);
                                }
 
@@ -688,9 +725,31 @@ int main(int argc, char *argv[])
                        }
                        fprintf(outf, "Valid --update options are:\n"
                "     'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n"
-               "     'summaries', 'homehost', 'byteorder', 'devicesize'.\n");
+               "     'summaries', 'homehost', 'byteorder', 'devicesize',\n"
+               "     'no-bitmap'\n");
                        exit(outf == stdout ? 0 : 2);
 
+               case O(MANAGE,'U'):
+                       /* update=devicesize is allowed with --re-add */
+                       if (devmode != 'a' || re_add != 1) {
+                               fprintf(stderr, Name "--update in Manage mode only"
+                                       " allowed with --re-add.\n");
+                               exit(1);
+                       }
+                       if (update) {
+                               fprintf(stderr, Name ": Can only update one aspect"
+                                       " of superblock, both %s and %s given.\n",
+                                       update, optarg);
+                               exit(2);
+                       }
+                       update = optarg;
+                       if (strcmp(update, "devicesize") != 0) {
+                               fprintf(stderr, Name ": only 'devicesize' can be"
+                                       " updated with --re-add\n");
+                               exit(2);
+                       }
+                       continue;
+
                case O(INCREMENTAL,NoDegraded):
                        fprintf(stderr, Name ": --no-degraded is deprecated in Incremental mode\n");
                case O(ASSEMBLE,NoDegraded): /* --no-degraded */
@@ -698,10 +757,14 @@ int main(int argc, char *argv[])
                                       * so we overload slightly */
                        continue;
 
-               case O(ASSEMBLE,'c'): /* config file */
+               case O(ASSEMBLE,'c'):
+               case O(ASSEMBLE,ConfigFile):
                case O(INCREMENTAL, 'c'):
+               case O(INCREMENTAL, ConfigFile):
                case O(MISC, 'c'):
+               case O(MISC, ConfigFile):
                case O(MONITOR,'c'):
+               case O(MONITOR,ConfigFile):
                        if (configfile) {
                                fprintf(stderr, Name ": configfile cannot be set twice.  "
                                        "Second value is %s.\n", optarg);
@@ -719,6 +782,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'m'): /* mail address */
+               case O(MONITOR,EMail):
                        if (mailaddr)
                                fprintf(stderr, Name ": only specify one mailaddress. %s ignored.\n",
                                        optarg);
@@ -727,6 +791,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'p'): /* alert program */
+               case O(MONITOR,ProgramOpt): /* alert program */
                        if (program)
                                fprintf(stderr, Name ": only specify one alter program. %s ignored.\n",
                                        optarg);
@@ -735,6 +800,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(MONITOR,'r'): /* rebuild increments */
+               case O(MONITOR,Increment):
                        increments = atoi(optarg);
                        if (increments>99 || increments<1) {
                                fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n");
@@ -759,6 +825,7 @@ int main(int argc, char *argv[])
                        }
                        continue;
                case O(MONITOR,'f'): /* daemonise */
+               case O(MONITOR,Fork):
                        daemonise = 1;
                        continue;
                case O(MONITOR,'i'): /* pid */
@@ -778,12 +845,16 @@ int main(int argc, char *argv[])
                        openlog("mdadm", LOG_PID, SYSLOG_FACILITY);
                        dosyslog = 1;
                        continue;
-
+               case O(MONITOR, NoSharing):
+                       spare_sharing = 0;
+                       continue;
                        /* now the general management options.  Some are applicable
                         * to other modes. None have arguments.
                         */
                case O(GROW,'a'):
-               case O(MANAGE,'a'): /* add a drive */
+               case O(GROW,Add):
+               case O(MANAGE,'a'):
+               case O(MANAGE,Add): /* add a drive */
                        devmode = 'a';
                        re_add = 0;
                        continue;
@@ -792,10 +863,14 @@ int main(int argc, char *argv[])
                        re_add = 1;
                        continue;
                case O(MANAGE,'r'): /* remove a drive */
+               case O(MANAGE,Remove):
                        devmode = 'r';
                        continue;
                case O(MANAGE,'f'): /* set faulty */
-               case O(INCREMENTAL,'f'): /* r for incremental is taken, use f
+               case O(MANAGE,Fail):
+               case O(INCREMENTAL,'f'):
+               case O(INCREMENTAL,Remove):
+               case O(INCREMENTAL,Fail): /* r for incremental is taken, use f
                                          * even though we will both fail and
                                          * remove the device */
                        devmode = 'f';
@@ -832,6 +907,7 @@ int main(int argc, char *argv[])
                case O(MISC,'o'):
                case O(MISC,'w'):
                case O(MISC,'W'):
+               case O(MISC, WaitOpt):
                case O(MISC, Waitclean):
                case O(MISC, DetailPlatform):
                case O(MISC, KillSubarray):
@@ -866,6 +942,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */
+               case O(ASSEMBLE,Bitmap):
                        if (!optarg) {
                                fprintf(stderr, Name ": bitmap file needed with -b in --assemble mode\n");
                                exit(2);
@@ -894,8 +971,17 @@ int main(int argc, char *argv[])
                        backup_file = optarg;
                        continue;
 
+               case O(ASSEMBLE, InvalidBackup):
+                       /* Acknowledge that the backupfile is invalid, but ask
+                        * to continue anyway
+                        */
+                       invalid_backup = 1;
+                       continue;
+
                case O(BUILD,'b'):
-               case O(CREATE,'b'): /* here we create the bitmap */
+               case O(BUILD,Bitmap):
+               case O(CREATE,'b'):
+               case O(CREATE,Bitmap): /* here we create the bitmap */
                        if (strcmp(optarg, "none") == 0) {
                                fprintf(stderr, Name ": '--bitmap none' only"
                                        " support for --grow\n");
@@ -903,6 +989,7 @@ int main(int argc, char *argv[])
                        }
                        /* FALL THROUGH */
                case O(GROW,'b'):
+               case O(GROW,Bitmap):
                        if (strcmp(optarg, "internal")== 0 ||
                            strcmp(optarg, "none")== 0 ||
                            strchr(optarg, '/') != NULL) {
@@ -917,15 +1004,16 @@ int main(int argc, char *argv[])
                case O(GROW,BitmapChunk):
                case O(BUILD,BitmapChunk):
                case O(CREATE,BitmapChunk): /* bitmap chunksize */
-                       bitmap_chunk = strtol(optarg, &c, 10);
-                       if (!optarg[0] || *c || bitmap_chunk < 0 ||
-                                       bitmap_chunk & (bitmap_chunk - 1)) {
-                               fprintf(stderr, Name ": invalid bitmap chunksize: %s\n",
-                                               optarg);
+                       bitmap_chunk = parse_size(optarg);
+                       if (bitmap_chunk < 0 ||
+                           bitmap_chunk & (bitmap_chunk - 1)) {
+                               fprintf(stderr,
+                                       Name ": invalid bitmap chunksize: %s\n",
+                                       optarg);
                                exit(2);
                        }
-                       /* convert K to B, chunk of 0K means 512B */
-                       bitmap_chunk = bitmap_chunk ? bitmap_chunk * 1024 : 512;
+                       /* convert sectors to B, chunk of 0 means 512B */
+                       bitmap_chunk = bitmap_chunk ? bitmap_chunk * 512 : 512;
                        continue;
 
                case O(GROW, WriteBehind):
@@ -943,8 +1031,12 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(INCREMENTAL, 'r'):
+               case O(INCREMENTAL, RebuildMapOpt):
                        rebuild_map = 1;
                        continue;
+               case O(INCREMENTAL, IncrementalPath):
+                       remove_path = optarg;
+                       continue;
                }
                /* We have now processed all the valid options. Anything else is
                 * an error
@@ -1055,11 +1147,6 @@ int main(int argc, char *argv[])
        }
 
        if (raiddisks) {
-               if (raiddisks > max_disks) {
-                       fprintf(stderr, Name ": invalid number of raid devices: %d\n",
-                               raiddisks);
-                       exit(2);
-               }
                if (raiddisks == 1 &&  !force && level != -5) {
                        fprintf(stderr, Name ": '1' is an unusual number of drives for an array, so it is probably\n"
                                "     a mistake.  If you really mean it you will need to specify --force before\n"
@@ -1067,13 +1154,6 @@ int main(int argc, char *argv[])
                        exit(2);
                }
        }
-       if (sparedisks) {
-               if ( sparedisks > max_disks - raiddisks) {
-                       fprintf(stderr, Name ": invalid number of spare-devices: %d\n",
-                               sparedisks);
-                       exit(2);
-               }
-       }
 
        if (homehost == NULL)
                homehost = conf_get_homehost(&require_homehost);
@@ -1100,7 +1180,8 @@ int main(int argc, char *argv[])
                        rv = Manage_ro(devlist->devname, mdfd, readonly);
                if (!rv && devs_found>1)
                        rv = Manage_subdevs(devlist->devname, mdfd,
-                                           devlist->next, verbose-quiet, test);
+                                           devlist->next, verbose-quiet, test,
+                                           update);
                if (!rv && readonly < 0)
                        rv = Manage_ro(devlist->devname, mdfd, readonly);
                if (!rv && runstop)
@@ -1110,7 +1191,7 @@ int main(int argc, char *argv[])
                if (devs_found == 1 && ident.uuid_set == 0 &&
                    ident.super_minor == UnSet && ident.name[0] == 0 && !scan ) {
                        /* Only a device has been given, so get details from config file */
-                       mddev_ident_t array_ident = conf_get_ident(devlist->devname);
+                       struct mddev_ident *array_ident = conf_get_ident(devlist->devname);
                        if (array_ident == NULL) {
                                fprintf(stderr, Name ": %s not identified in config file.\n",
                                        devlist->devname);
@@ -1121,14 +1202,14 @@ int main(int argc, char *argv[])
                                if (array_ident->autof == 0)
                                        array_ident->autof = autof;
                                rv |= Assemble(ss, devlist->devname, array_ident,
-                                              NULL, backup_file,
+                                              NULL, backup_file, invalid_backup,
                                               readonly, runstop, update,
                                               homehost, require_homehost,
                                               verbose-quiet, force);
                        }
                } else if (!scan)
                        rv = Assemble(ss, devlist->devname, &ident,
-                                     devlist->next, backup_file,
+                                     devlist->next, backup_file, invalid_backup,
                                      readonly, runstop, update,
                                      homehost, require_homehost,
                                      verbose-quiet, force);
@@ -1142,7 +1223,7 @@ int main(int argc, char *argv[])
                                exit(1);
                        }
                        for (dv = devlist ; dv ; dv=dv->next) {
-                               mddev_ident_t array_ident = conf_get_ident(dv->devname);
+                               struct mddev_ident *array_ident = conf_get_ident(dv->devname);
                                if (array_ident == NULL) {
                                        fprintf(stderr, Name ": %s not identified in config file.\n",
                                                dv->devname);
@@ -1152,14 +1233,14 @@ int main(int argc, char *argv[])
                                if (array_ident->autof == 0)
                                        array_ident->autof = autof;
                                rv |= Assemble(ss, dv->devname, array_ident,
-                                              NULL, backup_file,
+                                              NULL, backup_file, invalid_backup,
                                               readonly, runstop, update,
                                               homehost, require_homehost,
                                               verbose-quiet, force);
                        }
                } else {
-                       mddev_ident_t a, array_list =  conf_get_ident(NULL);
-                       mddev_dev_t devlist = conf_get_devs();
+                       struct mddev_ident *a, *array_list =  conf_get_ident(NULL);
+                       struct mddev_dev *devlist = conf_get_devs();
                        int cnt = 0;
                        int failures, successes;
                        if (devlist == NULL) {
@@ -1193,7 +1274,7 @@ int main(int argc, char *argv[])
                                
                                        r = Assemble(ss, a->devname,
                                                     a,
-                                                    NULL, NULL,
+                                                    NULL, NULL, 0,
                                                     readonly, runstop, NULL,
                                                     homehost, require_homehost,
                                                     verbose-quiet, force);
@@ -1215,12 +1296,12 @@ int main(int argc, char *argv[])
                                int acnt;
                                ident.autof = autof;
                                do {
-                                       mddev_dev_t devlist = conf_get_devs();
+                                       struct mddev_dev *devlist = conf_get_devs();
                                        acnt = 0;
                                        do {
                                                rv2 = Assemble(ss, NULL,
                                                               &ident,
-                                                              devlist, NULL,
+                                                              devlist, NULL, 0,
                                                               readonly, runstop, NULL,
                                                               homehost, require_homehost,
                                                               verbose-quiet, force);
@@ -1242,12 +1323,15 @@ int main(int argc, char *argv[])
                                        do {
                                                acnt = 0;
                                                do {
-                                                       rv2 = Assemble(ss, NULL,
-                                                                      &ident,
-                                                                      NULL, NULL,
-                                                                      readonly, runstop, "homehost",
-                                                                      homehost, require_homehost,
-                                                                      verbose-quiet, force);
+                                                       rv2 = Assemble(
+                                                               ss, NULL,
+                                                               &ident,
+                                                               NULL, NULL, 0,
+                                                               readonly, runstop,
+                                                               "homehost",
+                                                               homehost,
+                                                               require_homehost,
+                                                               verbose-quiet, force);
                                                        if (rv2==0) {
                                                                cnt++;
                                                                acnt++;
@@ -1438,6 +1522,7 @@ int main(int argc, char *argv[])
                                case 'X':
                                        rv |= ExamineBitmap(dv->devname, brief, ss); continue;
                                case 'W':
+                               case WaitOpt:
                                        rv |= Wait(dv->devname); continue;
                                case Waitclean:
                                        rv |= WaitClean(dv->devname, -1, verbose-quiet); continue;
@@ -1484,7 +1569,7 @@ int main(int argc, char *argv[])
                        break;
                }
                if (delay == 0) {
-                       if (get_linux_version() > 20616)
+                       if (get_linux_version() > 2006016)
                                /* mdstat responds to poll */
                                delay = 1000;
                        else
@@ -1492,7 +1577,7 @@ int main(int argc, char *argv[])
                }
                rv= Monitor(devlist, mailaddr, program,
                            delay?delay:60, daemonise, scan, oneshot,
-                           dosyslog, test, pidfile, increments);
+                           dosyslog, test, pidfile, increments, spare_sharing);
                break;
 
        case GROW:
@@ -1553,7 +1638,8 @@ int main(int argc, char *argv[])
                } else if (size >= 0 || raiddisks != 0 || layout_str != NULL
                           || chunk != 0 || level != UnSet) {
                        rv = Grow_reshape(devlist->devname, mdfd, quiet, backup_file,
-                                         size, level, layout_str, chunk, raiddisks);
+                                         size, level, layout_str, chunk, raiddisks,
+                                         force);
                } else if (array_size < 0)
                        fprintf(stderr, Name ": no changes to --grow\n");
                break;
@@ -1588,12 +1674,13 @@ int main(int argc, char *argv[])
                        rv = 1;
                        break;
                }
-               if (devmode == 'f') {
-                       rv = IncrementalRemove(devlist->devname, verbose-quiet);
-                       break;
-               }
-               rv = Incremental(devlist->devname, verbose-quiet, runstop,
-                                ss, homehost, require_homehost, autof);
+               if (devmode == 'f')
+                       rv = IncrementalRemove(devlist->devname, remove_path,
+                                              verbose-quiet);
+               else
+                       rv = Incremental(devlist->devname, verbose-quiet,
+                                        runstop, ss, homehost,
+                                        require_homehost, autof);
                break;
        case AUTODETECT:
                autodetect();
diff --git a/mdadm.h b/mdadm.h
index 03dd41c61666cb67eabafff07db21e0beb681412..a5d94d7195ee1c2fe34b7b8748264ab28d396e33 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -93,6 +93,14 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define MDMON_DIR "/dev/.mdadm/"
 #endif /* MDMON_DIR */
 
+/* FAILED_SLOTS is where to save files storing recent removal of array
+ * member in order to allow future reuse of disk inserted in the same
+ * slot for array recovery
+ */
+#ifndef FAILED_SLOTS_DIR
+#define FAILED_SLOTS_DIR "/dev/.mdadm/failed-slots"
+#endif /* FAILED_SLOTS */
+
 #include       "md_u.h"
 #include       "md_p.h"
 #include       "bitmap.h"
@@ -261,17 +269,16 @@ extern char Version[], Usage[], Help[], OptionHelp[],
        Help_manage[], Help_misc[], Help_monitor[], Help_config[];
 
 /* for option that don't have short equivilents, we assign arbitrary
- * small numbers.  '1' means an undecorated option, so we start at '2'.
- * (note we must stop before we get to 65 i.e. 'A')
+ * numbers later than any 'short' character option.
  */
 enum special_options {
-       AssumeClean = 2,
+       AssumeClean = 300,
        BitmapChunk,
        WriteBehind,
        ReAdd,
        NoDegraded,
        Sparc22,
-       BackupFile, /* 8 */
+       BackupFile,
        HomeHost,
        AutoHomeHost,
        Symlinks,
@@ -279,7 +286,31 @@ enum special_options {
        Waitclean,
        DetailPlatform,
        KillSubarray,
-       UpdateSubarray, /* 16 */
+       UpdateSubarray,
+       IncrementalPath,
+       NoSharing,
+       HelpOptions,
+       Brief,
+       ManageOpt,
+       Add,
+       Remove,
+       Fail,
+       MiscOpt,
+       WaitOpt,
+       ConfigFile,
+       ChunkSize,
+       WriteMostly,
+       Layout,
+       Auto,
+       Force,
+       SuperMinor,
+       EMail,
+       ProgramOpt,
+       Increment,
+       Fork,
+       Bitmap,
+       RebuildMapOpt,
+       InvalidBackup,
 };
 
 /* structures read from config file */
@@ -293,7 +324,7 @@ enum special_options {
  * devices is considered
  */
 #define UnSet (0xfffe)
-typedef struct mddev_ident_s {
+struct mddev_ident {
        char    *devname;
 
        int     uuid_set;
@@ -321,26 +352,24 @@ typedef struct mddev_ident_s {
                                 */
        char    *member;        /* subarray within a container */
 
-       struct mddev_ident_s *next;
+       struct mddev_ident *next;
        union {
                /* fields needed by different users of this structure */
                int assembled;  /* set when assembly succeeds */
        };
-} *mddev_ident_t;
+};
 
 /* List of device names - wildcards expanded */
-typedef struct mddev_dev_s {
+struct mddev_dev {
        char *devname;
-       char disposition;       /* 'a' for add, 'r' for remove, 'f' for fail.
+       int disposition;        /* 'a' for add, 'r' for remove, 'f' for fail.
                                 * Not set for names read from .config
                                 */
        char writemostly;       /* 1 for 'set writemostly', 2 for 'clear writemostly' */
        char re_add;
        char used;              /* set when used */
-       struct mdinfo *content; /* If devname is a container, this might list
-                                * the remaining member arrays. */
-       struct mddev_dev_s *next;
-} *mddev_dev_t;
+       struct mddev_dev *next;
+};
 
 typedef struct mapping {
        char *name;
@@ -355,10 +384,9 @@ struct mdstat_ent {
        char            *level;
        char            *pattern; /* U or up, _ for down */
        int             percent; /* -1 if no resync */
-       int             resync; /* 1 if resync, 0 if recovery */
+       int             resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */
        int             devcnt;
        int             raid_disks;
-       int             chunk_size;
        char *          metadata_version;
        struct dev_member {
                char                    *name;
@@ -436,6 +464,8 @@ extern int sysfs_fd_get_ll(int fd, unsigned long long *val);
 extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                        char *name, unsigned long long *val);
 extern int sysfs_fd_get_str(int fd, char *val, int size);
+extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev,
+                                    char *name);
 extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, char *val, int size);
 extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
@@ -443,8 +473,27 @@ extern int sysfs_set_array(struct mdinfo *info, int vers);
 extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume);
 extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
 extern int sysfs_unique_holder(int devnum, long rdev);
+extern int sysfs_freeze_array(struct mdinfo *sra);
 extern int load_sys(char *path, char *buf);
-
+extern int reshape_prepare_fdlist(char *devname,
+                                 struct mdinfo *sra,
+                                 int raid_disks,
+                                 int nrdisks,
+                                 unsigned long blocks,
+                                 char *backup_file,
+                                 int *fdlist,
+                                 unsigned long long *offsets);
+extern void reshape_free_fdlist(int *fdlist,
+                               unsigned long long *offsets,
+                               int size);
+extern int reshape_open_backup_file(char *backup,
+                                   int fd,
+                                   char *devname,
+                                   long blocks,
+                                   int *fdlist,
+                                   unsigned long long *offsets);
+extern unsigned long compute_backup_blocks(int nchunk, int ochunk,
+                                          unsigned int ndata, unsigned int odata);
 
 extern int save_stripes(int *source, unsigned long long *offsets,
                        int raid_disks, int chunk_size, int level, int layout,
@@ -534,9 +583,13 @@ extern struct superswitch {
         * The particular device should be:
         *   The last device added by add_to_super
         *   The device the metadata was loaded from by load_super
+        * If 'map' is present, then it is an array raid_disks long
+        * (raid_disk must already be set and correct) and it is filled
+        * with 1 for slots that are thought to be active and 0 for slots which
+        * appear to be failed/missing.
         */
-       void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
-
+       void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map);
+       struct mdinfo *(*getinfo_super_disks)(struct supertype *st);
        /* Check if the given metadata is flagged as belonging to "this"
         * host.  0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
         */
@@ -593,8 +646,10 @@ extern struct superswitch {
        int (*write_init_super)(struct supertype *st);
        int (*compare_super)(struct supertype *st, struct supertype *tst);
        int (*load_super)(struct supertype *st, int fd, char *devname);
+       int (*load_container)(struct supertype *st, int fd, char *devname);
        struct supertype * (*match_metadata_desc)(char *arg);
        __u64 (*avail_size)(struct supertype *st, __u64 size);
+       unsigned long long (*min_acceptable_spare_size)(struct supertype *st);
        int (*add_internal_bitmap)(struct supertype *st, int *chunkp,
                                   int delay, int write_behind,
                                   unsigned long long size, int may_change, int major);
@@ -609,7 +664,7 @@ extern struct superswitch {
         * added to validate changing size and new devices.  If there are
         * inter-device dependencies, it should record sufficient details
         * so these can be validated.
-        * Both 'size' and '*freesize' are in sectors.  chunk is bytes.
+        * Both 'size' and '*freesize' are in sectors.  chunk is KiB.
         */
        int (*validate_geometry)(struct supertype *st, int level, int layout,
                                 int raiddisks,
@@ -617,15 +672,23 @@ extern struct superswitch {
                                 char *subdev, unsigned long long *freesize,
                                 int verbose);
 
-       struct mdinfo *(*container_content)(struct supertype *st);
-       /* Allow a metadata handler to override mdadm's default layouts */
-       int (*default_layout)(int level); /* optional */
-       /* query the supertype for default chunk size */
-       int (*default_chunk)(struct supertype *st); /* optional */
+       struct mdinfo *(*container_content)(struct supertype *st, char *subarray);
+       /* query the supertype for default geometry */
+       void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */
        /* Permit subarray's to be deleted from inactive containers */
        int (*kill_subarray)(struct supertype *st); /* optional */
        /* Permit subarray's to be modified */
-       int (*update_subarray)(struct supertype *st, char *update, mddev_ident_t ident); /* optional */
+       int (*update_subarray)(struct supertype *st, char *subarray,
+                              char *update, struct mddev_ident *ident); /* optional */
+       /* Check if reshape is supported for this external format.
+        * st is obtained from super_by_fd() where st->subarray[0] is
+        * initialized to indicate if reshape is being performed at the
+        * container or subarray level
+        */
+       int (*reshape_super)(struct supertype *st, long long size, int level,
+                            int layout, int chunksize, int raid_disks,
+                            char *backup, char *dev, int verbose); /* optional */
+       int (*manage_reshape)(struct supertype *st, char *backup); /* optional */
 
 /* for mdmon */
        int (*open_new)(struct supertype *c, struct active_array *a,
@@ -667,13 +730,30 @@ extern struct superswitch {
         */
        struct mdinfo *(*activate_spare)(struct active_array *a,
                                         struct metadata_update **updates);
+       /*
+        * Return statically allocated string that represents metadata specific
+        * controller domain of the disk. The domain is used in disk domain
+        * matching functions. Disks belong to the same domain if the they have
+        * the same domain from mdadm.conf and belong the same metadata domain.
+        * Returning NULL or not providing this handler means that metadata
+        * does not distinguish the differences between disks that belong to
+        * different controllers. They are in the domain specified by
+        * configuration file (mdadm.conf).
+        * In case when the metadata has the notion of domains based on disk
+        * it shall return NULL for disks that do not belong to the controller
+        * the supported domains. Such disks will form another domain and won't
+        * be mixed with supported ones.
+        */
+       const char *(*get_disk_controller_domain)(const char *path);
 
        int swapuuid; /* true if uuid is bigending rather than hostendian */
        int external;
        const char *name; /* canonical metadata name */
-} super0, super1, super_ddf, *superlist[];
+} *superlist[];
 
-extern struct superswitch super_imsm;
+extern struct superswitch super0, super1;
+extern struct superswitch super_imsm, super_ddf;
+extern struct superswitch mbr, gpt;
 
 struct metadata_update {
        int     len;
@@ -704,11 +784,8 @@ struct supertype {
        int minor_version;
        int max_devs;
        int container_dev;    /* devnum of container */
-       char subarray[32];      /* name of array inside container */
        void *sb;
        void *info;
-       int loaded_container;   /* Set if load_super found a container,
-                                * not just one device */
 
        struct metadata_update *updates;
        struct metadata_update **update_tail;
@@ -726,14 +803,117 @@ struct supertype {
 
 };
 
-extern struct supertype *super_by_fd(int fd);
-extern struct supertype *guess_super(int fd);
+extern struct supertype *super_by_fd(int fd, char **subarray);
+enum guess_types { guess_any, guess_array, guess_partitions };
+extern struct supertype *guess_super_type(int fd, enum guess_types guess_type);
+static inline struct supertype *guess_super(int fd) {
+       return guess_super_type(fd, guess_any);
+}
 extern struct supertype *dup_super(struct supertype *st);
 extern int get_dev_size(int fd, char *dname, unsigned long long *sizep);
+extern int must_be_container(int fd);
+extern int dev_size_from_id(dev_t id, unsigned long long *size);
 extern void get_one_disk(int mdfd, mdu_array_info_t *ainf,
                         mdu_disk_info_t *disk);
 void wait_for(char *dev, int fd);
 
+/*
+ * Data structures for policy management.
+ * Each device can have a policy structure that lists
+ * various name/value pairs each possibly with a metadata associated.
+ * The policy list is sorted by name/value/metadata
+ */
+struct dev_policy {
+       struct dev_policy *next;
+       char *name;     /* None of these strings are allocated.  They are
+                        * all just references to strings which are known
+                        * to exist elsewhere.
+                        * name and metadata can be compared by address equality.
+                        */
+       const char *metadata;
+       const char *value;
+};
+
+extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[];
+
+/* iterate over the sublist starting at list, having the same
+ * 'name' as 'list', and matching the given metadata (Where
+ * NULL matches anything
+ */
+#define pol_for_each(item, list, _metadata)                            \
+       for (item = list;                                               \
+            item && item->name == list->name;                          \
+            item = item->next)                                         \
+               if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \
+                       ; else
+
+/*
+ * policy records read from mdadm are largely just name-value pairs.
+ * The names are constants, not strdupped
+ */
+struct pol_rule {
+       struct pol_rule *next;
+       char *type;     /* rule_policy or rule_part */
+       struct rule {
+               struct rule *next;
+               char *name;
+               char *value;
+               char *dups; /* duplicates of 'value' with a partNN appended */
+       } *rule;
+};
+
+extern char rule_policy[], rule_part[];
+extern char rule_path[], rule_type[];
+extern char type_part[], type_disk[];
+
+extern void policyline(char *line, char *type);
+extern void policy_add(char *type, ...);
+extern void policy_free(void);
+
+extern struct dev_policy *path_policy(char *path, char *type);
+extern struct dev_policy *disk_policy(struct mdinfo *disk);
+extern struct dev_policy *devnum_policy(int dev);
+extern void dev_policy_free(struct dev_policy *p);
+
+//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern struct dev_policy *pol_find(struct dev_policy *pol, char *name);
+
+enum policy_action {
+       act_default,
+       act_include,
+       act_re_add,
+       act_spare,      /* This only applies to bare devices */
+       act_spare_same_slot, /* this allows non-bare devices,
+                             * but only if recent removal */
+       act_force_spare, /* this allow non-bare devices in any case */
+       act_err
+};
+
+extern int policy_action_allows(struct dev_policy *plist, const char *metadata,
+                               enum policy_action want);
+extern int disk_action_allows(struct mdinfo *disk, const char *metadata,
+                             enum policy_action want);
+
+struct domainlist {
+       struct domainlist *next;
+       const char *dom;
+};
+
+extern int domain_test(struct domainlist *dom, struct dev_policy *pol,
+                      const char *metadata);
+extern struct domainlist *domain_from_array(struct mdinfo *mdi,
+                                           const char *metadata);
+extern void domainlist_add_dev(struct domainlist **dom, int devnum,
+                              const char *metadata);
+extern void domain_free(struct domainlist *dl);
+extern void domain_merge(struct domainlist **domp, struct dev_policy *pol,
+                        const char *metadata);
+void domain_add(struct domainlist **domp, char *domain);
+
+extern void policy_save_path(char *id_path, struct map_ent *array);
+extern int policy_check_path(struct mdinfo *disk, struct map_ent *array);
+
 #if __GNUC__ < 3
 struct stat64;
 #endif
@@ -779,27 +959,30 @@ extern int Manage_ro(char *devname, int fd, int readonly);
 extern int Manage_runstop(char *devname, int fd, int runstop, int quiet);
 extern int Manage_resize(char *devname, int fd, long long size, int raid_disks);
 extern int Manage_subdevs(char *devname, int fd,
-                         mddev_dev_t devlist, int verbose, int test);
+                         struct mddev_dev *devlist, int verbose, int test,
+                         char *update);
 extern int autodetect(void);
 extern int Grow_Add_device(char *devname, int fd, char *newdev);
 extern int Grow_addbitmap(char *devname, int fd, char *file, int chunk, int delay, int write_behind, int force);
 extern int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        long long size,
-                       int level, char *layout_str, int chunksize, int raid_disks);
+                       int level, char *layout_str, int chunksize, int raid_disks,
+                       int force);
 extern int Grow_restart(struct supertype *st, struct mdinfo *info,
                        int *fdlist, int cnt, char *backup_file, int verbose);
 extern int Grow_continue(int mdfd, struct supertype *st,
                         struct mdinfo *info, char *backup_file);
 
 extern int Assemble(struct supertype *st, char *mddev,
-                   mddev_ident_t ident,
-                   mddev_dev_t devlist, char *backup_file,
+                   struct mddev_ident *ident,
+                   struct mddev_dev *devlist,
+                   char *backup_file, int invalid_backup,
                    int readonly, int runstop,
                    char *update, char *homehost, int require_homehost,
                    int verbose, int force);
 
 extern int Build(char *mddev, int chunk, int level, int layout,
-                int raiddisks, mddev_dev_t devlist, int assume_clean,
+                int raiddisks, struct mddev_dev *devlist, int assume_clean,
                 char *bitmap_file, int bitmap_chunk, int write_behind,
                 int delay, int verbose, int autof, unsigned long long size);
 
@@ -807,35 +990,33 @@ extern int Build(char *mddev, int chunk, int level, int layout,
 extern int Create(struct supertype *st, char *mddev,
                  int chunk, int level, int layout, unsigned long long size, int raiddisks, int sparedisks,
                  char *name, char *homehost, int *uuid,
-                 int subdevs, mddev_dev_t devlist,
+                 int subdevs, struct mddev_dev *devlist,
                  int runstop, int verbose, int force, int assume_clean,
                  char *bitmap_file, int bitmap_chunk, int write_behind, int delay, int autof);
 
 extern int Detail(char *dev, int brief, int export, int test, char *homehost);
 extern int Detail_Platform(struct superswitch *ss, int scan, int verbose);
 extern int Query(char *dev);
-extern int Examine(mddev_dev_t devlist, int brief, int export, int scan,
+extern int Examine(struct mddev_dev *devlist, int brief, int export, int scan,
                   int SparcAdjust, struct supertype *forcest, char *homehost);
-extern int Monitor(mddev_dev_t devlist,
+extern int Monitor(struct mddev_dev *devlist,
                   char *mailaddr, char *alert_cmd,
                   int period, int daemonise, int scan, int oneshot,
-                  int dosyslog, int test, char *pidfile, int increments);
+                  int dosyslog, int test, char *pidfile, int increments,
+                  int share);
 
 extern int Kill(char *dev, struct supertype *st, int force, int quiet, int noexcl);
 extern int Kill_subarray(char *dev, char *subarray, int quiet);
-extern int Update_subarray(char *dev, char *subarray, char *update, mddev_ident_t ident, int quiet);
+extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet);
 extern int Wait(char *dev);
 extern int WaitClean(char *dev, int sock, int verbose);
 
 extern int Incremental(char *devname, int verbose, int runstop,
                       struct supertype *st, char *homehost, int require_homehost,
                       int autof);
-extern int Incremental_container(struct supertype *st, char *devname,
-                                int verbose, int runstop, int autof,
-                                int trustworthy);
 extern void RebuildMap(void);
 extern int IncrementalScan(int verbose);
-extern int IncrementalRemove(char *devname, int verbose);
+extern int IncrementalRemove(char *devname, char *path, int verbose);
 extern int CreateBitmap(char *filename, int force, char uuid[16],
                        unsigned long chunksize, unsigned long daemon_sleep,
                        unsigned long write_behind,
@@ -847,6 +1028,7 @@ extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb);
 
 extern int md_get_version(int fd);
 extern int get_linux_version(void);
+extern int mdadm_version(char *version);
 extern long long parse_size(char *size);
 extern int parse_uuid(char *str, int uuid[4]);
 extern int parse_layout_10(char *layout);
@@ -864,10 +1046,10 @@ extern int is_standard(char *dev, int *nump);
 extern int same_dev(char *one, char *two);
 
 extern int parse_auto(char *str, char *msg, int config);
-extern mddev_ident_t conf_get_ident(char *dev);
-extern mddev_dev_t conf_get_devs(void);
+extern struct mddev_ident *conf_get_ident(char *dev);
+extern struct mddev_dev *conf_get_devs(void);
 extern int conf_test_dev(char *devname);
-extern int conf_test_metadata(const char *version, int is_homehost);
+extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost);
 extern struct createinfo *conf_get_create_info(void);
 extern void set_conffile(char *file);
 extern char *conf_get_mailaddr(void);
@@ -878,7 +1060,8 @@ extern char *conf_line(FILE *file);
 extern char *conf_word(FILE *file, int allow_key);
 extern int conf_name_is_free(char *name);
 extern int devname_matches(char *name, char *match);
-extern struct mddev_ident_s *conf_match(struct mdinfo *info, struct supertype *st);
+extern struct mddev_ident *conf_match(struct mdinfo *info, struct supertype *st);
+extern int experimental(void);
 
 extern void free_line(char *line);
 extern int match_oneof(char *devices, char *devname);
@@ -892,6 +1075,7 @@ extern char *fname_from_uuid(struct supertype *st,
 extern unsigned long calc_csum(void *super, int bytes);
 extern int enough(int level, int raid_disks, int layout, int clean,
                   char *avail, int avail_disks);
+extern int enough_fd(int fd);
 extern int ask(char *mesg);
 extern unsigned long long get_component_size(int fd);
 extern void remove_partitions(int fd);
@@ -906,6 +1090,8 @@ extern int assemble_container_content(struct supertype *st, int mdfd,
 
 extern int add_disk(int mdfd, struct supertype *st,
                    struct mdinfo *sra, struct mdinfo *info);
+extern int remove_disk(int mdfd, struct supertype *st,
+                      struct mdinfo *sra, struct mdinfo *info);
 extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
 unsigned long long min_recovery_start(struct mdinfo *array);
 
@@ -933,7 +1119,7 @@ extern int open_container(int fd);
 extern int is_container_member(struct mdstat_ent *ent, char *devname);
 extern int is_subarray_active(char *subarray, char *devname);
 int is_container_active(char *devname);
-extern int open_subarray(char *dev, struct supertype *st, int quiet);
+extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet);
 extern struct superswitch *version_to_superswitch(char *vers);
 
 extern int mdmon_running(int devnum);
@@ -972,13 +1158,20 @@ static inline int is_subarray(char *vers)
        /* The version string for a 'subarray' (an array in a container)
         * is 
         *    /containername/componentname    for normal read-write arrays
-        *    -containername/componentname    for read-only arrays.
+        *    -containername/componentname    for arrays which mdmon must not
+        *                                    reconfigure.  They might be read-only
+        *                                    or might be undergoing reshape etc.
         * containername is e.g. md0, md_d1
         * componentname is dependant on the metadata. e.g. '1' 'S1' ...
         */
        return (*vers == '/' || *vers == '-');
 }
 
+static inline char *to_subarray(struct mdstat_ent *ent, char *container)
+{
+       return &ent->metadata_version[10+strlen(container)+1];
+}
+
 #ifdef DEBUG
 #define dprintf(fmt, arg...) \
        fprintf(stderr, fmt, ##arg)
index d0d0707e123412b6233fad96285ad6eb8114c137..28f431b45d3bb4b16733161ee7c8f793a2973444 100644 (file)
@@ -88,7 +88,7 @@ int verbose = 0;
 int force = 0;
 
 int main(int argc, char *argv[]) {
-       mddev_ident_t array_list =  conf_get_ident(NULL);
+       struct mddev_ident *array_list =  conf_get_ident(NULL);
        if (!array_list) {
                fprintf(stderr, Name ": No arrays found in config file\n");
                rv = 1;
diff --git a/mdmon.c b/mdmon.c
index e416b2e42f4559c69b896252f6d4cb4f8efa4447..f56e57f320ebf3e061bef58625012a6bbae1fc8e 100644 (file)
--- a/mdmon.c
+++ b/mdmon.c
@@ -398,7 +398,6 @@ static int mdmon(char *devname, int devnum, int must_fork, int takeover)
        container->devnum = devnum;
        container->devname = devname;
        container->arrays = NULL;
-       container->subarray[0] = 0;
        container->sock = -1;
 
        if (!container->devname) {
@@ -469,7 +468,7 @@ static int mdmon(char *devname, int devnum, int must_fork, int takeover)
                }
                close(victim_sock);
        }
-       if (container->ss->load_super(container, mdfd, devname)) {
+       if (container->ss->load_container(container, mdfd, devname)) {
                fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
                        devname);
                exit(3);
index 47be2bba21fabaac7a617f8b4f0264f7fcd966a3..c5a07b5ba9d651f8dec74d9acdf4d2168528541a 100644 (file)
--- a/mdstat.c
+++ b/mdstat.c
@@ -185,7 +185,6 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                ent->resync = 0;
                ent->metadata_version = NULL;
                ent->raid_disks = 0;
-               ent->chunk_size = 0;
                ent->devcnt = 0;
                ent->members = NULL;
 
@@ -241,11 +240,27 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                                   w[l-1] == '%' &&
                                   (eq=strchr(w, '=')) != NULL ) {
                                ent->percent = atoi(eq+1);
-                               if (strncmp(w,"resync", 4)==0)
+                               if (strncmp(w,"resync", 6)==0)
                                        ent->resync = 1;
+                               else if (strncmp(w, "reshape", 7)==0)
+                                       ent->resync = 2;
+                               else
+                                       ent->resync = 0;
                        } else if (ent->percent == -1 &&
-                                  strncmp(w, "resync", 4)==0) {
-                               ent->resync = 1;
+                                  (w[0] == 'r' || w[0] == 'c')) {
+                               if (strncmp(w, "resync", 4)==0)
+                                       ent->resync = 1;
+                               if (strncmp(w, "reshape", 7)==0)
+                                       ent->resync = 2;
+                               if (strncmp(w, "recovery", 8)==0)
+                                       ent->resync = 0;
+                               if (strncmp(w, "check", 5)==0)
+                                       ent->resync = 3;
+
+                               if (l > 8 && strcmp(w+l-8, "=DELAYED"))
+                                       ent->percent = 0;
+                               if (l > 8 && strcmp(w+l-8, "=PENDING"))
+                                       ent->percent = 0;
                        } else if (ent->percent == -1 &&
                                   w[0] >= '0' &&
                                   w[0] <= '9' &&
index 59b4181954c24c0dc209e93c17c6ba391e2a4eab..f166bc8fd44b0b0140ed313e3720b4ffdd1b6764 100644 (file)
--- a/monitor.c
+++ b/monitor.c
@@ -525,6 +525,7 @@ static int wait_and_act(struct supertype *container, int nowait)
                                remove_pidfile(container->devname);
                        exit_now = 1;
                        signal_manager();
+                       close(fd);
                        exit(0);
                }
        }
diff --git a/msg.c b/msg.c
index aabfa8f566072b64d900b91c8e50cfaf0fde0057..5511ecdb75af3b752ebc79aeedadedefc155ad45 100644 (file)
--- a/msg.c
+++ b/msg.c
@@ -135,7 +135,15 @@ int ack(int fd, int tmo)
 int wait_reply(int fd, int tmo)
 {
        struct metadata_update msg;
-       return receive_message(fd, &msg, tmo);
+       int err = receive_message(fd, &msg, tmo);
+
+       /* mdmon sent extra data, but caller only cares that we got a
+        * successful reply
+        */
+       if (err == 0 && msg.len > 0)
+               free(msg.buf);
+
+       return err;
 }
 
 int connect_monitor(char *devname)
@@ -195,7 +203,6 @@ int fping_monitor(int sfd)
        return err;
 }
 
-
 /* give the monitor a chance to update the metadata */
 int ping_monitor(char *devname)
 {
@@ -206,6 +213,203 @@ int ping_monitor(char *devname)
        return err;
 }
 
+static char *ping_monitor_version(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       struct metadata_update msg;
+       int err = 0;
+
+       if (sfd < 0)
+               return NULL;
+
+       if (ack(sfd, 20) != 0)
+               err = -1;
+
+       if (!err && receive_message(sfd, &msg, 20) != 0)
+               err = -1;
+
+       close(sfd);
+
+       if (err || !msg.len || !msg.buf)
+               return NULL;
+       return msg.buf;
+}
+
+static int unblock_subarray(struct mdinfo *sra, const int unfreeze)
+{
+       char buf[64];
+       int rc = 0;
+
+       if (sra) {
+               sprintf(buf, "external:%s\n", sra->text_version);
+               buf[9] = '/';
+       } else
+               buf[9] = '-';
+
+       if (buf[9] == '-' ||
+           sysfs_set_str(sra, NULL, "metadata_version", buf) ||
+           (unfreeze &&
+            sysfs_attribute_available(sra, NULL, "sync_action") &&
+            sysfs_set_str(sra, NULL, "sync_action", "idle")))
+               rc = -1;
+       return rc;
+}
+
+/**
+ * block_monitor - prevent mdmon spare assignment
+ * @container - container to block
+ * @freeze - flag to additionally freeze sync_action
+ *
+ * This is used by the reshape code to freeze the container, and the
+ * auto-rebuild implementation to atomically move spares.
+ * In both cases we need to stop mdmon from assigning spares to replace
+ * failed devices as we might have other plans for the spare.
+ * For the reshape case we also need to 'freeze' sync_action so that
+ * no recovery happens until we have fully prepared for the reshape.
+ *
+ * We tell mdmon that the array is frozen by marking the 'metadata' name
+ * with a leading '-'.  The previously told mdmon "Don't make this array
+ * read/write, leave it readonly".  Now it means a more general "Don't
+ * reconfigure this array at all".
+ * As older versions of mdmon (which might run from initrd) don't understand
+ * this, we first check that the running mdmon is new enough.
+ */
+int block_monitor(char *container, const int freeze)
+{
+       int devnum = devname2devnum(container);
+       struct mdstat_ent *ent, *e, *e2;
+       struct mdinfo *sra = NULL;
+       char *version = NULL;
+       char buf[64];
+       int rv = 0;
+
+       if (!mdmon_running(devnum)) {
+               /* if mdmon is not active we assume that any instance that is
+                * later started will match the current mdadm version, if this
+                * assumption is violated we may inadvertantly rebuild an array
+                * that was meant for reshape, or start rebuild on a spare that
+                * was to be moved to another container
+                */
+               /* pass */;
+       } else {
+               int ver;
+
+               version = ping_monitor_version(container);
+               ver = version ? mdadm_version(version) : -1;
+               free(version);
+               if (ver < 3002000) {
+                       fprintf(stderr, Name
+                               ": mdmon instance for %s cannot be disabled\n",
+                               container);
+                       return -1;
+               }
+       }
+
+       ent = mdstat_read(0, 0);
+       if (!ent) {
+               fprintf(stderr, Name
+                       ": failed to read /proc/mdstat while disabling mdmon\n");
+               return -1;
+       }
+
+       /* freeze container contents */
+       for (e = ent; e; e = e->next) {
+               if (!is_container_member(e, container))
+                       continue;
+               sysfs_free(sra);
+               sra = sysfs_read(-1, e->devnum, GET_VERSION);
+               if (!sra) {
+                       fprintf(stderr, Name
+                               ": failed to read sysfs for subarray%s\n",
+                               to_subarray(e, container));
+                       break;
+               }
+               /* can't reshape an array that we can't monitor */
+               if (sra->text_version[0] == '-')
+                       break;
+
+               if (freeze && sysfs_freeze_array(sra) < 1)
+                       break;
+               /* flag this array to not be modified by mdmon (close race with
+                * takeover in reshape case and spare reassignment in the
+                * auto-rebuild case)
+                */
+               sprintf(buf, "external:%s\n", sra->text_version);
+               buf[9] = '-';
+               if (sysfs_set_str(sra, NULL, "metadata_version", buf))
+                       break;
+               ping_monitor(container);
+
+               /* check that we did not race with recovery */
+               if ((freeze &&
+                    !sysfs_attribute_available(sra, NULL, "sync_action")) ||
+                   (freeze &&
+                    sysfs_attribute_available(sra, NULL, "sync_action") &&
+                    sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 &&
+                    strcmp(buf, "frozen\n") == 0))
+                       /* pass */;
+               else
+                       break;
+       }
+
+       if (e) {
+               fprintf(stderr, Name ": failed to freeze subarray%s\n",
+                       to_subarray(e, container));
+
+               /* thaw the partially frozen container */
+               for (e2 = ent; e2 && e2 != e; e2 = e2->next) {
+                       if (!is_container_member(e2, container))
+                               continue;
+                       sysfs_free(sra);
+                       sra = sysfs_read(-1, e2->devnum, GET_VERSION);
+                       if (unblock_subarray(sra, freeze))
+                               fprintf(stderr, Name ": Failed to unfreeze %s\n", e2->dev);
+               }
+
+               ping_monitor(container); /* cleared frozen */
+               rv = -1;
+       }
+
+       sysfs_free(sra);
+       free_mdstat(ent);
+       free(container);
+
+       return rv;
+}
+
+void unblock_monitor(char *container, const int unfreeze)
+{
+       struct mdstat_ent *ent, *e;
+       struct mdinfo *sra = NULL;
+       int to_ping = 0;
+
+       ent = mdstat_read(0, 0);
+       if (!ent) {
+               fprintf(stderr, Name
+                       ": failed to read /proc/mdstat while unblocking container\n");
+               return;
+       }
+
+       /* unfreeze container contents */
+       for (e = ent; e; e = e->next) {
+               if (!is_container_member(e, container))
+                       continue;
+               sysfs_free(sra);
+               sra = sysfs_read(-1, e->devnum, GET_VERSION|GET_LEVEL);
+               if (sra->array.level > 0)
+                       to_ping++;
+               if (unblock_subarray(sra, unfreeze))
+                       fprintf(stderr, Name ": Failed to unfreeze %s\n", e->dev);
+       }
+       if (to_ping)
+               ping_monitor(container);
+
+       sysfs_free(sra);
+       free_mdstat(ent);
+}
+
+
+
 /* give the manager a chance to view the updated container state.  This
  * would naturally happen due to the manager noticing a change in
  * /proc/mdstat; however, pinging encourages this detection to happen
diff --git a/msg.h b/msg.h
index f8e89fdccc989a29923026127d8cfdbba8a4ec3d..1f916debe6aa477b376b2214eedd714b9fa72226 100644 (file)
--- a/msg.h
+++ b/msg.h
@@ -27,6 +27,8 @@ extern int ack(int fd, int tmo);
 extern int wait_reply(int fd, int tmo);
 extern int connect_monitor(char *devname);
 extern int ping_monitor(char *devname);
+extern int block_monitor(char *container, const int freeze);
+extern void unblock_monitor(char *container, const int unfreeze);
 extern int fping_monitor(int sock);
 extern int ping_manager(char *devname);
 
diff --git a/part.h b/part.h
new file mode 100644 (file)
index 0000000..0afea33
--- /dev/null
+++ b/part.h
@@ -0,0 +1,82 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/* Structure definitions ext  for MBR and GPT partition tables
+ */
+
+
+#define        MBR_SIGNATURE_MAGIC     __cpu_to_le16(0xAA55)
+#define MBR_PARTITIONS               4
+
+struct MBR_part_record {
+  __u8 bootable;
+  __u8 first_head;
+  __u8 first_sector;
+  __u8 first_cyl;
+  __u8 part_type;
+  __u8 last_head;
+  __u8 last_sector;
+  __u8 last_cyl;
+  __u32 first_sect_lba;
+  __u32 blocks_num;
+};
+
+struct MBR {
+       __u8 pad[446];
+       struct MBR_part_record parts[MBR_PARTITIONS];
+       __u16 magic;
+} __attribute__((packed));
+
+
+
+#define        GPT_SIGNATURE_MAGIC     __cpu_to_le64(0x5452415020494645ULL)
+#define MBR_GPT_PARTITION_TYPE       0xEE
+
+struct GPT_part_entry {
+       unsigned char type_guid[16];
+       unsigned char partition_guid[16];
+       __u64 starting_lba;
+       __u64 ending_lba;
+       unsigned char attr_bits[8];
+       unsigned char name[72];
+} __attribute__((packed));
+
+struct GPT {
+       __u64 magic;
+       __u32 revision;
+       __u32 header_size;
+       __u32 crc;
+       __u32 pad1;
+       __u64 current_lba;
+       __u64 backup_lba;
+       __u64 first_lba;
+       __u64 last_lba;
+       __u8 guid[16];
+       __u64 part_start;
+       __u32 part_cnt;
+       __u32 part_size;
+       __u32 part_crc;
+       __u8 pad2[420];
+} __attribute__((packed));
diff --git a/policy.c b/policy.c
new file mode 100644 (file)
index 0000000..ba976db
--- /dev/null
+++ b/policy.c
@@ -0,0 +1,766 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <dirent.h>
+#include <fnmatch.h>
+#include <ctype.h>
+#include "dlink.h"
+/*
+ * Policy module for mdadm.
+ * A policy statement about a device lists a set of values for each
+ * of a set of names.  Each value can have a metadata type as context.
+ *
+ * names include:
+ *   action - the actions that can be taken on hot-plug
+ *   domain - the domain(s) that the device is part of
+ *
+ * Policy information is extracted from various sources, but
+ * particularly from a set of policy rules in mdadm.conf
+ */
+
+static void pol_new(struct dev_policy **pol, char *name, const char *val,
+                   const char *metadata)
+{
+       struct dev_policy *n = malloc(sizeof(*n));
+       const char *real_metadata = NULL;
+       int i;
+
+       n->name = name;
+       n->value = val;
+
+       /* We need to normalise the metadata name */
+       if (metadata) {
+               for (i = 0; superlist[i] ; i++)
+                       if (strcmp(metadata, superlist[i]->name) == 0) {
+                               real_metadata = superlist[i]->name;
+                               break;
+                       }
+               if (!real_metadata) {
+                       if (strcmp(metadata, "1") == 0 ||
+                           strcmp(metadata, "1.0") == 0 ||
+                           strcmp(metadata, "1.1") == 0 ||
+                           strcmp(metadata, "1.2") == 0)
+                               real_metadata = super1.name;
+               }
+               if (!real_metadata) {
+                       static const char *prev = NULL;
+                       if (prev != metadata) {
+                               fprintf(stderr, Name ": metadata=%s unrecognised - ignoring rule\n",
+                                       metadata);
+                               prev = metadata;
+                       }
+                       real_metadata = "unknown";
+               }
+       }
+
+       n->metadata = real_metadata;
+       n->next = *pol;
+       *pol = n;
+}
+
+static int pol_lesseq(struct dev_policy *a, struct dev_policy *b)
+{
+       int cmp;
+
+       if (a->name < b->name)
+               return 1;
+       if (a->name > b->name)
+               return 0;
+
+       cmp = strcmp(a->value, b->value);
+       if (cmp < 0)
+               return 1;
+       if (cmp > 0)
+               return 0;
+
+       return (a->metadata <= b->metadata);
+}
+
+static void pol_sort(struct dev_policy **pol)
+{
+       /* sort policy list in *pol by name/metadata/value
+        * using merge sort
+        */
+
+       struct dev_policy *pl[2];
+       pl[0] = *pol;
+       pl[1] = NULL;
+
+       do {
+               struct dev_policy **plp[2], *p[2];
+               int curr = 0;
+               struct dev_policy nul = { NULL, NULL, NULL, NULL };
+               struct dev_policy *prev = &nul;
+               int next = 0;
+
+               /* p[] are the two lists that we are merging.
+                * plp[] are the ends of the two lists we create
+                * from the merge.
+                * 'curr' is which of plp[] that we are currently
+                *   adding items to.
+                * 'next' is which if p[] we will take the next
+                *   item from.
+                * 'prev' is that last value, which was placed in
+                * plp[curr].
+                */
+               plp[0] = &pl[0];
+               plp[1] = &pl[1];
+               p[0] = pl[0];
+               p[1] = pl[1];
+
+               /* take least of p[0] and p[1]
+                * if it is larger than prev, add to
+                * plp[curr], else swap curr then add
+                */
+               while (p[0] || p[1]) {
+                       if (p[next] == NULL ||
+                           (p[1-next] != NULL &&
+                            !(pol_lesseq(prev, p[1-next])
+                              ^pol_lesseq(prev, p[next])
+                              ^pol_lesseq(p[next], p[1-next])))
+                               )
+                               next = 1 - next;
+
+                       if (!pol_lesseq(prev, p[next]))
+                               curr = 1 - curr;
+
+                       *plp[curr] = prev = p[next];
+                       plp[curr] = &p[next]->next;
+                       p[next] = p[next]->next;
+               }
+               *plp[0] = NULL;
+               *plp[1] = NULL;
+       } while (pl[0] && pl[1]);
+       if (pl[0])
+               *pol = pl[0];
+       else
+               *pol = pl[1];
+}
+
+static void pol_dedup(struct dev_policy *pol)
+{
+       /* This is a sorted list - remove duplicates. */
+       while (pol && pol->next) {
+               if (pol_lesseq(pol->next, pol)) {
+                       struct dev_policy *tmp = pol->next;
+                       pol->next = tmp->next;
+                       free(tmp);
+               } else
+                       pol = pol->next;
+       }
+}
+
+/*
+ * pol_find finds the first entry in the policy
+ * list to match name.
+ * If it returns non-NULL there is at least one
+ * value, but how many can only be found by
+ * iterating through the list.
+ */
+struct dev_policy *pol_find(struct dev_policy *pol, char *name)
+{
+       while (pol && pol->name < name)
+               pol = pol->next;
+
+       if (!pol || pol->name != name)
+               return NULL;
+       return pol;
+}
+
+static char *disk_path(struct mdinfo *disk)
+{
+       struct stat stb;
+       int prefix_len;
+       DIR *by_path;
+       char symlink[PATH_MAX] = "/dev/disk/by-path/";
+       struct dirent *ent;
+
+       by_path = opendir(symlink);
+       if (!by_path)
+               return NULL;
+       prefix_len = strlen(symlink);
+
+       while ((ent = readdir(by_path)) != NULL) {
+               if (ent->d_type != DT_LNK)
+                       continue;
+               strncpy(symlink + prefix_len,
+                       ent->d_name,
+                       sizeof(symlink) - prefix_len);
+               if (stat(symlink, &stb) < 0)
+                       continue;
+               if ((stb.st_mode & S_IFMT) != S_IFBLK)
+                       continue;
+               if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor))
+                       continue;
+               closedir(by_path);
+               return strdup(ent->d_name);
+       }
+       closedir(by_path);
+       return NULL;
+}
+
+char type_part[] = "part";
+char type_disk[] = "disk";
+static char *disk_type(struct mdinfo *disk)
+{
+       char buf[30+20+20];
+       struct stat stb;
+       sprintf(buf, "/sys/dev/block/%d:%d/partition",
+               disk->disk.major, disk->disk.minor);
+       if (stat(buf, &stb) == 0)
+               return type_part;
+       else
+               return type_disk;
+}
+
+static int pol_match(struct rule *rule, char *path, char *type)
+{
+       /* check if this rule matches on path and type */
+       int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */
+       int typeok = 0;
+
+       while (rule) {
+               if (rule->name == rule_path) {
+                       if (pathok == 0)
+                               pathok = -1;
+                       if (fnmatch(rule->value, path, 0) == 0)
+                               pathok = 1;
+               }
+               if (rule->name == rule_type) {
+                       if (typeok == 0)
+                               typeok = -1;
+                       if (strcmp(rule->value, type) == 0)
+                               typeok = 1;
+               }
+               rule = rule->next;
+       }
+       return pathok >= 0 && typeok >= 0;
+}
+
+static void pol_merge(struct dev_policy **pol, struct rule *rule)
+{
+       /* copy any name assignments from rule into pol */
+       struct rule *r;
+       char *metadata = NULL;
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_metadata)
+                       metadata = r->value;
+
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_act ||
+                   r->name == pol_domain)
+                       pol_new(pol, r->name, r->value, metadata);
+}
+
+static int path_has_part(char *path, char **part)
+{
+       /* check if path ends with "-partNN" and
+        * if it does, place a pointer to "-pathNN"
+        * in 'part'.
+        */
+       int l = strlen(path);
+       while (l > 1 && isdigit(path[l-1]))
+               l--;
+       if (l < 5 || strncmp(path+l-5, "-part", 5) != 0)
+               return 0;
+       *part = path+l-4;
+       return 1;
+}
+
+static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part)
+{
+       /* copy any name assignments from rule into pol, appending
+        * -part to any domain.  The string with -part appended is
+        * stored with the rule so it has a lifetime to match
+        * the rule.
+        */
+       struct rule *r;
+       char *metadata = NULL;
+       for (r = rule; r ; r = r->next)
+               if (r->name == pol_metadata)
+                       metadata = r->value;
+
+       for (r = rule; r ; r = r->next) {
+               if (r->name == pol_act)
+                       pol_new(pol, r->name, r->value, metadata);
+               else if (r->name == pol_domain) {
+                       char *dom;
+                       int len;
+                       if (r->dups == NULL)
+                               r->dups = dl_head();
+                       len = strlen(r->value);
+                       for (dom = dl_next(r->dups); dom != r->dups;
+                            dom = dl_next(dom))
+                               if (strcmp(dom+len+1, part)== 0)
+                                       break;
+                       if (dom == r->dups) {
+                               char *newdom = dl_strndup(
+                                       r->value, len + 1 + strlen(part));
+                               strcat(strcat(newdom, "-"), part);
+                               dl_add(r->dups, newdom);
+                               dom = newdom;
+                       }
+                       pol_new(pol, r->name, dom, metadata);
+               }
+       }
+}
+
+static struct pol_rule *config_rules = NULL;
+static struct pol_rule **config_rules_end = NULL;
+static int config_rules_has_path = 0;
+
+/*
+ * most policy comes from a set policy rules that are
+ * read from the config file.
+ * path_policy() gathers policy information for the
+ * disk described in the given a 'path' and a 'type'.
+ */
+struct dev_policy *path_policy(char *path, char *type)
+{
+       struct pol_rule *rules;
+       struct dev_policy *pol = NULL;
+       int i;
+
+       if (!type)
+               return NULL;
+
+       rules = config_rules;
+
+       while (rules) {
+               char *part;
+               if (rules->type == rule_policy)
+                       if (pol_match(rules->rule, path, type))
+                               pol_merge(&pol, rules->rule);
+               if (rules->type == rule_part && strcmp(type, type_part) == 0)
+                       if (path_has_part(path, &part)) {
+                               *part = 0;
+                               if (pol_match(rules->rule, path, type_disk))
+                                       pol_merge_part(&pol, rules->rule, part+1);
+                               *part = '-';
+                       }
+               rules = rules->next;
+       }
+
+       /* Now add any metadata-specific internal knowledge
+        * about this path
+        */
+       for (i=0; superlist[i]; i++)
+               if (superlist[i]->get_disk_controller_domain) {
+                       const char *d =
+                               superlist[i]->get_disk_controller_domain(path);
+                       if (d)
+                               pol_new(&pol, pol_domain, d, superlist[i]->name);
+               }
+
+       pol_sort(&pol);
+       pol_dedup(pol);
+       return pol;
+}
+
+void pol_add(struct dev_policy **pol,
+                   char *name, char *val,
+                   char *metadata)
+{
+       pol_new(pol, name, val, metadata);
+       pol_sort(pol);
+       pol_dedup(*pol);
+}
+
+
+/*
+ * disk_policy() gathers policy information for the
+ * disk described in the given mdinfo (disk.{major,minor}).
+ */
+struct dev_policy *disk_policy(struct mdinfo *disk)
+{
+       char *path = NULL;
+       char *type = disk_type(disk);
+       struct dev_policy *pol = NULL;
+
+       if (!type)
+               return NULL;
+       if (config_rules_has_path)
+               path = disk_path(disk);
+       if (!path)
+               return NULL;
+
+       pol = path_policy(path, type);
+
+       free(path);
+       return pol;
+}
+
+struct dev_policy *devnum_policy(int dev)
+{
+       struct mdinfo disk;
+       disk.disk.major = major(dev);
+       disk.disk.minor = minor(dev);
+       return disk_policy(&disk);
+}
+
+/*
+ * process policy rules read from config file.
+ */
+
+char rule_path[] = "path";
+char rule_type[] = "type";
+
+char rule_policy[] = "policy";
+char rule_part[] = "part-policy";
+
+char pol_metadata[] = "metadata";
+char pol_act[] = "action";
+char pol_domain[] = "domain";
+char pol_auto[] = "auto";
+
+static int try_rule(char *w, char *name, struct rule **rp)
+{
+       struct rule *r;
+       int len = strlen(name);
+       if (strncmp(w, name, len) != 0 ||
+           w[len] != '=')
+               return 0;
+       r = malloc(sizeof(*r));
+       r->next = *rp;
+       r->name = name;
+       r->value = strdup(w+len+1);
+       r->dups = NULL;
+       *rp = r;
+       return 1;
+}
+
+void policyline(char *line, char *type)
+{
+       struct pol_rule *pr;
+       char *w;
+
+       if (config_rules_end == NULL)
+               config_rules_end = &config_rules;
+
+       pr = malloc(sizeof(*pr));
+       pr->type = type;
+       pr->rule = NULL;
+       for (w = dl_next(line); w != line ; w = dl_next(w)) {
+               if (try_rule(w, rule_path, &pr->rule))
+                       config_rules_has_path = 1;
+               else if (! try_rule(w, rule_type, &pr->rule) &&
+                        ! try_rule(w, pol_metadata, &pr->rule) &&
+                        ! try_rule(w, pol_act, &pr->rule) &&
+                        ! try_rule(w, pol_domain, &pr->rule) &&
+                        ! try_rule(w, pol_auto, &pr->rule))
+                       fprintf(stderr, Name ": policy rule %s unrecognised and ignored\n",
+                               w);
+       }
+       pr->next = config_rules;
+       config_rules = pr;
+}
+
+void policy_add(char *type, ...)
+{
+       va_list ap;
+       struct pol_rule *pr;
+       char *name, *val;
+
+       pr = malloc(sizeof(*pr));
+       pr->type = type;
+       pr->rule = NULL;
+
+       va_start(ap, type);
+       while ((name = va_arg(ap, char*)) != NULL) {
+               struct rule *r;
+
+               val = va_arg(ap, char*);
+               r = malloc(sizeof(*r));
+               r->next = pr->rule;
+               r->name = name;
+               r->value = strdup(val);
+               r->dups = NULL;
+               pr->rule = r;
+       }
+       pr->next = config_rules;
+       config_rules = pr;
+}
+
+void policy_free(void)
+{
+       while (config_rules) {
+               struct pol_rule *pr = config_rules;
+               struct rule *r;
+
+               config_rules = config_rules->next;
+
+               for (r = pr->rule; r; ) {
+                       struct rule *next = r->next;
+                       free(r->value);
+                       if (r->dups)
+                               free_line(r->dups);
+                       free(r);
+                       r = next;
+               }
+               free(pr);
+       }
+       config_rules_end = NULL;
+       config_rules_has_path = 0;
+}
+
+void dev_policy_free(struct dev_policy *p)
+{
+       struct dev_policy *t;
+       while (p) {
+               t = p;
+               p = p->next;
+               free(t);
+       }
+}
+
+static enum policy_action map_act(const char *act)
+{
+       if (strcmp(act, "include") == 0)
+               return act_include;
+       if (strcmp(act, "re-add") == 0)
+               return act_re_add;
+       if (strcmp(act, "spare") == 0)
+               return act_spare;
+       if (strcmp(act, "spare-same-slot") == 0)
+               return act_spare_same_slot;
+       if (strcmp(act, "force-spare") == 0)
+               return act_force_spare;
+       return act_err;
+}
+
+static enum policy_action policy_action(struct dev_policy *plist, const char *metadata)
+{
+       enum policy_action rv = act_default;
+       struct dev_policy *p;
+
+       plist = pol_find(plist, pol_act);
+       pol_for_each(p, plist, metadata) {
+               enum policy_action a = map_act(p->value);
+               if (a > rv)
+                       rv = a;
+       }
+       return rv;
+}
+
+int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want)
+{
+       enum policy_action act = policy_action(plist, metadata);
+
+       if (act == act_err)
+               return 0;
+       return (act >= want);
+}
+
+int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want)
+{
+       struct dev_policy *pol = disk_policy(disk);
+       int rv = policy_action_allows(pol, metadata, want);
+
+       dev_policy_free(pol);
+       return rv;
+}
+
+
+/* Domain policy:
+ * Any device can have a list of domains asserted by different policy
+ * statements.
+ * An array also has a list of domains comprising all the domains of
+ * all the devices in an array.
+ * Where an array has a spare-group, that becomes an addition domain for
+ * every device in the array and thus for the array.
+ *
+ * We keep the list of domains in a sorted linked list
+ * As dev policies are already sorted, this is fairly easy to manage.
+ */
+
+static struct domainlist **domain_merge_one(struct domainlist **domp,
+                                           const char *domain)
+{
+       /* merge a domain name into a sorted list and return the
+        * location of the insertion or match
+        */
+       struct domainlist *dom = *domp;
+
+       while (dom && strcmp(dom->dom, domain) < 0) {
+               domp = &dom->next;
+               dom = *domp;
+       }
+       if (dom == NULL || strcmp(dom->dom, domain) != 0) {
+               dom = malloc(sizeof(*dom));
+               dom->next = *domp;
+               dom->dom = domain;
+               *domp = dom;
+       }
+       return domp;
+}
+
+#if (DEBUG)
+void dump_policy(struct dev_policy *policy)
+{
+       while (policy) {
+               dprintf("policy: %p name: %s value: %s metadata: %s\n",
+                       policy,
+                       policy->name,
+                       policy->value,
+                       policy->metadata);
+               policy = policy->next;
+       }
+}
+#endif
+
+void domain_merge(struct domainlist **domp, struct dev_policy *pollist,
+                        const char *metadata)
+{
+       /* Add to 'domp' all the domains in pol that apply to 'metadata'
+        * which are not already in domp
+        */
+       struct dev_policy *pol;
+       pollist = pol_find(pollist, pol_domain);
+       pol_for_each(pol, pollist, metadata)
+               domain_merge_one(domp, pol->value);
+}
+
+int domain_test(struct domainlist *dom, struct dev_policy *pol,
+               const char *metadata)
+{
+       /* Check that all domains in pol (for metadata) are also in
+        * dom.  Both lists are sorted.
+        * If pol has no domains, we don't really know about this device
+        * so we reject the match.
+        */
+       int found_any = 0;
+       struct dev_policy *p;
+
+       pol = pol_find(pol, pol_domain);
+       pol_for_each(p, pol, metadata) {
+               found_any = 1;
+               while (dom && strcmp(dom->dom, p->value) < 0)
+                       dom = dom->next;
+               if (!dom || strcmp(dom->dom, p->value) != 0)
+                       return 0;
+       }
+       return found_any;
+}
+
+void domainlist_add_dev(struct domainlist **dom, int devnum, const char *metadata)
+{
+       struct dev_policy *pol = devnum_policy(devnum);
+       domain_merge(dom, pol, metadata);
+       dev_policy_free(pol);
+}
+
+struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata)
+{
+       struct domainlist *domlist = NULL;
+
+       for (mdi = mdi->devs ; mdi ; mdi = mdi->next)
+               domainlist_add_dev(&domlist, makedev(mdi->disk.major,
+                                                    mdi->disk.minor),
+                                  metadata);
+
+       return domlist;
+}
+
+void domain_add(struct domainlist **domp, char *domain)
+{
+       domain_merge_one(domp, domain);
+}
+
+
+void domain_free(struct domainlist *dl)
+{
+       while (dl) {
+               struct domainlist *head = dl;
+               dl = dl->next;
+               free(head);
+       }
+}
+
+/*
+ * same-path policy.
+ * Some policy decisions are guided by knowledge of which
+ * array previously owned the device at a given physical location (path).
+ * When removing a device from an array we might record the array against
+ * the path, and when finding a new device, we might look for which
+ * array previously used that path.
+ *
+ * The 'array' is described by a map_ent, and the path by a the disk in an
+ * mdinfo, or a string.
+ */
+
+void policy_save_path(char *id_path, struct map_ent *array)
+{
+       char path[PATH_MAX];
+       FILE *f = NULL;
+
+       if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) {
+               fprintf(stderr, Name ": can't create file to save path "
+                       "to old disk: %s\n", strerror(errno));
+               return;
+       }
+
+       snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path);
+       f = fopen(path, "w");
+       if (!f) {
+               fprintf(stderr, Name ": can't create file to"
+                       " save path to old disk: %s\n",
+                       strerror(errno));
+               return;
+       }
+
+       if (fprintf(f, "%s %08x:%08x:%08x:%08x\n",
+                   array->metadata,
+                   array->uuid[0], array->uuid[1],
+                   array->uuid[2], array->uuid[3]) <= 0)
+               fprintf(stderr, Name ": Failed to write to "
+                       "<id_path> cookie\n");
+
+       fclose(f);
+}
+
+int policy_check_path(struct mdinfo *disk, struct map_ent *array)
+{
+       char path[PATH_MAX];
+       FILE *f = NULL;
+       char *id_path = disk_path(disk);
+       int rv;
+
+       if (!id_path)
+               return 0;
+
+       snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path);
+       f = fopen(path, "r");
+       if (!f)
+               return 0;
+
+       rv = fscanf(f, " %s %x:%x:%x:%x\n",
+                   array->metadata,
+                   array->uuid,
+                   array->uuid+1,
+                   array->uuid+2,
+                   array->uuid+3);
+       fclose(f);
+       return rv == 5;
+}
index 3074693bd2c3f8f36faa4f63ed76a01cacaf07d1..d33dbbad81005992ce46f1eb4e6eae7317417167 100644 (file)
@@ -43,6 +43,11 @@ static int geo_map(int block, unsigned long long stripe, int raid_disks,
         */
        int pd;
 
+       /* layout is not relevant for raid0 and raid4 */
+       if ((level == 0) ||
+           (level == 4))
+               layout = 0;
+
        switch(level*100 + layout) {
        case 000:
        case 400:
index dba59703a03cc2f054e70e1c8f3a5b74677c3ee7..b3890aae7d63849c20d9335c6df5ecd1a57474b2 100644 (file)
@@ -760,7 +760,7 @@ static int load_ddf_local(int fd, struct ddf_super *super,
 
 #ifndef MDASSEMBLE
 static int load_super_ddf_all(struct supertype *st, int fd,
-                             void **sbp, char *devname, int keep_fd);
+                             void **sbp, char *devname);
 #endif
 
 static void free_super_ddf(struct supertype *st);
@@ -774,11 +774,9 @@ static int load_super_ddf(struct supertype *st, int fd,
 
 #ifndef MDASSEMBLE
        /* if 'fd' is a container, load metadata from all the devices */
-       if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
+       if (load_super_ddf_all(st, fd, &st->sb, devname) == 0)
                return 0;
 #endif
-       if (st->subarray[0])
-               return 1; /* FIXME Is this correct */
 
        if (get_dev_size(fd, devname, &dsize) == 0)
                return 1;
@@ -844,26 +842,6 @@ static int load_super_ddf(struct supertype *st, int fd,
                return rv;
        }
 
-       if (st->subarray[0]) {
-               unsigned long val;
-               struct vcl *v;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free(super);
-                       return 1;
-               }
-
-               for (v = super->conflist; v; v = v->next)
-                       if (v->vcnum == val)
-                               super->currentconf = v;
-               if (!super->currentconf) {
-                       free(super);
-                       return 1;
-               }
-       }
-
        /* Should possibly check the sections .... */
 
        st->sb = super;
@@ -872,7 +850,6 @@ static int load_super_ddf(struct supertype *st, int fd,
                st->minor_version = 0;
                st->max_devs = 512;
        }
-       st->loaded_container = 0;
        return 0;
 
 }
@@ -915,6 +892,7 @@ static struct supertype *match_metadata_desc_ddf(char *arg)
 
        st = malloc(sizeof(*st));
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super_ddf;
        st->max_devs = 512;
        st->minor_version = 0;
@@ -1187,7 +1165,7 @@ static void examine_super_ddf(struct supertype *st, char *homehost)
        examine_pds(sb);
 }
 
-static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map);
 
 static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
 
@@ -1197,7 +1175,7 @@ static void brief_examine_super_ddf(struct supertype *st, int verbose)
         */
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
 
        printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5);
@@ -1211,7 +1189,7 @@ static void brief_examine_subarrays_ddf(struct supertype *st, int verbose)
        struct mdinfo info;
        unsigned int i;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
 
        for (i = 0; i < __be16_to_cpu(ddf->virt->max_vdes); i++) {
@@ -1233,7 +1211,7 @@ static void export_examine_super_ddf(struct supertype *st)
 {
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("MD_METADATA=ddf\n");
        printf("MD_LEVEL=container\n");
@@ -1259,7 +1237,7 @@ static void brief_detail_super_ddf(struct supertype *st)
 //     struct ddf_super *ddf = st->sb;
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_ddf(st, &info);
+       getinfo_super_ddf(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf,':');
        printf(" UUID=%s", nbuf + 5);
 }
@@ -1346,14 +1324,15 @@ static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
        memcpy(uuid, buf, 4*4);
 }
 
-static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map);
 
-static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct ddf_super *ddf = st->sb;
+       int map_disks = info->array.raid_disks;
 
        if (ddf->currentconf) {
-               getinfo_super_ddf_bvd(st, info);
+               getinfo_super_ddf_bvd(st, info, map);
                return;
        }
 
@@ -1397,17 +1376,29 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
 
        uuid_from_super_ddf(st, info->uuid);
 
+       if (map) {
+               int i;
+               for (i = 0 ; i < map_disks; i++) {
+                       if (i < info->array.raid_disks &&
+                           (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) &&
+                           !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed))
+                               map[i] = 1;
+                       else
+                               map[i] = 0;
+               }
+       }
 }
 
 static int rlq_to_layout(int rlq, int prl, int raiddisks);
 
-static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct ddf_super *ddf = st->sb;
        struct vcl *vc = ddf->currentconf;
        int cd = ddf->currentdev;
        int j;
        struct dl *dl;
+       int map_disks = info->array.raid_disks;
 
        /* FIXME this returns BVD info - what if we want SVD ?? */
 
@@ -1457,12 +1448,11 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
 
        uuid_from_super_ddf(st, info->uuid);
 
-       info->container_member = atoi(st->subarray);
        info->array.major_version = -1;
        info->array.minor_version = -2;
-       sprintf(info->text_version, "/%s/%s",
+       sprintf(info->text_version, "/%s/%d",
                devnum2devname(st->container_dev),
-               st->subarray);
+               info->container_member);
        info->safe_mode_delay = 200;
 
        memcpy(info->name, ddf->virt->entries[info->container_member].name, 16);
@@ -1470,6 +1460,18 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
        for(j=0; j<16; j++)
                if (info->name[j] == ' ')
                        info->name[j] = 0;
+
+       if (map)
+               for (j = 0; j < map_disks; j++) {
+                       map[j] = 0;
+                       if (j <  info->array.raid_disks) {
+                               int i = find_phys(ddf, vc->conf.phys_refnum[j]);
+                               if (i >= 0 && 
+                                   (__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Online) &&
+                                   !(__be16_to_cpu(ddf->phys->entries[i].state) & DDF_Failed))
+                                       map[i] = 1;
+                       }
+               }
 }
 
 
@@ -1515,28 +1517,27 @@ static int update_super_ddf(struct supertype *st, struct mdinfo *info,
 
        if (strcmp(update, "grow") == 0) {
                /* FIXME */
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
 //             info->resync_checkpoint = 0;
-       }
-       /* We ignore UUID updates as they make even less sense
-        * with DDF
-        */
-       if (strcmp(update, "homehost") == 0) {
+       } else if (strcmp(update, "homehost") == 0) {
                /* homehost is stored in controller->vendor_data,
                 * or it is when we are the vendor
                 */
 //             if (info->vendor_is_local)
 //                     strcpy(ddf->controller.vendor_data, homehost);
-       }
-       if (strcmp(update, "name") == 0) {
+               rv = -1;
+       } else if (strcmp(update, "name") == 0) {
                /* name is stored in virtual_entry->name */
 //             memset(ve->name, ' ', 16);
 //             strncpy(ve->name, info->name, 16);
-       }
-       if (strcmp(update, "_reshape_progress") == 0) {
+               rv = -1;
+       } else if (strcmp(update, "_reshape_progress") == 0) {
                /* We don't support reshape yet */
-       }
+       } else if (strcmp(update, "assemble") == 0 ) {
+               /* Do nothing, just succeed */
+               rv = 0;
+       } else
+               rv = -1;
 
 //     update_all_csum(ddf);
 
@@ -1960,6 +1961,19 @@ static int init_super_ddf_bvd(struct supertype *st,
                return 0;
        }
 
+       if (name)
+               for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
+                       if (!all_ff(ddf->virt->entries[venum].guid)) {
+                               char *n = ddf->virt->entries[venum].name;
+
+                               if (strncmp(name, n, 16) == 0) {
+                                       fprintf(stderr, Name ": This ddf already"
+                                               " has an array called %s\n",
+                                               name);
+                                       return 0;
+                               }
+                       }
+
        for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
                if (all_ff(ddf->virt->entries[venum].guid))
                        break;
@@ -1999,7 +2013,6 @@ static int init_super_ddf_bvd(struct supertype *st,
        }
        vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
        vcl->vcnum = venum;
-       sprintf(st->subarray, "%d", venum);
        vcl->block_sizes = NULL; /* FIXME not for CONCAT */
 
        vc = &vcl->conf;
@@ -2659,7 +2672,7 @@ static int validate_geometry_ddf(struct supertype *st,
                 * and try to create a bvd
                 */
                struct ddf_super *ddf;
-               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) {
                        st->sb = ddf;
                        st->container_dev = fd2devnum(cfd);
                        close(cfd);
@@ -2806,7 +2819,7 @@ static int validate_geometry_ddf_bvd(struct supertype *st,
 }
 
 static int load_super_ddf_all(struct supertype *st, int fd,
-                             void **sbp, char *devname, int keep_fd)
+                             void **sbp, char *devname)
 {
        struct mdinfo *sra;
        struct ddf_super *super;
@@ -2862,49 +2875,35 @@ static int load_super_ddf_all(struct supertype *st, int fd,
                int rv;
 
                sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
-               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               dfd = dev_open(nm, O_RDWR);
                if (dfd < 0)
                        return 2;
                rv = load_ddf_headers(dfd, super, NULL);
                if (rv == 0)
-                       rv = load_ddf_local(dfd, super, NULL, keep_fd);
-               if (!keep_fd) close(dfd);
+                       rv = load_ddf_local(dfd, super, NULL, 1);
                if (rv)
                        return 1;
        }
-       if (st->subarray[0]) {
-               unsigned long val;
-               struct vcl *v;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free(super);
-                       return 1;
-               }
-
-               for (v = super->conflist; v; v = v->next)
-                       if (v->vcnum == val)
-                               super->currentconf = v;
-               if (!super->currentconf) {
-                       free(super);
-                       return 1;
-               }
-       }
 
        *sbp = super;
        if (st->ss == NULL) {
                st->ss = &super_ddf;
                st->minor_version = 0;
                st->max_devs = 512;
-               st->container_dev = fd2devnum(fd);
        }
-       st->loaded_container = 1;
+       st->container_dev = fd2devnum(fd);
        return 0;
 }
+
+static int load_container_ddf(struct supertype *st, int fd,
+                             char *devname)
+{
+       return load_super_ddf_all(st, fd, &st->sb, devname);
+}
+
 #endif /* MDASSEMBLE */
 
-static struct mdinfo *container_content_ddf(struct supertype *st)
+static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray)
 {
        /* Given a container loaded by load_super_ddf_all,
         * extract information about all the arrays into
@@ -2923,6 +2922,13 @@ static struct mdinfo *container_content_ddf(struct supertype *st)
                unsigned int i;
                unsigned int j;
                struct mdinfo *this;
+               char *ep;
+
+               if (subarray &&
+                   (strtoul(subarray, &ep, 10) != vc->vcnum ||
+                    *ep != '\0'))
+                       continue;
+
                this = malloc(sizeof(*this));
                memset(this, 0, sizeof(*this));
                this->next = rest;
@@ -3653,6 +3659,15 @@ static int ddf_level_to_layout(int level)
        }
 }
 
+static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk)
+{
+       if (level && *level == UnSet)
+               *level = LEVEL_CONTAINER;
+
+       if (level && layout && *layout == UnSet)
+               *layout = ddf_level_to_layout(*level);
+}
+
 struct superswitch super_ddf = {
 #ifndef        MDASSEMBLE
        .examine_super  = examine_super_ddf,
@@ -3664,6 +3679,7 @@ struct superswitch super_ddf = {
        .validate_geometry = validate_geometry_ddf,
        .write_init_super = write_init_super_ddf,
        .add_to_super   = add_to_super_ddf,
+       .load_container = load_container_ddf,
 #endif
        .match_home     = match_home_ddf,
        .uuid_from_super= uuid_from_super_ddf,
@@ -3680,7 +3696,7 @@ struct superswitch super_ddf = {
        .free_super     = free_super_ddf,
        .match_metadata_desc = match_metadata_desc_ddf,
        .container_content = container_content_ddf,
-       .default_layout = ddf_level_to_layout,
+       .default_geometry = default_geometry_ddf,
 
        .external       = 1,
 
diff --git a/super-gpt.c b/super-gpt.c
new file mode 100644 (file)
index 0000000..e70a6fa
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'gpt' is a pseudo metadata type for devices which have a
+ * GPT partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_gpt(struct supertype *st)
+{
+       free(st->sb);
+       st->sb = NULL;
+}
+
+#ifndef MDASSEMBLE
+static void examine_gpt(struct supertype *st, char *homehost)
+{
+       struct GPT *gpt = st->sb + 512;
+       struct GPT_part_entry *gpe = st->sb + 1024;
+       unsigned int i;
+
+       printf("    GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic));
+       printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision));
+       for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+               printf("  Partition[%02d] : %12llu sectors at %12llu\n",
+                      i,
+                      (unsigned long long)__le64_to_cpu(gpe[i].starting_lba),
+                      (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)-
+                      (unsigned long long)__le64_to_cpu(gpe[i].starting_lba)
+                      +1
+                       );
+       }
+}
+#endif /* MDASSEMBLE */
+
+static int load_gpt(struct supertype *st, int fd, char *devname)
+{
+       struct MBR *super;
+       struct GPT *gpt_head;
+       int to_read;
+
+       free_gpt(st);
+
+       if (posix_memalign((void**)&super, 512, 32*512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+       no_read:
+               if (devname)
+                       fprintf(stderr, Name ": Cannot read partition table on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       if (super->magic != MBR_SIGNATURE_MAGIC ||
+           super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) {
+       not_found:
+               if (devname)
+                       fprintf(stderr, Name ": No partition table found on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       /* Seem to have GPT, load the header */
+       gpt_head = (struct GPT*)(super+1);
+       if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head))
+               goto no_read;
+       if (gpt_head->magic != GPT_SIGNATURE_MAGIC)
+               goto not_found;
+       if (__le32_to_cpu(gpt_head->part_cnt) >= 128)
+               goto not_found;
+
+       to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry);
+       to_read =  ((to_read+511)/512) * 512;
+       if (read(fd, gpt_head+1, to_read) != to_read)
+               goto no_read;
+
+       st->sb = super;
+
+       if (st->ss == NULL) {
+               st->ss = &gpt;
+               st->minor_version = 0;
+               st->max_devs = 1;
+               st->info = NULL;
+       }
+       return 0;
+}
+
+static int store_gpt(struct supertype *st, int fd)
+{
+       /* FIXME should I save the boot loader */
+       /* need to write two copies! */
+       /* FIXME allow for blocks != 512 bytes
+        *etc
+        */
+       struct MBR *super = st->sb;
+       struct GPT *gpt;
+       int to_write;
+
+       gpt = (struct GPT*)(super+1);
+
+       to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry);
+       to_write =  ((to_write+511)/512) * 512;
+
+       lseek(fd, 0, 0);
+       if (write(fd, st->sb, to_write) != to_write)
+               return 4;
+
+       fsync(fd);
+       ioctl(fd, BLKRRPART, 0);
+       return 0;
+}
+
+static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map)
+{
+       struct GPT *gpt = st->sb + 512;
+       struct GPT_part_entry *gpe = st->sb + 1024;
+       unsigned int i;
+
+       memset(&info->array, 0, sizeof(info->array));
+       memset(&info->disk, 0, sizeof(info->disk));
+       strcpy(info->text_version, "gpt");
+       strcpy(info->name, "gpt");
+       info->component_size = 0;
+
+       for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+               unsigned long long last =
+                       (unsigned long long)__le64_to_cpu(gpe[i].ending_lba);
+               if (last > info->component_size)
+                       info->component_size = last;
+       }
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+       struct supertype *st = malloc(sizeof(*st));
+
+       if (!st)
+               return st;
+       if (strcmp(arg, "gpt") != 0)
+               return NULL;
+
+       st->ss = &gpt;
+       st->info = NULL;
+       st->minor_version = 0;
+       st->max_devs = 1;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry(struct supertype *st, int level,
+                            int layout, int raiddisks,
+                            int chunk, unsigned long long size,
+                            char *subdev, unsigned long long *freesize,
+                            int verbose)
+{
+       fprintf(stderr, Name ": gpt metadata cannot be used this way\n");
+       return 0;
+}
+#endif
+
+struct superswitch gpt = {
+#ifndef MDASSEMBLE
+       .examine_super = examine_gpt,
+#endif
+       .validate_geometry = validate_geometry,
+       .match_metadata_desc = match_metadata_desc,
+       .load_super = load_gpt,
+       .store_super = store_gpt,
+       .getinfo_super = getinfo_gpt,
+       .free_super = free_gpt,
+       .name = "gpt",
+};
index b3a116f90308d75f2229c6ba1b70a3e78a7a1ce0..38f9622591c0fd5d6a911645c7883ff93535aa5c 100644 (file)
@@ -333,6 +333,7 @@ static struct supertype *match_metadata_desc_imsm(char *arg)
        if (!st)
                return NULL;
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super_imsm;
        st->max_devs = IMSM_MAX_DEVICES;
        st->minor_version = 0;
@@ -646,6 +647,37 @@ static int is_failed(struct imsm_disk *disk)
        return (disk->status & FAILED_DISK) == FAILED_DISK;
 }
 
+/* Return minimum size of a spare that can be used in this array*/
+static unsigned long long min_acceptable_spare_size_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dl;
+       struct extent *e;
+       int i;
+       unsigned long long rv = 0;
+
+       if (!super)
+               return rv;
+       /* find first active disk in array */
+       dl = super->disks;
+       while (dl && (is_failed(&dl->disk) || dl->index == -1))
+               dl = dl->next;
+       if (!dl)
+               return rv;
+       /* find last lba used by subarrays */
+       e = get_extents(super, dl);
+       if (!e)
+               return rv;
+       for (i = 0; e[i].size; i++)
+               continue;
+       if (i > 0)
+               rv = e[i-1].start + e[i-1].size;
+       free(e);
+       /* add the amount of space needed for metadata */
+       rv = rv + MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+       return rv * 512;
+}
+
 #ifndef MDASSEMBLE
 static __u64 blocks_per_migr_unit(struct imsm_dev *dev);
 
@@ -741,7 +773,7 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved)
               human_size(sz * 512));
 }
 
-static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map);
 
 static void examine_super_imsm(struct supertype *st, char *homehost)
 {
@@ -762,7 +794,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
        printf("    Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num));
        printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
        printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("           UUID : %s\n", nbuf + 5);
        sum = __le32_to_cpu(mpb->check_sum);
@@ -789,7 +821,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
                struct imsm_dev *dev = __get_imsm_dev(mpb, i);
 
                super->current_vol = i;
-               getinfo_super_imsm(st, &info);
+               getinfo_super_imsm(st, &info, NULL);
                fname_from_uuid(st, &info, nbuf, ':');
                print_imsm_dev(dev, nbuf + 5, super->disks->index);
        }
@@ -812,7 +844,7 @@ static void brief_examine_super_imsm(struct supertype *st, int verbose)
                return;
        }
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5);
 }
@@ -829,13 +861,13 @@ static void brief_examine_subarrays_imsm(struct supertype *st, int verbose)
        if (!super->anchor->num_raid_devs)
                return;
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        for (i = 0; i < super->anchor->num_raid_devs; i++) {
                struct imsm_dev *dev = get_imsm_dev(super, i);
 
                super->current_vol = i;
-               getinfo_super_imsm(st, &info);
+               getinfo_super_imsm(st, &info, NULL);
                fname_from_uuid(st, &info, nbuf1, ':');
                printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n",
                       dev->volume, nbuf + 5, i, nbuf1 + 5);
@@ -849,7 +881,7 @@ static void export_examine_super_imsm(struct supertype *st)
        struct mdinfo info;
        char nbuf[64];
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("MD_METADATA=imsm\n");
        printf("MD_LEVEL=container\n");
@@ -862,7 +894,7 @@ static void detail_super_imsm(struct supertype *st, char *homehost)
        struct mdinfo info;
        char nbuf[64];
 
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf("\n           UUID : %s\n", nbuf + 5);
 }
@@ -871,7 +903,7 @@ static void brief_detail_super_imsm(struct supertype *st)
 {
        struct mdinfo info;
        char nbuf[64];
-       getinfo_super_imsm(st, &info);
+       getinfo_super_imsm(st, &info, NULL);
        fname_from_uuid(st, &info, nbuf, ':');
        printf(" UUID=%s", nbuf + 5);
 }
@@ -1434,13 +1466,14 @@ static int imsm_level_to_layout(int level)
        return UnSet;
 }
 
-static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap)
 {
        struct intel_super *super = st->sb;
        struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
        struct imsm_map *map = get_imsm_map(dev, 0);
        struct dl *dl;
        char *devname;
+       int map_disks = info->array.raid_disks;
 
        for (dl = super->disks; dl; dl = dl->next)
                if (dl->raiddisk == info->disk.raid_disk)
@@ -1512,12 +1545,26 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
        free(devname);
        info->safe_mode_delay = 4000;  /* 4 secs like the Matrix driver */
        uuid_from_super_imsm(st, info->uuid);
-}
+
+       if (dmap) {
+               int i, j;
+               for (i=0; i<map_disks; i++) {
+                       dmap[i] = 0;
+                       if (i < info->array.raid_disks) {
+                               struct imsm_disk *dsk;
+                               j = get_imsm_disk_idx(dev, i);
+                               dsk = get_imsm_disk(super, j);
+                               if (dsk && (dsk->status & CONFIGURED_DISK))
+                                       dmap[i] = 1;
+                       }
+               }
+       }
+}                              
 
 /* check the config file to see if we can return a real uuid for this spare */
 static void fixup_container_spare_uuid(struct mdinfo *inf)
 {
-       struct mddev_ident_s *array_list;
+       struct mddev_ident *array_list;
 
        if (inf->array.level != LEVEL_CONTAINER ||
            memcmp(inf->uuid, uuid_match_any, sizeof(int[4])) != 0)
@@ -1559,13 +1606,17 @@ static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
        return NULL;
 }
 
-static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct intel_super *super = st->sb;
        struct imsm_disk *disk;
+       int map_disks = info->array.raid_disks;
+       int max_enough = -1;
+       int i;
+       struct imsm_super *mpb;
 
        if (super->current_vol >= 0) {
-               getinfo_super_imsm_volume(st, info);
+               getinfo_super_imsm_volume(st, info, map);
                return;
        }
 
@@ -1594,51 +1645,47 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
        info->recovery_start = MaxSector;
 
        /* do we have the all the insync disks that we expect? */
-       if (st->loaded_container) {
-               struct imsm_super *mpb = super->anchor;
-               int max_enough = -1, i;
+       mpb = super->anchor;
 
-               for (i = 0; i < mpb->num_raid_devs; i++) {
-                       struct imsm_dev *dev = get_imsm_dev(super, i);
-                       int failed, enough, j, missing = 0;
-                       struct imsm_map *map;
-                       __u8 state;
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               int failed, enough, j, missing = 0;
+               struct imsm_map *map;
+               __u8 state;
 
-                       failed = imsm_count_failed(super, dev);
-                       state = imsm_check_degraded(super, dev, failed);
-                       map = get_imsm_map(dev, dev->vol.migr_state);
+               failed = imsm_count_failed(super, dev);
+               state = imsm_check_degraded(super, dev, failed);
+               map = get_imsm_map(dev, dev->vol.migr_state);
 
-                       /* any newly missing disks?
-                        * (catches single-degraded vs double-degraded)
-                        */
-                       for (j = 0; j < map->num_members; j++) {
-                               __u32 ord = get_imsm_ord_tbl_ent(dev, i);
-                               __u32 idx = ord_to_idx(ord);
+               /* any newly missing disks?
+                * (catches single-degraded vs double-degraded)
+                */
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+                       __u32 idx = ord_to_idx(ord);
 
-                               if (!(ord & IMSM_ORD_REBUILD) &&
-                                   get_imsm_missing(super, idx)) {
-                                       missing = 1;
-                                       break;
-                               }
+                       if (!(ord & IMSM_ORD_REBUILD) &&
+                           get_imsm_missing(super, idx)) {
+                               missing = 1;
+                               break;
                        }
+               }
 
-                       if (state == IMSM_T_STATE_FAILED)
-                               enough = -1;
-                       else if (state == IMSM_T_STATE_DEGRADED &&
-                                (state != map->map_state || missing))
-                               enough = 0;
-                       else /* we're normal, or already degraded */
-                               enough = 1;
+               if (state == IMSM_T_STATE_FAILED)
+                       enough = -1;
+               else if (state == IMSM_T_STATE_DEGRADED &&
+                        (state != map->map_state || missing))
+                       enough = 0;
+               else /* we're normal, or already degraded */
+                       enough = 1;
 
-                       /* in the missing/failed disk case check to see
-                        * if at least one array is runnable
-                        */
-                       max_enough = max(max_enough, enough);
-               }
-               dprintf("%s: enough: %d\n", __func__, max_enough);
-               info->container_enough = max_enough;
-       } else
-               info->container_enough = -1;
+               /* in the missing/failed disk case check to see
+                * if at least one array is runnable
+                */
+               max_enough = max(max_enough, enough);
+       }
+       dprintf("%s: enough: %d\n", __func__, max_enough);
+       info->container_enough = max_enough;
 
        if (super->disks) {
                __u32 reserved = imsm_reserved_sectors(super, super->disks);
@@ -1664,6 +1711,59 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
                memcpy(info->uuid, uuid_match_any, sizeof(int[4]));
                fixup_container_spare_uuid(info);
        }
+
+       /* I don't know how to compute 'map' on imsm, so use safe default */
+       if (map) {
+               int i;
+               for (i = 0; i < map_disks; i++)
+                       map[i] = 1;
+       }
+
+}
+
+/* allocates memory and fills disk in mdinfo structure
+ * for each disk in array */
+struct mdinfo *getinfo_super_disks_imsm(struct supertype *st)
+{
+       struct mdinfo *mddev = NULL;
+       struct intel_super *super = st->sb;
+       struct imsm_disk *disk;
+       int count = 0;
+       struct dl *dl;
+       if (!super || !super->disks)
+               return NULL;
+       dl = super->disks;
+       mddev = malloc(sizeof(*mddev));
+       if (!mddev) {
+               fprintf(stderr, Name ": Failed to allocate memory.\n");
+               return NULL;
+       }
+       memset(mddev, 0, sizeof(*mddev));
+       while (dl) {
+               struct mdinfo *tmp;
+               disk = &dl->disk;
+               tmp = malloc(sizeof(*tmp));
+               if (!tmp) {
+                       fprintf(stderr, Name ": Failed to allocate memory.\n");
+                       if (mddev)
+                               sysfs_free(mddev);
+                       return NULL;
+               }
+               memset(tmp, 0, sizeof(*tmp));
+               if (mddev->devs)
+                       tmp->next = mddev->devs;
+               mddev->devs = tmp;
+               tmp->disk.number = count++;
+               tmp->disk.major = dl->major;
+               tmp->disk.minor = dl->minor;
+               tmp->disk.state = is_configured(disk) ?
+                                 (1 << MD_DISK_ACTIVE) : 0;
+               tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
+               tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC);
+               tmp->disk.raid_disk = -1;
+               dl = dl->next;
+       }
+       return mddev;
 }
 
 static int update_super_imsm(struct supertype *st, struct mdinfo *info,
@@ -1705,8 +1805,7 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info,
        mpb = super->anchor;
 
        if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private)
-               fprintf(stderr,
-                       Name ": '--uuid' not supported for imsm metadata\n");
+               rv = -1;
        else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) {
                mpb->orig_family_num = *((__u32 *) info->update_private);
                rv = 0;
@@ -1727,9 +1826,7 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info,
        } else if (strcmp(update, "assemble") == 0)
                rv = 0;
        else
-               fprintf(stderr,
-                       Name ": '--update=%s' not supported for imsm metadata\n",
-                       update);
+               rv = -1;
 
        /* successful update? recompute checksum */
        if (rv == 0)
@@ -2086,7 +2183,8 @@ static void migrate(struct imsm_dev *dev, __u8 to_state, int migr_type)
 
        /* duplicate and then set the target end state in map[0] */
        memcpy(dest, src, sizeof_imsm_map(src));
-       if (migr_type == MIGR_REBUILD) {
+       if ((migr_type == MIGR_REBUILD) ||
+           (migr_type ==  MIGR_GEN_MIGR)) {
                __u32 ord;
                int i;
 
@@ -2103,18 +2201,26 @@ static void end_migration(struct imsm_dev *dev, __u8 map_state)
 {
        struct imsm_map *map = get_imsm_map(dev, 0);
        struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state);
-       int i;
+       int i, j;
 
        /* merge any IMSM_ORD_REBUILD bits that were not successfully
         * completed in the last migration.
         *
-        * FIXME add support for online capacity expansion and
-        * raid-level-migration
+        * FIXME add support for raid-level-migration
         */
        for (i = 0; i < prev->num_members; i++)
-               map->disk_ord_tbl[i] |= prev->disk_ord_tbl[i];
+               for (j = 0; j < map->num_members; j++)
+                       /* during online capacity expansion
+                        * disks position can be changed if takeover is used
+                        */
+                       if (ord_to_idx(map->disk_ord_tbl[j]) ==
+                           ord_to_idx(prev->disk_ord_tbl[i])) {
+                               map->disk_ord_tbl[j] |= prev->disk_ord_tbl[i];
+                               break;
+                       }
 
        dev->vol.migr_state = 0;
+       dev->vol.migr_type = 0;
        dev->vol.curr_migr_unit = 0;
        map->map_state = map_state;
 }
@@ -2199,6 +2305,13 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
        __u32 check_sum;
 
        get_dev_size(fd, NULL, &dsize);
+       if (dsize < 1024) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": %s: device to small for imsm\n",
+                               devname);
+               return 1;
+       }
 
        if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
                if (devname)
@@ -2756,7 +2869,7 @@ imsm_thunderdome(struct intel_super **super_list, int len)
 }
 
 static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
-                              char *devname, int keep_fd)
+                              char *devname)
 {
        struct mdinfo *sra;
        struct intel_super *super_list = NULL;
@@ -2792,22 +2905,20 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
 
                err = 2;
                sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
-               dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+               dfd = dev_open(nm, O_RDWR);
                if (dfd < 0)
                        goto error;
 
-               err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+               err = load_and_parse_mpb(dfd, s, NULL, 1);
 
                /* retry the load if we might have raced against mdmon */
                if (err == 3 && mdmon_running(devnum))
                        for (retry = 0; retry < 3; retry++) {
                                usleep(3000);
-                               err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+                               err = load_and_parse_mpb(dfd, s, NULL, 1);
                                if (err != 3)
                                        break;
                        }
-               if (!keep_fd)
-                       close(dfd);
                if (err)
                        goto error;
        }
@@ -2824,25 +2935,6 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
                err = 2;
                goto error;
        }
-
-       if (st->subarray[0]) {
-               unsigned long val;
-               char *ep;
-
-               err = 1;
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free_imsm(super);
-                       goto error;
-               }
-
-               if (val < super->anchor->num_raid_devs)
-                       super->current_vol = val;
-               else {
-                       free_imsm(super);
-                       goto error;
-               }
-       }
        err = 0;
 
  error:
@@ -2864,10 +2956,13 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
                st->minor_version = 0;
                st->max_devs = IMSM_MAX_DEVICES;
        }
-       st->loaded_container = 1;
-
        return 0;
 }
+
+static int load_container_imsm(struct supertype *st, int fd, char *devname)
+{
+       return load_super_imsm_all(st, fd, &st->sb, devname);
+}
 #endif
 
 static int load_super_imsm(struct supertype *st, int fd, char *devname)
@@ -2876,7 +2971,7 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
        int rv;
 
 #ifndef MDASSEMBLE
-       if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
+       if (load_super_imsm_all(st, fd, &st->sb, devname) == 0)
                return 0;
 #endif
 
@@ -2905,32 +3000,12 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
                return rv;
        }
 
-       if (st->subarray[0]) {
-               unsigned long val;
-               char *ep;
-
-               val = strtoul(st->subarray, &ep, 10);
-               if (*ep != '\0') {
-                       free_imsm(super);
-                       return 1;
-               }
-
-               if (val < super->anchor->num_raid_devs)
-                       super->current_vol = val;
-               else {
-                       free_imsm(super);
-                       return 1;
-               }
-       }
-
        st->sb = super;
        if (st->ss == NULL) {
                st->ss = &super_imsm;
                st->minor_version = 0;
                st->max_devs = IMSM_MAX_DEVICES;
        }
-       st->loaded_container = 0;
-
        return 0;
 }
 
@@ -3083,7 +3158,6 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 
        if (!check_name(super, name, 0))
                return 0;
-       sprintf(st->subarray, "%d", idx);
        dv = malloc(sizeof(*dv));
        if (!dv) {
                fprintf(stderr, Name ": failed to allocate device list entry\n");
@@ -3410,8 +3484,9 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose)
        return 0;
 }
 
-static int write_super_imsm(struct intel_super *super, int doclose)
+static int write_super_imsm(struct supertype *st, int doclose)
 {
+       struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
        struct dl *d;
        __u32 generation;
@@ -3419,6 +3494,7 @@ static int write_super_imsm(struct intel_super *super, int doclose)
        int spares = 0;
        int i;
        __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+       int num_disks = 0;
 
        /* 'generation' is incremented everytime the metadata is written */
        generation = __le32_to_cpu(mpb->generation_num);
@@ -3431,21 +3507,28 @@ static int write_super_imsm(struct intel_super *super, int doclose)
        if (mpb->orig_family_num == 0)
                mpb->orig_family_num = mpb->family_num;
 
-       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
        for (d = super->disks; d; d = d->next) {
                if (d->index == -1)
                        spares++;
-               else
+               else {
                        mpb->disk[d->index] = d->disk;
+                       num_disks++;
+               }
        }
-       for (d = super->missing; d; d = d->next)
+       for (d = super->missing; d; d = d->next) {
                mpb->disk[d->index] = d->disk;
+               num_disks++;
+       }
+       mpb->num_disks = num_disks;
+       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
 
        for (i = 0; i < mpb->num_raid_devs; i++) {
                struct imsm_dev *dev = __get_imsm_dev(mpb, i);
-
-               imsm_copy_dev(dev, get_imsm_dev(super, i));
-               mpb_size += sizeof_imsm_dev(dev, 0);
+               struct imsm_dev *dev2 = get_imsm_dev(super, i);
+               if (dev && dev2) {
+                       imsm_copy_dev(dev, dev2);
+                       mpb_size += sizeof_imsm_dev(dev, 0);
+               }
        }
        mpb_size += __le32_to_cpu(mpb->bbm_log_size);
        mpb->mpb_size = __cpu_to_le32(mpb_size);
@@ -3565,7 +3648,7 @@ static int write_init_super_imsm(struct supertype *st)
                struct dl *d;
                for (d = super->disks; d; d = d->next)
                        Kill(d->devname, NULL, 0, 1, 1);
-               return write_super_imsm(st->sb, 1);
+               return write_super_imsm(st, 1);
        }
 }
 #endif
@@ -4097,7 +4180,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
                 */
                struct intel_super *super;
 
-               if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+               if (load_super_imsm_all(st, cfd, (void **) &super, NULL) == 0) {
                        st->sb = super;
                        st->container_dev = fd2devnum(cfd);
                        close(cfd);
@@ -4115,14 +4198,19 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
        return 0;
 }
 
-static int default_chunk_imsm(struct supertype *st)
+static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk)
 {
        struct intel_super *super = st->sb;
 
-       if (!super || !super->orom)
-               return 0;
+       if (level && *level == UnSet)
+               *level = LEVEL_CONTAINER;
+
+       if (level && layout && *layout == UnSet)
+               *layout = imsm_level_to_layout(*level);
 
-       return imsm_orom_default_chunk(super->orom);
+       if (chunk && (*chunk == UnSet || *chunk == 0) && 
+           super && super->orom)
+               *chunk = imsm_orom_default_chunk(super->orom);
 }
 
 static void handle_missing(struct intel_super *super, struct imsm_dev *dev);
@@ -4201,19 +4289,19 @@ static int kill_subarray_imsm(struct supertype *st)
        return 0;
 }
 
-static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_t ident)
+static int update_subarray_imsm(struct supertype *st, char *subarray,
+                               char *update, struct mddev_ident *ident)
 {
        /* update the subarray currently referenced by ->current_vol */
        struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
 
-       if (super->current_vol < 0)
-               return 2;
-
        if (strcmp(update, "name") == 0) {
                char *name = ident->name;
+               char *ep;
+               int vol;
 
-               if (is_subarray_active(st->subarray, st->devname)) {
+               if (is_subarray_active(subarray, st->devname)) {
                        fprintf(stderr,
                                Name ": Unable to update name of active subarray\n");
                        return 2;
@@ -4222,20 +4310,24 @@ static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_
                if (!check_name(super, name, 0))
                        return 2;
 
+               vol = strtoul(subarray, &ep, 10);
+               if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+                       return 2;
+
                if (st->update_tail) {
                        struct imsm_update_rename_array *u = malloc(sizeof(*u));
 
                        if (!u)
                                return 2;
                        u->type = update_rename_array;
-                       u->dev_idx = super->current_vol;
+                       u->dev_idx = vol;
                        snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name);
                        append_metadata_update(st, u, sizeof(*u));
                } else {
                        struct imsm_dev *dev;
                        int i;
 
-                       dev = get_imsm_dev(super, super->current_vol);
+                       dev = get_imsm_dev(super, vol);
                        snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name);
                        for (i = 0; i < mpb->num_raid_devs; i++) {
                                dev = get_imsm_dev(super, i);
@@ -4250,6 +4342,17 @@ static int update_subarray_imsm(struct supertype *st, char *update, mddev_ident_
 }
 #endif /* MDASSEMBLE */
 
+static int is_gen_migration(struct imsm_dev *dev)
+{
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (migr_type(dev) == MIGR_GEN_MIGR)
+               return 1;
+
+       return 0;
+}
+
 static int is_rebuilding(struct imsm_dev *dev)
 {
        struct imsm_map *migr_map;
@@ -4299,11 +4402,12 @@ static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array)
 }
 
 
-static struct mdinfo *container_content_imsm(struct supertype *st)
+static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray)
 {
        /* Given a container loaded by load_super_imsm_all,
         * extract information about all the arrays into
         * an mdinfo tree.
+        * If 'subarray' is given, just extract info about that array.
         *
         * For each imsm_dev create an mdinfo, fill it in,
         *  then look for matching devices in super->disks
@@ -4312,7 +4416,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
        struct intel_super *super = st->sb;
        struct imsm_super *mpb = super->anchor;
        struct mdinfo *rest = NULL;
-       int i;
+       unsigned int i;
 
        /* do not assemble arrays that might have bad blocks */
        if (imsm_bbm_log_size(super->anchor)) {
@@ -4322,17 +4426,24 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
        }
 
        for (i = 0; i < mpb->num_raid_devs; i++) {
-               struct imsm_dev *dev = get_imsm_dev(super, i);
-               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct imsm_dev *dev;
+               struct imsm_map *map;
                struct mdinfo *this;
                int slot;
+               char *ep;
+
+               if (subarray &&
+                   (i != strtoul(subarray, &ep, 10) || *ep != '\0'))
+                       continue;
+
+               dev = get_imsm_dev(super, i);
+               map = get_imsm_map(dev, 0);
 
                /* do not publish arrays that are in the middle of an
                 * unsupported migration
                 */
                if (dev->vol.migr_state &&
-                   (migr_type(dev) == MIGR_GEN_MIGR ||
-                    migr_type(dev) == MIGR_STATE_CHANGE)) {
+                   (migr_type(dev) == MIGR_STATE_CHANGE)) {
                        fprintf(stderr, Name ": cannot assemble volume '%.16s':"
                                " unsupported migration in progress\n",
                                dev->volume);
@@ -4349,7 +4460,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
                this->next = rest;
 
                super->current_vol = i;
-               getinfo_super_imsm_volume(st, this);
+               getinfo_super_imsm_volume(st, this, NULL);
                for (slot = 0 ; slot <  map->num_members; slot++) {
                        unsigned long long recovery_start;
                        struct mdinfo *info_d;
@@ -4615,6 +4726,8 @@ static void handle_missing(struct intel_super *super, struct imsm_dev *dev)
        super->updates_pending++;
 }
 
+static void imsm_set_disk(struct active_array *a, int n, int state);
+
 /* Handle dirty -> clean transititions and resync.  Degraded and rebuild
  * states are handled in imsm_set_disk() with one exception, when a
  * resync is stopped due to a new failure this routine will set the
@@ -4690,6 +4803,16 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
                        dev->vol.dirty = 1;
                super->updates_pending++;
        }
+
+       /* finalize online capacity expansion/reshape */
+       if ((a->curr_action != reshape) &&
+           (a->prev_action == reshape)) {
+               struct mdinfo *mdi;
+
+               for (mdi = a->info.devs; mdi; mdi = mdi->next)
+                       imsm_set_disk(a, mdi->disk.raid_disk, mdi->curr_state);
+       }
+
        return consistent;
 }
 
@@ -4753,6 +4876,23 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
                end_migration(dev, map_state);
                super->updates_pending++;
                a->last_checkpoint = 0;
+       } else if (is_gen_migration(dev)) {
+               dprintf("imsm: Detected General Migration in state: ");
+               if (map_state == IMSM_T_STATE_NORMAL) {
+                       end_migration(dev, map_state);
+                       map = get_imsm_map(dev, 0);
+                       map->failed_disk_num = ~0;
+                       dprintf("normal\n");
+               } else {
+                       if (map_state == IMSM_T_STATE_DEGRADED) {
+                               printf("degraded\n");
+                               end_migration(dev, map_state);
+                       } else {
+                               dprintf("failed\n");
+                       }
+                       map->map_state = map_state;
+               }
+               super->updates_pending++;
        }
 }
 
@@ -4795,7 +4935,7 @@ static void imsm_sync_metadata(struct supertype *container)
        if (!super->updates_pending)
                return;
 
-       write_super_imsm(super, 0);
+       write_super_imsm(container, 0);
 
        super->updates_pending = 0;
 }
@@ -4820,7 +4960,8 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a
 }
 
 static struct dl *imsm_add_spare(struct intel_super *super, int slot,
-                                struct active_array *a, int activate_new)
+                                struct active_array *a, int activate_new,
+                                struct mdinfo *additional_test_list)
 {
        struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
        int idx = get_imsm_disk_idx(dev, slot);
@@ -4841,11 +4982,23 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot,
                        if (d->state_fd >= 0 &&
                            d->disk.major == dl->major &&
                            d->disk.minor == dl->minor) {
-                               dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                               dprintf("%x:%x already in array\n",
+                                       dl->major, dl->minor);
                                break;
                        }
                if (d)
                        continue;
+               while (additional_test_list) {
+                       if (additional_test_list->disk.major == dl->major &&
+                           additional_test_list->disk.minor == dl->minor) {
+                               dprintf("%x:%x already in additional test list\n",
+                                       dl->major, dl->minor);
+                               break;
+                       }
+                       additional_test_list = additional_test_list->next;
+               }
+               if (additional_test_list)
+                       continue;
 
                /* skip in use or failed drives */
                if (is_failed(&dl->disk) || idx == dl->index ||
@@ -4975,9 +5128,9 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
                 */
                dl = imsm_readd(super, i, a);
                if (!dl)
-                       dl = imsm_add_spare(super, i, a, 0);
+                       dl = imsm_add_spare(super, i, a, 0, NULL);
                if (!dl)
-                       dl = imsm_add_spare(super, i, a, 1);
+                       dl = imsm_add_spare(super, i, a, 1, NULL);
                if (!dl)
                        continue;
  
@@ -5557,6 +5710,41 @@ static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned ind
 }
 #endif /* MDASSEMBLE */
 
+static char disk_by_path[] = "/dev/disk/by-path/";
+
+static const char *imsm_get_disk_controller_domain(const char *path)
+{
+       struct sys_dev *list, *hba = NULL;
+       char disk_path[PATH_MAX];
+       int ahci = 0;
+       char *dpath = NULL;
+
+       list = find_driver_devices("pci", "ahci");
+       for (hba = list; hba; hba = hba->next)
+               if (devpath_to_vendor(hba->path) == 0x8086)
+                       break;
+
+       if (hba) {
+               struct stat st;
+
+               strncpy(disk_path, disk_by_path, PATH_MAX - 1);
+               strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1);
+               if (stat(disk_path, &st) == 0) {
+                       dpath = devt_to_devpath(st.st_rdev);
+                       if (dpath)
+                               ahci = path_attached_to_hba(dpath, hba->path);
+               }
+       }
+       dprintf("path: %s(%s) hba: %s attached: %d\n",
+               path, dpath, (hba) ? hba->path : "NULL", ahci);
+       free_sys_dev(&list);
+       if (ahci)
+               return "ahci";
+       else
+               return NULL;
+}
+
+
 struct superswitch super_imsm = {
 #ifndef        MDASSEMBLE
        .examine_super  = examine_super_imsm,
@@ -5567,18 +5755,20 @@ struct superswitch super_imsm = {
        .brief_detail_super = brief_detail_super_imsm,
        .write_init_super = write_init_super_imsm,
        .validate_geometry = validate_geometry_imsm,
-       .default_chunk  = default_chunk_imsm,
        .add_to_super   = add_to_super_imsm,
        .detail_platform = detail_platform_imsm,
        .kill_subarray = kill_subarray_imsm,
        .update_subarray = update_subarray_imsm,
+       .load_container = load_container_imsm,
 #endif
        .match_home     = match_home_imsm,
        .uuid_from_super= uuid_from_super_imsm,
        .getinfo_super  = getinfo_super_imsm,
+       .getinfo_super_disks = getinfo_super_disks_imsm,
        .update_super   = update_super_imsm,
 
        .avail_size     = avail_size_imsm,
+       .min_acceptable_spare_size = min_acceptable_spare_size_imsm,
 
        .compare_super  = compare_super_imsm,
 
@@ -5588,7 +5778,8 @@ struct superswitch super_imsm = {
        .free_super     = free_super_imsm,
        .match_metadata_desc = match_metadata_desc_imsm,
        .container_content = container_content_imsm,
-       .default_layout = imsm_level_to_layout,
+       .default_geometry = default_geometry_imsm,
+       .get_disk_controller_domain = imsm_get_disk_controller_domain,
 
        .external       = 1,
        .name = "imsm",
diff --git a/super-mbr.c b/super-mbr.c
new file mode 100644 (file)
index 0000000..0129fd6
--- /dev/null
@@ -0,0 +1,210 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'mbr' is a pseudo metadata type for devices which have a
+ * partition table in the Master Boot Record (mbr) also known
+ * as a dos partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_mbr(struct supertype *st)
+{
+       free(st->sb);
+       st->sb = NULL;
+}
+
+#ifndef MDASSEMBLE
+
+static void examine_mbr(struct supertype *st, char *homehost)
+{
+       struct MBR *sb = st->sb;
+       int i;
+
+       printf("   MBR Magic : %04x\n", sb->magic);
+       for (i = 0; i < MBR_PARTITIONS; i++)
+               if (sb->parts[i].blocks_num)
+                       printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n",
+                              i,
+                              (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num),
+                              (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba),
+                              sb->parts[i].part_type);
+
+}
+
+#endif /*MDASSEMBLE */
+
+static int load_super_mbr(struct supertype *st, int fd, char *devname)
+{
+       /* try to read an mbr
+        * Return
+        *  0 on success
+        *  1 cannot get record
+        *  2 record is meaningless
+        */
+       struct MBR *super;
+
+       free_mbr(st);
+
+       if (posix_memalign((void**)&super, 512, 512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+               if (devname)
+                       fprintf(stderr, Name ": Cannot read partition table on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+       if (super->magic != MBR_SIGNATURE_MAGIC) {
+               if (devname)
+                       fprintf(stderr, Name ": No partition table found on %s\n",
+                               devname);
+               free(super);
+               return 1;
+       }
+
+       st->sb = super;
+
+       if (st->ss == NULL) {
+               st->ss = &mbr;
+               st->minor_version = 0;
+               st->max_devs = 1;
+               st->info = NULL;
+       }
+       return 0;
+}
+
+static int store_mbr(struct supertype *st, int fd)
+{
+       struct MBR *old, *super;
+
+       if (posix_memalign((void**)&old, 512, 512) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
+
+       ioctl(fd, BLKFLSBUF, 0); /* make sure we read current data */
+
+       lseek(fd, 0, 0);
+       if (read(fd, old, sizeof(*old)) != sizeof(*old)) {
+               free(old);
+               return 1;
+       }
+
+       super = st->sb;
+       memcpy(super->pad, old->pad, sizeof(super->pad));
+       free(old);
+       lseek(fd, 0, 0);
+       if (write(fd, super, sizeof(*super)) != sizeof(*super))
+               return 4;
+       fsync(fd);
+       ioctl(fd, BLKRRPART, 0);
+       return 0;
+}
+
+static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map)
+{
+       struct MBR *sb = st->sb;
+       int i;
+
+       memset(&info->array, 0, sizeof(info->array));
+       memset(&info->disk, 0, sizeof(info->disk));
+       strcpy(info->text_version, "mbr");
+       strcpy(info->name, "mbr");
+       info->component_size = 0;
+
+       for (i = 0; i < MBR_PARTITIONS ; i++)
+               if (sb->parts[i].blocks_num) {
+                       unsigned long last = 
+                               (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num)
+                               + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba);
+                       if (last > info->component_size)
+                               info->component_size = last;
+               }
+
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+       struct supertype *st = malloc(sizeof(*st));
+
+       if (!st)
+               return st;
+       if (strcmp(arg, "mbr") != 0)
+               return NULL;
+
+       st->ss = &mbr;
+       st->info = NULL;
+       st->minor_version = 0;
+       st->max_devs = 1;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry(struct supertype *st, int level,
+                            int layout, int raiddisks,
+                            int chunk, unsigned long long size,
+                            char *subdev, unsigned long long *freesize,
+                            int verbose)
+{
+       fprintf(stderr, Name ": mbr metadata cannot be used this way\n");
+       return 0;
+}
+#endif
+
+struct superswitch mbr = {
+#ifndef MDASSEMBLE
+       .examine_super = examine_mbr,
+#endif
+       .validate_geometry = validate_geometry,
+       .match_metadata_desc = match_metadata_desc,
+       .load_super = load_super_mbr,
+       .store_super = store_mbr,
+       .getinfo_super = getinfo_mbr,
+       .free_super = free_mbr,
+       .name = "mbr",
+};
index ae3e8855193f9d6183776f346779b6d29ba5ef5c..d69d0c049ad81d5337a1432158c91e4d0cf08e66 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -339,11 +339,12 @@ static void uuid_from_super0(struct supertype *st, int uuid[4])
        }
 }
 
-static void getinfo_super0(struct supertype *st, struct mdinfo *info)
+static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map)
 {
        mdp_super_t *sb = st->sb;
        int working = 0;
        int i;
+       int map_disks = info->array.raid_disks;
 
        info->array.major_version = sb->major_version;
        info->array.minor_version = sb->minor_version;
@@ -391,19 +392,35 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info)
                if ((sb->disks[i].state & (1<<MD_DISK_SYNC)) &&
                    (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) &&
                    (sb->disks[i].state & (1<<MD_DISK_ACTIVE)) &&
-                   !(sb->disks[i].state & (1<<MD_DISK_FAULTY)))
+                   !(sb->disks[i].state & (1<<MD_DISK_FAULTY))) {
                        working ++;
+                       if (map && i < map_disks)
+                               map[i] = 1;
+               } else if (map && i < map_disks)
+                       map[i] = 0;
        info->array.working_disks = working;
 }
 
+static struct mdinfo *container_content0(struct supertype *st, char *subarray)
+{
+       struct mdinfo *info;
+
+       if (subarray)
+               return NULL;
+
+       info = malloc(sizeof(*info));
+       getinfo_super0(st, info, NULL);
+       return info;
+}
 
 static int update_super0(struct supertype *st, struct mdinfo *info,
                         char *update,
                         char *devname, int verbose,
                         int uuid_set, char *homehost)
 {
-       /* NOTE: for 'assemble' and 'force' we need to return non-zero if any change was made.
-        * For others, the return value is ignored.
+       /* NOTE: for 'assemble' and 'force' we need to return non-zero
+        * if any change was made.  For others, the return value is
+        * ignored.
         */
        int rv = 0;
        mdp_super_t *sb = st->sb;
@@ -419,14 +436,12 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                if (verbose >= 0)
                        fprintf (stderr, Name ": adjusting superblock of %s for 2.2/sparc compatability.\n",
                                 devname);
-       }
-       if (strcmp(update, "super-minor") ==0) {
+       } else if (strcmp(update, "super-minor") ==0) {
                sb->md_minor = info->array.md_minor;
                if (verbose > 0)
                        fprintf(stderr, Name ": updating superblock of %s with minor number %d\n",
                                devname, info->array.md_minor);
-       }
-       if (strcmp(update, "summaries") == 0) {
+       } else if (strcmp(update, "summaries") == 0) {
                unsigned int i;
                /* set nr_disks, active_disks, working_disks,
                 * failed_disks, spare_disks based on disks[]
@@ -453,8 +468,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                                        sb->spare_disks++;
                        } else if (i >= sb->raid_disks && sb->disks[i].number == 0)
                                sb->disks[i].state = 0;
-       }
-       if (strcmp(update, "force-one")==0) {
+       } else if (strcmp(update, "force-one")==0) {
                /* Not enough devices for a working array, so
                 * bring this one up-to-date.
                 */
@@ -464,8 +478,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                if (sb->events_hi != ehi ||
                    sb->events_lo != elo)
                        rv = 1;
-       }
-       if (strcmp(update, "force-array")==0) {
+       } else if (strcmp(update, "force-array")==0) {
                /* degraded array and 'force' requested, so
                 * maybe need to mark it 'clean'
                 */
@@ -475,8 +488,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                        sb->state |= (1 << MD_SB_CLEAN);
                        rv = 1;
                }
-       }
-       if (strcmp(update, "assemble")==0) {
+       } else if (strcmp(update, "assemble")==0) {
                int d = info->disk.number;
                int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
                int mask = (1<<MD_DISK_WRITEMOSTLY);
@@ -491,8 +503,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                        sb->disks[d].state = info->disk.state | wonly;
                        rv = 1;
                }
-       }
-       if (strcmp(update, "linear-grow-new") == 0) {
+       } else if (strcmp(update, "linear-grow-new") == 0) {
                memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0]));
                sb->disks[info->disk.number].number = info->disk.number;
                sb->disks[info->disk.number].major = info->disk.major;
@@ -500,8 +511,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
                sb->disks[info->disk.number].state = info->disk.state;
                sb->this_disk = sb->disks[info->disk.number];
-       }
-       if (strcmp(update, "linear-grow-update") == 0) {
+       } else if (strcmp(update, "linear-grow-update") == 0) {
                sb->raid_disks = info->array.raid_disks;
                sb->nr_disks = info->array.nr_disks;
                sb->active_disks = info->array.active_disks;
@@ -512,20 +522,17 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                sb->disks[info->disk.number].minor = info->disk.minor;
                sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
                sb->disks[info->disk.number].state = info->disk.state;
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
                /* make sure resync happens */
                sb->state &= ~(1<<MD_SB_CLEAN);
                sb->recovery_cp = 0;
-       }
-       if (strcmp(update, "homehost") == 0 &&
+       } else if (strcmp(update, "homehost") == 0 &&
            homehost) {
                uuid_set = 0;
                update = "uuid";
                info->uuid[0] = sb->set_uuid0;
                info->uuid[1] = sb->set_uuid1;
-       }
-       if (strcmp(update, "uuid") == 0) {
+       } else if (strcmp(update, "uuid") == 0) {
                if (!uuid_set && homehost) {
                        char buf[20];
                        char *hash = sha1_buffer(homehost,
@@ -542,9 +549,12 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
                        bm = (struct bitmap_super_s*)(sb+1);
                        uuid_from_super0(st, (int*)bm->uuid);
                }
-       }
-       if (strcmp(update, "_reshape_progress")==0)
+       } else if (strcmp(update, "no-bitmap") == 0) {
+               sb->state &= ~(1<<MD_SB_BITMAP_PRESENT);
+       } else if (strcmp(update, "_reshape_progress")==0)
                sb->reshape_position = info->reshape_progress;
+       else
+               rv = -1;
 
        sb->sb_csum = calc_sb0_csum(sb);
        return rv;
@@ -813,9 +823,6 @@ static int load_super0(struct supertype *st, int fd, char *devname)
 
        free_super0(st);
 
-       if (st->subarray[0])
-               return 1;
-
        if (!get_dev_size(fd, devname, &dsize))
                return 1;
 
@@ -913,6 +920,7 @@ static struct supertype *match_metadata_desc0(char *arg)
        if (!st) return st;
 
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super0;
        st->info = NULL;
        st->minor_version = 90;
@@ -1140,6 +1148,7 @@ struct superswitch super0 = {
        .match_home = match_home0,
        .uuid_from_super = uuid_from_super0,
        .getinfo_super = getinfo_super0,
+       .container_content = container_content0,
        .update_super = update_super0,
        .init_super = init_super0,
        .store_super = store_super0,
index 0eb03230a79fd4b8bf791f2b5057be5ee0db83be..f879e669db79ee1bb3f0376d43bfc850141139fc 100644 (file)
--- a/super1.c
+++ b/super1.c
@@ -558,12 +558,13 @@ static void uuid_from_super1(struct supertype *st, int uuid[4])
                cuuid[i] = super->set_uuid[i];
 }
 
-static void getinfo_super1(struct supertype *st, struct mdinfo *info)
+static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 {
        struct mdp_superblock_1 *sb = st->sb;
        int working = 0;
        unsigned int i;
-       int role;
+       unsigned int role;
+       unsigned int map_disks = info->array.raid_disks;
 
        info->array.major_version = 1;
        info->array.minor_version = st->minor_version;
@@ -629,22 +630,41 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
        } else
                info->reshape_active = 0;
 
+       if (map)
+               for (i=0; i<map_disks; i++)
+                       map[i] = 0;
        for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
                role = __le16_to_cpu(sb->dev_roles[i]);
-               if (/*role == 0xFFFF || */role < info->array.raid_disks)
+               if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) {
                        working++;
+                       if (map && role < map_disks)
+                               map[role] = 1;
+               }
        }
 
        info->array.working_disks = working;
 }
 
+static struct mdinfo *container_content1(struct supertype *st, char *subarray)
+{
+       struct mdinfo *info;
+
+       if (subarray)
+               return NULL;
+
+       info = malloc(sizeof(*info));
+       getinfo_super1(st, info, NULL);
+       return info;
+}
+
 static int update_super1(struct supertype *st, struct mdinfo *info,
                         char *update,
                         char *devname, int verbose,
                         int uuid_set, char *homehost)
 {
-       /* NOTE: for 'assemble' and 'force' we need to return non-zero if any change was made.
-        * For others, the return value is ignored.
+       /* NOTE: for 'assemble' and 'force' we need to return non-zero
+        * if any change was made.  For others, the return value is
+        * ignored.
         */
        int rv = 0;
        struct mdp_superblock_1 *sb = st->sb;
@@ -656,8 +676,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                if (sb->events != __cpu_to_le64(info->events))
                        rv = 1;
                sb->events = __cpu_to_le64(info->events);
-       }
-       if (strcmp(update, "force-array")==0) {
+       } else if (strcmp(update, "force-array")==0) {
                /* Degraded array and 'force' requests to
                 * maybe need to mark it 'clean'.
                 */
@@ -668,8 +687,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                                rv = 1;
                        sb->resync_offset = MaxSector;
                }
-       }
-       if (strcmp(update, "assemble")==0) {
+       } else if (strcmp(update, "assemble")==0) {
                int d = info->disk.number;
                int want;
                if (info->disk.state == 6)
@@ -680,8 +698,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        sb->dev_roles[d] = __cpu_to_le16(want);
                        rv = 1;
                }
-       }
-       if (strcmp(update, "linear-grow-new") == 0) {
+       } else if (strcmp(update, "linear-grow-new") == 0) {
                unsigned int i;
                int rfd, fd;
                unsigned int max = __le32_to_cpu(sb->max_dev);
@@ -723,17 +740,14 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                                        ds - __le64_to_cpu(sb->data_offset));
                        }
                }
-       }
-       if (strcmp(update, "linear-grow-update") == 0) {
+       } else if (strcmp(update, "linear-grow-update") == 0) {
                sb->raid_disks = __cpu_to_le32(info->array.raid_disks);
                sb->dev_roles[info->disk.number] =
                        __cpu_to_le16(info->disk.raid_disk);
-       }
-       if (strcmp(update, "resync") == 0) {
+       } else if (strcmp(update, "resync") == 0) {
                /* make sure resync happens */
                sb->resync_offset = 0ULL;
-       }
-       if (strcmp(update, "uuid") == 0) {
+       } else if (strcmp(update, "uuid") == 0) {
                copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid);
 
                if (__le32_to_cpu(sb->feature_map)&MD_FEATURE_BITMAP_OFFSET) {
@@ -741,8 +755,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        bm = (struct bitmap_super_s*)(st->sb+1024);
                        memcpy(bm->uuid, sb->set_uuid, 16);
                }
-       }
-       if (strcmp(update, "homehost") == 0 &&
+       } else if (strcmp(update, "no-bitmap") == 0) {
+               sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
+       } else if (strcmp(update, "homehost") == 0 &&
            homehost) {
                char *c;
                update = "name";
@@ -752,8 +767,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                else
                        strncpy(info->name, sb->set_name, 32);
                info->name[32] = 0;
-       }
-       if (strcmp(update, "name") == 0) {
+       } else if (strcmp(update, "name") == 0) {
                if (info->name[0] == 0)
                        sprintf(info->name, "%d", info->array.md_minor);
                memset(sb->set_name, 0, sizeof(sb->set_name));
@@ -765,8 +779,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        strcat(sb->set_name, info->name);
                } else
                        strcpy(sb->set_name, info->name);
-       }
-       if (strcmp(update, "devicesize") == 0 &&
+       } else if (strcmp(update, "devicesize") == 0 &&
            __le64_to_cpu(sb->super_offset) <
            __le64_to_cpu(sb->data_offset)) {
                /* set data_size to device size less data_offset */
@@ -778,9 +791,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
                        misc->device_size - __le64_to_cpu(sb->data_offset));
                printf("Size is %llu\n", (unsigned long long)
                       __le64_to_cpu(sb->data_size));
-       }
-       if (strcmp(update, "_reshape_progress")==0)
+       } else if (strcmp(update, "_reshape_progress")==0)
                sb->reshape_position = __cpu_to_le64(info->reshape_progress);
+       else
+               rv = -1;
 
        sb->sb_csum = calc_sb_1_csum(sb);
        return rv;
@@ -1206,9 +1220,6 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        free_super1(st);
 
-       if (st->subarray[0])
-               return 1;
-
        if (st->ss == NULL || st->minor_version == -1) {
                int bestvers = -1;
                struct supertype tst;
@@ -1363,6 +1374,7 @@ static struct supertype *match_metadata_desc1(char *arg)
        if (!st) return st;
 
        memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
        st->ss = &super1;
        st->max_devs = 384;
        st->sb = NULL;
@@ -1684,6 +1696,7 @@ struct superswitch super1 = {
        .match_home = match_home1,
        .uuid_from_super = uuid_from_super1,
        .getinfo_super = getinfo_super1,
+       .container_content = container_content1,
        .update_super = update_super1,
        .init_super = init_super1,
        .store_super = store_super1,
diff --git a/sysfs.c b/sysfs.c
index 6e1d77b313daf28189cfa38e753c022f92bdfd71..7a0403d635b7f2067fa48ecb50e9ba326f5a8037 100644 (file)
--- a/sysfs.c
+++ b/sysfs.c
@@ -435,6 +435,17 @@ int sysfs_uevent(struct mdinfo *sra, char *event)
        return 0;
 }      
 
+int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name)
+{
+       char fname[50];
+       struct stat st;
+
+       sprintf(fname, "/sys/block/%s/md/%s/%s",
+               sra->sys_name, dev?dev->sys_name:"", name);
+
+       return stat(fname, &st) == 0;
+}
+
 int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
                       char *name)
 {
@@ -603,7 +614,8 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
                         * yet, so just ignore status for now.
                         */
                        sysfs_set_str(sra, sd, "state", "insync");
-               rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+               if (sd->disk.raid_disk >= 0)
+                       rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
                if (resume)
                        sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
        }
@@ -789,6 +801,28 @@ int sysfs_unique_holder(int devnum, long rdev)
                return found;
 }
 
+int sysfs_freeze_array(struct mdinfo *sra)
+{
+       /* Try to freeze resync/rebuild on this array/container.
+        * Return -1 if the array is busy,
+        * return -2 container cannot be frozen,
+        * return 0 if this kernel doesn't support 'frozen'
+        * return 1 if it worked.
+        */
+       char buf[20];
+
+       if (!sysfs_attribute_available(sra, NULL, "sync_action"))
+               return 1; /* no sync_action == frozen */
+       if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
+               return 0;
+       if (strcmp(buf, "idle\n") != 0 &&
+           strcmp(buf, "frozen\n") != 0)
+               return -1;
+       if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
+               return 0;
+       return 1;
+}
+
 #ifndef MDASSEMBLE
 
 static char *clean_states[] = {
diff --git a/test b/test
index a31ad40fd379470d15829c2d2bcb4a8f933698d0..d1b458263e1757dbb62ff85846f9dc6996143865 100644 (file)
--- a/test
+++ b/test
@@ -53,7 +53,8 @@ cleanup() {
        $mdadm -Ssq
        for d in 0 1 2 3 4 5 6 7  8 9 10 11 12
        do
-           losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
+           losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d
+           rm -f /dev/disk/by-path/loop*
         done
 }
 
index c3786630935741ed66a7eae5449ec2a64423577b..9f5563210a9f6768bff6c232ea5fc62cfe46077d 100644 (file)
@@ -20,6 +20,7 @@ mdadm $md0 --remove $dev2 $dev1
 check nosync
 check state UUU_
 
+mdadm --zero-superblock $dev2
 mdadm $md0 -a $dev2 
 check recovery
 check wait
index 12c38208cd37b6bd9f516742128ab7864126ecdd..25b1352944244138d6516e1cece4f6e1215fcb27 100644 (file)
@@ -7,9 +7,13 @@
 #
 # add some data, tear down the array, reassemble
 # and make sure it is still there.
+set -e
 
 mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
 mdadm -CR r0 -l0 -n5 /dev/md/ddf0 -z 5000
+if mdadm -CR r0 -l1 -n2 /dev/md/ddf0 -z 5000
+then echo >&2 create with same name should fail ; exit 1
+fi
 mdadm -CR r1 -l1 -n2 /dev/md/ddf0
 mdadm -CR r5 -l5 -n3 /dev/md/ddf0
 testdev /dev/md/r0 5 5000 512
diff --git a/tests/11spare-migration b/tests/11spare-migration
new file mode 100644 (file)
index 0000000..02d19fa
--- /dev/null
@@ -0,0 +1,383 @@
+# Set of tests for autorebuild functionality using mdadm -F
+# To be able to test ddf one must have all loop devices of bigger size, with the ones
+# above number 7 bigger again by any amount (this is not changed for now as it
+# could affect other tests)
+
+. tests/utils
+set -ex
+verbose="yes"
+sleeptime=10
+
+# if listfailed=yes then don't exit if test failed due to wrong
+# spare-migration and just print a list at the end. Other errors still
+# stop the test.
+# if listfailed=no then exit on first failure
+listfailed="yes"
+
+# start Monitor, set monitorpid
+# uses global scan variable
+# all parameters are numbers of devices to be monitored. only used when $scan="no"
+# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1
+monitor(){
+       [ -z $monitorpid ] || return
+       if [ "$scan" == "yes" ]; then
+               $mdadm -F -d 1 --scan --mail root@localhost &
+               monitorpid=$!
+               return
+       fi
+       unset mddevs
+       while [ -n "$1" ]
+       do
+               eval container=\$c$1
+               eval volumes=\$v$1
+               mddevs="$mddevs /dev/$container"
+               if [ "$container" != "$volumes" ]; then
+                       for vol in $volumes; do
+                               mddevs="$mddevs /dev/$vol"
+                       done
+               fi
+               shift
+       done
+       if [ -n "$mddevs" ]; then
+               if [ "$verbose" != "yes" ]; then
+                       $mdadm -F -d 1 $mddevs >&2 &
+                       monitorpid=$!
+               else
+                       $mdadm -F -t -d 1 $mddevs &
+                       monitorpid=$!
+               fi
+       fi
+       [ "$verbose" != "yes" ] || echo $mddevs $monitorpid
+}
+
+test1()
+{
+dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with arrays and common domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test1a()
+{
+dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev1
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test2()
+{
+dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev2
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test3()
+{
+dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with 2 domains
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2
+createconfig domain-$platform"2" $platform spare 3 4 5
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+chksparemoved $c1 $c0 $dev5 n
+tidyup
+}
+
+test4()
+{
+dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, spare loop5 should not be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+chksparemoved $c1 $c0 $dev5 n
+tidyup
+}
+
+test5()
+{
+dsc "Test 5: Two domains, two containers in each domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+setupdevs 2 5 6 $platform
+setupdevs 3 7 8 $platform
+# 2 and 9 for spares
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2 3 4
+createconfig domain-$platform"2" $platform spare 5 6 7 8 9
+monitor 0 1 2 3
+test5a
+test5b
+test5c
+tidyup
+}
+
+test5a()
+{
+dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved"
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v2 $dev5
+chksparemoved $c0 $c2 $dev2 n
+}
+
+test5b()
+{
+dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved"
+mdadm --fail /dev/$v1 $dev3
+chksparemoved $c0 $c1 $dev2
+}
+
+test5c()
+{
+dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved"
+mdadm -a /dev/$c3 $dev9
+chksparemoved $c3 $c2 $dev9
+}
+
+test6()
+{
+dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+# all devices in one domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v1 $dev8
+chksparemoved $c0 $c1 $dev2 n
+tidyup
+}
+
+test7()
+{
+dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9 10
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+
+test7a()
+{
+dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+#all $platform devices in one domain
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 8 9 10
+createconfig domain-$platform"2" $platform spare 2
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+chkspare $c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+test8()
+{
+# ddf does not have getinfo_super_disks implemented so skip this test
+return
+dsc "Test 8: imsm and ddf - spare should not be migrated"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 ddf
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12 n
+tidyup
+}
+
+test9()
+{
+dsc "Test 9: imsm and native 1.2 - spare should not be shared"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 1.2
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12 n
+tidyup
+}
+
+test10()
+{
+dsc "Test 10: Two arrays on the same devices in container"
+setupdevs 0 0 1 $platform 10000
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/md/sub0_ $dev0
+chksparemoved $c1 $c0 $dev2
+if [ $failed -eq 0 ]; then
+# now fail the spare and see if we get another one
+       mdadm --fail /dev/md/sub0_ $dev2
+       mdadm -a /dev/$c1 $dev5
+       chksparemoved $c1 $c0 $dev5
+fi
+tidyup
+}
+
+test11()
+{
+dsc "Test 11: Failed spare from other container should not be used"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev3
+#wait until recovery finishes so no degraded array in c1
+check wait
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $dev3 n
+tidyup
+}
+
+test12()
+{
+dsc "Test 12: Only one spare should be taken for rebuild, second not needed"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm -a /dev/$c1 $dev5
+mdadm --fail /dev/$v0 $dev0
+sleep $sleeptime
+chkarray $dev2 n
+sc1=$c
+chkarray $dev5 n
+sc2=$c
+[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1"
+tidyup
+}
+
+test13()
+{
+dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should not be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 4 5 $platform
+# same domain but different action on 4 5 6
+createconfig a
+createconfig domain-$platform $platform spare 0 1
+createconfig domain-$platform $platform include 4 5 6
+monitor 0 1
+mdadm -a /dev/$c1 $dev6
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $d6 n
+tidyup
+}
+
+test14()
+{
+dsc "Test 14: One domain, small array on big disks, check if small spare is accepted"
+setupdevs 0 8 9 $platform 10000 1
+setupdevs 1 0 1 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev9
+chksparemoved $c1 $c0 $d2
+tidyup
+}
+
+try()
+{
+test1
+test1a
+test2
+test3
+test4
+test5
+test6
+if [ "$platform" != "1.2" ]; then
+# this is because we can't have a small spare added to native array
+    test7
+    test7a
+fi
+test8
+test9
+if [ "$platform" != "1.2" ]; then
+# we can't create two subarrays on the same devices for native (without
+# partitions)
+    test10
+fi
+test11
+test12
+test13
+test14
+}
+
+try_failed()
+{
+platform="1.2"
+scan="no"
+test5
+test9
+test13
+scan="yes"
+test9
+}
+
+#try_failed
+
+for scan in no yes; do
+       for platform in 1.2 imsm; do
+               try
+       done
+done
+
+[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist"
+
+#cat $targetdir/log
+rm -f /dev/disk/by-path/loop*
diff --git a/tests/env-11spare-migration b/tests/env-11spare-migration
new file mode 100644 (file)
index 0000000..7bf468d
--- /dev/null
@@ -0,0 +1,9 @@
+setup_env() {
+       export IMSM_DEVNAME_AS_SERIAL=1
+       export IMSM_TEST_OROM=1
+       }
+
+reset_env() {
+       unset IMSM_DEVNAME_AS_SERIAL
+       unset IMSM_TEST_OROM
+}
diff --git a/tests/utils b/tests/utils
new file mode 100644 (file)
index 0000000..1d45fa8
--- /dev/null
@@ -0,0 +1,192 @@
+# set of functions used to test policy framework with assemble, incremental and Monitor
+
+set +e
+#create links to be able to use domains
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
+do
+       eval ln -s \$dev$d /dev/disk/by-path/loop$d
+       eval d$d="loop$d"
+       eval mdadm --zero-superblock \$dev$d
+done
+
+devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12"
+
+# on failure print out few things before exit
+# uses testdsc and platform global variables
+err(){
+       echo >&2 "ERROR: $*"
+       cat /etc/mdadm.conf >&2 || true
+       cat /proc/mdstat >&2
+       [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; }
+       ps -e | grep mdadm >&2 || true
+       if [ $listfailed == "yes" ]; then
+               [ "$verbose" != "yes" ] || echo ---FAILED---
+               flist="$flist \n $platform $testdsc"
+               failed=1
+       else
+               exit 1
+       fi
+}
+
+# set test description
+dsc(){
+       failed=0
+       testdsc="$*"
+       [ "$verbose" != "yes" ] || echo $testdsc
+}
+
+killmonitor(){
+       [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; }
+}
+
+tidyup(){
+       killmonitor
+       mdadm -Ss || true
+       mdadm -Ss
+       mdadm --zero-superblock $devices || true
+       udevadm settle
+       rm -f /etc/mdadm.conf
+}
+
+trap tidyup 0 1 2 3 15
+
+# create a RAID 1 array or container and subarray(s) on 2 disks
+# if platform not specified imsm is used
+# if subsize is given, first subarray is created with given size and second one on remaining space
+ccv(){
+       # mddevno used to name created array
+       local mddevno="$1"
+       # numbers of devices to be used in array
+       local devno1="$2"
+       local devno2="$3"
+       local platform="$4"
+       local subsize="$5"
+       local onearray="$6"
+       [ -n "$platform" ] || platform="imsm"
+       if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then
+               eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2
+               udevadm settle
+               [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize
+               [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno
+       else
+               [ -z "$subsize" ] || sizepar="-z $subsize"
+               eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar
+               unset sizepar
+       fi
+}
+
+# get container and subarray using given device from mdstat
+# sets global variables c and v
+getarray(){
+       local devname=`basename $1`
+       local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'`
+       c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+       v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+       [ "$platformtype" == "external" ] || c=$v
+}
+
+# check if given device belongs to any container and subarray
+# if $2 given then only container checked
+chkarray(){
+       local devname="$1"
+       local subcheck="$2"
+       getarray $devname
+       [ -n "$c" ] || err "$devname not in any container"
+       [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray"
+}
+
+# test if two devices in the same container/subarray
+# $1 $2 - devices
+# $3 don't check subarrays, only containers
+tst(){
+       local device1=`basename $1`
+       local device2=`basename $2`
+       local subcheck="$3"
+       chkarray $device1 $subcheck
+       local x="$c"
+       local y="$v"
+       chkarray $device2 $subcheck
+       [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container"
+       [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray"
+}
+
+# same as tst, just use numbers of devices instead of names as parameters
+dtst(){
+       local devno1="$1"
+       local devno2="$2"
+       local subcheck="$3"
+       eval tst \$dev$devno1 \$dev$devno2 $subcheck
+}
+
+# create containers/subarrays, check if created properly,
+# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc.
+setupdevs(){
+       local mddevno="$1"
+       local devno1="$2"
+       local devno2="$3"
+       local p="$4"
+       local subsize="$5"
+       local onearray="$6"
+       [ -n "$p" ] || p=$platform
+       ccv $mddevno $devno1 $devno2 $p $subsize $onearray
+       dtst $devno1 $devno2
+       eval c$mddevno=\"$c\"
+       eval v$mddevno=\"$v\"
+}
+
+# check if given spare in container
+# usage: chkspare container spare [n]  (n if spare shouldn't be in container)
+chkspare(){
+       local container=`basename $1`
+       local spare=$2
+       local expected=$3
+       getarray $spare
+       [ -n "$expected" ] || expected="y"
+       if [ "$expected" == "y" ]; then
+               [ "$c" == "$container" ] || err "$spare not in container $container"
+       else
+               [ "$c" != "$container" ] || err "$spare in container $container"
+       fi
+}
+
+#check if spare was moved from one container to another
+# args: from_container to_container spare [yn]
+# n when spare should remain in original container
+chksparemoved(){
+       sleep $sleeptime
+       from_container="$1"
+       to_container="$2"
+       spare="$3"
+       expected="$4"
+       [ -n "$expected" ] || expected="y"
+       notexpected="n"; [ "$expected" == "y" ] || notexpected="y"
+       chkspare $from_container $spare $notexpected
+       [ $failed -eq 1 ] || chkspare $to_container $spare $expected
+}
+
+
+# for domains defined through policy
+createconfig(){
+conf=/etc/mdadm.conf
+if [ "$1" != "a" ]; then
+{
+       domain=$1
+       metadata=$2
+       action=$3
+       while [ -n "$4" ]; do
+               echo="policy domain=$domain"
+               [ "$metadata" == "noplatform" ] ||  echo="$echo metadata=$metadata"
+               echo="$echo path=loop$4"
+               echo="$echo action=$action"
+               echo "$echo"
+               shift
+       done
+} >> $conf
+else
+{
+       echo "DEVICES $devlist /dev/md1*"
+       mdadm -Ebs
+} >  $conf
+fi
+#[ "$verbose" != "yes" ] || cat /etc/mdadm.conf | grep policy || true
+}
index f9607f37cd68e370fcd17702993dca8428f348cd..1d898332e4dee19c8467a077b133b88914c0beb3 100644 (file)
@@ -3,8 +3,10 @@
 SUBSYSTEM!="block", GOTO="md_end"
 
 # handle potential components of arrays
-ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name"
+ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}"
 ENV{ID_FS_TYPE}=="linux_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
+ENV{ID_FS_TYPE}=="isw_raid_member", ACTION=="remove", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}"
+ENV{ID_FS_TYPE}=="isw_raid_member", ACTION=="add", RUN+="/sbin/mdadm --incremental $env{DEVNAME}"
 
 # handle md arrays
 ACTION!="add|change", GOTO="md_end"
diff --git a/util.c b/util.c
index c9bdd6eb8e1a565734ccf91f8a4f665c13db5169..4b41e2b47b7ce50e3a4bdd23b182f0cfd2b637a3 100644 (file)
--- a/util.c
+++ b/util.c
@@ -65,55 +65,7 @@ struct blkpg_partition {
        char volname[BLKPG_VOLNAMELTH]; /* volume label */
 };
 
-/* partition table structures so we can check metadata position
- * against the end of the last partition.
- * Only handle MBR ant GPT partition tables.
- */
-struct MBR_part_record {
-  __u8 bootable;
-  __u8 first_head;
-  __u8 first_sector;
-  __u8 first_cyl;
-  __u8 part_type;
-  __u8 last_head;
-  __u8 last_sector;
-  __u8 last_cyl;
-  __u32 first_sect_lba;
-  __u32 blocks_num;
-};
-
-struct MBR {
-       __u8 pad[446];
-       struct MBR_part_record parts[4];
-       __u16 magic;
-} __attribute__((packed));
-
-struct GPT_part_entry {
-  unsigned char type_guid[16];
-  unsigned char partition_guid[16];
-  __u64 starting_lba;
-  __u64 ending_lba;
-  unsigned char attr_bits[8];
-  unsigned char name[72];
-} __attribute__((packed));
-
-struct GPT {
-       __u64 magic;
-       __u32 revision;
-       __u32 header_size;
-       __u32 crc;
-       __u32 pad1;
-       __u64 current_lba;
-       __u64 backup_lba;
-       __u64 first_lba;
-       __u64 last_lba;
-       __u8 guid[16];
-       __u64 part_start;
-       __u32 part_cnt;
-       __u32 part_size;
-       __u32 part_crc;
-       __u8 pad2[420];
-} __attribute__((packed));
+#include "part.h"
 
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
@@ -124,14 +76,6 @@ struct GPT {
    aren't permitted). */
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 
-
-/* MBR/GPT magic numbers */
-#define        MBR_SIGNATURE_MAGIC     __cpu_to_le16(0xAA55)
-#define        GPT_SIGNATURE_MAGIC     __cpu_to_le64(0x5452415020494645ULL)
-
-#define MBR_PARTITIONS               4
-#define MBR_GPT_PARTITION_TYPE       0xEE
-
 /*
  * Parse a 128 bit uuid in 4 integers
  * format is 32 hexx nibbles with options :.<space> separator
@@ -216,6 +160,31 @@ int get_linux_version()
        return (a*1000000)+(b*1000)+c;
 }
 
+int mdadm_version(char *version)
+{
+       int a, b, c;
+       char *cp;
+
+       if (!version)
+               version = Version;
+
+       cp = strchr(version, '-');
+       if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v')
+               return -1;
+       cp += 3;
+       a = strtoul(cp, &cp, 10);
+       if (*cp != '.')
+               return -1;
+       b = strtoul(cp+1, &cp, 10);
+       if (*cp == '.')
+               c = strtoul(cp+1, &cp, 10);
+       else
+               c = 0;
+       if (*cp != ' ' && *cp != '-')
+               return -1;
+       return (a*1000000)+(b*1000)+c;
+}
+
 #ifndef MDASSEMBLE
 long long parse_size(char *size)
 {
@@ -376,6 +345,36 @@ int enough(int level, int raid_disks, int layout, int clean,
        }
 }
 
+int enough_fd(int fd)
+{
+       struct mdu_array_info_s array;
+       struct mdu_disk_info_s disk;
+       int avail_disks = 0;
+       int i;
+       char *avail;
+
+       if (ioctl(fd, GET_ARRAY_INFO, &array) != 0 ||
+           array.raid_disks <= 0)
+               return 0;
+       avail = calloc(array.raid_disks, 1);
+       for (i=0; i<array.raid_disks + array.nr_disks; i++) {
+               disk.number = i;
+               if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+                       continue;
+               if (! (disk.state & (1<<MD_DISK_SYNC)))
+                       continue;
+               if (disk.raid_disk < 0 || disk.raid_disk >= array.raid_disks)
+                       continue;
+               avail_disks++;
+               avail[disk.raid_disk] = 1;
+       }
+       /* This is used on an active array, so assume it is clean */
+       return enough(array.level, array.raid_disks, array.layout,
+                     1,
+                     avail, avail_disks);
+}
+
+
 const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 };
 int same_uuid(int a[4], int b[4], int swapuuid)
 {
@@ -526,7 +525,7 @@ int check_raid(int fd, char *name)
        /* Looks like a raid array .. */
        fprintf(stderr, Name ": %s appears to be part of a raid array:\n",
                name);
-       st->ss->getinfo_super(st, &info);
+       st->ss->getinfo_super(st, &info, NULL);
        st->ss->free_super(st);
        crtime = info.array.ctime;
        level = map_num(pers, info.array.level);
@@ -1049,11 +1048,16 @@ void wait_for(char *dev, int fd)
                dprintf("%s: timeout waiting for %s\n", __func__, dev);
 }
 
-struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
+struct superswitch *superlist[] =
+{
+       &super0, &super1,
+       &super_ddf, &super_imsm,
+       &mbr, &gpt,
+       NULL };
 
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
 
-struct supertype *super_by_fd(int fd)
+struct supertype *super_by_fd(int fd, char **subarrayp)
 {
        mdu_array_info_t array;
        int vers;
@@ -1064,6 +1068,7 @@ struct supertype *super_by_fd(int fd)
        char version[20];
        int i;
        char *subarray = NULL;
+       int container = NoMdDev;
 
        sra = sysfs_read(fd, 0, GET_VERSION);
 
@@ -1085,15 +1090,15 @@ struct supertype *super_by_fd(int fd)
        }
        if (minor == -2 && is_subarray(verstr)) {
                char *dev = verstr+1;
+
                subarray = strchr(dev, '/');
-               int devnum;
                if (subarray)
                        *subarray++ = '\0';
-               devnum = devname2devnum(dev);
                subarray = strdup(subarray);
+               container = devname2devnum(dev);
                if (sra)
                        sysfs_free(sra);
-               sra = sysfs_read(-1, devnum, GET_VERSION);
+               sra = sysfs_read(-1, container, GET_VERSION);
                if (sra && sra->text_version[0])
                        verstr = sra->text_version;
                else
@@ -1107,17 +1112,33 @@ struct supertype *super_by_fd(int fd)
                sysfs_free(sra);
        if (st) {
                st->sb = NULL;
-               if (subarray) {
-                       strncpy(st->subarray, subarray, 32);
-                       st->subarray[31] = 0;
-                       free(subarray);
-               } else
-                       st->subarray[0] = 0;
-       }
+               if (subarrayp)
+                       *subarrayp = subarray;
+               st->container_dev = container;
+               st->devnum = fd2devnum(fd);
+       } else
+               free(subarray);
+
        return st;
 }
 #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
 
+int dev_size_from_id(dev_t id, unsigned long long *size)
+{
+       char buf[20];
+       int fd;
+
+       sprintf(buf, "%d:%d", major(id), minor(id));
+       fd = dev_open(buf, O_RDONLY);
+       if (fd < 0)
+               return 0;
+       if (get_dev_size(fd, NULL, size)) {
+               close(fd);
+               return 1;
+       }
+       close(fd);
+       return 0;
+}
 
 struct supertype *dup_super(struct supertype *orig)
 {
@@ -1132,13 +1153,12 @@ struct supertype *dup_super(struct supertype *orig)
        st->ss = orig->ss;
        st->max_devs = orig->max_devs;
        st->minor_version = orig->minor_version;
-       strcpy(st->subarray, orig->subarray);
        st->sb = NULL;
        st->info = NULL;
        return st;
 }
 
-struct supertype *guess_super(int fd)
+struct supertype *guess_super_type(int fd, enum guess_types guess_type)
 {
        /* try each load_super to find the best match,
         * and return the best superswitch
@@ -1150,14 +1170,21 @@ struct supertype *guess_super(int fd)
        int i;
 
        st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->container_dev = NoMdDev;
+
        for (i=0 ; superlist[i]; i++) {
                int rv;
                ss = superlist[i];
+               if (guess_type == guess_array && ss->add_to_super == NULL)
+                       continue;
+               if (guess_type == guess_partitions && ss->add_to_super != NULL)
+                       continue;
                memset(st, 0, sizeof(*st));
                rv = ss->load_super(st, fd, NULL);
                if (rv == 0) {
                        struct mdinfo info;
-                       st->ss->getinfo_super(st, &info);
+                       st->ss->getinfo_super(st, &info, NULL);
                        if (bestsuper == -1 ||
                            besttime < info.array.ctime) {
                                bestsuper = i;
@@ -1207,6 +1234,20 @@ int get_dev_size(int fd, char *dname, unsigned long long *sizep)
        return 1;
 }
 
+/* Return true if this can only be a container, not a member device.
+ * i.e. is and md device and size is zero
+ */
+int must_be_container(int fd)
+{
+       unsigned long long size;
+       if (md_get_version(fd) < 0)
+               return 0;
+       if (get_dev_size(fd, NULL, &size) == 0)
+               return 1;
+       if (size == 0)
+               return 1;
+       return 0;
+}
 
 /* Sets endofpart parameter to the last block used by the last GPT partition on the device.
  * Returns: 1 if successful
@@ -1437,14 +1478,11 @@ int is_subarray_active(char *subarray, char *container)
        struct mdstat_ent *mdstat = mdstat_read(0, 0);
        struct mdstat_ent *ent;
 
-       for (ent = mdstat; ent; ent = ent->next) {
-               if (is_container_member(ent, container)) {
-                       char *inst = &ent->metadata_version[10+strlen(container)+1];
-
-                       if (!subarray || strcmp(inst, subarray) == 0)
+       for (ent = mdstat; ent; ent = ent->next)
+               if (is_container_member(ent, container))
+                       if (!subarray ||
+                           strcmp(to_subarray(ent, container), subarray) == 0)
                                break;
-               }
-       }
 
        free_mdstat(mdstat);
 
@@ -1458,14 +1496,15 @@ int is_container_active(char *container)
 
 /* open_subarray - opens a subarray in a container
  * @dev: container device name
- * @st: supertype with only ->subarray set
+ * @st: empty supertype
  * @quiet: block reporting errors flag
  *
  * On success returns an fd to a container and fills in *st
  */
-int open_subarray(char *dev, struct supertype *st, int quiet)
+int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet)
 {
        struct mdinfo *mdi;
+       struct mdinfo *info;
        int fd, err = 1;
 
        fd = open(dev, O_RDWR|O_EXCL);
@@ -1515,18 +1554,27 @@ int open_subarray(char *dev, struct supertype *st, int quiet)
                goto free_sysfs;
        }
 
-       if (st->ss->load_super(st, fd, NULL)) {
+       if (!st->ss->load_container) {
                if (!quiet)
-                       fprintf(stderr, Name ": Failed to find subarray-%s in %s\n",
-                               st->subarray, dev);
+                       fprintf(stderr, Name ": %s is not a container\n", dev);
                goto free_name;
        }
 
-       if (!st->loaded_container) {
+       if (st->ss->load_container(st, fd, NULL)) {
                if (!quiet)
-                       fprintf(stderr, Name ": %s is not a container\n", dev);
+                       fprintf(stderr, Name ": Failed to load metadata for %s\n",
+                               dev);
+               goto free_name;
+       }
+
+       info = st->ss->container_content(st, subarray);
+       if (!info) {
+               if (!quiet)
+                       fprintf(stderr, Name ": Failed to find subarray-%s in %s\n",
+                               subarray, dev);
                goto free_super;
        }
+       free(info);
 
        err = 0;
 
@@ -1578,6 +1626,21 @@ int add_disk(int mdfd, struct supertype *st,
        return rv;
 }
 
+int remove_disk(int mdfd, struct supertype *st,
+               struct mdinfo *sra, struct mdinfo *info)
+{
+       int rv;
+       /* Remove the disk given by 'info' from the array */
+#ifndef MDASSEMBLE
+       if (st->ss->external)
+               rv = sysfs_set_str(sra, info, "slot", "none");
+       else
+#endif
+               rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major,
+                                                         info->disk.minor));
+       return rv;
+}
+
 int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
 {
        /* Initialise kernel's knowledge of array.
@@ -1833,3 +1896,13 @@ void append_metadata_update(struct supertype *st, void *buf, int len)
 unsigned int __invalid_size_argument_for_IOC = 0;
 #endif
 
+int experimental(void)
+{
+       if (check_env("MDADM_EXPERIMENTAL"))
+               return 1;
+       else {
+               fprintf(stderr, Name ": To use this feature MDADM_EXPERIMENTAL enviroment variable has to defined.\n");
+               return 0;
+       }
+}
+