]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Merge branch 'master' into from-stable
authorNeilBrown <neilb@suse.de>
Thu, 7 Aug 2008 04:12:25 +0000 (14:12 +1000)
committerNeilBrown <neilb@suse.de>
Thu, 7 Aug 2008 04:12:25 +0000 (14:12 +1000)
Conflicts:

Create.c
Manage.c

34 files changed:
Assemble.c
Create.c
Grow.c
Incremental.c
Kill.c
Makefile
Manage.c
Query.c
ReadMe.c
TODO
bitmap.c
crc32.c [new file with mode: 0644]
crc32.h [new file with mode: 0644]
kernel-patch-2.6.25 [new file with mode: 0644]
managemon.c [new file with mode: 0644]
mapfile.c
md.4
mdadm.8
mdadm.c
mdadm.h
mdmon.c [new file with mode: 0644]
mdmon.h [new file with mode: 0644]
mdstat.c
monitor.c [new file with mode: 0644]
msg.c [new file with mode: 0644]
msg.h [new file with mode: 0644]
sg_io.c [new file with mode: 0644]
super-ddf.c [new file with mode: 0644]
super-intel.c [new file with mode: 0644]
super0.c
super1.c
sysfs.c
test
util.c

index 79f091269e68f843cbf9bd03978b8c739f3cf80c..7efa2b8c8bfc4d0526e819f49b207266b9e49f79 100644 (file)
@@ -542,8 +542,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                            == devices[devcnt].i.events
                            && (devices[best[i]].i.disk.minor
                                != devices[devcnt].i.disk.minor)
-                           && st->ss->major == 0
-                           && info.array.level != -4) {
+                           && st->ss == &super0
+                           && info.array.level != LEVEL_MULTIPATH) {
                                /* two different devices with identical superblock.
                                 * Could be a mis-detection caused by overlapping
                                 * partitions.  fail-safe.
@@ -845,11 +845,29 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        /* Almost ready to actually *do* something */
        if (!old_linux) {
                int rv;
+
+#ifndef MDASSEMBLE
+               struct mdinfo *sra;
+               if (st->ss->external) {
+                       char ver[100];
+                       strcat(strcpy(ver, "external:"), info.text_version);
+                       sra = sysfs_read(mdfd, 0, 0);
+                       if ((vers % 100) < 2 ||
+                           sra == NULL ||
+                           sysfs_set_str(sra, NULL, "metadata_version",
+                                         ver) < 0) {
+                               fprintf(stderr, Name ": This kernel does not "
+                                       "support external metadata.\n");
+                               return 1;
+                       }
+                       rv = sysfs_set_array(sra, &info);
+               } else
+#endif
                if ((vers % 100) >= 1) { /* can use different versions */
                        mdu_array_info_t inf;
                        memset(&inf, 0, sizeof(inf));
-                       inf.major_version = st->ss->major;
-                       inf.minor_version = st->minor_version;
+                       inf.major_version = info.array.major_version;
+                       inf.minor_version = info.array.minor_version;
                        rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
                } else
                        rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
@@ -895,8 +913,14 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                j = chosen_drive;
 
                        if (j >= 0 /* && devices[j].uptodate */) {
-                               if (ioctl(mdfd, ADD_NEW_DISK,
-                                         &devices[j].i.disk)!=0) {
+#ifndef MDASSEMBLE
+                               if (st->ss->external)
+                                       rv = sysfs_add_disk(sra, &devices[j].i);
+                               else
+#endif
+                                       rv = ioctl(mdfd, ADD_NEW_DISK,
+                                         &devices[j].i.disk);
+                               if (rv) {
                                        fprintf(stderr, Name ": failed to add "
                                                        "%s to %s: %s\n",
                                                devices[j].devname,
@@ -918,6 +942,21 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                        i, mddev);
                }
 
+               if (info.array.level == LEVEL_CONTAINER) {
+                       if (verbose >= 0) {
+                               fprintf(stderr, Name ": Container %s has been "
+                                       "assembled with %d drive%s",
+                                       mddev, okcnt, okcnt==1?"":"s");
+                               if (okcnt < info.array.raid_disks)
+                                       fprintf(stderr, " (out of %d)",
+                                               info.array.raid_disks);
+                               fprintf(stderr, "\n");
+                       }
+                       if (must_close)
+                               close(mdfd);
+                       return 0;
+               }
+
                if (runstop == 1 ||
                    (runstop <= 0 &&
                     ( enough(info.array.level, info.array.raid_disks,
@@ -940,7 +979,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                        /* There is a nasty race with 'mdadm --monitor'.
                                         * If it opens this device before we close it,
                                         * it gets an incomplete open on which IO
-                                        * doesn't work and the capacity if wrong.
+                                        * doesn't work and the capacity is
+                                        * wrong.
                                         * If we reopen (to check for layered devices)
                                         * before --monitor closes, we loose.
                                         *
index 7b1836a319b267c67484580c5d7af57cefffa26f..69192abb23b415147a19c60c898dd221d722e329 100644 (file)
--- a/Create.c
+++ b/Create.c
@@ -66,12 +66,18 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        int second_missing = subdevs * 2;
        int missing_disks = 0;
        int insert_point = subdevs * 2; /* where to insert a missing drive */
+       int total_slots;
        int pass;
        int vers;
        int rv;
        int bitmap_fd;
+       int have_container = 0;
+       int container_fd;
+       int need_mdmon = 0;
        unsigned long long bitmapsize;
-       struct mdinfo info;
+       struct mdinfo *sra;
+       struct mdinfo info, *infos;
+       int did_default = 0;
 
        int major_num = BITMAP_MAJOR_HI;
 
@@ -91,6 +97,14 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        return 1;
                }
        }
+       if (level == UnSet) {
+               /* "ddf" and "imsm" metadata only supports one level - should possibly
+                * push this into metadata handler??
+                */
+               if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
+                       level = LEVEL_CONTAINER;
+       }
+
        if (level == UnSet) {
                fprintf(stderr,
                        Name ": a RAID level is needed to create an array.\n");
@@ -116,11 +130,47 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        Name ": This level does not support spare devices\n");
                return 1;
        }
+
+       if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+               /* If given a single device, it might be a container, and we can
+                * extract a device list from there
+                */
+               mdu_array_info_t inf;
+               int fd;
+
+               memset(&inf, 0, sizeof(inf));
+               fd = open(devlist->devname, O_RDONLY);
+               if (fd >= 0 &&
+                   ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+                   inf.raid_disks == 0) {
+                       /* yep, looks like a container */
+                       if (st) {
+                               rv = st->ss->load_super(st, fd,
+                                                       devlist->devname);
+                               if (rv == 0)
+                                       have_container = 1;
+                       } else {
+                               st = guess_super(fd);
+                               if (st && !(rv = st->ss->
+                                           load_super(st, fd,
+                                                      devlist->devname)))
+                                       have_container = 1;
+                               else
+                                       st = NULL;
+                       }
+               }
+               if (fd >= 0)
+                       close(fd);
+               if (have_container) {
+                       subdevs = 0;
+                       devlist = NULL;
+               }
+       }
        if (subdevs > raiddisks+sparedisks) {
                fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
                return 1;
        }
-       if (subdevs < raiddisks+sparedisks) {
+       if (!have_container && subdevs < raiddisks+sparedisks) {
                fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n");
                return 1;
        }
@@ -182,6 +232,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        case 1:
        case LEVEL_FAULTY:
        case LEVEL_MULTIPATH:
+       case LEVEL_CONTAINER:
                if (chunk) {
                        chunk = 0;
                        if (verbose > 0)
@@ -193,14 +244,17 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                return 1;
        }
 
+       if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
+                                             chunk, size, NULL, NULL, verbose>=0))
+               return 1;
+
        /* now look at the subdevs */
        info.array.active_disks = 0;
        info.array.working_disks = 0;
        dnum = 0;
        for (dv=devlist; dv; dv=dv->next, dnum++) {
                char *dname = dv->devname;
-               unsigned long long ldsize, freesize;
-               int fd;
+               unsigned long long freesize;
                if (strcasecmp(dname, "missing")==0) {
                        if (first_missing > dnum)
                                first_missing = dnum;
@@ -212,18 +266,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                info.array.working_disks++;
                if (dnum < raiddisks)
                        info.array.active_disks++;
-               fd = open(dname, O_RDONLY|O_EXCL);
-               if (fd <0 ) {
-                       fprintf(stderr, Name ": Cannot open %s: %s\n",
-                               dname, strerror(errno));
-                       fail=1;
-                       continue;
-               }
-               if (!get_dev_size(fd, dname, &ldsize)) {
-                       fail = 1;
-                       close(fd);
-                       continue;
-               }
                if (st == NULL) {
                        struct createinfo *ci = conf_get_create_info();
                        if (ci)
@@ -231,33 +273,42 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
                if (st == NULL) {
                        /* Need to choose a default metadata, which is different
-                        * depending on the sizes of devices
+                        * depending on geometry of array.
                         */
                        int i;
                        char *name = "default";
-                       if (level >= 1 && ldsize > (0x7fffffffULL<<10))
-                               name = "default/large";
-                       for(i=0; !st && superlist[i]; i++)
+                       for(i=0; !st && superlist[i]; i++) {
                                st = superlist[i]->match_metadata_desc(name);
+                               if (st && !st->ss->validate_geometry
+                                               (st, level, layout, raiddisks,
+                                                chunk, size, dname, &freesize,
+                                                verbose > 0))
+                                       st = NULL;
+                       }
 
                        if (!st) {
-                               fprintf(stderr, Name ": internal error - no default metadata style\n");
+                               fprintf(stderr, Name ": device %s not suitable "
+                                       "for any style of array\n",
+                                       dname);
                                exit(2);
                        }
-                       if (st->ss->major != 0 ||
+                       if (st->ss != &super0 ||
                            st->minor_version != 90)
-                               fprintf(stderr, Name ": Defaulting to version"
-                                       " %d.%d metadata\n",
-                                       st->ss->major,
-                                       st->minor_version);
-               }
-               freesize = st->ss->avail_size(st, ldsize >> 9);
-               if (freesize == 0) {
-                       fprintf(stderr, Name ": %s is too small: %luK\n",
-                               dname, (unsigned long)(ldsize>>10));
-                       fail = 1;
-                       close(fd);
-                       continue;
+                               did_default = 1;
+               } else {
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, size, dname,
+                                                      &freesize,
+                                                      verbose > 0)) {
+
+                               fprintf(stderr,
+                                       Name ": %s is not suitable for "
+                                       "this array.\n",
+                                       dname);
+                               fail = 1;
+                               continue;
+                       }
                }
 
                freesize /= 2; /* convert to K */
@@ -268,9 +319,9 @@ int Create(struct supertype *st, char *mddev, int mdfd,
 
                if (size && freesize < size) {
                        fprintf(stderr, Name ": %s is smaller that given size."
-                               " %lluK < %lluK + superblock\n", dname, freesize, size);
+                               " %lluK < %lluK + metadata\n",
+                               dname, freesize, size);
                        fail = 1;
-                       close(fd);
                        continue;
                }
                if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
@@ -282,24 +333,36 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        minsize = freesize;
                }
                if (runstop != 1 || verbose >= 0) {
+                       int fd = open(dname, O_RDONLY);
+                       if (fd <0 ) {
+                               fprintf(stderr, Name ": Cannot open %s: %s\n",
+                                       dname, strerror(errno));
+                               fail=1;
+                               continue;
+                       }
                        warn |= check_ext2(fd, dname);
                        warn |= check_reiser(fd, dname);
                        warn |= check_raid(fd, dname);
+                       close(fd);
                }
-               close(fd);
        }
        if (fail) {
                fprintf(stderr, Name ": create aborted\n");
                return 1;
        }
        if (size == 0) {
-               if (mindisc == NULL) {
+               if (mindisc == NULL && !have_container) {
                        fprintf(stderr, Name ": no size and no drives given - aborting create.\n");
                        return 1;
                }
-               if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) {
+               if (level > 0 || level == LEVEL_MULTIPATH
+                   || level == LEVEL_FAULTY
+                   || st->ss->external ) {
                        /* size is meaningful */
-                       if (minsize > 0x100000000ULL && st->ss->major == 0) {
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, minsize,
+                                                      NULL, NULL, 0)) {
                                fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
                                return 1;
                        }
@@ -357,7 +420,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                missing_disks++;
        }
 
-       if (level <= 0 && first_missing != subdevs * 2) {
+       if (level <= 0 && first_missing < subdevs * 2) {
                fprintf(stderr,
                        Name ": This level does not support missing devices\n");
                return 1;
@@ -382,12 +445,16 @@ int Create(struct supertype *st, char *mddev, int mdfd,
             ( level == 6 && (insert_point < raiddisks
                              || second_missing < raiddisks))
             ||
+            ( level <= 0 )
+            ||
             assume_clean
-               )
+               ) {
                info.array.state = 1; /* clean, but one+ drive will be missing*/
-       else
+               info.resync_start = ~0ULL;
+       } else {
                info.array.state = 0; /* not clean, but no errors */
-
+               info.resync_start = 0;
+       }
        if (level == 10) {
                /* for raid10, the bitmap size is the capacity of the array,
                 * which is array.size * raid_disks / ncopies;
@@ -424,7 +491,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                + info.array.failed_disks;
        info.array.layout = layout;
        info.array.chunk_size = chunk*1024;
-       info.array.major_version = st->ss->major;
 
        if (name == NULL || *name == 0) {
                /* base name on mddev */
@@ -453,6 +519,31 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
                return 1;
 
+       total_slots = info.array.nr_disks;
+       st->ss->getinfo_super(st, &info);
+
+       if (did_default && verbose >= 0) {
+               if (info.text_version[0] == '/') {
+                       int dnum = devname2devnum(info.text_version+1);
+                       char *path;
+                       int mdp = get_mdp_major();
+                       struct mdinfo *mdi;
+                       if (dnum > 0)
+                               path = map_dev(MD_MAJOR, dnum, 1);
+                       else
+                               path = map_dev(mdp, (-1-dnum)<< 6, 1);
+
+                       mdi = sysfs_read(-1, dnum, GET_VERSION);
+
+                       fprintf(stderr, Name ": Creating array inside "
+                               "%s container %s\n", 
+                               mdi?mdi->text_version:"managed", path);
+                       sysfs_free(mdi);
+               } else
+                       fprintf(stderr, Name ": Defaulting to version"
+                               " %s metadata\n", info.text_version);
+       }
+
        if (bitmap_file && vers < 9003) {
                major_num = BITMAP_MAJOR_HOSTENDIAN;
 #ifdef __BIG_ENDIAN
@@ -476,12 +567,56 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        }
 
 
-
-       if ((vers % 100) >= 1) { /* can use different versions */
+       sra = sysfs_read(mdfd, 0, 0);
+
+       if (st->ss->external) {
+               char ver[100];
+               strcat(strcpy(ver, "external:"),
+                      info.text_version);
+               if (st->ss->external && st->subarray[0]) {
+                       /* member */
+
+                       /* When creating a member, we need to be careful
+                        * to negotiate with mdmon properly.
+                        * If it is already running, we cannot write to
+                        * the devices and must ask it to do that part.
+                        * If it isn't running, we write to the devices,
+                        * and then start it.
+                        * We hold an exclusive open on the container
+                        * device to make sure mdmon doesn't exit after
+                        * we checked that it is running.
+                        *
+                        * For now, fail if it is already running.
+                        */
+                       container_fd = open_dev_excl(st->container_dev);
+                       if (container_fd < 0) {
+                               fprintf(stderr, Name ": Cannot get exclusive "
+                                       "open on container - weird.\n");
+                               return 1;
+                       }
+                       if (mdmon_running(st->container_dev)) {
+                               if (verbose)
+                                       fprintf(stderr, Name ": reusing mdmon "
+                                               "for %s.\n",
+                                               devnum2devname(st->container_dev));
+                               st->update_tail = &st->updates;
+                       } else
+                               need_mdmon = 1;
+               }
+               if ((vers % 100) < 2 ||
+                   sra == NULL ||
+                   sysfs_set_str(sra, NULL, "metadata_version",
+                                 ver) < 0) {
+                       fprintf(stderr, Name ": This kernel does not "
+                               "support external metadata.\n");
+                       return 1;
+               }
+               rv = sysfs_set_array(sra, &info);
+       } else  if ((vers % 100) >= 1) { /* can use different versions */
                mdu_array_info_t inf;
                memset(&inf, 0, sizeof(inf));
-               inf.major_version = st->ss->major;
-               inf.minor_version = st->minor_version;
+               inf.major_version = info.array.major_version;
+               inf.minor_version = info.array.minor_version;
                rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
        } else
                rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
@@ -514,7 +649,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
        }
 
-
+       infos = malloc(sizeof(*infos) * total_slots);
 
        for (pass=1; pass <=2 ; pass++) {
                mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
@@ -523,74 +658,123 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
                        int fd;
                        struct stat stb;
+                       struct mdinfo *inf = &infos[dnum];
 
-                       info.disk.number = dnum;
+                       if (dnum >= total_slots)
+                               abort();
                        if (dnum == insert_point) {
                                moved_disk = dv;
                        }
-                       info.disk.raid_disk = info.disk.number;
-                       if (info.disk.raid_disk < raiddisks)
-                               info.disk.state = (1<<MD_DISK_ACTIVE) |
+                       if (dnum == insert_point ||
+                           strcasecmp(dv->devname, "missing")==0)
+                               continue;
+
+                       switch(pass) {
+                       case 1:
+                               *inf = info;
+
+                               inf->disk.number = dnum;
+                               inf->disk.raid_disk = dnum;
+                               if (inf->disk.raid_disk < raiddisks)
+                                       inf->disk.state = (1<<MD_DISK_ACTIVE) |
                                                (1<<MD_DISK_SYNC);
-                       else
-                               info.disk.state = 0;
-                       if (dv->writemostly)
-                               info.disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+                               else
+                                       inf->disk.state = 0;
+
+                               if (dv->writemostly)
+                                       inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+                               if (st->ss->external && st->subarray[0])
+                                       fd = open(dv->devname, O_RDWR);
+                               else
+                                       fd = open(dv->devname, O_RDWR|O_EXCL);
 
-                       if (dnum == insert_point ||
-                           strcasecmp(dv->devname, "missing")==0) {
-                               info.disk.major = 0;
-                               info.disk.minor = 0;
-                               info.disk.state = (1<<MD_DISK_FAULTY);
-                       } else {
-                               fd = open(dv->devname, O_RDONLY|O_EXCL);
                                if (fd < 0) {
-                                       fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n",
+                                       fprintf(stderr, Name ": failed to open %s "
+                                               "after earlier success - aborting\n",
                                                dv->devname);
                                        return 1;
                                }
                                fstat(fd, &stb);
-                               info.disk.major = major(stb.st_rdev);
-                               info.disk.minor = minor(stb.st_rdev);
+                               inf->disk.major = major(stb.st_rdev);
+                               inf->disk.minor = minor(stb.st_rdev);
+
                                remove_partitions(fd);
-                               close(fd);
-                       }
-                       switch(pass){
-                       case 1:
-                               st->ss->add_to_super(st, &info.disk);
+                               st->ss->add_to_super(st, &inf->disk,
+                                                    fd, dv->devname);
+                               st->ss->getinfo_super(st, inf);
+
+                               /* getinfo_super might have lost these ... */
+                               inf->disk.major = major(stb.st_rdev);
+                               inf->disk.minor = minor(stb.st_rdev);
                                break;
                        case 2:
-                               if (info.disk.state == 1) break;
-                               Kill(dv->devname, 0, 1); /* Just be sure it is clean */
-                               Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */
-                               st->ss->write_init_super(st, &info.disk,
-                                                        dv->devname);
-
-                               if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) {
-                                       fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n",
+                               inf->errors = 0;
+                               rv = 0;
+
+                               if (st->ss->external)
+                                       rv = sysfs_add_disk(sra, inf);
+                               else
+                                       rv = ioctl(mdfd, ADD_NEW_DISK,
+                                                  &inf->disk);
+
+                               if (rv) {
+                                       fprintf(stderr,
+                                               Name ": ADD_NEW_DISK for %s "
+                                               "failed: %s\n",
                                                dv->devname, strerror(errno));
                                        st->ss->free_super(st);
                                        return 1;
                                }
-
                                break;
                        }
                        if (dv == moved_disk && dnum != insert_point) break;
                }
+               if (pass == 1) {
+                       st->ss->write_init_super(st);
+                       flush_metadata_updates(st);
+               }
        }
+       free(infos);
        st->ss->free_super(st);
 
        /* param is not actually used */
-       if (runstop == 1 || subdevs >= raiddisks) {
-               mdu_param_t param;
-               if (ioctl(mdfd, RUN_ARRAY, &param)) {
-                       fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
-                               strerror(errno));
-                       Manage_runstop(mddev, mdfd, -1, 0);
-                       return 1;
+       if (level == LEVEL_CONTAINER)
+               /* No need to start */
+               ;
+       else if (runstop == 1 || subdevs >= raiddisks) {
+               if (st->ss->external) {
+                       switch(level) {
+                       case LEVEL_LINEAR:
+                       case LEVEL_MULTIPATH:
+                       case 0:
+                               sysfs_set_str(sra, NULL, "array_state",
+                                             "active");
+                               need_mdmon = 0;
+                               break;
+                       default:
+                               sysfs_set_str(sra, NULL, "array_state",
+                                             "readonly");
+                               break;
+                       }
+               } else {
+                       mdu_param_t param;
+                       if (ioctl(mdfd, RUN_ARRAY, &param)) {
+                               fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
+                                       strerror(errno));
+                               Manage_runstop(mddev, mdfd, -1, 0);
+                               return 1;
+                       }
                }
                if (verbose >= 0)
                        fprintf(stderr, Name ": array %s started.\n", mddev);
+               if (st->ss->external && st->subarray[0]) {
+                       if (need_mdmon)
+                               start_mdmon(st->container_dev);
+
+                       ping_monitor(devnum2devname(st->container_dev));
+                       close(container_fd);
+               }
        } else {
                fprintf(stderr, Name ": not starting array - not enough devices.\n");
        }
diff --git a/Grow.c b/Grow.c
index a8194bf05b69e3e86b5eefcc88241bdb837ea398..3a31ea547cc9317a1564beaf3d0caed4d5583268 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -69,7 +69,7 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
                return 1;
        }
 
-       nfd = open(newdev, O_RDWR|O_EXCL);
+       nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
        if (nfd < 0) {
                fprintf(stderr, Name ": cannot open %s\n", newdev);
                return 1;
index 0fb9afd3c8aa55717b86e8fa7ea445fb937f82f2..9b22220604fc463f9fbba2ebc7fb493ac7d41ac7 100644 (file)
@@ -40,7 +40,7 @@ int Incremental(char *devname, int verbose, int runstop,
                struct supertype *st, char *homehost, int autof)
 {
        /* Add this device to an array, creating the array if necessary
-        * and starting the array if sensibe or - if runstop>0 - if possible.
+        * and starting the array if sensible or - if runstop>0 - if possible.
         *
         * This has several steps:
         *
@@ -140,9 +140,17 @@ int Incremental(char *devname, int verbose, int runstop,
                close(dfd);
                return 1;
        }
-       st->ss->getinfo_super(st, &info);
        close (dfd);
 
+       if (st->ss->container_content) {
+               /* This is a pre-built container array, so we do something
+                * rather different.
+                */
+               return Incremental_container(st, devname, verbose, runstop,
+                                            autof);
+       }
+
+       st->ss->getinfo_super(st, &info);
        /* 3/ Check if there is a match in mdadm.conf */
 
        array_list = conf_get_ident(NULL);
@@ -229,6 +237,7 @@ int Incremental(char *devname, int verbose, int runstop,
                /* Have to guess a bit. */
                int use_partitions = 1;
                char *np, *ep;
+               char *nm, nbuf[1024];
                if ((autof&7) == 3 || (autof&7) == 5)
                        use_partitions = 0;
                np = strchr(info.name, ':');
@@ -244,6 +253,24 @@ int Incremental(char *devname, int verbose, int runstop,
                } else
                        devnum = -1;
 
+               if (match)
+                       nm = match->devname;
+               else {
+                       sprintf(nbuf, "/dev/md/%s", np);
+                       nm = nbuf;
+               }
+               if (stat(nm, &stb) == 0 &&
+                   S_ISBLK(stb.st_mode) &&
+                   major(stb.st_rdev) == (use_partitions ?
+                                          get_mdp_major() : MD_MAJOR)) {
+                       if (use_partitions)
+                               devnum = minor(stb.st_rdev) >> MdpMinorShift;
+                       else
+                               devnum = minor(stb.st_rdev);
+                       if (mddev_busy(use_partitions ? (-1-devnum) : devnum))
+                               devnum = -1;
+               }
+
                if (devnum < 0) {
                        /* Haven't found anything yet, choose something free */
                        devnum = find_free_devnum(use_partitions);
@@ -273,12 +300,11 @@ int Incremental(char *devname, int verbose, int runstop,
        /* - add the device */
                mdu_array_info_t ainf;
                mdu_disk_info_t disk;
-               char md[20];
                struct mdinfo *sra;
 
                memset(&ainf, 0, sizeof(ainf));
-               ainf.major_version = st->ss->major;
-               ainf.minor_version = st->minor_version;
+               ainf.major_version = info.array.major_version;
+               ainf.minor_version = info.array.minor_version;
                if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) {
                        fprintf(stderr, Name
                                ": SET_ARRAY_INFO failed for %s: %s\b",
@@ -286,9 +312,8 @@ int Incremental(char *devname, int verbose, int runstop,
                        close(mdfd);
                        return 2;
                }
-               sprintf(md, "%d.%d\n", st->ss->major, st->minor_version);
                sra = sysfs_read(mdfd, devnum, GET_VERSION);
-               sysfs_set_str(sra, NULL, "metadata_version", md);
+               sysfs_set_str(sra, NULL, "metadata_version", info.text_version);
                memset(&disk, 0, sizeof(disk));
                disk.major = major(stb.st_rdev);
                disk.minor = minor(stb.st_rdev);
@@ -325,29 +350,18 @@ int Incremental(char *devname, int verbose, int runstop,
                int err;
                struct mdinfo *sra;
                struct supertype *st2;
-               sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS |
-                                               GET_STATE));
+               sra = sysfs_read(mdfd, devnum, (GET_DEVS | GET_STATE));
 
-               if (sra->array.major_version != st->ss->major ||
-                   sra->array.minor_version != st->minor_version) {
-                       if (verbose >= 0)
-                               fprintf(stderr, Name
-             ": %s has different metadata to chosen array %s %d.%d %d.%d.\n",
-                                       devname, chosen_name,
-                                       sra->array.major_version,
-                                       sra->array.minor_version,
-                                       st->ss->major, st->minor_version);
-                       close(mdfd);
-                       return 1;
-               }
                sprintf(dn, "%d:%d", sra->devs->disk.major,
                        sra->devs->disk.minor);
                dfd2 = dev_open(dn, O_RDONLY);
                st2 = dup_super(st);
-               if (st2->ss->load_super(st2, dfd2, NULL)) {
+               if (st2->ss->load_super(st2, dfd2, NULL) ||
+                   st->ss->compare_super(st, st2) != 0) {
                        fprintf(stderr, Name
-                               ": Strange error loading metadata for %s.\n",
-                               chosen_name);
+                               ": metadata mismatch between %s and "
+                               "chosen array %s\n",
+                               devname, chosen_name);
                        close(mdfd);
                        close(dfd2);
                        return 2;
@@ -385,8 +399,7 @@ int Incremental(char *devname, int verbose, int runstop,
        }
        /* 6/ Make sure /var/run/mdadm.map contains this array. */
        map_update(&map, devnum,
-                  info.array.major_version,
-                  info.array.minor_version,
+                  info.text_version,
                   info.uuid, chosen_name);
 
        /* 7/ Is there enough devices to possibly start the array? */
@@ -620,8 +633,8 @@ void RebuildMap(void)
                                path = map_dev(MD_MAJOR, md->devnum, 0);
                        else
                                path = map_dev(mdp, (-1-md->devnum)<< 6, 0);
-                       map_add(&map, md->devnum, st->ss->major,
-                               st->minor_version,
+                       map_add(&map, md->devnum,
+                               info.text_version,
                                info.uuid, path ? : "/unknown");
                        st->ss->free_super(st);
                        break;
@@ -708,3 +721,136 @@ int IncrementalScan(int verbose)
        }
        return rv;
 }
+
+static char *container2devname(char *devname)
+{
+       int fd = open(devname, O_RDONLY);
+       char *mdname = NULL;
+
+       if (fd >= 0) {
+               mdname = devnum2devname(fd2devnum(fd));
+               close(fd);
+       }
+
+       return mdname;
+}
+
+int Incremental_container(struct supertype *st, char *devname, int verbose,
+                         int runstop, int autof)
+{
+       /* Collect the contents of this container and for each
+        * array, choose a device name and assemble the array.
+        */
+
+       struct mdinfo *list = st->ss->container_content(st);
+       struct mdinfo *ra;
+       char *mdname = container2devname(devname);
+
+       if (!mdname) {
+               fprintf(stderr, Name": failed to determine device name\n");
+               return 2;
+       }
+
+       for (ra = list ; ra ; ra = ra->next) {
+               struct mdinfo *sra;
+               struct mdinfo *dev;
+               int devnum = -1;
+               int mdfd;
+               char chosen_name[1024];
+               int usepart = 1;
+               char *n;
+               int working = 0;
+               char ver[100];
+
+               if ((autof&7) == 3 || (autof&7) == 5)
+                       usepart = 0;
+
+               n = ra->name;
+               if (*n == 'd')
+                       n++;
+               if (*n) {
+                       devnum = strtoul(n, &n, 10);
+                       if (devnum >= 0 && (*n == 0 || *n == ' ')) {
+                               /* Use this devnum */
+                               usepart = (ra->name[0] == 'd');
+                               if (mddev_busy(usepart ? (-1-devnum) : devnum))
+                                       devnum = -1;
+                       } else
+                               devnum = -1;
+               }
+
+               if (devnum < 0) {
+                       char *nm = ra->name;
+                       char nbuf[1024];
+                       struct stat stb;
+                       if (strchr(nm, ':'))
+                               nm = strchr(nm, ':')+1;
+                       sprintf(nbuf, "/dev/md/%s", nm);
+
+                       if (stat(nbuf, &stb) == 0 &&
+                           S_ISBLK(stb.st_mode) &&
+                           major(stb.st_rdev) == (usepart ?
+                                                  get_mdp_major() : MD_MAJOR)){
+                               if (usepart)
+                                       devnum = minor(stb.st_rdev)
+                                               >> MdpMinorShift;
+                               else
+                                       devnum = minor(stb.st_rdev);
+                               if (mddev_busy(usepart ? (-1-devnum) : devnum))
+                                       devnum = -1;
+                       }
+               }
+
+               if (devnum >= 0)
+                       devnum = usepart ? (-1-devnum) : devnum;
+               else
+                       devnum = find_free_devnum(usepart);
+               mdfd = open_mddev_devnum(NULL, devnum, ra->name,
+                                        chosen_name, autof>>3);
+
+               if (mdfd < 0) {
+                       fprintf(stderr, Name ": failed to open %s: %s.\n",
+                               chosen_name, strerror(errno));
+                       return 2;
+               }
+
+               sra = sysfs_read(mdfd, 0, 0);
+
+               sprintf(ver, "external:%s", ra->text_version);
+               sysfs_set_str(sra, NULL, "metadata_version", ver);
+
+               sysfs_set_array(sra, ra);
+               for (dev = ra->devs; dev; dev = dev->next)
+                       if (sysfs_add_disk(sra, dev) == 0)
+                               working++;
+
+               if (runstop > 0 || working >= ra->array.working_disks) {
+                       switch(ra->array.level) {
+                       case LEVEL_LINEAR:
+                       case LEVEL_MULTIPATH:
+                       case 0:
+                               sysfs_set_str(sra, NULL, "array_state",
+                                             "active");
+                               break;
+                       default:
+                               sysfs_set_str(sra, NULL, "array_state",
+                                             "readonly");
+                               /* start mdmon if needed. */
+                               if (!mdmon_running(st->container_dev))
+                                       start_mdmon(st->container_dev);
+                               ping_monitor(devnum2devname(st->container_dev));
+                               break;
+                       }
+                       if (verbose >= 0)
+                               printf("Started %s with %d devices\n",
+                                      chosen_name, working);
+                       /* FIXME should have an O_EXCL and wait for read-auto */
+               } else
+                       if (verbose >= 0)
+                               printf("%s assembled with %d devices but "
+                                      "not started\n",
+                                      chosen_name, working);
+               close(mdfd);
+       }
+       return 0;
+}
diff --git a/Kill.c b/Kill.c
index 0a2763eaa20615940a035ce4d7e2b3be8bd2a222..d5c1e36df4d3724e6cf060238902cb897e01bfed 100644 (file)
--- a/Kill.c
+++ b/Kill.c
@@ -34,7 +34,7 @@
 #include       "md_u.h"
 #include       "md_p.h"
 
-int Kill(char *dev, int force, int quiet)
+int Kill(char *dev, int force, int quiet, int noexcl)
 {
        /*
         * Nothing fancy about Kill.  It just zeroes out a superblock
@@ -44,7 +44,7 @@ int Kill(char *dev, int force, int quiet)
        int fd, rv = 0;
        struct supertype *st;
 
-       fd = open(dev, O_RDWR|O_EXCL);
+       fd = open(dev, O_DIRECT | (noexcl ? O_RDWR : (O_RDWR|O_EXCL)));
        if (fd < 0) {
                if (!quiet)
                        fprintf(stderr, Name ": Couldn't open %s for write - not zeroing\n",
@@ -63,10 +63,8 @@ int Kill(char *dev, int force, int quiet)
        if (force && rv >= 2)
                rv = 0; /* ignore bad data in superblock */
        if (rv== 0 || (force && rv >= 2)) {
-               mdu_array_info_t info;
-               info.major_version = -1; /* zero superblock */
                st->ss->free_super(st);
-               st->ss->init_super(st, &info, 0, "", NULL, NULL);
+               st->ss->init_super(st, NULL, 0, "", NULL, NULL);
                if (st->ss->store_super(st, fd)) {
                        if (!quiet)
                                fprintf(stderr, Name ": Could not zero superblock on %s\n",
index 52bd55051049cc470adbaf36955646f06da68adc..4a19fa9a1b3b41eb9d049f06f52c0a10ba9e3eb2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -69,19 +69,24 @@ MAN8DIR = $(MANDIR)/man8
 OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
        Incremental.o \
-       mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \
-       mapfile.o
+       mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+       restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o
 SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
        Incremental.c \
-       mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \
-       mapfile.c
+       mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+       restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+       Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+       super-ddf.o sha1.o crc32.o msg.o
+
 
 STATICSRC = pwgr.c
 STATICOBJS = pwgr.o
 
 ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
-       super0.c super1.c sha1.c
+       super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c
 ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c sysfs.c
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
 ifdef MDASSEMBLE_AUTO
@@ -89,7 +94,7 @@ ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS)
 ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
 endif
 
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man
 
 everything: all mdadm.static swap_super test_stripe \
        mdassemble mdassemble.auto mdassemble.static mdassemble.man \
@@ -119,6 +124,10 @@ mdadm.Os : $(SRCS) mdadm.h
 mdadm.O2 : $(SRCS) mdadm.h
        gcc -o mdadm.O2 $(CFLAGS)  -DHAVE_STDINT_H -O2 $(SRCS)
 
+mdmon : $(MON_OBJS)
+       $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
 test_stripe : restripe.c mdadm.h
        $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
 
@@ -161,8 +170,9 @@ $(OBJS) : mdadm.h bitmap.h
 sha1.o : sha1.c sha1.h md5.h
        $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
 
-install : mdadm install-man
+install : mdadm mdmon install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+       $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
 
 install-static : mdadm.static install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
@@ -188,7 +198,8 @@ test: mdadm test_stripe swap_super
        @echo "Please run 'sh ./test' as root"
 
 clean : 
-       rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+       rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+       mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
        mdadm.Os mdadm.O2 \
        mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
        mdassemble.klibc swap_super \
index 8297708dc39b9a885ea88ee67b52178d1fc07e6a..714a33b0a3df01430ffd561ab84ff29a813bf30a 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -78,13 +78,18 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
 {
        /* Run or stop the array. array must already be configured
         * required >= 0.90.0
+        * Only print failure messages if quiet == 0;
+        * quiet > 0 means really be quiet
+        * quiet < 0 means we will try again if it fails.
         */
        mdu_param_t param; /* unused */
 
        if (runstop == -1 && md_get_version(fd) < 9000) {
                if (ioctl(fd, STOP_MD, 0)) {
-                       if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n",
-                                           devname, strerror(errno));
+                       if (quiet == 0) fprintf(stderr,
+                                               Name ": stopping device %s "
+                                               "failed: %s\n",
+                                               devname, strerror(errno));
                        return 1;
                }
        }
@@ -111,9 +116,51 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
        } else if (runstop < 0){
                struct map_ent *map = NULL;
                struct stat stb;
-               if (ioctl(fd, STOP_ARRAY, NULL)) {
-                       if (quiet==0) {
-                               fprintf(stderr, Name ": fail to stop array %s: %s\n",
+               struct mdinfo *mdi;
+               /* If this is an mdmon managed array, just write 'inactive'
+                * to the array state and let mdmon clear up.
+                */
+               mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+               if (mdi &&
+                   mdi->array.level > 0 &&
+                   mdi->text_version[0] == '/') {
+                       char *cp;
+
+                       /* This is mdmon managed. */
+                       close(fd);
+                       if (sysfs_set_str(mdi, NULL,
+                                         "array_state", "inactive") < 0) {
+                               if (quiet == 0)
+                                       fprintf(stderr, Name
+                                               ": failed to stop array %s: %s\n",
+                                               devname, strerror(errno));
+                               return 1;
+                       }
+
+                       /* Give monitor a chance to act */
+                       cp = strchr(mdi->text_version+1, '/');
+                       if (*cp)
+                               *cp = 0;
+                       ping_monitor(mdi->text_version+1);
+
+                       fd = open(devname, O_RDONLY);
+               } else if (mdi &&
+                          mdi->array.major_version == -1 &&
+                          mdi->array.minor_version == -2 &&
+                          mdi->text_version[0] != '/') {
+                       /* container, possibly mdmon-managed.
+                        * Make sure mdmon isn't opening it, which
+                        * would interfere with the 'stop'
+                        */
+                       ping_monitor(mdi->sys_name);
+               }
+               if (mdi)
+                       sysfs_free(mdi);
+
+               if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) {
+                       if (quiet == 0) {
+                               fprintf(stderr, Name
+                                       ": failed to stop array %s: %s\n",
                                        devname, strerror(errno));
                                if (errno == EBUSY)
                                        fprintf(stderr, "Perhaps a running "
@@ -122,9 +169,10 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
                        }
                        return 1;
                }
+
                if (quiet <= 0)
                        fprintf(stderr, Name ": stopped %s\n", devname);
-               if (fstat(fd, &stb) == 0) {
+               if (fd >= 0 && fstat(fd, &stb) == 0) {
                        int devnum;
                        if (major(stb.st_rdev) == MD_MAJOR)
                                devnum = minor(stb.st_rdev);
@@ -201,6 +249,7 @@ int Manage_subdevs(char *devname, int fd,
        struct supertype *st, *tst;
        int duuid[4];
        int ouuid[4];
+       int lfd = -1;
 
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                fprintf(stderr, Name ": cannot get array info for %s\n",
@@ -227,6 +276,7 @@ int Manage_subdevs(char *devname, int fd,
                unsigned long long ldsize;
                char dvname[20];
                char *dnprintable = dv->devname;
+               int err;
 
                next = dv->next;
                jnext = 0;
@@ -311,9 +361,14 @@ int Manage_subdevs(char *devname, int fd,
                        return 1;
                case 'a':
                        /* add the device */
-
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot add disks to a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
                        /* Make sure it isn't in use (in 2.6 or later) */
-                       tfd = open(dv->devname, O_RDONLY|O_EXCL);
+                       tfd = open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT);
                        if (tfd < 0) {
                                fprintf(stderr, Name ": Cannot open %s: %s\n",
                                        dv->devname, strerror(errno));
@@ -332,7 +387,9 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        close(tfd);
 
-                       if (array.major_version == 0 &&
+
+                       if (!tst->ss->external &&
+                           array.major_version == 0 &&
                            md_get_version(fd)%100 < 2) {
                                if (ioctl(fd, HOT_ADD_DISK,
                                          (unsigned long)stb.st_rdev)==0) {
@@ -451,11 +508,14 @@ int Manage_subdevs(char *devname, int fd,
                        disc.number =j;
                        disc.state = 0;
                        if (array.not_persistent==0) {
+                               int dfd;
                                if (dv->writemostly)
                                        disc.state |= 1 << MD_DISK_WRITEMOSTLY;
-                               tst->ss->add_to_super(tst, &disc);
-                               if (tst->ss->write_init_super(tst, &disc,
-                                                             dv->devname))
+                               dfd = open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+                               tst->ss->add_to_super(tst, &disc, dfd,
+                                                     dv->devname);
+                               /* write_init_super will close 'dfd' */
+                               if (tst->ss->write_init_super(tst))
                                        return 1;
                        } else if (dv->re_add) {
                                /*  this had better be raid1.
@@ -499,13 +559,70 @@ int Manage_subdevs(char *devname, int fd,
 
                case 'r':
                        /* hot remove */
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot remove disks from a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
+                       if (tst->ss->external) {
+                               /* To remove a device from a container, we must
+                                * check that it isn't in use in an array.
+                                * This involves looking in the 'holders'
+                                * directory - there must be just one entry,
+                                * the container.
+                                * To ensure that it doesn't get used as a
+                                * hold spare while we are checking, we
+                                * get an O_EXCL open on the container
+                                */
+                               int dnum = fd2devnum(fd);
+                               lfd = open_dev_excl(dnum);
+                               if (lfd < 0) {
+                                       fprintf(stderr, Name
+                                               ": Cannot get exclusive access "
+                                               " to container - odd\n");
+                                       return 1;
+                               }
+                               if (!sysfs_unique_holder(dnum, stb.st_rdev)) {
+                                       fprintf(stderr, Name
+                                               ": %s is %s, cannot remove.\n",
+                                               dnprintable,
+                                               errno == EEXIST ? "still in use":
+                                               "not a member");
+                                       close(lfd);
+                                       return 1;
+                               }
+                       }
                        /* FIXME check that it is a current member */
-                       if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) {
+                       err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
+                       if (err && errno == ENODEV) {
+                               /* Old kernels rejected this if no personality
+                                * registered */
+                               struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+                               struct mdinfo *dv = NULL;
+                               if (sra)
+                                       dv = sra->devs;
+                               for ( ; dv ; dv=dv->next)
+                                       if (dv->disk.major == major(stb.st_rdev) &&
+                                           dv->disk.minor == minor(stb.st_rdev))
+                                               break;
+                               if (dv)
+                                       err = sysfs_set_str(sra, dv,
+                                                           "state", "remove");
+                               else
+                                       err = -1;
+                               if (sra)
+                                       sysfs_free(sra);
+                       }
+                       if (err) {
                                fprintf(stderr, Name ": hot remove failed "
                                        "for %s: %s\n", dnprintable,
                                        strerror(errno));
+                               if (lfd >= 0)
+                                       close(lfd);
                                return 1;
                        }
+                       close(lfd);
                        if (verbose >= 0)
                                fprintf(stderr, Name ": hot removed %s\n",
                                        dnprintable);
diff --git a/Query.c b/Query.c
index 190ee298834e70d9640e29fbc551bba0a5742934..dc69eb8271ec171c35a418bf883b52cfacb04b6d 100644 (file)
--- a/Query.c
+++ b/Query.c
@@ -96,7 +96,7 @@ int Query(char *dev)
        if (superror == 0) {
                /* array might be active... */
                st->ss->getinfo_super(st, &info);
-               if (st->ss->major == 0) {
+               if (st->ss == &super0) {
                        mddev = get_md_name(info.array.md_minor);
                        disc.number = info.disk.number;
                        activity = "undetected";
@@ -121,7 +121,7 @@ int Query(char *dev)
                       activity,
                       map_num(pers, info.array.level),
                       mddev);
-               if (st->ss->major == 0)
+               if (st->ss == &super0)
                        put_md_name(mddev);
        }
        return 0;
index 031889432c21e85509e2efc5daefc5126fa73f29..12ed17f93e7a869de87623b2a4487baf27feef8c 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -612,6 +612,7 @@ mapping_t pers[] = {
        { "raid10", 10},
        { "10", 10},
        { "faulty", LEVEL_FAULTY},
+       { "container", LEVEL_CONTAINER},
        { NULL, 0}
 };
 
diff --git a/TODO b/TODO
index f79163b88ca434065232034381af3a27aff23c25..279d20db99892c8e79b969a961ae38a7be5fd77c 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,3 +1,38 @@
+ - add 'name' field to metadata type and use it.
+ - use validate_geometry more
+ - metadata should be able to check/reject bitmap stuff.
+
+DDF:
+  Three new metadata types:
+    ddf - used only to create a container.
+    ddf-bvd - used to create an array in a container
+    ddf-svd - used to create a secondary array from bvds.
+
+  Usage:
+    mdadm -C /dev/ddf1 /dev/sd[abcdef]
+    mdadm -C /dev/md1 -e ddf /dev/sd[a-f]
+    mdadm -C /dev/md1 -l container /dev/sd[a-f]
+
+        Each of these create a new ddf container using all those
+       devices.  The name 'ddf*' signals that ddf metadata should be used.
+       '-e ddf' only supports one level - 'container'.  'container' is only
+       supported by ddf.
+
+    mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ???
+    mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb
+       If exactly one device is given, and it is a container, we select
+       devices from that container.
+       If devices are given that are already in use, they must be in use by
+       a container, and the array is created in the container.
+       If devices given are bvds, we slip under the hood to make
+         the svd arrays.
+
+    mdadm -A /dev/ddf ......
+       base drives make a container.  Anything in that container is started
+        auto-read-only.
+        if /dev/ddf is already assembled, we assemble bvds and svds inside it.
+
+
 2005-dec-20
   Want an incremental assembly mode to work nicely with udev.
   Core usage would be something like
index fdf8884ddfe3d31524d6e583594adfa278425c82..86176696f957edef9d88955e869612ba6c47de85 100644 (file)
--- a/bitmap.c
+++ b/bitmap.c
@@ -122,11 +122,10 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
         */
        unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0;
        bitmap_info_t *info;
-       char *buf, *unaligned;
+       void *buf;
        int n, skip;
 
-       unaligned = malloc(8192*2);
-       buf = (char*) ((unsigned long)unaligned | 8191)+1;
+       posix_memalign(&buf, 512, 8192);
        n = read(fd, buf, 8192);
 
        info = malloc(sizeof(*info));
@@ -145,7 +144,6 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
                fprintf(stderr, Name ": failed to read superblock of bitmap "
                        "file: %s\n", strerror(errno));
                free(info);
-               free(unaligned);
                return NULL;
        }
        memcpy(&info->sb, buf, sizeof(info->sb));
diff --git a/crc32.c b/crc32.c
new file mode 100644 (file)
index 0000000..12d08e5
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,340 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results about a factor
+ * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+  protection on the static variables used to control the first-use generation
+  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+  first call get_crc_table() to initialize the tables before allowing more than
+  one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+/* #include "zutil.h"      / * for STDC and FAR definitions */
+#define STDC
+#define FAR
+#define Z_NULL ((void*)0)
+#define OF(X) X
+#define ZEXPORT
+typedef long ptrdiff_t;
+#define NOBYFOUR
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
+#    include <limits.h>
+#    define BYFOUR
+#    if (UINT_MAX == 0xffffffffUL)
+       typedef unsigned int u4;
+#    else
+#      if (ULONG_MAX == 0xffffffffUL)
+         typedef unsigned long u4;
+#      else
+#        if (USHRT_MAX == 0xffffffffUL)
+           typedef unsigned short u4;
+#        else
+#          undef BYFOUR     /* can't find a four-byte integer type! */
+#        endif
+#      endif
+#    endif
+#  endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+   local unsigned long crc32_little OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+   local unsigned long crc32_big OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+#  define TBLS 8
+#else
+#  define TBLS 1
+#endif /* BYFOUR */
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+   local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+
+/*
+  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+  Polynomials over GF(2) are represented in binary, one bit per coefficient,
+  with the lowest powers in the most significant bit.  Then adding polynomials
+  is just exclusive-or, and multiplying a polynomial by x is a right shift by
+  one.  If we call the above polynomial p, and represent a byte as the
+  polynomial q, also with the lowest power in the most significant bit (so the
+  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+  where a mod b means the remainder after dividing a by b.
+
+  This calculation is done using the shift-register method of multiplying and
+  taking the remainder.  The register is initialized to zero, and for each
+  incoming bit, x^32 is added mod p to the register if the bit is a one (where
+  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+  x (which is shifting right by one and adding x^32 mod p if the bit shifted
+  out is a one).  We start with the highest power (least significant bit) of
+  q and repeat for all eight bits of q.
+
+  The first table is simply the CRC of all possible eight bit values.  This is
+  all the information needed to generate CRCs on data a byte at a time for all
+  combinations of CRC register values and incoming bytes.  The remaining tables
+  allow for word-at-a-time CRC calculation for both big-endian and little-
+  endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+    unsigned long c;
+    int n, k;
+    unsigned long poly;                 /* polynomial exclusive-or pattern */
+    /* terms of polynomial defining this crc (except x^32): */
+    static volatile int first = 1;      /* flag to limit concurrent making */
+    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* See if another task is already doing this (not thread-safe, but better
+       than nothing -- significantly reduces duration of vulnerability in
+       case the advice about DYNAMIC_CRC_TABLE is ignored) */
+    if (first) {
+        first = 0;
+
+        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+        poly = 0UL;
+        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+            poly |= 1UL << (31 - p[n]);
+
+        /* generate a crc for every 8-bit value */
+        for (n = 0; n < 256; n++) {
+            c = (unsigned long)n;
+            for (k = 0; k < 8; k++)
+                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+            crc_table[0][n] = c;
+        }
+
+#ifdef BYFOUR
+        /* generate crc for each value followed by one, two, and three zeros,
+           and then the byte reversal of those as well as the first table */
+        for (n = 0; n < 256; n++) {
+            c = crc_table[0][n];
+            crc_table[4][n] = REV(c);
+            for (k = 1; k < 4; k++) {
+                c = crc_table[0][c & 0xff] ^ (c >> 8);
+                crc_table[k][n] = c;
+                crc_table[k + 4][n] = REV(c);
+            }
+        }
+#endif /* BYFOUR */
+
+        crc_table_empty = 0;
+    }
+    else {      /* not first */
+        /* wait for the other guy to finish (not efficient, but rare) */
+        while (crc_table_empty)
+            ;
+    }
+
+#ifdef MAKECRCH
+    /* write out CRC tables to crc32.h */
+    {
+        FILE *out;
+
+        out = fopen("crc32.h", "w");
+        if (out == NULL) return;
+        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+        fprintf(out, "local const unsigned long FAR ");
+        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
+        write_table(out, crc_table[0]);
+#  ifdef BYFOUR
+        fprintf(out, "#ifdef BYFOUR\n");
+        for (k = 1; k < 8; k++) {
+            fprintf(out, "  },\n  {\n");
+            write_table(out, crc_table[k]);
+        }
+        fprintf(out, "#endif\n");
+#  endif /* BYFOUR */
+        fprintf(out, "  }\n};\n");
+        fclose(out);
+    }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+    FILE *out;
+    const unsigned long FAR *table;
+{
+    int n;
+
+    for (n = 0; n < 256; n++)
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
+                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+    return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(
+       unsigned long crc,
+       const unsigned char FAR *buf,
+       unsigned len)
+{
+    if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+    if (sizeof(void *) == sizeof(ptrdiff_t)) {
+        u4 endian;
+
+        endian = 1;
+        if (*((unsigned char *)(&endian)))
+            return crc32_little(crc, buf, len);
+        else
+            return crc32_big(crc, buf, len);
+    }
+#endif /* BYFOUR */
+/*    crc = crc ^ 0xffffffffUL;*/
+    while (len >= 8) {
+        DO8;
+        len -= 8;
+    }
+    if (len) do {
+        DO1;
+    } while (--len);
+    return crc /* ^ 0xffffffffUL*/;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = (u4)crc;
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    while (len >= 32) {
+        DOLIT32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOLIT4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = REV((u4)crc);
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    buf4--;
+    while (len >= 32) {
+        DOBIG32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOBIG4;
+        len -= 4;
+    }
+    buf4++;
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
diff --git a/crc32.h b/crc32.h
new file mode 100644 (file)
index 0000000..8053b61
--- /dev/null
+++ b/crc32.h
@@ -0,0 +1,441 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+  {
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL
+#ifdef BYFOUR
+  },
+  {
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL
+  },
+  {
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL
+  },
+  {
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL
+  },
+  {
+    0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+    0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+    0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+    0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+    0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+    0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+    0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+    0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+    0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+    0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+    0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+    0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+    0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+    0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+    0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+    0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+    0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+    0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+    0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+    0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+    0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+    0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+    0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+    0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+    0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+    0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+    0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+    0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+    0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+    0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+    0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+    0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+    0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+    0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+    0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+    0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+    0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+    0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+    0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+    0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+    0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+    0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+    0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+    0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+    0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+    0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+    0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+    0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+    0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+    0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+    0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+    0x8def022dUL
+  },
+  {
+    0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+    0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+    0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+    0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+    0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+    0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+    0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+    0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+    0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+    0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+    0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+    0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+    0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+    0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+    0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+    0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+    0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+    0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+    0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+    0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+    0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+    0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+    0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+    0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+    0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+    0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+    0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+    0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+    0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+    0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+    0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+    0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+    0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+    0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+    0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+    0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+    0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+    0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+    0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+    0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+    0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+    0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+    0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+    0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+    0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+    0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+    0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+    0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+    0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+    0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+    0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+    0x72fd2493UL
+  },
+  {
+    0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+    0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+    0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+    0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+    0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+    0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+    0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+    0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+    0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+    0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+    0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+    0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+    0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+    0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+    0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+    0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+    0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+    0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+    0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+    0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+    0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+    0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+    0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+    0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+    0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+    0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+    0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+    0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+    0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+    0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+    0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+    0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+    0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+    0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+    0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+    0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+    0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+    0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+    0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+    0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+    0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+    0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+    0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+    0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+    0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+    0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+    0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+    0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+    0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+    0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+    0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+    0xed3498beUL
+  },
+  {
+    0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+    0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+    0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+    0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+    0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+    0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+    0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+    0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+    0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+    0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+    0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+    0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+    0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+    0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+    0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+    0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+    0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+    0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+    0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+    0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+    0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+    0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+    0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+    0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+    0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+    0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+    0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+    0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+    0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+    0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+    0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+    0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+    0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+    0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+    0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+    0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+    0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+    0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+    0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+    0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+    0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+    0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+    0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+    0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+    0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+    0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+    0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+    0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+    0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+    0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+    0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+    0xf10605deUL
+#endif
+  }
+};
diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25
new file mode 100644 (file)
index 0000000..2329007
--- /dev/null
@@ -0,0 +1,199 @@
+Status: ok
+
+Support adding a spare to a live md array with external metadata.
+
+i.e. extend the 'md/dev-XXX/slot' attribute so that you can
+tell a device to fill an vacant slot in an and md array.
+
+
+Signed-off-by: Neil Brown <neilb@suse.de>
+
+### Diffstat output
+ ./drivers/md/md.c        |   44 ++++++++++++++++++++++++++++++++++++++++----
+ ./drivers/md/multipath.c |    7 ++++++-
+ ./drivers/md/raid1.c     |    7 ++++++-
+ ./drivers/md/raid10.c    |   10 ++++++++--
+ ./drivers/md/raid5.c     |   10 ++++++++--
+ 5 files changed, 68 insertions(+), 10 deletions(-)
+
+diff .prev/drivers/md/md.c ./drivers/md/md.c
+--- .prev/drivers/md/md.c      2008-06-05 09:19:56.000000000 +1000
++++ ./drivers/md/md.c  2008-06-10 10:41:21.000000000 +1000
+@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char 
+               slot = -1;
+       else if (e==buf || (*e && *e!= '\n'))
+               return -EINVAL;
+-      if (rdev->mddev->pers) {
++      if (rdev->mddev->pers && slot == -1) {
+               /* Setting 'slot' on an active array requires also
+                * updating the 'rd%d' link, and communicating
+                * with the personality with ->hot_*_disk.
+@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char 
+                * failed/spare devices.  This normally happens automatically,
+                * but not when the metadata is externally managed.
+                */
+-              if (slot != -1)
+-                      return -EBUSY;
+               if (rdev->raid_disk == -1)
+                       return -EEXIST;
+               /* personality does all needed checks */
+@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char 
+               sysfs_remove_link(&rdev->mddev->kobj, nm);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
++      } else if (rdev->mddev->pers) {
++              mdk_rdev_t *rdev2;
++              struct list_head *tmp;
++              /* Activating a spare .. or possibly reactivating
++               * if we every get bitmaps working here.
++               */
++
++              if (rdev->raid_disk != -1)
++                      return -EBUSY;
++
++              if (rdev->mddev->pers->hot_add_disk == NULL)
++                      return -EINVAL;
++
++              rdev_for_each(rdev2, tmp, rdev->mddev)
++                      if (rdev2->raid_disk == slot)
++                              return -EEXIST;
++
++              rdev->raid_disk = slot;
++              if (test_bit(In_sync, &rdev->flags))
++                      rdev->saved_raid_disk = slot;
++              else
++                      rdev->saved_raid_disk = -1;
++              err = rdev->mddev->pers->
++                      hot_add_disk(rdev->mddev, rdev);
++              if (err != 1) {
++                      rdev->raid_disk = -1;
++                      if (err == 0)
++                              return -EEXIST;
++                      return err;
++              }
++              sprintf(nm, "rd%d", rdev->raid_disk);
++              if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
++                      printk(KERN_WARNING
++                             "md: cannot register "
++                             "%s for %s\n",
++                             nm, mdname(rdev->mddev));
++
++              /* don't wakeup anyone, leave that to userspace. */
+       } else {
+               if (slot >= rdev->mddev->raid_disks)
+                       return -ENOSPC;
+@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev,
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+                       err = mddev->pers->hot_add_disk(mddev, rdev);
+-                      if (err)
++                      if (err < 0)
+                               unbind_rdev_from_array(rdev);
+               }
+               if (err)
+
+diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c
+--- .prev/drivers/md/multipath.c       2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/multipath.c   2008-06-10 10:35:03.000000000 +1000
+@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m
+       int found = 0;
+       int path;
+       struct multipath_info *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
++
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
+       print_multipath_conf(conf);
+-      for (path=0; path<mddev->raid_disks; path++) 
++      for (path = first; path <= last; path++)
+               if ((p=conf->multipaths+path)->rdev == NULL) {
+                       q = rdev->bdev->bd_disk->queue;
+                       blk_queue_stack_limits(mddev->queue, q);
+
+diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c
+--- .prev/drivers/md/raid10.c  2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid10.c      2008-06-10 10:28:53.000000000 +1000
+@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde
+       int found = 0;
+       int mirror;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+       if (mddev->recovery_cp < MaxSector)
+               /* only hot-add to in-sync arrays, as recovery is
+@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde
+       if (!enough(conf))
+               return 0;
++      if (rdev->raid_disk)
++              first = last = rdev->raid_disk;
++
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+               mirror = rdev->saved_raid_disk;
+       else
+-              mirror = 0;
+-      for ( ; mirror < mddev->raid_disks; mirror++)
++              mirror = first;
++      for ( ; mirror <= last ; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
+--- .prev/drivers/md/raid1.c   2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid1.c       2008-06-10 10:41:00.000000000 +1000
+@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev
+       int found = 0;
+       int mirror = 0;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+-      for (mirror=0; mirror < mddev->raid_disks; mirror++)
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
++      for (mirror = first; mirror <= last; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
+--- .prev/drivers/md/raid5.c   2008-05-30 14:49:35.000000000 +1000
++++ ./drivers/md/raid5.c       2008-06-10 10:27:51.000000000 +1000
+@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev
+       int found = 0;
+       int disk;
+       struct disk_info *p;
++      int first = 0;
++      int last = conf->raid_disks - 1;
+       if (mddev->degraded > conf->max_degraded)
+               /* no point adding a device */
+               return 0;
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
+       /*
+        * find the disk ... but prefer rdev->saved_raid_disk
+        * if possible.
+        */
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->disks[rdev->saved_raid_disk].rdev == NULL)
+               disk = rdev->saved_raid_disk;
+       else
+-              disk = 0;
+-      for ( ; disk < conf->raid_disks; disk++)
++              disk = first;
++      for ( ; disk <= last ; disk++)
+               if ((p=conf->disks + disk)->rdev == NULL) {
+                       clear_bit(In_sync, &rdev->flags);
+                       rdev->raid_disk = disk;
diff --git a/managemon.c b/managemon.c
new file mode 100644 (file)
index 0000000..c947552
--- /dev/null
@@ -0,0 +1,524 @@
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ *   Allocate the data structures and open the files.
+ *
+ *   For this we watch /proc/mdstat and find new arrays with
+ *   metadata type that confirms sharing. e.g. "md4"
+ *   When we find a new array we slip it into the list of
+ *   arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ *   and opening new files.
+ *
+ *   These come as a change to raid_disks.  We allocate a new
+ *   version of the data structures and slip it into the list.
+ *   'monitor' will notice and release the old version.
+ *   Changes to level, chunksize, layout.. do not need re-allocation.
+ *   Reductions in raid_disks don't really either, but we handle
+ *   them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ *   as a spare.
+ *
+ * - Deal with degraded array
+ *    We only do this when first noticing the array is degraded.
+ *    This can be when we first see the array, when sync completes or
+ *    when recovery completes.
+ *
+ *    Check if number of failed devices suggests recovery is needed, and
+ *    skip if not.
+ *    Ask metadata to allocate a spare device
+ *    Add device as not in_sync and give a role
+ *    Update metadata.
+ *    Open sysfs files and pass to monitor.
+ *    Make sure that monitor Starts recovery....
+ *
+ * - Pass on metadata updates from external programs such as
+ *   mdadm creating a new array.
+ *
+ *   This is most-messy.
+ *   It might involve adding a new array or changing the status of
+ *   a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ *   The required updates are received via a named pipe.  There will
+ *   be one named pipe for each container. Each message contains a
+ *   sync marker: 0x5a5aa5a5, A byte count, and the message.  This is
+ *   passed to the metadata handler which will interpret and process it.
+ *   For 'DDF' messages are internal data blocks with the leading
+ *   'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata.  Maybe we require
+ * the metadata to be mdX/NN  when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ *   metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ *   and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays.  This shouldn't
+ *   happen, as we should do all the adding.  Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc.  Update metadata and
+ *   start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include       "mdadm.h"
+#include       "mdmon.h"
+#include       <sys/syscall.h>
+#include       <sys/socket.h>
+#include       <signal.h>
+
+static void close_aa(struct active_array *aa)
+{
+       struct mdinfo *d;
+
+       for (d = aa->info.devs; d; d = d->next)
+               close(d->state_fd);
+
+       close(aa->action_fd);
+       close(aa->info.state_fd);
+       close(aa->resync_start_fd);
+}
+
+static void free_aa(struct active_array *aa)
+{
+       /* Note that this doesn't close fds if they are being used
+        * by a clone.  ->container will be set for a clone
+        */
+       dprintf("%s: devnum: %d\n", __func__, aa->devnum);
+       if (!aa->container)
+               close_aa(aa);
+       while (aa->info.devs) {
+               struct mdinfo *d = aa->info.devs;
+               aa->info.devs = d->next;
+               free(d);
+       }
+       free(aa);
+}
+
+static struct active_array *duplicate_aa(struct active_array *aa)
+{
+       struct active_array *newa = malloc(sizeof(*newa));
+       struct mdinfo **dp1, **dp2;
+
+       *newa = *aa;
+       newa->next = NULL;
+       newa->replaces = NULL;
+       newa->info.next = NULL;
+
+       dp2 = &newa->info.devs;
+
+       for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
+               struct mdinfo *d;
+               if ((*dp1)->state_fd < 0)
+                       continue;
+
+               d = malloc(sizeof(*d));
+               *d = **dp1;
+               *dp2 = d;
+               dp2 = & d->next;
+       }
+       *dp2 = NULL;
+
+       return newa;
+}
+
+static void wakeup_monitor(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mon_tid, SIGUSR1);
+}
+
+static void remove_old(void)
+{
+       if (discard_this) {
+               discard_this->next = NULL;
+               free_aa(discard_this);
+               if (pending_discard == discard_this)
+                       pending_discard = NULL;
+               discard_this = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void replace_array(struct supertype *container,
+                         struct active_array *old,
+                         struct active_array *new)
+{
+       /* To replace an array, we add it to the top of the list
+        * marked with ->replaces to point to the original.
+        * 'monitor' will take the original out of the list
+        * and put it on 'discard_this'.  We take it from there
+        * and discard it.
+        */
+       remove_old();
+       while (pending_discard) {
+               while (discard_this == NULL)
+                       sleep(1);
+               remove_old();
+       }
+       pending_discard = old;
+       new->replaces = old;
+       new->next = container->arrays;
+       container->arrays = new;
+       wakeup_monitor();
+}
+
+struct metadata_update *update_queue = NULL;
+struct metadata_update *update_queue_handled = NULL;
+struct metadata_update *update_queue_pending = NULL;
+
+void check_update_queue(struct supertype *container)
+{
+       while (update_queue_handled) {
+               struct metadata_update *this = update_queue_handled;
+               update_queue_handled = this->next;
+               free(this->buf);
+               if (this->space)
+                       free(this->space);
+               free(this);
+       }
+       if (update_queue == NULL &&
+           update_queue_pending) {
+               update_queue = update_queue_pending;
+               update_queue_pending = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void queue_metadata_update(struct metadata_update *mu)
+{
+       struct metadata_update **qp;
+
+       qp = &update_queue_pending;
+       while (*qp)
+               qp = & ((*qp)->next);
+       *qp = mu;
+}
+
+void wait_update_handled(void)
+{
+       /* Wait for any pending update to be handled by monitor.
+        * i.e. wait until update_queue is NULL
+        */
+       while (update_queue)
+               usleep(100 * 1000);
+}
+
+static void manage_container(struct mdstat_ent *mdstat,
+                            struct supertype *container)
+{
+       /* The only thing of interest here is if a new device
+        * has been added to the container.  We add it to the
+        * array ignoring any metadata on it.
+        * FIXME should we look for compatible metadata and take hints
+        * about spare assignment.... probably not.
+        */
+       if (mdstat->devcnt != container->devcnt) {
+               /* read /sys/block/NAME/md/dev-??/block/dev to find out
+                * what is there, and compare with container->info.devs
+                * To see what is removed and what is added.
+                * These need to be remove from, or added to, the array
+                */
+               // FIXME
+               container->devcnt = mdstat->devcnt;
+       }
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+                         struct active_array *a)
+{
+       /* Compare mdstat info with known state of member array.
+        * We do not need to look for device state changes here, that
+        * is dealt with by the monitor.
+        *
+        * We just look for changes which suggest that a reshape is
+        * being requested.
+        * Unfortunately decreases in raid_disks don't show up in
+        * mdstat until the reshape completes FIXME.
+        *
+        * Actually, we also want to handle degraded arrays here by
+        * trying to find and assign a spare.
+        * We do that whenever the monitor tells us too.
+        */
+       // FIXME
+       a->info.array.raid_disks = mdstat->raid_disks;
+       a->info.array.chunk_size = mdstat->chunk_size;
+       // MORE
+
+       if (a->check_degraded) {
+               struct metadata_update *updates = NULL;
+               struct mdinfo *newdev;
+               struct active_array *newa;
+               wait_update_handled();
+               a->check_degraded = 0;
+
+               /* The array may not be degraded, this is just a good time
+                * to check.
+                */
+               newdev = a->container->ss->activate_spare(a, &updates);
+               if (newdev) {
+                       struct mdinfo *d;
+                       /* Cool, we can add a device or several. */
+                       newa = duplicate_aa(a);
+                       /* suspend recovery - maybe not needed */
+
+                       /* Add device to array and set offset/size/slot.
+                        * and open files for each newdev */
+                       for (d = newdev; d ; d = d->next) {
+                               struct mdinfo *newd;
+                               if (sysfs_add_disk(&newa->info, d) < 0)
+                                       continue;
+                               newd = newa->info.devs;
+                               newd->state_fd = sysfs_open(a->devnum,
+                                                           newd->sys_name,
+                                                           "state");
+                               newd->prev_state
+                                       = read_dev_state(newd->state_fd);
+                               newd->curr_state = newd->prev_state;
+                       }
+                       queue_metadata_update(updates);
+                       replace_array(a->container, a, newa);
+                       sysfs_set_str(&a->info, NULL, "sync_action", "repair");
+               }
+       }
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+                      struct supertype *container,
+                      struct active_array *victim)
+{
+       /* A new array has appeared in this container.
+        * Hopefully it is already recorded in the metadata.
+        * Check, then create the new array to report it to
+        * the monitor.
+        */
+
+       struct active_array *new;
+       struct mdinfo *mdi, *di;
+       char *inst;
+       int i;
+
+       new = malloc(sizeof(*new));
+
+       memset(new, 0, sizeof(*new));
+
+       new->devnum = mdstat->devnum;
+       strcpy(new->info.sys_name, devnum2devname(new->devnum));
+
+       new->prev_state = new->curr_state = new->next_state = inactive;
+       new->prev_action= new->curr_action= new->next_action= idle;
+
+       new->container = container;
+
+       inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+
+       mdi = sysfs_read(-1, new->devnum,
+                        GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
+                        GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+       if (!mdi) {
+               /* Eeek. Cannot monitor this array.
+                * Mark it to be ignored by setting container to NULL
+                */
+               new->container = NULL;
+               replace_array(container, victim, new);
+               return;
+       }
+
+       new->info.array = mdi->array;
+       new->info.component_size = mdi->component_size;
+
+       for (i = 0; i < new->info.array.raid_disks; i++) {
+               struct mdinfo *newd = malloc(sizeof(*newd));
+
+               for (di = mdi->devs; di; di = di->next)
+                       if (i == di->disk.raid_disk)
+                               break;
+
+               if (di) {
+                       memcpy(newd, di, sizeof(*newd));
+
+                       newd->state_fd = sysfs_open(new->devnum,
+                                                   newd->sys_name,
+                                                   "state");
+
+                       newd->prev_state = read_dev_state(newd->state_fd);
+                       newd->curr_state = newd->prev_state;
+               } else {
+                       newd->state_fd = -1;
+                       newd->disk.raid_disk = i;
+                       newd->prev_state = DS_REMOVE;
+                       newd->curr_state = DS_REMOVE;
+               }
+               sprintf(newd->sys_name, "rd%d", i);
+               newd->next = new->info.devs;
+               new->info.devs = newd;
+       }
+       new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
+       new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
+       new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
+       get_resync_start(new);
+       dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
+               new->action_fd, new->info.state_fd);
+
+       sysfs_free(mdi);
+       // finds and compares.
+       if (container->ss->open_new(container, new, inst) < 0) {
+               // FIXME close all those files
+               new->container = NULL;
+               replace_array(container, victim, new);
+               return;
+       }
+       replace_array(container, victim, new);
+       return;
+}
+
+void manage(struct mdstat_ent *mdstat, struct supertype *container)
+{
+       /* We have just read mdstat and need to compare it with
+        * the known active arrays.
+        * Arrays with the wrong metadata are ignored.
+        */
+
+       for ( ; mdstat ; mdstat = mdstat->next) {
+               struct active_array *a;
+               if (mdstat->devnum == container->devnum) {
+                       manage_container(mdstat, container);
+                       continue;
+               }
+               if (mdstat->metadata_version == NULL ||
+                   strncmp(mdstat->metadata_version, "external:/", 10) != 0 ||
+                   strncmp(mdstat->metadata_version+10, container->devname,
+                           strlen(container->devname)) != 0 ||
+                   mdstat->metadata_version[10+strlen(container->devname)]
+                     != '/')
+                       /* Not for this array */
+                       continue;
+               /* Looks like a member of this container */
+               for (a = container->arrays; a; a = a->next) {
+                       if (mdstat->devnum == a->devnum) {
+                               if (a->container)
+                                       manage_member(mdstat, a);
+                               break;
+                       }
+               }
+               if (a == NULL || !a->container)
+                       manage_new(mdstat, container, a);
+       }
+}
+
+static void handle_message(struct supertype *container, struct metadata_update *msg)
+{
+       /* queue this metadata update through to the monitor */
+
+       struct metadata_update *mu;
+
+       if (msg->len == 0) {
+               int cnt = monitor_loop_cnt;
+               if (cnt & 1)
+                       cnt += 2; /* wait until next pselect */
+               else
+                       cnt += 3; /* wait for 2 pselects */
+               wakeup_monitor();
+               wait_update_handled();
+               while (monitor_loop_cnt - cnt < 0)
+                       usleep(10 * 1000);
+       } else {
+               mu = malloc(sizeof(*mu));
+               mu->len = msg->len;
+               mu->buf = msg->buf;
+               msg->buf = NULL;
+               mu->space = NULL;
+               mu->next = NULL;
+               if (container->ss->prepare_update)
+                       container->ss->prepare_update(container, mu);
+               queue_metadata_update(mu);
+       }
+}
+
+void read_sock(struct supertype *container)
+{
+       int fd;
+       struct metadata_update msg;
+       int terminate = 0;
+       long fl;
+       int tmo = 3; /* 3 second timeout before hanging up the socket */
+
+       fd = accept(container->sock, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       fl = fcntl(fd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(fd, F_SETFL, fl);
+
+       do {
+               msg.buf = NULL;
+
+               /* read and validate the message */
+               if (receive_message(fd, &msg, tmo) == 0) {
+                       handle_message(container, &msg);
+                       if (ack(fd, tmo) < 0)
+                               terminate = 1;
+               } else
+                       terminate = 1;
+
+       } while (!terminate);
+
+       close(fd);
+}
+
+int exit_now = 0;
+int manager_ready = 0;
+void do_manager(struct supertype *container)
+{
+       struct mdstat_ent *mdstat;
+       sigset_t set;
+
+       sigprocmask(SIG_UNBLOCK, NULL, &set);
+       sigdelset(&set, SIGUSR1);
+
+       do {
+
+               if (exit_now)
+                       exit(0);
+
+               mdstat = mdstat_read(1, 0);
+
+               manage(mdstat, container);
+
+               read_sock(container);
+
+               free_mdstat(mdstat);
+
+               remove_old();
+
+               check_update_queue(container);
+
+               manager_ready = 1;
+
+               mdstat_wait_fd(container->sock, &set);
+       } while(1);
+}
index 746073d059c508685a3743af79b995af36b05d6a..cf2ca2bf8a5837ba28456fb7a9cd32938e4294b0 100644 (file)
--- a/mapfile.c
+++ b/mapfile.c
@@ -33,8 +33,8 @@
  * also allows the array device name to be easily found.
  *
  * The map file is line based with space separated fields.  The fields are:
- *  Device id  -  mdX or mdpX  where is a number.
- *  metadata   -  0.90 1.0 1.1 1.2
+ *  Device id  -  mdX or mdpX  where is a number.
+ *  metadata   -  0.90 1.0 1.1 1.2 ddf ...
  *  UUID       -  uuid of the array
  *  path       -  path where device created: /dev/md/home
  *
@@ -62,7 +62,7 @@ int map_write(struct map_ent *mel)
                        fprintf(f, "mdp%d ", -1-mel->devnum);
                else
                        fprintf(f, "md%d ", mel->devnum);
-               fprintf(f, "%d.%d ", mel->major, mel->minor);
+               fprintf(f, "%s ", mel->metadata);
                fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0],
                        mel->uuid[1], mel->uuid[2], mel->uuid[3]);
                fprintf(f, "%s\n", mel->path);
@@ -87,13 +87,12 @@ int map_write(struct map_ent *mel)
 }
 
 void map_add(struct map_ent **melp,
-           int devnum, int major, int minor, int uuid[4], char *path)
+           int devnum, char *metadata, int uuid[4], char *path)
 {
        struct map_ent *me = malloc(sizeof(*me));
 
        me->devnum = devnum;
-       me->major = major;
-       me->minor = minor;
+       strcpy(me->metadata, metadata);
        memcpy(me->uuid, uuid, 16);
        me->path = strdup(path);
        me->next = *melp;
@@ -105,7 +104,8 @@ void map_read(struct map_ent **melp)
        FILE *f;
        char buf[8192];
        char path[200];
-       int devnum, major, minor, uuid[4];
+       int devnum, uuid[4];
+       char metadata[30];
        char nam[4];
 
        *melp = NULL;
@@ -117,12 +117,12 @@ void map_read(struct map_ent **melp)
                return;
 
        while (fgets(buf, sizeof(buf), f)) {
-               if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s",
-                          nam, &devnum, &major, &minor, uuid, uuid+1,
+               if (sscanf(buf, " md%1[p]%d %s %x:%x:%x:%x %200s",
+                          nam, &devnum, metadata, uuid, uuid+1,
                           uuid+2, uuid+3, path) == 9) {
                        if (nam[0] == 'p')
                                devnum = -1 - devnum;
-                       map_add(melp, devnum, major, minor, uuid, path);
+                       map_add(melp, devnum, metadata, uuid, path);
                }
        }
        fclose(f);
@@ -138,7 +138,7 @@ void map_free(struct map_ent *map)
        }
 }
 
-int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
               int *uuid, char *path)
 {
        struct map_ent *map, *mp;
@@ -151,15 +151,14 @@ int map_update(struct map_ent **mpp, int devnum, int major, int minor,
 
        for (mp = map ; mp ; mp=mp->next)
                if (mp->devnum == devnum) {
-                       mp->major = major;
-                       mp->minor = minor;
+                       strcpy(mp->metadata, metadata);
                        memcpy(mp->uuid, uuid, 16);
                        free(mp->path);
                        mp->path = strdup(path);
                        break;
                }
        if (!mp)
-               map_add(&map, devnum, major, minor, uuid, path);
+               map_add(&map, devnum, metadata, uuid, path);
        *mpp = NULL;
        rv = map_write(map);
        map_free(map);
diff --git a/md.4 b/md.4
index dfd287f1f156db44c03715fa26303c0b6f6c9011..ea12eaffaf1570f040801ff518c5a62b4b655d12 100644 (file)
--- a/md.4
+++ b/md.4
@@ -526,10 +526,22 @@ Finally, "idle" can be written to stop the check/repair process.
 .B md/stripe_cache_size
 This is only available on RAID5 and RAID6.  It records the size (in
 pages per device) of the  stripe cache which is used for synchronising
-all read and write operations to the array.  The default is 128.
+all write operations to the array and all read operations if the array
+is degraded.  The default is 256.  Valid values are 17 to 32768.
 Increasing this number can increase performance in some situations, at
-some cost in system memory.
+some cost in system memory.  Note, setting this value too high can
+result in an "out of memory" condition for the system.
 
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6.  This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading".  For fairness this defaults to
+1.  Valid values are 0 to stripe_cache_size.  Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.  
 
 .SS KERNEL PARAMETERS
 
diff --git a/mdadm.8 b/mdadm.8
index be8568d19a15b5c2aadb4ddd4f9716179748d196..3c283ca9e7b4bda5beb92b41f6b35acc499bdd59 100644 (file)
--- a/mdadm.8
+++ b/mdadm.8
@@ -1937,6 +1937,16 @@ that no metadata updates are made and no attempt at resync or recovery
 happens.  Further devices that are found before the first write can
 still be added safely.
 
+
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon.  This variable is intended primarily for debugging mdadm/mdmon.
+
 .SH EXAMPLES
 
 .B "  mdadm \-\-query /dev/name-of-device"
diff --git a/mdadm.c b/mdadm.c
index 3aa3b132eee6d1d9eb555a834e479186a7f0388c..b7865ef73d63f08bc22e0e090ad424ae2f817f84 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -1272,7 +1272,8 @@ int main(int argc, char *argv[])
                                                     export, test, homehost);
                                        continue;
                                case 'K': /* Zero superblock */
-                                       rv |= Kill(dv->devname, force, quiet); continue;
+                                       rv |= Kill(dv->devname, force, quiet,0);
+                                       continue;
                                case 'Q':
                                        rv |= Query(dv->devname); continue;
                                case 'X':
diff --git a/mdadm.h b/mdadm.h
index 5c18d15e594b84fb47c0d003e82fa5590ddbc4d4..52d94352e8ff2a3211686fcc996b5f1220e3e14e 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -76,6 +76,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #include       "md_u.h"
 #include       "md_p.h"
 #include       "bitmap.h"
+#include       "msg.h"
 
 #include <endian.h>
 /* Redhat don't like to #include <asm/byteorder.h>, and
@@ -106,6 +107,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) (_x)
 #define __le32_to_cpu(_x) (_x)
 #define __le64_to_cpu(_x) (_x)
+
+#define        __cpu_to_be16(_x) bswap_16(_x)
+#define __cpu_to_be32(_x) bswap_32(_x)
+#define __cpu_to_be64(_x) bswap_64(_x)
+#define        __be16_to_cpu(_x) bswap_16(_x)
+#define __be32_to_cpu(_x) bswap_32(_x)
+#define __be64_to_cpu(_x) bswap_64(_x)
 #elif BYTE_ORDER == BIG_ENDIAN
 #define        __cpu_to_le16(_x) bswap_16(_x)
 #define __cpu_to_le32(_x) bswap_32(_x)
@@ -113,6 +121,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) bswap_16(_x)
 #define __le32_to_cpu(_x) bswap_32(_x)
 #define __le64_to_cpu(_x) bswap_64(_x)
+
+#define        __cpu_to_be16(_x) (_x)
+#define __cpu_to_be32(_x) (_x)
+#define __cpu_to_be64(_x) (_x)
+#define        __be16_to_cpu(_x) (_x)
+#define __be32_to_cpu(_x) (_x)
+#define __be64_to_cpu(_x) (_x)
 #else
 #  error "unknown endianness."
 #endif
@@ -128,18 +143,36 @@ struct mdinfo {
        int                     uuid[4];
        char                    name[33];
        unsigned long long      data_offset;
-       unsigned long long      component_size;
+       unsigned long long      component_size; /* same as array.size, except in
+                                                * sectors and up to 64bits.
+                                                */
        int                     reshape_active;
        unsigned long long      reshape_progress;
+       unsigned long long      resync_start;
        int                     new_level, delta_disks, new_layout, new_chunk;
        int                     errors;
        int                     cache_size; /* size of raid456 stripe cache*/
        int                     mismatch_cnt;
        char                    text_version[50];
 
+       int container_member; /* for assembling external-metatdata arrays
+                              * This is to be used internally by metadata
+                              * handler only */
+
        char            sys_name[20];
        struct mdinfo *devs;
        struct mdinfo *next;
+
+       /* Device info for mdmon: */
+       int state_fd;
+       #define DS_FAULTY       1
+       #define DS_INSYNC       2
+       #define DS_WRITE_MOSTLY 4
+       #define DS_SPARE        8
+       #define DS_BLOCKED      16
+       #define DS_REMOVE       1024
+       int prev_state, curr_state, next_state;
+
 };
 
 struct createinfo {
@@ -252,22 +285,27 @@ struct mdstat_ent {
        char            *pattern; /* U or up, _ for down */
        int             percent; /* -1 if no resync */
        int             resync; /* 1 if resync, 0 if recovery */
+       int             devcnt;
+       int             raid_disks;
+       int             chunk_size;
+       char *          metadata_version;
        struct mdstat_ent *next;
 };
 
 extern struct mdstat_ent *mdstat_read(int hold, int start);
 extern void free_mdstat(struct mdstat_ent *ms);
 extern void mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
 extern int mddev_busy(int devnum);
 
 struct map_ent {
        struct map_ent *next;
        int     devnum;
-       int     major,minor;
+       char    metadata[20];
        int     uuid[4];
        char    *path;
 };
-extern int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+extern int map_update(struct map_ent **mpp, int devnum, char *metadata,
                      int uuid[4], char *path);
 extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]);
 extern void map_read(struct map_ent **melp);
@@ -275,7 +313,7 @@ extern int map_write(struct map_ent *mel);
 extern void map_delete(struct map_ent **mapp, int devnum);
 extern void map_free(struct map_ent *map);
 extern void map_add(struct map_ent **melp,
-                   int devnum, int major, int minor, int uuid[4], char *path);
+                   int devnum, char *metadata, int uuid[4], char *path);
 
 /* various details can be requested */
 #define        GET_LEVEL       1
@@ -285,6 +323,7 @@ extern void map_add(struct map_ent **melp,
 #define GET_CACHE      16
 #define        GET_MISMATCH    32
 #define        GET_VERSION     64
+#define        GET_DISKS       128
 
 #define        GET_DEVS        1024 /* gets role, major, minor */
 #define        GET_OFFSET      2048
@@ -295,6 +334,7 @@ extern void map_add(struct map_ent **melp,
 /* If fd >= 0, get the array it is open on,
  * else use devnum. >=0 -> major9. <0.....
  */
+extern int sysfs_open(int devnum, char *devname, char *attr);
 extern void sysfs_free(struct mdinfo *sra);
 extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options);
 extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
@@ -303,6 +343,11 @@ extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, unsigned long long val);
 extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                        char *name, unsigned long long *val);
+extern int sysfs_set_array(struct mdinfo *sra,
+                          struct mdinfo *info);
+extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd);
+extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
+extern int sysfs_unique_holder(int devnum, long rdev);
 
 
 extern int save_stripes(int *source, unsigned long long *offsets,
@@ -326,28 +371,126 @@ extern mapping_t r5layout[], pers[], modes[], faultylayout[];
 
 extern char *map_dev(int major, int minor, int create);
 
+struct active_array;
+struct metadata_update;
 
+/* A superswitch provides entry point the a metadata handler.
+ *
+ * The super_switch primarily operates on some "metadata" that
+ * is accessed via the 'supertype'.
+ * This metadata has one of three possible sources.
+ * 1/ It is read from a single device.  In this case it may not completely
+ *    describe the array or arrays as some information might be on other
+ *    devices.
+ * 2/ It is read from all devices in a container.  In this case all
+ *    information is present.
+ * 3/ It is created by ->init_super / ->add_to_super.  In this case it will
+ *    be complete once enough ->add_to_super calls have completed.
+ *
+ * When creating an array inside a container, the metadata will be
+ * formed by a combination of 2 and 3.  The metadata or the array is read,
+ * then new information is added.
+ *
+ * The metadata must sometimes have a concept of a 'current' array
+ * and a 'current' device.
+ * The 'current' array is set by init_super to be the newly created array,
+ * or is set by super_by_fd when it finds it is looking at an array inside
+ * a container.
+ *
+ * The 'current' device is either the device that the metadata was read from
+ * in case 1, or the last device added by add_to_super in case 3.
+ * Case 2 does not identify a 'current' device.
+ */
 extern struct superswitch {
+
+       /* Used to report details of metadata read from a component
+        * device. ->load_super has been called.
+        */
        void (*examine_super)(struct supertype *st, char *homehost);
        void (*brief_examine_super)(struct supertype *st);
        void (*export_examine_super)(struct supertype *st);
+
+       /* Used to report details of an active array.
+        * ->load_super was possibly given a 'component' string.
+        */
        void (*detail_super)(struct supertype *st, char *homehost);
        void (*brief_detail_super)(struct supertype *st);
        void (*export_detail_super)(struct supertype *st);
+
+       /* Used:
+        *   to get uuid to storing in bitmap metadata
+        *   and 'reshape' backup-data metadata
+        *   To see if a device is being re-added to an array it was part of.
+        */
        void (*uuid_from_super)(struct supertype *st, int uuid[4]);
+
+       /* Extra generic details from metadata.  This could be details about
+        * the container, or about an individual array within the container.
+        * The determination is made either by:
+        *   load_super being given a 'component' string.
+        *   validate_geometry determining what to create.
+        * The info includes both array information and device information.
+        * The particular device should be:
+        *   The last device added by add_to_super
+        *   The device the metadata was loaded from by load_super
+        */
        void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
+
+       /* Check if the given metadata is flagged as belonging to "this"
+        * host.  For arrays that don't determine a minor-number, this
+        * can always be true (??)
+        */
        int (*match_home)(struct supertype *st, char *homehost);
+
+       /* Make one of several generic modifications to metadata
+        * prior to assembly (or other times).
+        *   sparc2.2  - first bug in early 0.90 metadata
+        *   super-minor - change name of 0.90 metadata
+        *   summaries - 'correct' any redundant data
+        *   resync - mark array as dirty to trigger a resync.
+        *   uuid - set new uuid - only 0.90 or 1.x
+        *   name - change the name of the array (where supported)
+        *   homehost - change which host this array is tied to.
+        *   devicesize - If metadata is at start of device, change recorded
+        *               device size to match actual device size
+        *   byteorder - swap bytes for 0.90 metadata
+        *
+        *   force-one  - mark that device as uptodate, not old or failed.
+        *   force-array - mark array as clean if it would not otherwise
+        *               assemble
+        *   assemble   - not sure how this is different from force-one...
+        *   linear-grow-new - add a new device to a linear array, but don't
+        *                   change the size: so superblock still matches
+        *   linear-grow-update - now change the size of the array.
+        */
        int (*update_super)(struct supertype *st, struct mdinfo *info,
                            char *update,
                            char *devname, int verbose,
                            int uuid_set, char *homehost);
+
+       /* Create new metadata for new array as described.  This could
+        * be a new container, or an array in a pre-existing container.
+        * Also used to zero metadata prior to writing it to invalidate old
+        * metadata.
+        */
        int (*init_super)(struct supertype *st, mdu_array_info_t *info,
                          unsigned long long size, char *name,
                          char *homehost, int *uuid);
-       void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo);
+
+       /* update the metadata to include new device, either at create or
+        * when hot-adding a spare.
+        */
+       void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
+                            int fd, char *devname);
+
+       /* Write metadata to one device when fixing problems or adding
+        * a new device.
+        */
        int (*store_super)(struct supertype *st, int fd);
-       int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo,
-                               char *devname);
+
+       /*  Write all metadata for this array.
+        */
+       int (*write_init_super)(struct supertype *st);
        int (*compare_super)(struct supertype *st, struct supertype *tst);
        int (*load_super)(struct supertype *st, int fd, char *devname);
        struct supertype * (*match_metadata_desc)(char *arg);
@@ -358,15 +501,108 @@ extern struct superswitch {
        void (*locate_bitmap)(struct supertype *st, int fd);
        int (*write_bitmap)(struct supertype *st, int fd);
        void (*free_super)(struct supertype *st);
-       int major;
+
+       /* validate_geometry is called with an st returned by
+        * match_metadata_desc.
+        * It should check that the geometry described in compatible with
+        * the metadata type.  It will be called repeatedly as devices
+        * added to validate changing size and new devices.  If there are
+        * inter-device dependencies, it should record sufficient details
+        * so these can be validated.
+        */
+       int (*validate_geometry)(struct supertype *st, int level, int layout,
+                                int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *subdev, unsigned long long *freesize,
+                                int verbose);
+
+       struct mdinfo *(*container_content)(struct supertype *st);
+
+/* for mdmon */
+       int (*open_new)(struct supertype *c, struct active_array *a,
+                       char *inst);
+
+       /* Tell the metadata handler the current state of the array.
+        * This covers whether it is known to be consistent (no pending writes)
+        * when how far along a resync is known to have progressed
+        * (in a->resync_start).
+        * resync status is really irrelevant if the array is not consistent,
+        * but some metadata (DDF!) have a place to record the distinction.
+        */
+       void (*set_array_state)(struct active_array *a, int consistent);
+
+       /* When the state of a device might have changed, we call set_disk to
+        * tell the metadata what the current state is.
+        * Typically this happens on spare->in_sync and (spare|in_sync)->faulty
+        * transitions.
+        * set_disk might be called when the state of the particular disk has
+        * not in fact changed.
+        */
+       void (*set_disk)(struct active_array *a, int n, int state);
+       void (*sync_metadata)(struct supertype *st);
+       void (*process_update)(struct supertype *st,
+                              struct metadata_update *update);
+       void (*prepare_update)(struct supertype *st,
+                              struct metadata_update *update);
+
+       /* activate_spare will check if the array is degraded and, if it
+        * is, try to find some spare space in the container.
+        * On success, it add appropriate updates (For process_update) to
+        * to the 'updates' list and returns a list of 'mdinfo' identifying
+        * the device, or devices as there might be multiple missing
+        * devices and multiple spares available.
+        */
+       struct mdinfo *(*activate_spare)(struct active_array *a,
+                                        struct metadata_update **updates);
+
        int swapuuid; /* true if uuid is bigending rather than hostendian */
-} super0, super1, *superlist[];
+       int external;
+} super0, super1, super_ddf, *superlist[];
 
+extern struct superswitch super_imsm;
+
+struct metadata_update {
+       int     len;
+       char    *buf;
+       void    *space; /* allocated space that monitor will use */
+       struct metadata_update *next;
+};
+
+/* A supertype holds a particular collection of metadata.
+ * It identifies the metadata type by the superswitch, and the particular
+ * sub-version of that metadata type.
+ * metadata read in or created is stored in 'sb' and 'info'.
+ * There are also fields used by mdmon to track containers.
+ *
+ * A supertype is created by:
+ *   super_by_fd
+ *   guess_super
+ *   dup_super
+ */
 struct supertype {
        struct superswitch *ss;
        int minor_version;
        int max_devs;
+       int container_dev;    /* devnum of container */
+       char subarray[32];      /* name of array inside container */
        void *sb;
+       void *info;
+
+       struct metadata_update *updates;
+       struct metadata_update **update_tail;
+
+       /* extra stuff used by mdmon */
+       struct active_array *arrays;
+       int sock; /* listen to external programs */
+       int devnum;
+       char *devname; /* e.g. md0.  This appears in metadata_verison:
+                       *  external:/md0/12
+                       */
+       int devcnt;
+       char *device_name; /* e.g. /dev/md/whatever */
+
+       struct mdinfo *devs;
+
 };
 
 extern struct supertype *super_by_fd(int fd);
@@ -459,11 +695,13 @@ extern int Monitor(mddev_dev_t devlist,
                   int period, int daemonise, int scan, int oneshot,
                   int dosyslog, int test, char *pidfile);
 
-extern int Kill(char *dev, int force, int quiet);
+extern int Kill(char *dev, int force, int quiet, int noexcl);
 extern int Wait(char *dev);
 
 extern int Incremental(char *devname, int verbose, int runstop,
                       struct supertype *st, char *homehost, int autof);
+extern int Incremental_container(struct supertype *st, char *devname,
+                                int verbose, int runstop, int autof);
 extern void RebuildMap(void);
 extern int IncrementalScan(int verbose);
 
@@ -484,6 +722,7 @@ extern int check_raid(int fd, char *name);
 
 extern int get_mdp_major(void);
 extern int dev_open(char *dev, int flags);
+extern int open_dev_excl(int devnum);
 extern int is_standard(char *dev, int *nump);
 
 extern int parse_auto(char *str, char *msg, int config);
@@ -509,6 +748,10 @@ extern int enough(int level, int raid_disks, int layout, int clean,
 extern int ask(char *mesg);
 extern unsigned long long get_component_size(int fd);
 extern void remove_partitions(int fd);
+extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize);
+extern int flush_metadata_updates(struct supertype *st);
+extern void append_metadata_update(struct supertype *st, void *buf, int len);
 
 
 extern char *human_size(long long bytes);
@@ -525,12 +768,45 @@ extern char DefaultConfFile[];
 extern int open_mddev(char *dev, int autof);
 extern int open_mddev_devnum(char *devname, int devnum, char *name,
                             char *chosen_name, int parts);
-
+extern int open_container(int fd);
+
+extern int mdmon_running(int devnum);
+extern int signal_mdmon(int devnum);
+extern int env_no_mdmon(void);
+extern int start_mdmon(int devnum);
+
+extern char *devnum2devname(int num);
+extern int devname2devnum(char *name);
+extern int fd2devnum(int fd);
+
+static inline int dev2major(int d)
+{
+       if (d >= 0)
+               return MD_MAJOR;
+       else
+               return get_mdp_major();
+}
+
+static inline int dev2minor(int d)
+{
+       if (d >= 0)
+               return d;
+       return (-1-d) << MdpMinorShift;
+}
+
+static inline int ROUND_UP(int a, int base)
+{
+       return ((a+base-1)/base)*base;
+}
 
 #define        LEVEL_MULTIPATH         (-4)
 #define        LEVEL_LINEAR            (-1)
 #define        LEVEL_FAULTY            (-5)
 
+/* kernel module doesn't know about these */
+#define LEVEL_CONTAINER                (-100)
+#define        LEVEL_UNSUPPORTED       (-200)
+
 
 /* faulty stuff */
 
diff --git a/mdmon.c b/mdmon.c
new file mode 100644 (file)
index 0000000..85f44bc
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,348 @@
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked.  It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include       <unistd.h>
+#include       <stdlib.h>
+#include       <sys/types.h>
+#include       <sys/stat.h>
+#include       <sys/socket.h>
+#include       <sys/un.h>
+#include       <sys/mman.h>
+#include       <sys/syscall.h>
+#include       <sys/wait.h>
+#include       <stdio.h>
+#include       <errno.h>
+#include       <string.h>
+#include       <fcntl.h>
+#include       <signal.h>
+
+#include       <sched.h>
+
+#include       "mdadm.h"
+#include       "mdmon.h"
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int run_child(void *v)
+{
+       struct supertype *c = v;
+
+       do_monitor(c);
+       return 0;
+}
+
+int clone_monitor(struct supertype *container)
+{
+       static char stack[4096];
+
+       mon_tid = clone(run_child, stack+4096-64,
+                  CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+                  container);
+
+       mgr_tid = syscall(SYS_gettid);
+
+       return mon_tid;
+}
+
+static struct superswitch *find_metadata_methods(char *vers)
+{
+       if (strcmp(vers, "ddf") == 0)
+               return &super_ddf;
+       if (strcmp(vers, "imsm") == 0)
+               return &super_imsm;
+       return NULL;
+}
+
+
+static int make_pidfile(char *devname, int o_excl)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       sprintf(path, "/var/run/mdadm/%s.pid", devname);
+
+       fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
+       if (fd < 0)
+               return -1;
+       sprintf(pid, "%d\n", getpid());
+       write(fd, pid, strlen(pid));
+       close(fd);
+       return 0;
+}
+
+static void try_kill_monitor(char *devname)
+{
+       char buf[100];
+       int fd;
+       pid_t pid;
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       close(fd);
+       pid = strtoul(buf, NULL, 10);
+
+       /* kill this process if it is mdmon */
+       sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       if (strstr(buf, "mdmon") != NULL)
+               kill(pid, SIGTERM);
+}
+
+void remove_pidfile(char *devname)
+{
+       char buf[100];
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       unlink(buf);
+}
+
+static int make_control_sock(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+
+       sprintf(path, "/var/run/mdadm/%s.sock", devname);
+       unlink(path);
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (bind(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+       listen(sfd, 10);
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+       return sfd;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+       #ifdef DEBUG
+       if (env_no_mdmon())
+               return 0;
+       #endif
+
+       return 1;
+}
+
+
+
+int main(int argc, char *argv[])
+{
+       int mdfd;
+       struct mdinfo *mdi, *di;
+       struct supertype *container;
+       sigset_t set;
+       struct sigaction act;
+       int pfd[2];
+       int status;
+
+       if (argc != 2) {
+               fprintf(stderr, "Usage: md-manage /device/name/for/container\n");
+               exit(2);
+       }
+       mdfd = open(argv[1], O_RDWR);
+       if (mdfd < 0) {
+               fprintf(stderr, "md-manage: %s: %s\n", argv[1],
+                       strerror(errno));
+               exit(1);
+       }
+       if (md_get_version(mdfd) < 0) {
+               fprintf(stderr, "md-manage: %s: Not an md device\n",
+                       argv[1]);
+               exit(1);
+       }
+
+       /* Fork, and have the child tell us when they are ready */
+       if (do_fork()) {
+               pipe(pfd);
+               switch(fork()) {
+               case -1:
+                       fprintf(stderr, "mdmon: failed to fork: %s\n",
+                               strerror(errno));
+                       exit(1);
+               case 0: /* child */
+                       close(pfd[0]);
+                       break;
+               default: /* parent */
+                       close(pfd[1]);
+                       if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+                               wait(&status);
+                               status = WEXITSTATUS(status);
+                       }
+                       exit(status);
+               }
+       } else
+               pfd[0] = pfd[1] = -1;
+       /* hopefully it is a container - we'll check later */
+
+       container = malloc(sizeof(*container));
+       container->devnum = fd2devnum(mdfd);
+       container->devname = devnum2devname(container->devnum);
+       container->device_name = argv[1];
+
+       /* If this fails, we hope it already exists */
+       mkdir("/var/run/mdadm", 0600);
+       /* pid file lives in /var/run/mdadm/mdXX.pid */
+       if (make_pidfile(container->devname, O_EXCL) < 0) {
+               if (ping_monitor(container->devname) == 0) {
+                       fprintf(stderr, "mdmon: %s already managed\n",
+                               container->devname);
+                       exit(3);
+               } else {
+                       /* cleanup the old monitor, this one is taking over */
+                       try_kill_monitor(container->devname);
+                       if (make_pidfile(container->devname, 0) < 0) {
+                               fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
+                                       container->devname);
+                               exit(3);
+                       }
+               }
+       }
+
+       container->sock = make_control_sock(container->devname);
+       if (container->sock < 0) {
+               fprintf(stderr, "mdmon: Cannot create socket in /var/run/mdadm\n");
+               exit(3);
+       }
+       container->arrays = NULL;
+
+       mdi = sysfs_read(mdfd, container->devnum,
+                        GET_VERSION|GET_LEVEL|GET_DEVS);
+
+       if (!mdi) {
+               fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
+                       container->devname);
+               exit(3);
+       }
+       if (mdi->array.level != UnSet) {
+               fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
+                       argv[1]);
+               exit(3);
+       }
+       if (mdi->array.major_version != -1 ||
+           mdi->array.minor_version != -2) {
+               fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
+                       argv[1]);
+               exit(3);
+       }
+
+       container->ss = find_metadata_methods(mdi->text_version);
+       if (container->ss == NULL) {
+               fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
+                       argv[1], mdi->text_version);
+               exit(3);
+       }
+
+       container->devs = NULL;
+       for (di = mdi->devs; di; di = di->next) {
+               struct mdinfo *cd = malloc(sizeof(*cd));
+               cd = di;
+               cd->next = container->devs;
+               container->devs = cd;
+       }
+       sysfs_free(mdi);
+
+
+       if (container->ss->load_super(container, mdfd, argv[1])) {
+               fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
+                       argv[1]);
+               exit(3);
+       }
+
+       /* Ok, this is close enough.  We can say goodbye to our parent now.
+        */
+       status = 0;
+       write(pfd[1], &status, sizeof(status));
+       close(pfd[1]);
+
+       chdir("/");
+       setsid();
+       close(0);
+       open("/dev/null", O_RDWR);
+       close(1);
+       dup(0);
+#ifndef DEBUG
+       close(2);
+       dup(0);
+#endif
+
+       mlockall(MCL_FUTURE);
+
+       /* SIGUSR is sent between parent and child.  So both block it
+        * and enable it only with pselect.
+        */
+       sigemptyset(&set);
+       sigaddset(&set, SIGUSR1);
+       sigprocmask(SIG_BLOCK, &set, NULL);
+       act.sa_handler = wake_me;
+       act.sa_flags = 0;
+       sigaction(SIGUSR1, &act, NULL);
+       act.sa_handler = SIG_IGN;
+       sigaction(SIGPIPE, &act, NULL);
+
+       if (clone_monitor(container) < 0) {
+               fprintf(stderr, "md-manage: failed to start monitor process: %s\n",
+                       strerror(errno));
+               exit(2);
+       }
+
+       do_manager(container);
+
+       exit(0);
+}
diff --git a/mdmon.h b/mdmon.h
new file mode 100644 (file)
index 0000000..6c1961a
--- /dev/null
+++ b/mdmon.h
@@ -0,0 +1,65 @@
+#ifdef DEBUG
+#define dprintf(fmt, arg...) \
+       fprintf(stderr, fmt, ##arg)
+#else
+#define dprintf(fmt, arg...) \
+        ({ if (0) fprintf(stderr, fmt, ##arg); 0; })
+#endif
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+                  clean, active, write_pending, active_idle, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+
+struct active_array {
+       struct mdinfo info;
+       struct supertype *container;
+       struct active_array *next, *replaces;
+
+       int action_fd;
+       int resync_start_fd;
+
+       enum array_state prev_state, curr_state, next_state;
+       enum sync_action prev_action, curr_action, next_action;
+
+       int check_degraded; /* flag set by mon, read by manage */
+
+       int devnum;
+
+       unsigned long long resync_start;
+};
+
+/*
+ * Metadata updates are handled by the monitor thread,
+ * as it has exclusive access to the metadata.
+ * When the manager want to updates metadata, either
+ * for it's own reason (e.g. committing a spare) or
+ * on behalf of mdadm, it creates a metadata_update
+ * structure and queues it to the monitor.
+ * Updates are created and processed by code under the
+ * superswitch.  All common code sees them as opaque
+ * blobs.
+ */
+extern struct metadata_update *update_queue, *update_queue_handled;
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+extern struct md_generic_cmd *active_cmd;
+
+
+void remove_pidfile(char *devname);
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+
+int read_dev_state(int fd);
+int get_resync_start(struct active_array *a);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern int exit_now, manager_ready;
+extern int mon_tid, mgr_tid;
+extern int monitor_loop_cnt;
index a8f7ce7576eb1d017acfe962a1dea48084cde5d2..4bb29d85c822237e1a0244ff477561f125e0e5c5 100644 (file)
--- a/mdstat.c
+++ b/mdstat.c
@@ -86,6 +86,7 @@
 #include       "mdadm.h"
 #include       "dlink.h"
 #include       <sys/select.h>
+#include       <ctype.h>
 
 void free_mdstat(struct mdstat_ent *ms)
 {
@@ -94,6 +95,7 @@ void free_mdstat(struct mdstat_ent *ms)
                if (ms->dev) free(ms->dev);
                if (ms->level) free(ms->level);
                if (ms->pattern) free(ms->pattern);
+               if (ms->metadata_version) free(ms->metadata_version);
                t = ms;
                ms = ms->next;
                free(t);
@@ -158,6 +160,10 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                ent->percent = -1;
                ent->active = -1;
                ent->resync = 0;
+               ent->metadata_version = NULL;
+               ent->raid_disks = 0;
+               ent->chunk_size = 0;
+               ent->devcnt = 0;
 
                ent->dev = strdup(line);
                ent->devnum = devnum;
@@ -176,22 +182,28 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                                in_devs = 1;
                        } else if (in_devs && strcmp(w, "blocks")==0)
                                in_devs = 0;
-                       else if (in_devs && strncmp(w, "md", 2)==0) {
-                               /* This has an md device as a component.
-                                * If that device is already in the list,
-                                * make sure we insert before there.
-                                */
-                               struct mdstat_ent **ih;
-                               int dn2;
-                               if (strncmp(w, "md_d", 4)==0)
-                                       dn2 = -1-strtoul(w+4, &ep, 10);
-                               else
-                                       dn2 = strtoul(w+2, &ep, 10);
-                               ih = &all;
-                               while (ih != insert_here && *ih &&
-                                      (*ih)->devnum != dn2)
-                                       ih = & (*ih)->next;
-                               insert_here = ih;
+                       else if (in_devs) {
+                               ent->devcnt++;
+                               if (strncmp(w, "md", 2)==0) {
+                                       /* This has an md device as a component.
+                                        * If that device is already in the
+                                        * list, make sure we insert before
+                                        * there.
+                                        */
+                                       struct mdstat_ent **ih;
+                                       int dn2 = devname2devnum(w);
+                                       ih = &all;
+                                       while (ih != insert_here && *ih &&
+                                              (*ih)->devnum != dn2)
+                                               ih = & (*ih)->next;
+                                       insert_here = ih;
+                               }
+                       } else if (strcmp(w, "super") == 0 &&
+                                  dl_next(w) != line) {
+                               w = dl_next(w);
+                               ent->metadata_version = strdup(w);
+                       } else if (w[0] == '[' && isdigit(w[1])) {
+                               ent->raid_disks = atoi(w+1);
                        } else if (!ent->pattern &&
                                 w[0] == '[' &&
                                 (w[1] == 'U' || w[1] == '_')) {
@@ -256,6 +268,20 @@ void mdstat_wait(int seconds)
        select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm);
 }
 
+void mdstat_wait_fd(int fd, const sigset_t *sigmask)
+{
+       fd_set fds, rfds;
+
+       FD_ZERO(&fds);
+       FD_ZERO(&rfds);
+       if (mdstat_fd >= 0)
+               FD_SET(mdstat_fd, &fds);
+       FD_SET(fd, &rfds);
+
+       pselect(mdstat_fd >2 ? mdstat_fd+1:3, &rfds, NULL, &fds,
+               NULL, sigmask);
+}
+
 int mddev_busy(int devnum)
 {
        struct mdstat_ent *mdstat = mdstat_read(0, 0);
diff --git a/monitor.c b/monitor.c
new file mode 100644 (file)
index 0000000..7cce5a8
--- /dev/null
+++ b/monitor.c
@@ -0,0 +1,527 @@
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/select.h>
+#include <signal.h>
+
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto",
+       "clean", "active", "write-pending", "active-idle", NULL };
+static char *sync_actions[] = {
+       "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+static int write_attr(char *attr, int fd)
+{
+       return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+       if (fd < 0)
+               return;
+       if (fd > *maxfd)
+               *maxfd = fd;
+       FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+       int n;
+
+       if (fd < 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       lseek(fd, 0, 0);
+       n = read(fd, buf, len - 1);
+
+       if (n <= 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       buf[n] = 0;
+       if (buf[n-1] == '\n')
+               buf[n-1] = 0;
+       return n;
+}
+
+
+int get_resync_start(struct active_array *a)
+{
+       char buf[30];
+       int n;
+
+       n = read_attr(buf, 30, a->resync_start_fd);
+       if (n <= 0)
+               return n;
+
+       a->resync_start = strtoull(buf, NULL, 10);
+
+       return 1;
+}
+
+static int attr_match(const char *attr, const char *str)
+{
+       /* See if attr, read from a sysfs file, matches
+        * str.  They must either be the same, or attr can
+        * have a trailing newline or comma
+        */
+       while (*attr && *str && *attr == *str) {
+               attr++;
+               str++;
+       }
+
+       if (*str || (*attr && *attr != ',' && *attr != '\n'))
+               return 0;
+       return 1;
+}
+
+static int match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (attr_match(word, list[n]))
+                       break;
+       return n;
+}
+
+static enum array_state read_state(int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_word;
+       return (enum array_state) match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_action;
+       return (enum sync_action) match_word(buf, sync_actions);
+}
+
+int read_dev_state(int fd)
+{
+       char buf[60];
+       int n = read_attr(buf, 60, fd);
+       char *cp;
+       int rv = 0;
+
+       if (n <= 0)
+               return 0;
+
+       cp = buf;
+       while (cp) {
+               if (attr_match(cp, "faulty"))
+                       rv |= DS_FAULTY;
+               if (attr_match(cp, "in_sync"))
+                       rv |= DS_INSYNC;
+               if (attr_match(cp, "write_mostly"))
+                       rv |= DS_WRITE_MOSTLY;
+               if (attr_match(cp, "spare"))
+                       rv |= DS_SPARE;
+               if (attr_match(cp, "blocked"))
+                       rv |= DS_BLOCKED;
+               cp = strchr(cp, ',');
+               if (cp)
+                       cp++;
+       }
+       return rv;
+}
+
+static void signal_manager(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
+}
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ *  Array stops.
+ *    This is detected by array_state going to 'clear' or 'inactive'.
+ *    while we thought it was active.
+ *    Response is to mark metadata as clean and 'clear' the array(??)
+ *  write-pending
+ *    array_state if 'write-pending'
+ *    We mark metadata as 'dirty' then set array to 'active'.
+ *  active_idle
+ *    Either ignore, or mark clean, then mark metadata as clean.
+ *
+ *  device fails
+ *    detected by rd-N/state reporting "faulty"
+ *    mark device as 'failed' in metadata, let the kernel release the
+ *    device by writing '-blocked' to rd/state, and finally write 'remove' to
+ *    rd/state.  Before a disk can be replaced it must be failed and removed
+ *    from all container members, this will be preemptive for the other
+ *    arrays... safe?
+ *
+ *  sync completes
+ *    sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ *    MaxSector
+ *    Notify metadata that sync is complete.
+ *
+ *  recovery completes
+ *    sync_action changes from 'recover' to 'idle'
+ *    Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ *
+ *  deal with resync
+ *    This only happens on finding a new array... mdadm will have set
+ *    'resync_start' to the correct value.  If 'resync_start' indicates that an
+ *    resync needs to occur set the array to the 'active' state rather than the
+ *    initial read-auto state.
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything.  So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ *   - update the array_state
+ *   - set the role of some devices.
+ *   - request a sync_action
+ *
+ */
+
+static int read_and_act(struct active_array *a)
+{
+       int check_degraded = 0;
+       int deactivate = 0;
+       struct mdinfo *mdi;
+
+       a->next_state = bad_word;
+       a->next_action = bad_action;
+
+       a->curr_state = read_state(a->info.state_fd);
+       a->curr_action = read_action(a->action_fd);
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->next_state = 0;
+               if (mdi->state_fd >= 0)
+                       mdi->curr_state = read_dev_state(mdi->state_fd);
+       }
+
+       if (a->curr_state <= inactive &&
+           a->prev_state > inactive) {
+               /* array has been stopped */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 1);
+               a->next_state = clear;
+               deactivate = 1;
+       }
+       if (a->curr_state == write_pending) {
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 0);
+               a->next_state = active;
+       }
+       if (a->curr_state == active_idle) {
+               /* Set array to 'clean' FIRST, then
+                * a->ss->mark_clean(a, ~0ULL);
+                * just ignore for now.
+                */
+       }
+
+       if (a->curr_state == readonly) {
+               /* Well, I'm ready to handle things, so
+                * read-auto is OK. FIXME what if we really want
+                * readonly ???
+                */
+               get_resync_start(a);
+//             printf("Found a readonly array at %llu\n", a->resync_start);
+               if (a->resync_start == ~0ULL)
+                       a->next_state = read_auto; /* array is clean */
+               else {
+                       a->container->ss->set_array_state(a, 0);
+                       a->next_state = active;
+               }
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == resync) {
+               /* A resync has finished.  The endpoint is recorded in
+                * 'sync_start'.  We don't update the metadata
+                * until the array goes inactive or readonly though.
+                * Just check if we need to fiddle spares.
+                */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 0);
+               check_degraded = 1;
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == recover) {
+               /* A recovery has finished.  Some disks may be in sync now,
+                * and the array may no longer be degraded
+                */
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       if (! (mdi->curr_state & DS_INSYNC))
+                               check_degraded = 1;
+               }
+       }
+
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+               if (mdi->curr_state & DS_FAULTY) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       check_degraded = 1;
+                       mdi->next_state = DS_REMOVE;
+               }
+       }
+
+       a->container->ss->sync_metadata(a->container);
+       dprintf("%s: update[%d]: (", __func__, a->info.container_member);
+
+       /* Effect state changes in the array */
+       if (a->next_state != bad_word) {
+               dprintf(" state:%s", array_states[a->next_state]);
+               write_attr(array_states[a->next_state], a->info.state_fd);
+       }
+       if (a->next_action != bad_action) {
+               write_attr(sync_actions[a->next_action], a->action_fd);
+               dprintf(" action:%s", array_states[a->next_state]);
+       }
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               if (mdi->next_state == DS_REMOVE && mdi->state_fd >= 0) {
+                       int remove_result;
+
+                       write_attr("-blocked", mdi->state_fd);
+                       /* the kernel may not be able to immediately remove the
+                        * disk, we can simply wait until the next event to try
+                        * again.
+                        */
+                       dprintf(" %d:-blocked", mdi->disk.raid_disk);
+                       remove_result = write_attr("remove", mdi->state_fd);
+                       if (remove_result > 0) {
+                               dprintf(" %d:removed", mdi->disk.raid_disk);
+                               close(mdi->state_fd);
+                               mdi->state_fd = -1;
+                       }
+               }
+               if (mdi->next_state & DS_INSYNC) {
+                       write_attr("+in_sync", mdi->state_fd);
+                       dprintf(" %d:+in_sync", mdi->disk.raid_disk);
+               }
+       }
+       dprintf(" )\n");
+
+       /* move curr_ to prev_ */
+       a->prev_state = a->curr_state;
+
+       a->prev_action = a->curr_action;
+
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->prev_state = mdi->curr_state;
+               mdi->next_state = 0;
+       }
+
+       if (check_degraded) {
+               /* manager will do the actual check */
+               a->check_degraded = 1;
+               signal_manager();
+       }
+
+       if (deactivate)
+               a->container = NULL;
+
+       return 1;
+}
+
+static struct mdinfo *
+find_device(struct active_array *a, int major, int minor)
+{
+       struct mdinfo *mdi;
+
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+               if (mdi->disk.major == major && mdi->disk.minor == minor)
+                       return mdi;
+
+       return NULL;
+}
+
+static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
+{
+       struct active_array *a;
+       struct mdinfo *victim;
+
+       for (a = aa; a; a = a->next) {
+               if (!a->container)
+                       continue;
+               victim = find_device(a, failed->disk.major, failed->disk.minor);
+               if (!victim)
+                       continue;
+
+               if (!(victim->curr_state & DS_FAULTY))
+                       write_attr("faulty", victim->state_fd);
+       }
+}
+
+#ifdef DEBUG
+static void dprint_wake_reasons(fd_set *fds)
+{
+       int i;
+       char proc_path[256];
+       char link[256];
+       char *basename;
+       int rv;
+
+       fprintf(stderr, "monitor: wake ( ");
+       for (i = 0; i < FD_SETSIZE; i++) {
+               if (FD_ISSET(i, fds)) {
+                       sprintf(proc_path, "/proc/%d/fd/%d",
+                               (int) getpid(), i);
+
+                       rv = readlink(proc_path, link, sizeof(link) - 1);
+                       if (rv < 0) {
+                               fprintf(stderr, "%d:unknown ", i);
+                               continue;
+                       }
+                       link[rv] = '\0';
+                       basename = strrchr(link, '/');
+                       fprintf(stderr, "%d:%s ",
+                               i, basename ? ++basename : link);
+               }
+       }
+       fprintf(stderr, ")\n");
+}
+#endif
+
+int monitor_loop_cnt;
+
+static int wait_and_act(struct supertype *container, int nowait)
+{
+       fd_set rfds;
+       int maxfd = 0;
+       struct active_array **aap = &container->arrays;
+       struct active_array *a, **ap;
+       int rv;
+       struct mdinfo *mdi;
+
+       FD_ZERO(&rfds);
+
+       for (ap = aap ; *ap ;) {
+               a = *ap;
+               /* once an array has been deactivated we want to
+                * ask the manager to discard it.
+                */
+               if (!a->container) {
+                       if (discard_this) {
+                               ap = &(*ap)->next;
+                               continue;
+                       }
+                       *ap = a->next;
+                       a->next = NULL;
+                       discard_this = a;
+                       signal_manager();
+                       continue;
+               }
+
+               add_fd(&rfds, &maxfd, a->info.state_fd);
+               add_fd(&rfds, &maxfd, a->action_fd);
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       add_fd(&rfds, &maxfd, mdi->state_fd);
+
+               ap = &(*ap)->next;
+       }
+
+       if (manager_ready && *aap == NULL) {
+               /* No interesting arrays. Lets see about exiting.
+                * Note that blocking at this point is not a problem
+                * as there are no active arrays, there is nothing that
+                * we need to be ready to do.
+                */
+               int fd = open(container->device_name, O_RDONLY|O_EXCL);
+               if (fd >= 0 || errno != EBUSY) {
+                       /* OK, we are safe to leave */
+                       dprintf("no arrays to monitor... exiting\n");
+                       remove_pidfile(container->devname);
+                       exit_now = 1;
+                       signal_manager();
+                       exit(0);
+               }
+       }
+
+       if (!nowait) {
+               sigset_t set;
+               sigprocmask(SIG_UNBLOCK, NULL, &set);
+               sigdelset(&set, SIGUSR1);
+               monitor_loop_cnt |= 1;
+               rv = pselect(maxfd+1, &rfds, NULL, NULL, NULL, &set);
+               monitor_loop_cnt += 1;
+               if (rv == -1 && errno == EINTR)
+                       rv = 0;
+               #ifdef DEBUG
+               dprint_wake_reasons(&rfds);
+               #endif
+
+       }
+
+       if (update_queue) {
+               struct metadata_update *this;
+
+               for (this = update_queue; this ; this = this->next)
+                       container->ss->process_update(container, this);
+
+               update_queue_handled = update_queue;
+               update_queue = NULL;
+               signal_manager();
+               container->ss->sync_metadata(container);
+       }
+
+       for (a = *aap; a ; a = a->next) {
+               if (a->replaces && !discard_this) {
+                       struct active_array **ap;
+                       for (ap = &a->next; *ap && *ap != a->replaces;
+                            ap = & (*ap)->next)
+                               ;
+                       if (*ap)
+                               *ap = (*ap)->next;
+                       discard_this = a->replaces;
+                       a->replaces = NULL;
+                       /* FIXME check if device->state_fd need to be cleared?*/
+                       signal_manager();
+               }
+               if (a->container)
+                       rv += read_and_act(a);
+       }
+
+       /* propagate failures across container members */
+       for (a = *aap; a ; a = a->next) {
+               if (!a->container)
+                       continue;
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       if (mdi->curr_state & DS_FAULTY)
+                               reconcile_failed(*aap, mdi);
+       }
+
+       return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+       int rv;
+       int first = 1;
+       do {
+               rv = wait_and_act(container, first);
+               first = 0;
+       } while (rv >= 0);
+}
diff --git a/msg.c b/msg.c
new file mode 100644 (file)
index 0000000..78fd7f7
--- /dev/null
+++ b/msg.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include "mdadm.h"
+#include "mdmon.h"
+
+static const __u32 start_magic = 0x5a5aa5a5;
+static const __u32 end_magic = 0xa5a55a5a;
+
+static int send_buf(int fd, const void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, NULL, &set, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = write(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+static int recv_buf(int fd, void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, &set, NULL, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = read(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+
+int send_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __u32 len = msg->len;
+       int rv;
+
+       rv = send_buf(fd, &start_magic, 4, tmo);
+       rv = rv ?: send_buf(fd, &len, 4, tmo);
+       if (len)
+               rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo);
+       rv = send_buf(fd, &end_magic, 4, tmo);
+
+       return rv;
+}
+
+int receive_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __u32 magic;
+       __u32 len;
+       int rv;
+
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != start_magic)
+               return -1;
+       rv = recv_buf(fd, &len, 4, tmo);
+       if (rv < 0 || len > MSG_MAX_LEN)
+               return -1;
+       if (len) {
+               msg->buf = malloc(len);
+               if (msg->buf == NULL)
+                       return -1;
+               rv = recv_buf(fd, msg->buf, len, tmo);
+               if (rv < 0) {
+                       free(msg->buf);
+                       return -1;
+               }
+       } else
+               msg->buf = NULL;
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != end_magic) {
+               free(msg->buf);
+               return -1;
+       }
+       msg->len = len;
+       return 0;
+}
+
+int ack(int fd, int tmo)
+{
+       struct metadata_update msg = { .len = 0 };
+
+       return send_message(fd, &msg, tmo);
+}
+
+int wait_reply(int fd, int tmo)
+{
+       struct metadata_update msg;
+       return receive_message(fd, &msg, tmo);
+}
+
+int connect_monitor(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+
+       sprintf(path, "/var/run/mdadm/%s.sock", devname);
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (connect(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+
+       return sfd;
+}
+
+int ping_monitor(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       int err = 0;
+
+       if (sfd < 0)
+               return sfd;
+
+       /* try to ping existing socket */
+       if (ack(sfd, 20) != 0)
+               err = -1;
+
+       /* check the reply */
+       if (!err && wait_reply(sfd, 20) != 0)
+               err = -1;
+
+       close(sfd);
+       return err;
+}
diff --git a/msg.h b/msg.h
new file mode 100644 (file)
index 0000000..4dc805e
--- /dev/null
+++ b/msg.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+struct mdinfo;
+struct metadata_update;
+
+extern int receive_message(int fd, struct metadata_update *msg, int tmo);
+extern int send_message(int fd, struct metadata_update *msg, int tmo);
+extern int ack(int fd, int tmo);
+extern int wait_reply(int fd, int tmo);
+extern int connect_monitor(char *devname);
+extern int ping_monitor(char *devname);
+
+#define MSG_MAX_LEN (4*1024*1024)
diff --git a/sg_io.c b/sg_io.c
new file mode 100644 (file)
index 0000000..4ae5d92
--- /dev/null
+++ b/sg_io.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007 Intel Corporation
+ *
+ *     Retrieve drive serial numbers for scsi disks
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <string.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <sys/ioctl.h>
+
+int scsi_get_serial(int fd, void *buf, size_t buf_len)
+{
+       unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0};
+       unsigned char sense[32];
+       struct sg_io_hdr io_hdr;
+
+       memset(&io_hdr, 0, sizeof(io_hdr));
+       io_hdr.interface_id = 'S';
+       io_hdr.cmdp = inq_cmd;
+       io_hdr.cmd_len = sizeof(inq_cmd);
+       io_hdr.dxferp = buf;
+       io_hdr.dxfer_len = buf_len;
+       io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+       io_hdr.sbp = sense;
+       io_hdr.mx_sb_len = sizeof(sense);
+       io_hdr.timeout = 5000;
+
+       return ioctl(fd, SG_IO, &io_hdr);
+}
diff --git a/super-ddf.c b/super-ddf.c
new file mode 100644 (file)
index 0000000..5d38750
--- /dev/null
@@ -0,0 +1,3227 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2007 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2
+ * (July 28 2006).  Reused by permission of SNIA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+
+/* a non-official T10 name for creation GUIDs */
+static char T10[] = "Linux-MD";
+
+/* DDF timestamps are 1980 based, so we need to add
+ * second-in-decade-of-seventies to convert to linux timestamps.
+ * 10 years with 2 leap years.
+ */
+#define DECADE (3600*24*(365*10+2))
+unsigned long crc32(
+       unsigned long crc,
+       const unsigned char *buf,
+       unsigned len);
+
+/* The DDF metadata handling.
+ * DDF metadata lives at the end of the device.
+ * The last 512 byte block provides an 'anchor' which is used to locate
+ * the rest of the metadata which usually lives immediately behind the anchor.
+ *
+ * Note:
+ *  - all multibyte numeric fields are bigendian.
+ *  - all strings are space padded.
+ *
+ */
+
+/* Primary Raid Level (PRL) */
+#define        DDF_RAID0       0x00
+#define        DDF_RAID1       0x01
+#define        DDF_RAID3       0x03
+#define        DDF_RAID4       0x04
+#define        DDF_RAID5       0x05
+#define        DDF_RAID1E      0x11
+#define        DDF_JBOD        0x0f
+#define        DDF_CONCAT      0x1f
+#define        DDF_RAID5E      0x15
+#define        DDF_RAID5EE     0x25
+#define        DDF_RAID6       0x06
+
+/* Raid Level Qualifier (RLQ) */
+#define        DDF_RAID0_SIMPLE        0x00
+#define        DDF_RAID1_SIMPLE        0x00 /* just 2 devices in this plex */
+#define        DDF_RAID1_MULTI         0x01 /* exactly 3 devices in this plex */
+#define        DDF_RAID3_0             0x00 /* parity in first extent */
+#define        DDF_RAID3_N             0x01 /* parity in last extent */
+#define        DDF_RAID4_0             0x00 /* parity in first extent */
+#define        DDF_RAID4_N             0x01 /* parity in last extent */
+/* these apply to raid5e and raid5ee as well */
+#define        DDF_RAID5_0_RESTART     0x00 /* same as 'right asymmetric' - layout 1 */
+#define        DDF_RAID6_0_RESTART     0x01 /* raid6 different from raid5 here!!! */
+#define        DDF_RAID5_N_RESTART     0x02 /* same as 'left asymmetric' - layout 0 */
+#define        DDF_RAID5_N_CONTINUE    0x03 /* same as 'left symmetric' - layout 2 */
+
+#define        DDF_RAID1E_ADJACENT     0x00 /* raid10 nearcopies==2 */
+#define        DDF_RAID1E_OFFSET       0x01 /* raid10 offsetcopies==2 */
+
+/* Secondary RAID Level (SRL) */
+#define        DDF_2STRIPED    0x00    /* This is weirder than RAID0 !! */
+#define        DDF_2MIRRORED   0x01
+#define        DDF_2CONCAT     0x02
+#define        DDF_2SPANNED    0x03    /* This is also weird - be careful */
+
+/* Magic numbers */
+#define        DDF_HEADER_MAGIC        __cpu_to_be32(0xDE11DE11)
+#define        DDF_CONTROLLER_MAGIC    __cpu_to_be32(0xAD111111)
+#define        DDF_PHYS_RECORDS_MAGIC  __cpu_to_be32(0x22222222)
+#define        DDF_PHYS_DATA_MAGIC     __cpu_to_be32(0x33333333)
+#define        DDF_VIRT_RECORDS_MAGIC  __cpu_to_be32(0xDDDDDDDD)
+#define        DDF_VD_CONF_MAGIC       __cpu_to_be32(0xEEEEEEEE)
+#define        DDF_SPARE_ASSIGN_MAGIC  __cpu_to_be32(0x55555555)
+#define        DDF_VU_CONF_MAGIC       __cpu_to_be32(0x88888888)
+#define        DDF_VENDOR_LOG_MAGIC    __cpu_to_be32(0x01dBEEF0)
+#define        DDF_BBM_LOG_MAGIC       __cpu_to_be32(0xABADB10C)
+
+#define        DDF_GUID_LEN    24
+#define DDF_REVISION_0 "01.00.00"
+#define DDF_REVISION_2 "01.02.00"
+
+struct ddf_header {
+       __u32   magic;          /* DDF_HEADER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       char    revision[8];    /* 01.02.00 */
+       __u32   seq;            /* starts at '1' */
+       __u32   timestamp;
+       __u8    openflag;
+       __u8    foreignflag;
+       __u8    enforcegroups;
+       __u8    pad0;           /* 0xff */
+       __u8    pad1[12];       /* 12 * 0xff */
+       /* 64 bytes so far */
+       __u8    header_ext[32]; /* reserved: fill with 0xff */
+       __u64   primary_lba;
+       __u64   secondary_lba;
+       __u8    type;
+       __u8    pad2[3];        /* 0xff */
+       __u32   workspace_len;  /* sectors for vendor space -
+                                * at least 32768(sectors) */
+       __u64   workspace_lba;
+       __u16   max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */
+       __u16   max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */
+       __u16   max_partitions; /* i.e. max num of configuration
+                                  record entries per disk */
+       __u16   config_record_len; /* 1 +ROUNDUP(max_primary_element_entries
+                                                *12/512) */
+       __u16   max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */
+       __u8    pad3[54];       /* 0xff */
+       /* 192 bytes so far */
+       __u32   controller_section_offset;
+       __u32   controller_section_length;
+       __u32   phys_section_offset;
+       __u32   phys_section_length;
+       __u32   virt_section_offset;
+       __u32   virt_section_length;
+       __u32   config_section_offset;
+       __u32   config_section_length;
+       __u32   data_section_offset;
+       __u32   data_section_length;
+       __u32   bbm_section_offset;
+       __u32   bbm_section_length;
+       __u32   diag_space_offset;
+       __u32   diag_space_length;
+       __u32   vendor_offset;
+       __u32   vendor_length;
+       /* 256 bytes so far */
+       __u8    pad4[256];      /* 0xff */
+};
+
+/* type field */
+#define        DDF_HEADER_ANCHOR       0x00
+#define        DDF_HEADER_PRIMARY      0x01
+#define        DDF_HEADER_SECONDARY    0x02
+
+/* The content of the 'controller section' - global scope */
+struct ddf_controller_data {
+       __u32   magic;                  /* DDF_CONTROLLER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       struct controller_type {
+               __u16 vendor_id;
+               __u16 device_id;
+               __u16 sub_vendor_id;
+               __u16 sub_device_id;
+       } type;
+       char    product_id[16];
+       __u8    pad[8]; /* 0xff */
+       __u8    vendor_data[448];
+};
+
+/* The content of phys_section - global scope */
+struct phys_disk {
+       __u32   magic;          /* DDF_PHYS_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   used_pdes;
+       __u16   max_pdes;
+       __u8    pad[52];
+       struct phys_disk_entry {
+               char    guid[DDF_GUID_LEN];
+               __u32   refnum;
+               __u16   type;
+               __u16   state;
+               __u64   config_size; /* DDF structures must be after here */
+               char    path[18];       /* another horrible structure really */
+               __u8    pad[6];
+       } entries[0];
+};
+
+/* phys_disk_entry.type is a bitmap - bigendian remember */
+#define        DDF_Forced_PD_GUID              1
+#define        DDF_Active_in_VD                2
+#define        DDF_Global_Spare                4 /* VD_CONF records are ignored */
+#define        DDF_Spare                       8 /* overrides Global_spare */
+#define        DDF_Foreign                     16
+#define        DDF_Legacy                      32 /* no DDF on this device */
+
+#define        DDF_Interface_mask              0xf00
+#define        DDF_Interface_SCSI              0x100
+#define        DDF_Interface_SAS               0x200
+#define        DDF_Interface_SATA              0x300
+#define        DDF_Interface_FC                0x400
+
+/* phys_disk_entry.state is a bigendian bitmap */
+#define        DDF_Online                      1
+#define        DDF_Failed                      2 /* overrides  1,4,8 */
+#define        DDF_Rebuilding                  4
+#define        DDF_Transition                  8
+#define        DDF_SMART                       16
+#define        DDF_ReadErrors                  32
+#define        DDF_Missing                     64
+
+/* The content of the virt_section global scope */
+struct virtual_disk {
+       __u32   magic;          /* DDF_VIRT_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   populated_vdes;
+       __u16   max_vdes;
+       __u8    pad[52];
+       struct virtual_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   unit;
+               __u16   pad0;   /* 0xffff */
+               __u16   guid_crc;
+               __u16   type;
+               __u8    state;
+               __u8    init_state;
+               __u8    pad1[14];
+               char    name[16];
+       } entries[0];
+};
+
+/* virtual_entry.type is a bitmap - bigendian */
+#define        DDF_Shared              1
+#define        DDF_Enforce_Groups      2
+#define        DDF_Unicode             4
+#define        DDF_Owner_Valid         8
+
+/* virtual_entry.state is a bigendian bitmap */
+#define        DDF_state_mask          0x7
+#define        DDF_state_optimal       0x0
+#define        DDF_state_degraded      0x1
+#define        DDF_state_deleted       0x2
+#define        DDF_state_missing       0x3
+#define        DDF_state_failed        0x4
+#define        DDF_state_part_optimal  0x5
+
+#define        DDF_state_morphing      0x8
+#define        DDF_state_inconsistent  0x10
+
+/* virtual_entry.init_state is a bigendian bitmap */
+#define        DDF_initstate_mask      0x03
+#define        DDF_init_not            0x00
+#define        DDF_init_quick          0x01 /* initialisation is progress.
+                                     * i.e. 'state_inconsistent' */
+#define        DDF_init_full           0x02
+
+#define        DDF_access_mask         0xc0
+#define        DDF_access_rw           0x00
+#define        DDF_access_ro           0x80
+#define        DDF_access_blocked      0xc0
+
+/* The content of the config_section - local scope
+ * It has multiple records each config_record_len sectors
+ * They can be vd_config or spare_assign
+ */
+
+struct vd_config {
+       __u32   magic;          /* DDF_VD_CONF_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   timestamp;
+       __u32   seqnum;
+       __u8    pad0[24];
+       __u16   prim_elmnt_count;
+       __u8    chunk_shift;    /* 0 == 512, 1==1024 etc */
+       __u8    prl;
+       __u8    rlq;
+       __u8    sec_elmnt_count;
+       __u8    sec_elmnt_seq;
+       __u8    srl;
+       __u64   blocks;         /* blocks per component could be different
+                                * on different component devices...(only
+                                * for concat I hope) */
+       __u64   array_blocks;   /* blocks in array */
+       __u8    pad1[8];
+       __u32   spare_refs[8];
+       __u8    cache_pol[8];
+       __u8    bg_rate;
+       __u8    pad2[3];
+       __u8    pad3[52];
+       __u8    pad4[192];
+       __u8    v0[32]; /* reserved- 0xff */
+       __u8    v1[32]; /* reserved- 0xff */
+       __u8    v2[16]; /* reserved- 0xff */
+       __u8    v3[16]; /* reserved- 0xff */
+       __u8    vendor[32];
+       __u32   phys_refnum[0]; /* refnum of each disk in sequence */
+      /*__u64  lba_offset[0];  LBA offset in each phys.  Note extents in a
+                               bvd are always the same size */
+};
+
+/* vd_config.cache_pol[7] is a bitmap */
+#define        DDF_cache_writeback     1       /* else writethrough */
+#define        DDF_cache_wadaptive     2       /* only applies if writeback */
+#define        DDF_cache_readahead     4
+#define        DDF_cache_radaptive     8       /* only if doing read-ahead */
+#define        DDF_cache_ifnobatt      16      /* even to write cache if battery is poor */
+#define        DDF_cache_wallowed      32      /* enable write caching */
+#define        DDF_cache_rallowed      64      /* enable read caching */
+
+struct spare_assign {
+       __u32   magic;          /* DDF_SPARE_ASSIGN_MAGIC */
+       __u32   crc;
+       __u32   timestamp;
+       __u8    reserved[7];
+       __u8    type;
+       __u16   populated;      /* SAEs used */
+       __u16   max;            /* max SAEs */
+       __u8    pad[8];
+       struct spare_assign_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   secondary_element;
+               __u8    pad[6];
+       } spare_ents[0];
+};
+/* spare_assign.type is a bitmap */
+#define        DDF_spare_dedicated     0x1     /* else global */
+#define        DDF_spare_revertible    0x2     /* else committable */
+#define        DDF_spare_active        0x4     /* else not active */
+#define        DDF_spare_affinity      0x8     /* enclosure affinity */
+
+/* The data_section contents - local scope */
+struct disk_data {
+       __u32   magic;          /* DDF_PHYS_DATA_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   refnum;         /* crc of some magic drive data ... */
+       __u8    forced_ref;     /* set when above was not result of magic */
+       __u8    forced_guid;    /* set if guid was forced rather than magic */
+       __u8    vendor[32];
+       __u8    pad[442];
+};
+
+/* bbm_section content */
+struct bad_block_log {
+       __u32   magic;
+       __u32   crc;
+       __u16   entry_count;
+       __u32   spare_count;
+       __u8    pad[10];
+       __u64   first_spare;
+       struct mapped_block {
+               __u64   defective_start;
+               __u32   replacement_start;
+               __u16   remap_count;
+               __u8    pad[2];
+       } entries[0];
+};
+
+/* Struct for internally holding ddf structures */
+/* The DDF structure stored on each device is potentially
+ * quite different, as some data is global and some is local.
+ * The global data is:
+ *   - ddf header
+ *   - controller_data
+ *   - Physical disk records
+ *   - Virtual disk records
+ * The local data is:
+ *   - Configuration records
+ *   - Physical Disk data section
+ *  (  and Bad block and vendor which I don't care about yet).
+ *
+ * The local data is parsed into separate lists as it is read
+ * and reconstructed for writing.  This means that we only need
+ * to make config changes once and they are automatically
+ * propagated to all devices.
+ * Note that the ddf_super has space of the conf and disk data
+ * for this disk and also for a list of all such data.
+ * The list is only used for the superblock that is being
+ * built in Create or Assemble to describe the whole array.
+ */
+struct ddf_super {
+       struct ddf_header anchor, primary, secondary;
+       struct ddf_controller_data controller;
+       struct ddf_header *active;
+       struct phys_disk        *phys;
+       struct virtual_disk     *virt;
+       int pdsize, vdsize;
+       int max_part, mppe, conf_rec_len;
+       int currentdev;
+       int updates_pending;
+       struct vcl {
+               union {
+                       char space[512];
+                       struct {
+                               struct vcl      *next;
+                               __u64           *lba_offset; /* location in 'conf' of
+                                                             * the lba table */
+                               int     vcnum; /* index into ->virt */
+                               __u64           *block_sizes; /* NULL if all the same */
+                       };
+               };
+               struct vd_config conf;
+       } *conflist, *currentconf;
+       struct dl {
+               union {
+                       char space[512];
+                       struct {
+                               struct dl       *next;
+                               int major, minor;
+                               char *devname;
+                               int fd;
+                               unsigned long long size; /* sectors */
+                               int pdnum;      /* index in ->phys */
+                               struct spare_assign *spare;
+                       };
+               };
+               struct disk_data disk;
+               struct vcl *vlist[0]; /* max_part in size */
+       } *dlist;
+};
+
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
+
+static int calc_crc(void *buf, int len)
+{
+       /* crcs are always at the same place as in the ddf_header */
+       struct ddf_header *ddf = buf;
+       __u32 oldcrc = ddf->crc;
+       __u32 newcrc;
+       ddf->crc = 0xffffffff;
+
+       newcrc = crc32(0, buf, len);
+       ddf->crc = oldcrc;
+       return newcrc;
+}
+
+static int load_ddf_header(int fd, unsigned long long lba,
+                          unsigned long long size,
+                          int type,
+                          struct ddf_header *hdr, struct ddf_header *anchor)
+{
+       /* read a ddf header (primary or secondary) from fd/lba
+        * and check that it is consistent with anchor
+        * Need to check:
+        *   magic, crc, guid, rev, and LBA's header_type, and
+        *  everything after header_type must be the same
+        */
+       if (lba >= size-1)
+               return 0;
+
+       if (lseek64(fd, lba<<9, 0) < 0)
+               return 0;
+
+       if (read(fd, hdr, 512) != 512)
+               return 0;
+
+       if (hdr->magic != DDF_HEADER_MAGIC)
+               return 0;
+       if (calc_crc(hdr, 512) != hdr->crc)
+               return 0;
+       if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 ||
+           memcmp(anchor->revision, hdr->revision, 8) != 0 ||
+           anchor->primary_lba != hdr->primary_lba ||
+           anchor->secondary_lba != hdr->secondary_lba ||
+           hdr->type != type ||
+           memcmp(anchor->pad2, hdr->pad2, 512 -
+                  offsetof(struct ddf_header, pad2)) != 0)
+               return 0;
+
+       /* Looks good enough to me... */
+       return 1;
+}
+
+static void *load_section(int fd, struct ddf_super *super, void *buf,
+                         __u32 offset_be, __u32 len_be, int check)
+{
+       unsigned long long offset = __be32_to_cpu(offset_be);
+       unsigned long long len = __be32_to_cpu(len_be);
+       int dofree = (buf == NULL);
+
+       if (check)
+               if (len != 2 && len != 8 && len != 32
+                   && len != 128 && len != 512)
+                       return NULL;
+
+       if (len > 1024)
+               return NULL;
+       if (buf) {
+               /* All pre-allocated sections are a single block */
+               if (len != 1)
+                       return NULL;
+       } else {
+               posix_memalign(&buf, 512, len<<9);
+       }
+
+       if (!buf)
+               return NULL;
+
+       if (super->active->type == 1)
+               offset += __be64_to_cpu(super->active->primary_lba);
+       else
+               offset += __be64_to_cpu(super->active->secondary_lba);
+
+       if (lseek64(fd, offset<<9, 0) != (offset<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       if (read(fd, buf, len<<9) != (len<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
+{
+       unsigned long long dsize;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize-512, 0) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (read(fd, &super->anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (super->anchor.magic != DDF_HEADER_MAGIC) {
+               if (devname)
+                       fprintf(stderr, Name ": no DDF anchor found on %s\n",
+                               devname);
+               return 2;
+       }
+       if (calc_crc(&super->anchor, 512) != super->anchor.crc) {
+               if (devname)
+                       fprintf(stderr, Name ": bad CRC on anchor on %s\n",
+                               devname);
+               return 2;
+       }
+       if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 &&
+           memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) {
+               if (devname)
+                       fprintf(stderr, Name ": can only support super revision"
+                               " %.8s and earlier, not %.8s on %s\n",
+                               DDF_REVISION_2, super->anchor.revision,devname);
+               return 2;
+       }
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba),
+                           dsize >> 9,  1,
+                           &super->primary, &super->anchor) == 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load primary DDF header "
+                               "on %s\n", devname);
+               return 2;
+       }
+       super->active = &super->primary;
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba),
+                           dsize >> 9,  2,
+                           &super->secondary, &super->anchor)) {
+               if ((__be32_to_cpu(super->primary.seq)
+                    < __be32_to_cpu(super->secondary.seq) &&
+                    !super->secondary.openflag)
+                   || (__be32_to_cpu(super->primary.seq)
+                       == __be32_to_cpu(super->secondary.seq) &&
+                       super->primary.openflag && !super->secondary.openflag)
+                       )
+                       super->active = &super->secondary;
+       }
+       return 0;
+}
+
+static int load_ddf_global(int fd, struct ddf_super *super, char *devname)
+{
+       void *ok;
+       ok = load_section(fd, super, &super->controller,
+                         super->active->controller_section_offset,
+                         super->active->controller_section_length,
+                         0);
+       super->phys = load_section(fd, super, NULL,
+                                  super->active->phys_section_offset,
+                                  super->active->phys_section_length,
+                                  1);
+       super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512;
+
+       super->virt = load_section(fd, super, NULL,
+                                  super->active->virt_section_offset,
+                                  super->active->virt_section_length,
+                                  1);
+       super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512;
+       if (!ok ||
+           !super->phys ||
+           !super->virt) {
+               free(super->phys);
+               free(super->virt);
+               super->phys = NULL;
+               super->virt = NULL;
+               return 2;
+       }
+       super->conflist = NULL;
+       super->dlist = NULL;
+
+       super->max_part = __be16_to_cpu(super->active->max_partitions);
+       super->mppe = __be16_to_cpu(super->active->max_primary_element_entries);
+       super->conf_rec_len = __be16_to_cpu(super->active->config_record_len);
+       return 0;
+}
+
+static int load_ddf_local(int fd, struct ddf_super *super,
+                         char *devname, int keep)
+{
+       struct dl *dl;
+       struct stat stb;
+       char *conf;
+       int i;
+       int vnum;
+       int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries);
+       unsigned long long dsize;
+
+       /* First the local disk info */
+       posix_memalign((void**)&dl, 512,
+                      sizeof(*dl) +
+                      (super->max_part) * sizeof(dl->vlist[0]));
+
+       load_section(fd, super, &dl->disk,
+                    super->active->data_section_offset,
+                    super->active->data_section_length,
+                    0);
+       dl->devname = devname ? strdup(devname) : NULL;
+
+       fstat(fd, &stb);
+       dl->major = major(stb.st_rdev);
+       dl->minor = minor(stb.st_rdev);
+       dl->next = super->dlist;
+       dl->fd = keep ? fd : -1;
+
+       dl->size = 0;
+       if (get_dev_size(fd, devname, &dsize))
+               dl->size = dsize >> 9;
+       dl->spare = NULL;
+       for (i=0 ; i < super->max_part ; i++)
+               dl->vlist[i] = NULL;
+       super->dlist = dl;
+       dl->pdnum = -1;
+       for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++)
+               if (memcmp(super->phys->entries[i].guid,
+                          dl->disk.guid, DDF_GUID_LEN) == 0)
+                       dl->pdnum = i;
+
+       /* Now the config list. */
+       /* 'conf' is an array of config entries, some of which are
+        * probably invalid.  Those which are good need to be copied into
+        * the conflist
+        */
+
+       conf = load_section(fd, super, NULL,
+                           super->active->config_section_offset,
+                           super->active->config_section_length,
+                           0);
+
+       vnum = 0;
+       for (i = 0;
+            i < __be32_to_cpu(super->active->config_section_length);
+            i += super->conf_rec_len) {
+               struct vd_config *vd =
+                       (struct vd_config *)((char*)conf + i*512);
+               struct vcl *vcl;
+
+               if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) {
+                       if (dl->spare)
+                               continue;
+                       posix_memalign((void**)&dl->spare, 512,
+                                      super->conf_rec_len*512);
+                       memcpy(dl->spare, vd, super->conf_rec_len*512);
+                       continue;
+               }
+               if (vd->magic != DDF_VD_CONF_MAGIC)
+                       continue;
+               for (vcl = super->conflist; vcl; vcl = vcl->next) {
+                       if (memcmp(vcl->conf.guid,
+                                  vd->guid, DDF_GUID_LEN) == 0)
+                               break;
+               }
+
+               if (vcl) {
+                       dl->vlist[vnum++] = vcl;
+                       if (__be32_to_cpu(vd->seqnum) <=
+                           __be32_to_cpu(vcl->conf.seqnum))
+                               continue;
+               } else {
+                       posix_memalign((void**)&vcl, 512,
+                                      (super->conf_rec_len*512 +
+                                       offsetof(struct vcl, conf)));
+                       vcl->next = super->conflist;
+                       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+                       super->conflist = vcl;
+                       dl->vlist[vnum++] = vcl;
+               }
+               memcpy(&vcl->conf, vd, super->conf_rec_len*512);
+               vcl->lba_offset = (__u64*)
+                       &vcl->conf.phys_refnum[super->mppe];
+
+               for (i=0; i < max_virt_disks ; i++)
+                       if (memcmp(super->virt->entries[i].guid,
+                                  vcl->conf.guid, DDF_GUID_LEN)==0)
+                               break;
+               if (i < max_virt_disks)
+                       vcl->vcnum = i;
+       }
+       free(conf);
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd);
+#endif
+static int load_super_ddf(struct supertype *st, int fd,
+                         char *devname)
+{
+       unsigned long long dsize;
+       struct ddf_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       /* if 'fd' is a container, load metadata from all the devices */
+       if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME Is this correct */
+
+       if (get_dev_size(fd, devname, &dsize) == 0)
+               return 1;
+
+       /* 32M is a lower bound */
+       if (dsize <= 32*1024*1024) {
+               if (devname) {
+                       fprintf(stderr,
+                               Name ": %s is too small for ddf: "
+                               "size is %llu sectors.\n",
+                               devname, dsize>>9);
+                       return 1;
+               }
+       }
+       if (dsize & 511) {
+               if (devname) {
+                       fprintf(stderr,
+                               Name ": %s is an odd size for ddf: "
+                               "size is %llu bytes.\n",
+                               devname, dsize);
+                       return 1;
+               }
+       }
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) {
+               fprintf(stderr, Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+       memset(super, 0, sizeof(*super));
+
+       rv = load_ddf_headers(fd, super, devname);
+       if (rv) {
+               free(super);
+               return rv;
+       }
+
+       /* Have valid headers and have chosen the best. Let's read in the rest*/
+
+       rv = load_ddf_global(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free(super);
+               return rv;
+       }
+
+       load_ddf_local(fd, super, devname, 0);
+
+       /* Should possibly check the sections .... */
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+       }
+       return 0;
+
+}
+
+static void free_super_ddf(struct supertype *st)
+{
+       struct ddf_super *ddf = st->sb;
+       if (ddf == NULL)
+               return;
+       free(ddf->phys);
+       free(ddf->virt);
+       while (ddf->conflist) {
+               struct vcl *v = ddf->conflist;
+               ddf->conflist = v->next;
+               if (v->block_sizes)
+                       free(v->block_sizes);
+               free(v);
+       }
+       while (ddf->dlist) {
+               struct dl *d = ddf->dlist;
+               ddf->dlist = d->next;
+               if (d->fd >= 0)
+                       close(d->fd);
+               if (d->spare)
+                       free(d->spare);
+               free(d);
+       }
+       free(ddf);
+       st->sb = NULL;
+}
+
+static struct supertype *match_metadata_desc_ddf(char *arg)
+{
+       /* 'ddf' only support containers */
+       struct supertype *st;
+       if (strcmp(arg, "ddf") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_ddf;
+       st->max_devs = 512;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+
+#ifndef MDASSEMBLE
+
+static mapping_t ddf_state[] = {
+       { "Optimal", 0},
+       { "Degraded", 1},
+       { "Deleted", 2},
+       { "Missing", 3},
+       { "Failed", 4},
+       { "Partially Optimal", 5},
+       { "-reserved-", 6},
+       { "-reserved-", 7},
+       { NULL, 0}
+};
+
+static mapping_t ddf_init_state[] = {
+       { "Not Initialised", 0},
+       { "QuickInit in Progress", 1},
+       { "Fully Initialised", 2},
+       { "*UNKNOWN*", 3},
+       { NULL, 0}
+};
+static mapping_t ddf_access[] = {
+       { "Read/Write", 0},
+       { "Reserved", 1},
+       { "Read Only", 2},
+       { "Blocked (no access)", 3},
+       { NULL ,0}
+};
+
+static mapping_t ddf_level[] = {
+       { "RAID0", DDF_RAID0},
+       { "RAID1", DDF_RAID1},
+       { "RAID3", DDF_RAID3},
+       { "RAID4", DDF_RAID4},
+       { "RAID5", DDF_RAID5},
+       { "RAID1E",DDF_RAID1E},
+       { "JBOD",  DDF_JBOD},
+       { "CONCAT",DDF_CONCAT},
+       { "RAID5E",DDF_RAID5E},
+       { "RAID5EE",DDF_RAID5EE},
+       { "RAID6", DDF_RAID6},
+       { NULL, 0}
+};
+static mapping_t ddf_sec_level[] = {
+       { "Striped", DDF_2STRIPED},
+       { "Mirrored", DDF_2MIRRORED},
+       { "Concat", DDF_2CONCAT},
+       { "Spanned", DDF_2SPANNED},
+       { NULL, 0}
+};
+#endif
+
+struct num_mapping {
+       int num1, num2;
+};
+static struct num_mapping ddf_level_num[] = {
+       { DDF_RAID0, 0 },
+       { DDF_RAID1, 1 },
+       { DDF_RAID3, LEVEL_UNSUPPORTED },
+       { DDF_RAID4, 4 },
+       { DDF_RAID5, 5 },
+       { DDF_RAID1E, LEVEL_UNSUPPORTED },
+       { DDF_JBOD, LEVEL_UNSUPPORTED },
+       { DDF_CONCAT, LEVEL_LINEAR },
+       { DDF_RAID5E, LEVEL_UNSUPPORTED },
+       { DDF_RAID5EE, LEVEL_UNSUPPORTED },
+       { DDF_RAID6, 6},
+       { MAXINT, MAXINT }
+};
+
+static int map_num1(struct num_mapping *map, int num)
+{
+       int i;
+       for (i=0 ; map[i].num1 != MAXINT; i++)
+               if (map[i].num1 == num)
+                       break;
+       return map[i].num2;
+}
+
+#ifndef MDASSEMBLE
+static void print_guid(char *guid, int tstamp)
+{
+       /* A GUIDs are part (or all) ASCII and part binary.
+        * They tend to be space padded.
+        * We print the GUID in HEX, then in parentheses add
+        * any initial ASCII sequence, and a possible
+        * time stamp from bytes 16-19
+        */
+       int l = DDF_GUID_LEN;
+       int i;
+
+       for (i=0 ; i<DDF_GUID_LEN ; i++) {
+               if ((i&3)==0 && i != 0) printf(":");
+               printf("%02X", guid[i]&255);
+       }
+
+       printf(" (");
+       while (l && guid[l-1] == ' ')
+               l--;
+       for (i=0 ; i<l ; i++) {
+               if (guid[i] >= 0x20 && guid[i] < 0x7f)
+                       fputc(guid[i], stdout);
+               else
+                       break;
+       }
+       if (tstamp) {
+               time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE;
+               char tbuf[100];
+               struct tm *tm;
+               tm = localtime(&then);
+               strftime(tbuf, 100, " %D %T",tm);
+               fputs(tbuf, stdout);
+       }
+       printf(")");
+}
+
+static void examine_vd(int n, struct ddf_super *sb, char *guid)
+{
+       int crl = sb->conf_rec_len;
+       struct vcl *vcl;
+
+       for (vcl = sb->conflist ; vcl ; vcl = vcl->next) {
+               struct vd_config *vc = &vcl->conf;
+
+               if (calc_crc(vc, crl*512) != vc->crc)
+                       continue;
+               if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0)
+                       continue;
+
+               /* Ok, we know about this VD, let's give more details */
+               printf(" Raid Devices[%d] : %d\n", n,
+                      __be16_to_cpu(vc->prim_elmnt_count));
+               printf("   Chunk Size[%d] : %d sectors\n", n,
+                      1 << vc->chunk_shift);
+               printf("   Raid Level[%d] : %s\n", n,
+                      map_num(ddf_level, vc->prl)?:"-unknown-");
+               if (vc->sec_elmnt_count != 1) {
+                       printf("  Secondary Position[%d] : %d of %d\n", n,
+                              vc->sec_elmnt_seq, vc->sec_elmnt_count);
+                       printf("  Secondary Level[%d] : %s\n", n,
+                              map_num(ddf_sec_level, vc->srl) ?: "-unknown-");
+               }
+               printf("  Device Size[%d] : %llu\n", n,
+                      __be64_to_cpu(vc->blocks)/2);
+               printf("   Array Size[%d] : %llu\n", n,
+                      __be64_to_cpu(vc->array_blocks)/2);
+       }
+}
+
+static void examine_vds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->virt->populated_vdes);
+       int i;
+       printf("  Virtual Disks : %d\n", cnt);
+
+       for (i=0; i<cnt; i++) {
+               struct virtual_entry *ve = &sb->virt->entries[i];
+               printf("      VD GUID[%d] : ", i); print_guid(ve->guid, 1);
+               printf("\n");
+               printf("         unit[%d] : %d\n", i, __be16_to_cpu(ve->unit));
+               printf("        state[%d] : %s, %s%s\n", i,
+                      map_num(ddf_state, ve->state & 7),
+                      (ve->state & 8) ? "Morphing, ": "",
+                      (ve->state & 16)? "Not Consistent" : "Consistent");
+               printf("   init state[%d] : %s\n", i,
+                      map_num(ddf_init_state, ve->init_state&3));
+               printf("       access[%d] : %s\n", i,
+                      map_num(ddf_access, (ve->init_state>>6) & 3));
+               printf("         Name[%d] : %.16s\n", i, ve->name);
+               examine_vd(i, sb, ve->guid);
+       }
+       if (cnt) printf("\n");
+}
+
+static void examine_pds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->phys->used_pdes);
+       int i;
+       struct dl *dl;
+       printf(" Physical Disks : %d\n", cnt);
+
+       for (i=0 ; i<cnt ; i++) {
+               struct phys_disk_entry *pd = &sb->phys->entries[i];
+               int type = __be16_to_cpu(pd->type);
+               int state = __be16_to_cpu(pd->state);
+
+               printf("      PD GUID[%d] : ", i); print_guid(pd->guid, 0);
+               printf("\n");
+               printf("          ref[%d] : %08x\n", i,
+                      __be32_to_cpu(pd->refnum));
+               printf("         mode[%d] : %s%s%s%s%s\n", i,
+                      (type&2) ? "active":"",
+                      (type&4) ? "Global Spare":"",
+                      (type&8) ? "spare" : "",
+                      (type&16)? ", foreign" : "",
+                      (type&32)? "pass-through" : "");
+               printf("        state[%d] : %s%s%s%s%s%s%s\n", i,
+                      (state&1)? "Online": "Offline",
+                      (state&2)? ", Failed": "",
+                      (state&4)? ", Rebuilding": "",
+                      (state&8)? ", in-transition": "",
+                      (state&16)? ", SMART errors": "",
+                      (state&32)? ", Unrecovered Read Errors": "",
+                      (state&64)? ", Missing" : "");
+               printf("   Avail Size[%d] : %llu K\n", i,
+                      __be64_to_cpu(pd->config_size)>>1);
+               for (dl = sb->dlist; dl ; dl = dl->next) {
+                       if (dl->disk.refnum == pd->refnum) {
+                               char *dv = map_dev(dl->major, dl->minor, 0);
+                               if (dv)
+                                       printf("       Device[%d] : %s\n",
+                                              i, dv);
+                       }
+               }
+               printf("\n");
+       }
+}
+
+static void examine_super_ddf(struct supertype *st, char *homehost)
+{
+       struct ddf_super *sb = st->sb;
+
+       printf("          Magic : %08x\n", __be32_to_cpu(sb->anchor.magic));
+       printf("        Version : %.8s\n", sb->anchor.revision);
+       printf("Controller GUID : "); print_guid(sb->controller.guid, 0);
+       printf("\n");
+       printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+       printf("\n");
+       printf("            Seq : %08x\n", __be32_to_cpu(sb->active->seq));
+       printf("  Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC
+              ?"yes" : "no");
+       examine_vds(sb);
+       examine_pds(sb);
+}
+
+static void brief_examine_super_ddf(struct supertype *st)
+{
+       /* We just write a generic DDF ARRAY entry
+        * The uuid is all hex, 6 groups of 4 bytes
+        */
+       struct ddf_super *ddf = st->sb;
+       int i;
+       printf("ARRAY /dev/ddf metadata=ddf UUID=");
+       for (i = 0; i < DDF_GUID_LEN; i++) {
+               if ((i&3) == 0 && i != 0)
+                       printf(":");
+               printf("%02X", 255&ddf->anchor.guid[i]);
+       }
+       printf("\n");
+}
+
+static void detail_super_ddf(struct supertype *st, char *homehost)
+{
+       /* FIXME later
+        * Could print DDF GUID
+        * Need to find which array
+        *  If whole, briefly list all arrays
+        *  If one, give name
+        */
+}
+
+static void brief_detail_super_ddf(struct supertype *st)
+{
+       /* FIXME I really need to know which array we are detailing.
+        * Can that be stored in ddf_super??
+        */
+//     struct ddf_super *ddf = st->sb;
+}
+#endif
+
+static int match_home_ddf(struct supertype *st, char *homehost)
+{
+       /* It matches 'this' host if the controller is a
+        * Linux-MD controller with vendor_data matching
+        * the hostname
+        */
+       struct ddf_super *ddf = st->sb;
+       int len = strlen(homehost);
+
+       return (memcmp(ddf->controller.guid, T10, 8) == 0 &&
+               len < sizeof(ddf->controller.vendor_data) &&
+               memcmp(ddf->controller.vendor_data, homehost,len) == 0 &&
+               ddf->controller.vendor_data[len] == 0);
+}
+
+static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst)
+{
+       struct vcl *v;
+
+       for (v = ddf->conflist; v; v = v->next)
+               if (inst == v->vcnum)
+                       return &v->conf;
+       return NULL;
+}
+
+static int find_phys(struct ddf_super *ddf, __u32 phys_refnum)
+{
+       /* Find the entry in phys_disk which has the given refnum
+        * and return it's index
+        */
+       int i;
+       for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++)
+               if (ddf->phys->entries[i].refnum == phys_refnum)
+                       return i;
+       return -1;
+}
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
+{
+       /* The uuid returned here is used for:
+        *  uuid to put into bitmap file (Create, Grow)
+        *  uuid for backup header when saving critical section (Grow)
+        *  comparing uuids when re-adding a device into an array
+        * For each of these we can make do with a truncated
+        * or hashed uuid rather than the original, as long as
+        * everyone agrees.
+        * In each case the uuid required is that of the data-array,
+        * not the device-set.
+        * In the case of SVD we assume the BVD is of interest,
+        * though that might be the case if a bitmap were made for
+        * a mirrored SVD - worry about that later.
+        * So we need to find the VD configuration record for the
+        * relevant BVD and extract the GUID and Secondary_Element_Seq.
+        * The first 16 bytes of the sha1 of these is used.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vcl = ddf->currentconf;
+
+       if (!vcl)
+               memset(uuid, 0, sizeof (uuid));
+       else {
+               char buf[20];
+               struct sha1_ctx ctx;
+               sha1_init_ctx(&ctx);
+               sha1_process_bytes(&vcl->conf.guid, DDF_GUID_LEN, &ctx);
+               if (vcl->conf.sec_elmnt_count > 1)
+                       sha1_process_bytes(&vcl->conf.sec_elmnt_seq, 1, &ctx);
+               sha1_finish_ctx(&ctx, buf);
+               memcpy(uuid, buf, sizeof(uuid));
+       }
+}
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+
+       if (ddf->currentconf) {
+               getinfo_super_ddf_bvd(st, info);
+               return;
+       }
+
+       info->array.raid_disks    = __be16_to_cpu(ddf->phys->used_pdes);
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE + __be32_to_cpu(*(__u32*)
+                                                        (ddf->anchor.guid+16));
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       if (ddf->dlist) {
+               info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum);
+               info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
+
+               info->data_offset = __be64_to_cpu(ddf->phys->
+                                         entries[info->disk.raid_disk].
+                                         config_size);
+               info->component_size = ddf->dlist->size - info->data_offset;
+       } else {
+               info->disk.number = -1;
+//             info->disk.raid_disk = find refnum in the table and use index;
+       }
+       info->disk.state = (1 << MD_DISK_SYNC);
+
+
+       info->reshape_active = 0;
+
+       strcpy(info->text_version, "ddf");
+
+//     uuid_from_super_ddf(info->uuid, sbv);
+
+//     info->name[] ?? ;
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks);
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vc = ddf->currentconf;
+       int cd = ddf->currentdev;
+
+       /* FIXME this returns BVD info - what if we want SVD ?? */
+
+       info->array.raid_disks    = __be16_to_cpu(vc->conf.prim_elmnt_count);
+       info->array.level         = map_num1(ddf_level_num, vc->conf.prl);
+       info->array.layout        = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                 info->array.raid_disks);
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE +
+               __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+       info->array.utime         = DECADE + __be32_to_cpu(vc->conf.timestamp);
+       info->array.chunk_size    = 512 << vc->conf.chunk_shift;
+
+       if (cd >= 0 && cd < ddf->mppe) {
+               info->data_offset         = __be64_to_cpu(vc->lba_offset[cd]);
+               if (vc->block_sizes)
+                       info->component_size = vc->block_sizes[cd];
+               else
+                       info->component_size = __be64_to_cpu(vc->conf.blocks);
+       }
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+//     info->disk.number = __be32_to_cpu(ddf->disk.refnum);
+//     info->disk.raid_disk = find refnum in the table and use index;
+//     info->disk.state = ???;
+
+       info->container_member = ddf->currentconf->vcnum;
+
+       info->resync_start = 0;
+       if (!(ddf->virt->entries[info->container_member].state
+             & DDF_state_inconsistent)  &&
+           (ddf->virt->entries[info->container_member].init_state
+            & DDF_initstate_mask)
+           == DDF_init_full)
+               info->resync_start = ~0ULL;
+
+       uuid_from_super_ddf(st, info->uuid);
+
+       info->container_member = atoi(st->subarray);
+       sprintf(info->text_version, "/%s/%s",
+               devnum2devname(st->container_dev),
+               st->subarray);
+
+//     info->name[] ?? ;
+}
+
+
+static int update_super_ddf(struct supertype *st, struct mdinfo *info,
+                           char *update,
+                           char *devname, int verbose,
+                           int uuid_set, char *homehost)
+{
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  uuid:  Change the uuid of the array to match what is given
+        *  homehost:  update the recorded homehost
+        *  name:  update the name - preserving the homehost
+        *  _reshape_progress: record new reshape_progress position.
+        *
+        * Following are not relevant for this version:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        */
+       int rv = 0;
+//     struct ddf_super *ddf = st->sb;
+//     struct vd_config *vd = find_vdcr(ddf, info->container_member);
+//     struct virtual_entry *ve = find_ve(ddf);
+
+       /* we don't need to handle "force-*" or "assemble" as
+        * there is no need to 'trick' the kernel.  We the metadata is
+        * first updated to activate the array, all the implied modifications
+        * will just happen.
+        */
+
+       if (strcmp(update, "grow") == 0) {
+               /* FIXME */
+       }
+       if (strcmp(update, "resync") == 0) {
+//             info->resync_checkpoint = 0;
+       }
+       /* We ignore UUID updates as they make even less sense
+        * with DDF
+        */
+       if (strcmp(update, "homehost") == 0) {
+               /* homehost is stored in controller->vendor_data,
+                * or it is when we are the vendor
+                */
+//             if (info->vendor_is_local)
+//                     strcpy(ddf->controller.vendor_data, homehost);
+       }
+       if (strcmp(update, "name") == 0) {
+               /* name is stored in virtual_entry->name */
+//             memset(ve->name, ' ', 16);
+//             strncpy(ve->name, info->name, 16);
+       }
+       if (strcmp(update, "_reshape_progress") == 0) {
+               /* We don't support reshape yet */
+       }
+
+//     update_all_csum(ddf);
+
+       return rv;
+}
+
+static void make_header_guid(char *guid)
+{
+       __u32 stamp;
+       int rfd;
+       /* Create a DDF Header of Virtual Disk GUID */
+
+       /* 24 bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000
+        * Remaining 8 random number plus timestamp
+        */
+       memcpy(guid, T10, sizeof(T10));
+       stamp = __cpu_to_be32(0xdeadbeef);
+       memcpy(guid+8, &stamp, 4);
+       stamp = __cpu_to_be32(0);
+       memcpy(guid+12, &stamp, 4);
+       stamp = __cpu_to_be32(time(0) - DECADE);
+       memcpy(guid+16, &stamp, 4);
+       rfd = open("/dev/urandom", O_RDONLY);
+       if (rfd < 0 || read(rfd, &stamp, 4) != 4)
+               stamp = random();
+       memcpy(guid+20, &stamp, 4);
+       if (rfd >= 0) close(rfd);
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid);
+
+static int init_super_ddf(struct supertype *st,
+                         mdu_array_info_t *info,
+                         unsigned long long size, char *name, char *homehost,
+                         int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For DDF, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        *
+        * We need to create the entire 'ddf' structure which includes:
+        *  DDF headers - these are easy.
+        *  Controller data - a Sector describing this controller .. not that
+        *                  this is a controller exactly.
+        *  Physical Disk Record - one entry per device, so
+        *                      leave plenty of space.
+        *  Virtual Disk Records - again, just leave plenty of space.
+        *                   This just lists VDs, doesn't give details
+        *  Config records - describes the VDs that use this disk
+        *  DiskData  - describes 'this' device.
+        *  BadBlockManagement - empty
+        *  Diag Space - empty
+        *  Vendor Logs - Could we put bitmaps here?
+        *
+        */
+       struct ddf_super *ddf;
+       char hostname[17];
+       int hostlen;
+       int max_phys_disks, max_virt_disks;
+       unsigned long long sector;
+       int clen;
+       int i;
+       int pdsize, vdsize;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_ddf_bvd(st, info, size, name, homehost,
+                                         uuid);
+
+       posix_memalign((void**)&ddf, 512, sizeof(*ddf));
+       memset(ddf, 0, sizeof(*ddf));
+       ddf->dlist = NULL; /* no physical disks yet */
+       ddf->conflist = NULL; /* No virtual disks yet */
+
+       /* At least 32MB *must* be reserved for the ddf.  So let's just
+        * start 32MB from the end, and put the primary header there.
+        * Don't do secondary for now.
+        * We don't know exactly where that will be yet as it could be
+        * different on each device.  To just set up the lengths.
+        *
+        */
+
+       ddf->anchor.magic = DDF_HEADER_MAGIC;
+       make_header_guid(ddf->anchor.guid);
+
+       memcpy(ddf->anchor.revision, DDF_REVISION_2, 8);
+       ddf->anchor.seq = __cpu_to_be32(1);
+       ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE);
+       ddf->anchor.openflag = 0xFF;
+       ddf->anchor.foreignflag = 0;
+       ddf->anchor.enforcegroups = 0; /* Is this best?? */
+       ddf->anchor.pad0 = 0xff;
+       memset(ddf->anchor.pad1, 0xff, 12);
+       memset(ddf->anchor.header_ext, 0xff, 32);
+       ddf->anchor.primary_lba = ~(__u64)0;
+       ddf->anchor.secondary_lba = ~(__u64)0;
+       ddf->anchor.type = DDF_HEADER_ANCHOR;
+       memset(ddf->anchor.pad2, 0xff, 3);
+       ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */
+       ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom
+                                                 of 32M reserved.. */
+       max_phys_disks = 1023;   /* Should be enough */
+       ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks);
+       max_virt_disks = 255;
+       ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */
+       ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */
+       ddf->max_part = 64;
+       ddf->mppe = 256;
+       ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
+       ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len);
+       ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe);
+       memset(ddf->anchor.pad3, 0xff, 54);
+       /* controller sections is one sector long immediately
+        * after the ddf header */
+       sector = 1;
+       ddf->anchor.controller_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.controller_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       /* phys is 8 sectors after that */
+       pdsize = ROUND_UP(sizeof(struct phys_disk) +
+                         sizeof(struct phys_disk_entry)*max_phys_disks,
+                         512);
+       switch(pdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.phys_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.phys_section_length =
+               __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */
+       sector += pdsize/512;
+
+       /* virt is another 32 sectors */
+       vdsize = ROUND_UP(sizeof(struct virtual_disk) +
+                         sizeof(struct virtual_entry) * max_virt_disks,
+                         512);
+       switch(vdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.virt_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.virt_section_length =
+               __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */
+       sector += vdsize/512;
+
+       clen = ddf->conf_rec_len * (ddf->max_part+1);
+       ddf->anchor.config_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.config_section_length = __cpu_to_be32(clen);
+       sector += clen;
+
+       ddf->anchor.data_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.data_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       ddf->anchor.bbm_section_length = __cpu_to_be32(0);
+       ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.diag_space_length = __cpu_to_be32(0);
+       ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.vendor_length = __cpu_to_be32(0);
+       ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF);
+
+       memset(ddf->anchor.pad4, 0xff, 256);
+
+       memcpy(&ddf->primary, &ddf->anchor, 512);
+       memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+       ddf->primary.openflag = 1; /* I guess.. */
+       ddf->primary.type = DDF_HEADER_PRIMARY;
+
+       ddf->secondary.openflag = 1; /* I guess.. */
+       ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+       ddf->active = &ddf->primary;
+
+       ddf->controller.magic = DDF_CONTROLLER_MAGIC;
+
+       /* 24 more bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * Remaining 16 are serial number.... maybe a hostname would do?
+        */
+       memcpy(ddf->controller.guid, T10, sizeof(T10));
+       gethostname(hostname, sizeof(hostname));
+       hostname[sizeof(hostname) - 1] = 0;
+       hostlen = strlen(hostname);
+       memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen);
+       for (i = strlen(T10) ; i+hostlen < 24; i++)
+               ddf->controller.guid[i] = ' ';
+
+       ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD);
+       ddf->controller.type.device_id = __cpu_to_be16(0xBEEF);
+       ddf->controller.type.sub_vendor_id = 0;
+       ddf->controller.type.sub_device_id = 0;
+       memcpy(ddf->controller.product_id, "What Is My PID??", 16);
+       memset(ddf->controller.pad, 0xff, 8);
+       memset(ddf->controller.vendor_data, 0xff, 448);
+
+       posix_memalign((void**)&pd, 512, pdsize);
+       ddf->phys = pd;
+       ddf->pdsize = pdsize;
+
+       memset(pd, 0xff, pdsize);
+       memset(pd, 0, sizeof(*pd));
+       pd->magic = DDF_PHYS_DATA_MAGIC;
+       pd->used_pdes = __cpu_to_be16(0);
+       pd->max_pdes = __cpu_to_be16(max_phys_disks);
+       memset(pd->pad, 0xff, 52);
+
+       posix_memalign((void**)&vd, 512, vdsize);
+       ddf->virt = vd;
+       ddf->vdsize = vdsize;
+       memset(vd, 0, vdsize);
+       vd->magic = DDF_VIRT_RECORDS_MAGIC;
+       vd->populated_vdes = __cpu_to_be16(0);
+       vd->max_vdes = __cpu_to_be16(max_virt_disks);
+       memset(vd->pad, 0xff, 52);
+
+       for (i=0; i<max_virt_disks; i++)
+               memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
+
+       st->sb = ddf;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+static int all_ff(char *guid)
+{
+       int i;
+       for (i = 0; i < DDF_GUID_LEN; i++)
+               if (guid[i] != (char)0xff)
+                       return 0;
+       return 1;
+}
+static int chunk_to_shift(int chunksize)
+{
+       return ffs(chunksize/512)-1;
+}
+
+static int level_to_prl(int level)
+{
+       switch (level) {
+       case LEVEL_LINEAR: return DDF_CONCAT;
+       case 0: return DDF_RAID0;
+       case 1: return DDF_RAID1;
+       case 4: return DDF_RAID4;
+       case 5: return DDF_RAID5;
+       case 6: return DDF_RAID6;
+       default: return -1;
+       }
+}
+static int layout_to_rlq(int level, int layout, int raiddisks)
+{
+       switch(level) {
+       case 0:
+               return DDF_RAID0_SIMPLE;
+       case 1:
+               switch(raiddisks) {
+               case 2: return DDF_RAID1_SIMPLE;
+               case 3: return DDF_RAID1_MULTI;
+               default: return -1;
+               }
+       case 4:
+               switch(layout) {
+               case 0: return DDF_RAID4_N;
+               }
+               break;
+       case 5:
+       case 6:
+               switch(layout) {
+               case ALGORITHM_LEFT_ASYMMETRIC:
+                       return DDF_RAID5_N_RESTART;
+               case ALGORITHM_RIGHT_ASYMMETRIC:
+                       if (level == 5)
+                               return DDF_RAID5_0_RESTART;
+                       else
+                               return DDF_RAID6_0_RESTART;
+               case ALGORITHM_LEFT_SYMMETRIC:
+                       return DDF_RAID5_N_CONTINUE;
+               case ALGORITHM_RIGHT_SYMMETRIC:
+                       return -1; /* not mentioned in standard */
+               }
+       }
+       return -1;
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks)
+{
+       switch(prl) {
+       case DDF_RAID0:
+               return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */
+       case DDF_RAID1:
+               return 0; /* hopefully rlq == SIMPLE or MULTI depending
+                            on raiddisks*/
+       case DDF_RAID4:
+               switch(rlq) {
+               case DDF_RAID4_N:
+                       return 0;
+               default:
+                       /* not supported */
+                       return -1; /* FIXME this isn't checked */
+               }
+       case DDF_RAID5:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_LEFT_ASYMMETRIC;
+               case DDF_RAID5_0_RESTART:
+                       return ALGORITHM_RIGHT_ASYMMETRIC;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_LEFT_SYMMETRIC;
+               default:
+                       return -1;
+               }
+       case DDF_RAID6:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_LEFT_ASYMMETRIC;
+               case DDF_RAID6_0_RESTART:
+                       return ALGORITHM_RIGHT_ASYMMETRIC;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_LEFT_SYMMETRIC;
+               default:
+                       return -1;
+               }
+       }
+       return -1;
+}
+
+struct extent {
+       unsigned long long start, size;
+};
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
+{
+       /* find a list of used extents on the give physical device
+        * (dnum) of the given ddf.
+        * Return a malloced array of 'struct extent'
+
+FIXME ignore DDF_Legacy devices?
+
+        */
+       struct extent *rv;
+       int n = 0;
+       int i, j;
+
+       rv = malloc(sizeof(struct extent) * (ddf->max_part + 2));
+       if (!rv)
+               return NULL;
+
+       for (i = 0; i < ddf->max_part; i++) {
+               struct vcl *v = dl->vlist[i];
+               if (v == NULL)
+                       continue;
+               for (j=0; j < v->conf.prim_elmnt_count; j++)
+                       if (v->conf.phys_refnum[j] == dl->disk.refnum) {
+                               /* This device plays role 'j' in  'v'. */
+                               rv[n].start = __be64_to_cpu(v->lba_offset[j]);
+                               rv[n].size = __be64_to_cpu(v->conf.blocks);
+                               n++;
+                               break;
+                       }
+       }
+       qsort(rv, n, sizeof(*rv), cmp_extent);
+
+       rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size);
+       rv[n].size = 0;
+       return rv;
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid)
+{
+       /* We are creating a BVD inside a pre-existing container.
+        * so st->sb is already set.
+        * We need to create a new vd_config and a new virtual_entry
+        */
+       struct ddf_super *ddf = st->sb;
+       int venum;
+       struct virtual_entry *ve;
+       struct vcl *vcl;
+       struct vd_config *vc;
+
+       if (__be16_to_cpu(ddf->virt->populated_vdes)
+           >= __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name": This ddf already has the "
+                       "maximum of %d virtual devices\n",
+                       __be16_to_cpu(ddf->virt->max_vdes));
+               return 0;
+       }
+
+       for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
+               if (all_ff(ddf->virt->entries[venum].guid))
+                       break;
+       if (venum == __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name ": Cannot find spare slot for "
+                       "virtual disk - DDF is corrupt\n");
+               return 0;
+       }
+       ve = &ddf->virt->entries[venum];
+
+       /* A Virtual Disk GUID contains the T10 Vendor ID, controller type,
+        * timestamp, random number
+        */
+       make_header_guid(ve->guid);
+       ve->unit = __cpu_to_be16(info->md_minor);
+       ve->pad0 = 0xFFFF;
+       ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN);
+       ve->type = 0;
+       ve->state = DDF_state_degraded; /* Will be modified as devices are added */
+       if (info->state & 1) /* clean */
+               ve->init_state = DDF_init_full;
+       else
+               ve->init_state = DDF_init_not;
+
+       memset(ve->pad1, 0xff, 14);
+       memset(ve->name, ' ', 16);
+       if (name)
+               strncpy(ve->name, name, 16);
+       ddf->virt->populated_vdes =
+               __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1);
+
+       /* Now create a new vd_config */
+       posix_memalign((void**)&vcl, 512,
+                      (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512));
+       vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
+       vcl->vcnum = venum;
+       sprintf(st->subarray, "%d", venum);
+       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+
+       vc = &vcl->conf;
+
+       vc->magic = DDF_VD_CONF_MAGIC;
+       memcpy(vc->guid, ve->guid, DDF_GUID_LEN);
+       vc->timestamp = __cpu_to_be32(time(0)-DECADE);
+       vc->seqnum = __cpu_to_be32(1);
+       memset(vc->pad0, 0xff, 24);
+       vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks);
+       vc->chunk_shift = chunk_to_shift(info->chunk_size);
+       vc->prl = level_to_prl(info->level);
+       vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks);
+       vc->sec_elmnt_count = 1;
+       vc->sec_elmnt_seq = 0;
+       vc->srl = 0;
+       vc->blocks = __cpu_to_be64(info->size * 2);
+       vc->array_blocks = __cpu_to_be64(
+               calc_array_size(info->level, info->raid_disks, info->layout,
+                               info->chunk_size, info->size*2));
+       memset(vc->pad1, 0xff, 8);
+       vc->spare_refs[0] = 0xffffffff;
+       vc->spare_refs[1] = 0xffffffff;
+       vc->spare_refs[2] = 0xffffffff;
+       vc->spare_refs[3] = 0xffffffff;
+       vc->spare_refs[4] = 0xffffffff;
+       vc->spare_refs[5] = 0xffffffff;
+       vc->spare_refs[6] = 0xffffffff;
+       vc->spare_refs[7] = 0xffffffff;
+       memset(vc->cache_pol, 0, 8);
+       vc->bg_rate = 0x80;
+       memset(vc->pad2, 0xff, 3);
+       memset(vc->pad3, 0xff, 52);
+       memset(vc->pad4, 0xff, 192);
+       memset(vc->v0, 0xff, 32);
+       memset(vc->v1, 0xff, 32);
+       memset(vc->v2, 0xff, 16);
+       memset(vc->v3, 0xff, 16);
+       memset(vc->vendor, 0xff, 32);
+
+       memset(vc->phys_refnum, 0xff, 4*ddf->mppe);
+       memset(vc->phys_refnum+(ddf->mppe * 4), 0x00, 8*ddf->mppe);
+
+       vcl->next = ddf->conflist;
+       ddf->conflist = vcl;
+       ddf->currentconf = vcl;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+static void add_to_super_ddf_bvd(struct supertype *st,
+                                mdu_disk_info_t *dk, int fd, char *devname)
+{
+       /* fd and devname identify a device with-in the ddf container (st).
+        * dk identifies a location in the new BVD.
+        * We need to find suitable free space in that device and update
+        * the phys_refnum and lba_offset for the newly created vd_config.
+        * We might also want to update the type in the phys_disk
+        * section.
+        */
+       struct dl *dl;
+       struct ddf_super *ddf = st->sb;
+       struct vd_config *vc;
+       __u64 *lba_offset;
+       int working;
+       int i;
+       unsigned long long blocks, pos, esize;
+       struct extent *ex;
+
+       for (dl = ddf->dlist; dl ; dl = dl->next)
+               if (dl->major == dk->major &&
+                   dl->minor == dk->minor)
+                       break;
+       if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+               return;
+
+       vc = &ddf->currentconf->conf;
+       lba_offset = ddf->currentconf->lba_offset;
+
+       ex = get_extents(ddf, dl);
+       if (!ex)
+               return;
+
+       i = 0; pos = 0;
+       blocks = __be64_to_cpu(vc->blocks);
+       if (ddf->currentconf->block_sizes)
+               blocks = ddf->currentconf->block_sizes[dk->raid_disk];
+
+       do {
+               esize = ex[i].start - pos;
+               if (esize >= blocks)
+                       break;
+               pos = ex[i].start + ex[i].size;
+               i++;
+       } while (ex[i-1].size);
+
+       free(ex);
+       if (esize < blocks)
+               return;
+
+       ddf->currentdev = dk->raid_disk;
+       vc->phys_refnum[dk->raid_disk] = dl->disk.refnum;
+       lba_offset[dk->raid_disk] = __cpu_to_be64(pos);
+
+       for (i=0; i < ddf->max_part ; i++)
+               if (dl->vlist[i] == NULL)
+                       break;
+       if (i == ddf->max_part)
+               return;
+       dl->vlist[i] = ddf->currentconf;
+
+       dl->fd = fd;
+       dl->devname = devname;
+
+       /* Check how many working raid_disks, and if we can mark
+        * array as optimal yet
+        */
+       working = 0;
+
+       for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++)
+               if (vc->phys_refnum[i] != 0xffffffff)
+                       working++;
+
+       /* Find which virtual_entry */
+       i = ddf->currentconf->vcnum;
+       if (working == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_optimal;
+
+       if (vc->prl == DDF_RAID6 &&
+           working+1 == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_part_optimal;
+
+       ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare);
+       ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD);
+       ddf->updates_pending = 1;
+}
+
+/* add a device to a container, either while creating it or while
+ * expanding a pre-existing container
+ */
+static void add_to_super_ddf(struct supertype *st,
+                            mdu_disk_info_t *dk, int fd, char *devname)
+{
+       struct ddf_super *ddf = st->sb;
+       struct dl *dd;
+       time_t now;
+       struct tm *tm;
+       unsigned long long size;
+       struct phys_disk_entry *pde;
+       int n, i;
+       struct stat stb;
+
+       if (ddf->currentconf) {
+               add_to_super_ddf_bvd(st, dk, fd, devname);
+               return;
+       }
+
+       /* This is device numbered dk->number.  We need to create
+        * a phys_disk entry and a more detailed disk_data entry.
+        */
+       fstat(fd, &stb);
+       posix_memalign((void**)&dd, 512,
+                      sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part);
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->devname = devname;
+       dd->next = ddf->dlist;
+       dd->fd = fd;
+       dd->spare = NULL;
+
+       dd->disk.magic = DDF_PHYS_DATA_MAGIC;
+       now = time(0);
+       tm = localtime(&now);
+       sprintf(dd->disk.guid, "%8s%04d%02d%02d",
+               T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday);
+       *(__u32*)(dd->disk.guid + 16) = random();
+       *(__u32*)(dd->disk.guid + 20) = random();
+
+       do {
+               /* Cannot be bothered finding a CRC of some irrelevant details*/
+               dd->disk.refnum = random();
+               for (i = __be16_to_cpu(ddf->active->max_pd_entries) - 1;
+                    i >= 0; i--)
+                       if (ddf->phys->entries[i].refnum == dd->disk.refnum)
+                               break;
+       } while (i >= 0);
+
+       dd->disk.forced_ref = 1;
+       dd->disk.forced_guid = 1;
+       memset(dd->disk.vendor, ' ', 32);
+       memcpy(dd->disk.vendor, "Linux", 5);
+       memset(dd->disk.pad, 0xff, 442);
+       for (i = 0; i < ddf->max_part ; i++)
+               dd->vlist[i] = NULL;
+
+       n = __be16_to_cpu(ddf->phys->used_pdes);
+       pde = &ddf->phys->entries[n];
+       dd->pdnum = n;
+
+       n++;
+       ddf->phys->used_pdes = __cpu_to_be16(n);
+
+       memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN);
+       pde->refnum = dd->disk.refnum;
+       pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare);
+       pde->state = __cpu_to_be16(DDF_Online);
+       get_dev_size(fd, NULL, &size);
+       /* We are required to reserve 32Meg, and record the size in sectors */
+       pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512);
+       sprintf(pde->path, "%17.17s","Information: nil") ;
+       memset(pde->pad, 0xff, 6);
+
+       dd->size = size >> 9;
+       ddf->dlist = dd;
+       ddf->updates_pending = 1;
+}
+
+/*
+ * This is the write_init_super method for a ddf container.  It is
+ * called when creating a container or adding another device to a
+ * container.
+ */
+
+#ifndef MDASSEMBLE
+
+static unsigned char null_conf[4096+512];
+
+static int __write_init_super_ddf(struct supertype *st, int do_close)
+{
+
+       struct ddf_super *ddf = st->sb;
+       int i;
+       struct dl *d;
+       int n_config;
+       int conf_size;
+
+       unsigned long long size, sector;
+
+       for (d = ddf->dlist; d; d=d->next) {
+               int fd = d->fd;
+
+               if (fd < 0)
+                       continue;
+
+               /* We need to fill in the primary, (secondary) and workspace
+                * lba's in the headers, set their checksums,
+                * Also checksum phys, virt....
+                *
+                * Then write everything out, finally the anchor is written.
+                */
+               get_dev_size(fd, NULL, &size);
+               size /= 512;
+               ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2);
+               ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2);
+               ddf->anchor.seq = __cpu_to_be32(1);
+               memcpy(&ddf->primary, &ddf->anchor, 512);
+               memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+               ddf->anchor.openflag = 0xFF; /* 'open' means nothing */
+               ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */
+               ddf->anchor.crc = calc_crc(&ddf->anchor, 512);
+
+               ddf->primary.openflag = 0;
+               ddf->primary.type = DDF_HEADER_PRIMARY;
+
+               ddf->secondary.openflag = 0;
+               ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+               ddf->primary.crc = calc_crc(&ddf->primary, 512);
+               ddf->secondary.crc = calc_crc(&ddf->secondary, 512);
+
+               sector = size - 16*1024*2;
+               lseek64(fd, sector<<9, 0);
+               write(fd, &ddf->primary, 512);
+
+               ddf->controller.crc = calc_crc(&ddf->controller, 512);
+               write(fd, &ddf->controller, 512);
+
+               ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize);
+
+               write(fd, ddf->phys, ddf->pdsize);
+
+               ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize);
+               write(fd, ddf->virt, ddf->vdsize);
+
+               /* Now write lots of config records. */
+               n_config = ddf->max_part;
+               conf_size = ddf->conf_rec_len * 512;
+               for (i = 0 ; i <= n_config ; i++) {
+                       struct vcl *c = d->vlist[i];
+                       if (i == n_config)
+                               c = (struct vcl*)d->spare;
+
+                       if (c) {
+                               c->conf.crc = calc_crc(&c->conf, conf_size);
+                               write(fd, &c->conf, conf_size);
+                       } else {
+                               char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL);
+                               if (null_conf[0] != 0xff)
+                                       memset(null_conf, 0xff, sizeof(null_conf));
+                               int togo = conf_size;
+                               while (togo > sizeof(null_conf)-512) {
+                                       write(fd, null_aligned, sizeof(null_conf)-512);
+                                       togo -= sizeof(null_conf)-512;
+                               }
+                               write(fd, null_aligned, togo);
+                       }
+               }
+               d->disk.crc = calc_crc(&d->disk, 512);
+               write(fd, &d->disk, 512);
+
+               /* Maybe do the same for secondary */
+
+               lseek64(fd, (size-1)*512, SEEK_SET);
+               write(fd, &ddf->anchor, 512);
+               if (do_close) {
+                       close(fd);
+                       d->fd = -1;
+               }
+       }
+       return 1;
+}
+
+static int write_init_super_ddf(struct supertype *st)
+{
+
+       if (st->update_tail) {
+               /* queue the virtual_disk and vd_config as metadata updates */
+               struct virtual_disk *vd;
+               struct vd_config *vc;
+               struct ddf_super *ddf = st->sb;
+               int len;
+
+               /* First the virtual disk.  We have a slightly fake header */
+               len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry);
+               vd = malloc(len);
+               *vd = *ddf->virt;
+               vd->entries[0] = ddf->virt->entries[ddf->currentconf->vcnum];
+               vd->populated_vdes = __cpu_to_be16(ddf->currentconf->vcnum);
+               append_metadata_update(st, vd, len);
+
+               /* Then the vd_config */
+               len = ddf->conf_rec_len * 512;
+               vc = malloc(len);
+               memcpy(vc, &ddf->currentconf->conf, len);
+               append_metadata_update(st, vc, len);
+
+               /* FIXME I need to close the fds! */
+               return 0;
+       } else 
+               return __write_init_super_ddf(st, 1);
+}
+
+#endif
+
+static __u64 avail_size_ddf(struct supertype *st, __u64 devsize)
+{
+       /* We must reserve the last 32Meg */
+       if (devsize <= 32*1024*2)
+               return 0;
+       return devsize - 32*1024*2;
+}
+
+#ifndef MDASSEMBLE
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose);
+
+static int validate_geometry_ddf(struct supertype *st,
+                                int level, int layout, int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *dev, unsigned long long *freesize,
+                                int verbose)
+{
+       int fd;
+       struct mdinfo *sra;
+       int cfd;
+
+       /* ddf potentially supports lots of things, but it depends on
+        * what devices are offered (and maybe kernel version?)
+        * If given unused devices, we will make a container.
+        * If given devices in a container, we will make a BVD.
+        * If given BVDs, we make an SVD, changing all the GUIDs in the process.
+        */
+
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_ddf_container(st, level, layout,
+                                                      raiddisks, chunk,
+                                                      size, dev, freesize,
+                                                      verbose);
+       }
+
+       if (st->sb) {
+               /* A container has already been opened, so we are
+                * creating in there.  Maybe a BVD, maybe an SVD.
+                * Should make a distinction one day.
+                */
+               return validate_geometry_ddf_bvd(st, level, layout, raiddisks,
+                                                chunk, size, dev, freesize,
+                                                verbose);
+       }
+       if (!dev) {
+               /* Initial sanity check.  Exclude illegal levels. */
+               int i;
+               for (i=0; ddf_level_num[i].num1 != MAXINT; i++)
+                       if (ddf_level_num[i].num2 == level)
+                               break;
+               if (ddf_level_num[i].num1 == MAXINT)
+                       return 0;
+               /* Should check layout? etc */
+               return 1;
+       }
+
+       /* This is the first device for the array.
+        * If it is a container, we read it in and do automagic allocations,
+        * no other devices should be given.
+        * Otherwise it must be a member device of a container, and we
+        * do manual allocation.
+        * Later we should check for a BVD and make an SVD.
+        */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               sra = sysfs_read(fd, 0, GET_VERSION);
+               close(fd);
+               if (sra && sra->array.major_version == -1 &&
+                   strcmp(sra->text_version, "ddf") == 0) {
+
+                       /* load super */
+                       /* find space for 'n' devices. */
+                       /* remember the devices */
+                       /* Somehow return the fact that we have enough */
+               }
+
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": ddf: Cannot create this array "
+                               "on device %s\n",
+                               dev);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe a 'ddf' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot use %s: %s\n",
+                               dev, strerror(EBUSY));
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "ddf") == 0) {
+               /* This is a member of a ddf container.  Load the container
+                * and try to create a bvd
+                */
+               struct ddf_super *ddf;
+               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+                       st->sb = ddf;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_ddf_bvd(st, level, layout,
+                                                        raiddisks, chunk, size,
+                                                        dev, freesize,
+                                                        verbose);
+               }
+               close(cfd);
+       } else /* device may belong to a different container */
+               return 0;
+
+       return 1;
+}
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_ddf(st, ldsize >> 9);
+
+       return 1;
+}
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose)
+{
+       struct stat stb;
+       struct ddf_super *ddf = st->sb;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+       /* ddf/bvd supports lots of things, but not containers */
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       /* We must have the container info already read in. */
+       if (!ddf)
+               return 0;
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size'.
+                */
+               unsigned long long minsize = size;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = 8;
+               for (dl = ddf->dlist; dl ; dl = dl->next)
+               {
+                       int found = 0;
+                       pos = 0;
+
+                       i = 0;
+                       e = get_extents(ddf, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr,
+                                       Name ": ddf: Not enough devices with "
+                                       "space for this array (%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = ddf->dlist ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: %s is not in the "
+                               "same DDF set\n",
+                               dev);
+               return 0;
+       }
+       e = get_extents(ddf, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) do {
+               unsigned long long esize;
+               esize = e[i].start - pos;
+               if (esize >= maxsize)
+                       maxsize = esize;
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       *freesize = maxsize;
+       // FIXME here I am
+
+       return 1;
+}
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct ddf_super *super;
+       struct mdinfo *sd, *best = NULL;
+       int bestseq = 0;
+       int seq;
+       char nm[20];
+       int dfd;
+
+       sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+       if (!sra)
+               return 1;
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "ddf") != 0)
+               return 1;
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0)
+               return 1;
+       memset(super, 0, sizeof(*super));
+
+       /* first, try each device, and choose the best ddf */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               int rv;
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               rv = load_ddf_headers(dfd, super, NULL);
+               close(dfd);
+               if (rv == 0) {
+                       seq = __be32_to_cpu(super->active->seq);
+                       if (super->active->openflag)
+                               seq--;
+                       if (!best || seq > bestseq) {
+                               bestseq = seq;
+                               best = sd;
+                       }
+               }
+       }
+       if (!best)
+               return 1;
+       /* OK, load this ddf */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (dfd < 0)
+               return 1;
+       load_ddf_headers(dfd, super, NULL);
+       load_ddf_global(dfd, super, NULL);
+       close(dfd);
+       /* Now we need the device-local bits */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               seq = load_ddf_local(dfd, super, NULL, keep_fd);
+               if (!keep_fd) close(dfd);
+       }
+       if (st->subarray[0]) {
+               struct vcl *v;
+
+               for (v = super->conflist; v; v = v->next)
+                       if (v->vcnum == atoi(st->subarray))
+                               super->currentconf = v;
+               if (!super->currentconf)
+                       return 1;
+       }
+       *sbp = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+               st->container_dev = fd2devnum(fd);
+       }
+       return 0;
+}
+#endif
+
+static struct mdinfo *container_content_ddf(struct supertype *st)
+{
+       /* Given a container loaded by load_super_ddf_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each vcl in conflist: create an mdinfo, fill it in,
+        *  then look for matching devices (phys_refnum) in dlist
+        *  and create appropriate device mdinfo.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct mdinfo *rest = NULL;
+       struct vcl *vc;
+
+       for (vc = ddf->conflist ; vc ; vc=vc->next)
+       {
+               int i;
+               struct mdinfo *this;
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+               rest = this;
+
+               this->array.level = map_num1(ddf_level_num, vc->conf.prl);
+               this->array.raid_disks =
+                       __be16_to_cpu(vc->conf.prim_elmnt_count);
+               this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                  this->array.raid_disks);
+               this->array.md_minor      = -1;
+               this->array.ctime         = DECADE +
+                       __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+               this->array.utime         = DECADE +
+                       __be32_to_cpu(vc->conf.timestamp);
+               this->array.chunk_size    = 512 << vc->conf.chunk_shift;
+
+               i = vc->vcnum;
+               if ((ddf->virt->entries[i].state & DDF_state_inconsistent) ||
+                   (ddf->virt->entries[i].init_state & DDF_initstate_mask) !=
+                   DDF_init_full) {
+                       this->array.state = 0;
+                       this->resync_start = 0;
+               } else {
+                       this->array.state = 1;
+                       this->resync_start = ~0ULL;
+               }
+               memcpy(this->name, ddf->virt->entries[i].name, 32);
+               this->name[33]=0;
+
+               memset(this->uuid, 0, sizeof(this->uuid));
+               this->component_size = __be64_to_cpu(vc->conf.blocks);
+               this->array.size = this->component_size / 2;
+               this->container_member = i;
+
+               sprintf(this->text_version, "/%s/%d",
+                       devnum2devname(st->container_dev),
+                       this->container_member);
+
+               for (i=0 ; i < ddf->mppe ; i++) {
+                       struct mdinfo *dev;
+                       struct dl *d;
+
+                       if (vc->conf.phys_refnum[i] == 0xFFFFFFFF)
+                               continue;
+
+                       this->array.working_disks++;
+
+                       for (d = ddf->dlist; d ; d=d->next)
+                               if (d->disk.refnum == vc->conf.phys_refnum[i])
+                                       break;
+                       if (d == NULL)
+                               break;
+
+                       dev = malloc(sizeof(*dev));
+                       memset(dev, 0, sizeof(*dev));
+                       dev->next = this->devs;
+                       this->devs = dev;
+
+                       dev->disk.number = __be32_to_cpu(d->disk.refnum);
+                       dev->disk.major = d->major;
+                       dev->disk.minor = d->minor;
+                       dev->disk.raid_disk = i;
+                       dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+
+                       dev->events = __be32_to_cpu(ddf->primary.seq);
+                       dev->data_offset = __be64_to_cpu(vc->lba_offset[i]);
+                       dev->component_size = __be64_to_cpu(vc->conf.blocks);
+                       if (d->devname)
+                               strcpy(dev->name, d->devname);
+               }
+       }
+       return rest;
+}
+
+static int store_zero_ddf(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+
+       if (!get_dev_size(fd, NULL, &dsize))
+               return 1;
+
+       posix_memalign(&buf, 512, 512);
+       memset(buf, 0, 512);
+
+       lseek64(fd, dsize-512, 0);
+       write(fd, buf, 512);
+       free(buf);
+       return 0;
+}
+
+static int compare_super_ddf(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct ddf_super *first = st->sb;
+       struct ddf_super *second = tst->sb;
+
+       if (!first) {
+               st->sb = tst->sb;
+               tst->sb = NULL;
+               return 0;
+       }
+
+       if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
+               return 2;
+
+       /* FIXME should I look at anything else? */
+       return 0;
+}
+
+/*
+ * A new array 'a' has been started which claims to be instance 'inst'
+ * within container 'c'.
+ * We need to confirm that the array matches the metadata in 'c' so
+ * that we don't corrupt any metadata.
+ */
+static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
+{
+       dprintf("ddf: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+/*
+ * The array 'a' is to be marked clean in the metadata.
+ * If '->resync_start' is not ~(unsigned long long)0, then the array is only
+ * clean up to the point (in sectors).  If that cannot be recorded in the
+ * metadata, then leave it as dirty.
+ *
+ * For DDF, we need to clear the DDF_state_inconsistent bit in the
+ * !global! virtual_disk.virtual_entry structure.
+ */
+static void ddf_set_array_state(struct active_array *a, int consistent)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       int old = ddf->virt->entries[inst].state;
+       if (consistent)
+               ddf->virt->entries[inst].state &= ~DDF_state_inconsistent;
+       else
+               ddf->virt->entries[inst].state |= DDF_state_inconsistent;
+       if (old != ddf->virt->entries[inst].state)
+               ddf->updates_pending = 1;
+
+       old = ddf->virt->entries[inst].init_state;
+       ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
+       if (a->resync_start == ~0ULL)
+               ddf->virt->entries[inst].init_state |= DDF_init_full;
+       else if (a->resync_start == 0)
+               ddf->virt->entries[inst].init_state |= DDF_init_not;
+       else
+               ddf->virt->entries[inst].init_state |= DDF_init_quick;
+       if (old != ddf->virt->entries[inst].init_state)
+               ddf->updates_pending = 1;
+
+       dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty",
+               a->resync_start);
+}
+
+/*
+ * The state of each disk is stored in the global phys_disk structure
+ * in phys_disk.entries[n].state.
+ * This makes various combinations awkward.
+ * - When a device fails in any array, it must be failed in all arrays
+ *   that include a part of this device.
+ * - When a component is rebuilding, we cannot include it officially in the
+ *   array unless this is the only array that uses the device.
+ *
+ * So: when transitioning:
+ *   Online -> failed,  just set failed flag.  monitor will propagate
+ *   spare -> online,   the device might need to be added to the array.
+ *   spare -> failed,   just set failed.  Don't worry if in array or not.
+ */
+static void ddf_set_disk(struct active_array *a, int n, int state)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       struct vd_config *vc = find_vdcr(ddf, inst);
+       int pd = find_phys(ddf, vc->phys_refnum[n]);
+       int i, st, working;
+
+       if (vc == NULL) {
+               dprintf("ddf: cannot find instance %d!!\n", inst);
+               return;
+       }
+       if (pd < 0) {
+               /* disk doesn't currently exist. If it is now in_sync,
+                * insert it. */
+               if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) {
+                       /* Find dev 'n' in a->info->devs, determine the
+                        * ddf refnum, and set vc->phys_refnum and update
+                        * phys->entries[]
+                        */
+                       /* FIXME */
+               }
+       } else {
+               int old = ddf->phys->entries[pd].state;
+               if (state & DS_FAULTY)
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Failed);
+               if (state & DS_INSYNC) {
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Online);
+                       ddf->phys->entries[pd].state  &= __cpu_to_be16(~DDF_Rebuilding);
+               }
+               if (old != ddf->phys->entries[pd].state)
+                       ddf->updates_pending = 1;
+       }
+
+       dprintf("ddf: set_disk %d to %x\n", n, state);
+
+       /* Now we need to check the state of the array and update
+        * virtual_disk.entries[n].state.
+        * It needs to be one of "optimal", "degraded", "failed".
+        * I don't understand 'deleted' or 'missing'.
+        */
+       working = 0;
+       for (i=0; i < a->info.array.raid_disks; i++) {
+               pd = find_phys(ddf, vc->phys_refnum[i]);
+               if (pd < 0)
+                       continue;
+               st = __be16_to_cpu(ddf->phys->entries[pd].state);
+               if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding))
+                   == DDF_Online)
+                       working++;
+       }
+       state = DDF_state_degraded;
+       if (working == a->info.array.raid_disks)
+               state = DDF_state_optimal;
+       else switch(vc->prl) {
+       case DDF_RAID0:
+       case DDF_CONCAT:
+       case DDF_JBOD:
+               state = DDF_state_failed;
+               break;
+       case DDF_RAID1:
+               if (working == 0)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID4:
+       case DDF_RAID5:
+               if (working < a->info.array.raid_disks-1)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID6:
+               if (working < a->info.array.raid_disks-2)
+                       state = DDF_state_failed;
+               else if (working == a->info.array.raid_disks-1)
+                       state = DDF_state_part_optimal;
+               break;
+       }
+
+       if (ddf->virt->entries[inst].state !=
+           ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+            | state)) {
+
+               ddf->virt->entries[inst].state =
+                       (ddf->virt->entries[inst].state & ~DDF_state_mask)
+                       | state;
+               ddf->updates_pending = 1;
+       }
+
+}
+
+static void ddf_sync_metadata(struct supertype *st)
+{
+
+       /*
+        * Write all data to all devices.
+        * Later, we might be able to track whether only local changes
+        * have been made, or whether any global data has been changed,
+        * but ddf is sufficiently weird that it probably always
+        * changes global data ....
+        */
+       struct ddf_super *ddf = st->sb;
+       if (!ddf->updates_pending)
+               return;
+       ddf->updates_pending = 0;
+       __write_init_super_ddf(st, 0);
+       dprintf("ddf: sync_metadata\n");
+}
+
+static void ddf_process_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* Apply this update to the metadata.
+        * The first 4 bytes are a DDF_*_MAGIC which guides
+        * our actions.
+        * Possible update are:
+        *  DDF_PHYS_RECORDS_MAGIC
+        *    Add a new physical device.  Changes to this record
+        *    only happen implicitly.
+        *    used_pdes is the device number.
+        *  DDF_VIRT_RECORDS_MAGIC
+        *    Add a new VD.  Possibly also change the 'access' bits.
+        *    populated_vdes is the entry number.
+        *  DDF_VD_CONF_MAGIC
+        *    New or updated VD.  the VIRT_RECORD must already
+        *    exist.  For an update, phys_refnum and lba_offset
+        *    (at least) are updated, and the VD_CONF must
+        *    be written to precisely those devices listed with
+        *    a phys_refnum.
+        *  DDF_SPARE_ASSIGN_MAGIC
+        *    replacement Spare Assignment Record... but for which device?
+        *
+        * So, e.g.:
+        *  - to create a new array, we send a VIRT_RECORD and
+        *    a VD_CONF.  Then assemble and start the array.
+        *  - to activate a spare we send a VD_CONF to add the phys_refnum
+        *    and offset.  This will also mark the spare as active with
+        *    a spare-assignment record.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+       struct vd_config *vc;
+       struct vcl *vcl;
+       struct dl *dl;
+       int mppe;
+       int ent;
+
+       dprintf("Process update %x\n", *magic);
+
+       switch (*magic) {
+       case DDF_PHYS_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct phys_disk) +
+                                   sizeof(struct phys_disk_entry)))
+                       return;
+               pd = (struct phys_disk*)update->buf;
+
+               ent = __be16_to_cpu(pd->used_pdes);
+               if (ent >= __be16_to_cpu(ddf->phys->max_pdes))
+                       return;
+               if (!all_ff(ddf->phys->entries[ent].guid))
+                       return;
+               ddf->phys->entries[ent] = pd->entries[0];
+               ddf->phys->used_pdes = __cpu_to_be16(1 +
+                                          __be16_to_cpu(ddf->phys->used_pdes));
+               ddf->updates_pending = 1;
+               break;
+
+       case DDF_VIRT_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct virtual_disk) +
+                                   sizeof(struct virtual_entry)))
+                       return;
+               vd = (struct virtual_disk*)update->buf;
+
+               ent = __be16_to_cpu(vd->populated_vdes);
+               if (ent >= __be16_to_cpu(ddf->virt->max_vdes))
+                       return;
+               if (!all_ff(ddf->virt->entries[ent].guid))
+                       return;
+               ddf->virt->entries[ent] = vd->entries[0];
+               ddf->virt->populated_vdes = __cpu_to_be16(1 +
+                             __be16_to_cpu(ddf->virt->populated_vdes));
+               ddf->updates_pending = 1;
+               break;
+
+       case DDF_VD_CONF_MAGIC:
+               dprintf("len %d %d\n", update->len, ddf->conf_rec_len);
+
+               mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries);
+               if (update->len != ddf->conf_rec_len * 512)
+                       return;
+               vc = (struct vd_config*)update->buf;
+               for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                       if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+                               break;
+               dprintf("vcl = %p\n", vcl);
+               if (vcl) {
+                       /* An update, just copy the phys_refnum and lba_offset
+                        * fields
+                        */
+                       memcpy(vcl->conf.phys_refnum, vc->phys_refnum,
+                              mppe * (sizeof(__u32) + sizeof(__u64)));
+               } else {
+                       /* A new VD_CONF */
+                       vcl = update->space;
+                       update->space = NULL;
+                       vcl->next = ddf->conflist;
+                       memcpy(&vcl->conf, vc, update->len);
+                       vcl->lba_offset = (__u64*)
+                               &vcl->conf.phys_refnum[mppe];
+                       ddf->conflist = vcl;
+               }
+               /* Now make sure vlist is correct for each dl. */
+               for (dl = ddf->dlist; dl; dl = dl->next) {
+                       int dn;
+                       int vn = 0;
+                       for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                               for (dn=0; dn < ddf->mppe ; dn++)
+                                       if (vcl->conf.phys_refnum[dn] ==
+                                           dl->disk.refnum) {
+                                               dprintf("dev %d has %p at %d\n",
+                                                       dl->pdnum, vcl, vn);
+                                               dl->vlist[vn++] = vcl;
+                                               break;
+                                       }
+                       while (vn < ddf->max_part)
+                               dl->vlist[vn++] = NULL;
+                       if (dl->vlist[0]) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Active_in_VD);
+                       }
+                       if (dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Spare);
+                       }
+                       if (!dl->vlist[0] && !dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Spare |
+                                                      DDF_Active_in_VD);
+                       }
+               }
+               ddf->updates_pending = 1;
+               break;
+       case DDF_SPARE_ASSIGN_MAGIC:
+       default: break;
+       }
+}
+
+static void ddf_prepare_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* This update arrived at managemon.
+        * We are about to pass it to monitor.
+        * If a malloc is needed, do it here.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       if (*magic == DDF_VD_CONF_MAGIC)
+               posix_memalign(&update->space, 512,
+                              offsetof(struct vcl, conf)
+                              + ddf->conf_rec_len * 512);
+}
+
+/*
+ * Check if the array 'a' is degraded but not failed.
+ * If it is, find as many spares as are available and needed and
+ * arrange for their inclusion.
+ * We only choose devices which are not already in the array,
+ * and prefer those with a spare-assignment to this array.
+ * otherwise we choose global spares - assuming always that
+ * there is enough room.
+ * For each spare that we assign, we return an 'mdinfo' which
+ * describes the position for the device in the array.
+ * We also add to 'updates' a DDF_VD_CONF_MAGIC update with
+ * the new phys_refnum and lba_offset values.
+ *
+ * Only worry about BVDs at the moment.
+ */
+static struct mdinfo *ddf_activate_spare(struct active_array *a,
+                                        struct metadata_update **updates)
+{
+       int working = 0;
+       struct mdinfo *d;
+       struct ddf_super *ddf = a->container->sb;
+       int global_ok = 0;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       int i;
+       struct vd_config *vc;
+       __u64 *lba;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       working ++;
+       }
+
+       dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks,
+               a->info.array.level);
+       if (working == a->info.array.raid_disks)
+               return NULL; /* array not degraded */
+       switch (a->info.array.level) {
+       case 1:
+               if (working == 0)
+                       return NULL; /* failed */
+               break;
+       case 4:
+       case 5:
+               if (working < a->info.array.raid_disks - 1)
+                       return NULL; /* failed */
+               break;
+       case 6:
+               if (working < a->info.array.raid_disks - 2)
+                       return NULL; /* failed */
+               break;
+       default: /* concat or stripe */
+               return NULL; /* failed */
+       }
+
+       /* For each slot, if it is not working, find a spare */
+       dl = ddf->dlist;
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /* OK, this device needs recovery.  Find a spare */
+       again:
+               for ( ; dl ; dl = dl->next) {
+                       unsigned long long esize;
+                       unsigned long long pos;
+                       struct mdinfo *d2;
+                       int is_global = 0;
+                       int is_dedicated = 0;
+                       struct extent *ex;
+                       int j;
+                       /* If in this array, skip */
+                       for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+                               if (d2->disk.major == dl->major &&
+                                   d2->disk.minor == dl->minor) {
+                                       dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                                       break;
+                               }
+                       if (d2)
+                               continue;
+                       if (ddf->phys->entries[dl->pdnum].type &
+                           __cpu_to_be16(DDF_Spare)) {
+                               /* Check spare assign record */
+                               if (dl->spare) {
+                                       if (dl->spare->type & DDF_spare_dedicated) {
+                                               /* check spare_ents for guid */
+                                               for (j = 0 ;
+                                                    j < __be16_to_cpu(dl->spare->populated);
+                                                    j++) {
+                                                       if (memcmp(dl->spare->spare_ents[j].guid,
+                                                                  ddf->virt->entries[a->info.container_member].guid,
+                                                                  DDF_GUID_LEN) == 0)
+                                                               is_dedicated = 1;
+                                               }
+                                       } else
+                                               is_global = 1;
+                               }
+                       } else if (ddf->phys->entries[dl->pdnum].type &
+                                  __cpu_to_be16(DDF_Global_Spare)) {
+                               is_global = 1;
+                       }
+                       if ( ! (is_dedicated ||
+                               (is_global && global_ok))) {
+                               dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor,
+                                      is_dedicated, is_global);
+                               continue;
+                       }
+
+                       /* We are allowed to use this device - is there space?
+                        * We need a->info.component_size sectors */
+                       ex = get_extents(ddf, dl);
+                       if (!ex) {
+                               dprintf("cannot get extents\n");
+                               continue;
+                       }
+                       j = 0; pos = 0;
+                       esize = 0;
+
+                       do {
+                               esize = ex[j].start - pos;
+                               if (esize >= a->info.component_size)
+                                       break;
+                               pos = ex[i].start + ex[i].size;
+                               i++;
+                       } while (ex[i-1].size);
+
+                       free(ex);
+                       if (esize < a->info.component_size) {
+                               dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor,
+                                       esize, a->info.component_size);
+                               /* No room */
+                               continue;
+                       }
+
+                       /* Cool, we have a device with some space at pos */
+                       di = malloc(sizeof(*di));
+                       memset(di, 0, sizeof(*di));
+                       di->disk.number = i;
+                       di->disk.raid_disk = i;
+                       di->disk.major = dl->major;
+                       di->disk.minor = dl->minor;
+                       di->disk.state = 0;
+                       di->data_offset = pos;
+                       di->component_size = a->info.component_size;
+                       di->container_member = dl->pdnum;
+                       di->next = rv;
+                       rv = di;
+                       dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                               i, pos);
+
+                       break;
+               }
+               if (!dl && ! global_ok) {
+                       /* not enough dedicated spares, try global */
+                       global_ok = 1;
+                       dl = ddf->dlist;
+                       goto again;
+               }
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * phys_refnum and lba_offset values
+        */
+       mu = malloc(sizeof(*mu));
+       mu->buf = malloc(ddf->conf_rec_len * 512);
+       posix_memalign(&mu->space, 512, sizeof(struct vcl));
+       mu->len = ddf->conf_rec_len;
+       mu->next = *updates;
+       vc = find_vdcr(ddf, a->info.container_member);
+       memcpy(mu->buf, vc, ddf->conf_rec_len * 512);
+
+       vc = (struct vd_config*)mu->buf;
+       lba = (__u64*)&vc->phys_refnum[ddf->mppe];
+       for (di = rv ; di ; di = di->next) {
+               vc->phys_refnum[di->disk.raid_disk] =
+                       ddf->phys->entries[dl->pdnum].refnum;
+               lba[di->disk.raid_disk] = di->data_offset;
+       }
+       *updates = mu;
+       return rv;
+}
+
+struct superswitch super_ddf = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_ddf,
+       .brief_examine_super = brief_examine_super_ddf,
+       .detail_super   = detail_super_ddf,
+       .brief_detail_super = brief_detail_super_ddf,
+       .validate_geometry = validate_geometry_ddf,
+       .write_init_super = write_init_super_ddf,
+#endif
+       .match_home     = match_home_ddf,
+       .uuid_from_super= uuid_from_super_ddf,
+       .getinfo_super  = getinfo_super_ddf,
+       .update_super   = update_super_ddf,
+
+       .avail_size     = avail_size_ddf,
+
+       .compare_super  = compare_super_ddf,
+
+       .load_super     = load_super_ddf,
+       .init_super     = init_super_ddf,
+       .store_super    = store_zero_ddf,
+       .free_super     = free_super_ddf,
+       .match_metadata_desc = match_metadata_desc_ddf,
+       .add_to_super   = add_to_super_ddf,
+       .container_content = container_content_ddf,
+
+       .external       = 1,
+
+/* for mdmon */
+       .open_new       = ddf_open_new,
+       .set_array_state= ddf_set_array_state,
+       .set_disk       = ddf_set_disk,
+       .sync_metadata  = ddf_sync_metadata,
+       .process_update = ddf_process_update,
+       .prepare_update = ddf_prepare_update,
+       .activate_spare = ddf_activate_spare,
+
+};
diff --git a/super-intel.c b/super-intel.c
new file mode 100644 (file)
index 0000000..caa3881
--- /dev/null
@@ -0,0 +1,2552 @@
+/*
+ * mdadm - Intel(R) Matrix Storage Manager Support
+ *
+ * Copyright (C) 2002-2007 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <values.h>
+#include <scsi/sg.h>
+#include <ctype.h>
+
+/* MPB == Metadata Parameter Block */
+#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
+#define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
+#define MPB_VERSION_RAID0 "1.0.00"
+#define MPB_VERSION_RAID1 "1.1.00"
+#define MPB_VERSION_RAID5 "1.2.02"
+#define MAX_SIGNATURE_LENGTH  32
+#define MAX_RAID_SERIAL_LEN   16
+#define MPB_SECTOR_CNT 418
+#define IMSM_RESERVED_SECTORS 4096
+
+/* Disk configuration info. */
+#define IMSM_MAX_DEVICES 255
+struct imsm_disk {
+       __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
+       __u32 total_blocks;              /* 0xE8 - 0xEB total blocks */
+       __u32 scsi_id;                   /* 0xEC - 0xEF scsi ID */
+       __u32 status;                    /* 0xF0 - 0xF3 */
+#define SPARE_DISK      0x01  /* Spare */
+#define CONFIGURED_DISK 0x02  /* Member of some RaidDev */
+#define FAILED_DISK     0x04  /* Permanent failure */
+#define USABLE_DISK     0x08  /* Fully usable unless FAILED_DISK is set */
+
+#define        IMSM_DISK_FILLERS       5
+       __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */
+};
+
+/* RAID map configuration infos. */
+struct imsm_map {
+       __u32 pba_of_lba0;      /* start address of partition */
+       __u32 blocks_per_member;/* blocks per member */
+       __u32 num_data_stripes; /* number of data stripes */
+       __u16 blocks_per_strip;
+       __u8  map_state;        /* Normal, Uninitialized, Degraded, Failed */
+#define IMSM_T_STATE_NORMAL 0
+#define IMSM_T_STATE_UNINITIALIZED 1
+#define IMSM_T_STATE_DEGRADED 2 /* FIXME: is this correct? */
+#define IMSM_T_STATE_FAILED 3 /* FIXME: is this correct? */
+       __u8  raid_level;
+#define IMSM_T_RAID0 0
+#define IMSM_T_RAID1 1
+#define IMSM_T_RAID5 5         /* since metadata version 1.2.02 ? */
+       __u8  num_members;      /* number of member disks */
+       __u8  reserved[3];
+       __u32 filler[7];        /* expansion area */
+       __u32 disk_ord_tbl[1];  /* disk_ord_tbl[num_members],
+                                  top byte special */
+} __attribute__ ((packed));
+
+struct imsm_vol {
+       __u32 reserved[2];
+       __u8  migr_state;       /* Normal or Migrating */
+       __u8  migr_type;        /* Initializing, Rebuilding, ... */
+       __u8  dirty;
+       __u8  fill[1];
+       __u32 filler[5];
+       struct imsm_map map[1];
+       /* here comes another one if migr_state */
+} __attribute__ ((packed));
+
+struct imsm_dev {
+       __u8    volume[MAX_RAID_SERIAL_LEN];
+       __u32 size_low;
+       __u32 size_high;
+       __u32 status;   /* Persistent RaidDev status */
+       __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
+#define IMSM_DEV_FILLERS 12
+       __u32 filler[IMSM_DEV_FILLERS];
+       struct imsm_vol vol;
+} __attribute__ ((packed));
+
+struct imsm_super {
+       __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
+       __u32 check_sum;                /* 0x20 - 0x23 MPB Checksum */
+       __u32 mpb_size;                 /* 0x24 - 0x27 Size of MPB */
+       __u32 family_num;               /* 0x28 - 0x2B Checksum from first time this config was written */
+       __u32 generation_num;           /* 0x2C - 0x2F Incremented each time this array's MPB is written */
+       __u32 reserved[2];              /* 0x30 - 0x37 */
+       __u8 num_disks;                 /* 0x38 Number of configured disks */
+       __u8 num_raid_devs;             /* 0x39 Number of configured volumes */
+       __u8 fill[2];                   /* 0x3A - 0x3B */
+#define IMSM_FILLERS 39
+       __u32 filler[IMSM_FILLERS];     /* 0x3C - 0xD7 RAID_MPB_FILLERS */
+       struct imsm_disk disk[1];       /* 0xD8 diskTbl[numDisks] */
+       /* here comes imsm_dev[num_raid_devs] */
+} __attribute__ ((packed));
+
+#ifndef MDASSEMBLE
+static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
+#endif
+
+static unsigned int sector_count(__u32 bytes)
+{
+       return ((bytes + (512-1)) & (~(512-1))) / 512;
+}
+
+static unsigned int mpb_sectors(struct imsm_super *mpb)
+{
+       return sector_count(__le32_to_cpu(mpb->mpb_size));
+}
+
+/* internal representation of IMSM metadata */
+struct intel_super {
+       union {
+               void *buf; /* O_DIRECT buffer for reading/writing metadata */
+               struct imsm_super *anchor; /* immovable parameters */
+       };
+       size_t len; /* size of the 'buf' allocation */
+       int updates_pending; /* count of pending updates for mdmon */
+       int creating_imsm; /* flag to indicate container creation */
+       int current_vol; /* index of raid device undergoing creation */
+       #define IMSM_MAX_DISKS 6
+       struct imsm_disk *disk_tbl[IMSM_MAX_DISKS];
+       #define IMSM_MAX_RAID_DEVS 2
+       struct imsm_dev *dev_tbl[IMSM_MAX_RAID_DEVS];
+       struct dl {
+               struct dl *next;
+               int index;
+               __u8 serial[MAX_RAID_SERIAL_LEN];
+               int major, minor;
+               char *devname;
+               int fd;
+       } *disks;
+};
+
+struct extent {
+       unsigned long long start, size;
+};
+
+/* definition of messages passed to imsm_process_update */
+enum imsm_update_type {
+       update_activate_spare,
+       update_create_array,
+};
+
+struct imsm_update_activate_spare {
+       enum imsm_update_type type;
+       int disk_idx;
+       int slot;
+       int array;
+       struct imsm_update_activate_spare *next;
+};
+
+struct imsm_update_create_array {
+       enum imsm_update_type type;
+       struct imsm_dev dev;
+       int dev_idx;
+};
+
+static int imsm_env_devname_as_serial(void)
+{
+       char *val = getenv("IMSM_DEVNAME_AS_SERIAL");
+
+       if (val && atoi(val) == 1)
+               return 1;
+
+       return 0;
+}
+
+
+static struct supertype *match_metadata_desc_imsm(char *arg)
+{
+       struct supertype *st;
+
+       if (strcmp(arg, "imsm") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_imsm;
+       st->max_devs = IMSM_MAX_DEVICES;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+static __u8 *get_imsm_version(struct imsm_super *mpb)
+{
+       return &mpb->sig[MPB_SIG_LEN];
+}
+
+/* retrieve a disk directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load time
+ */
+static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
+{
+       if (index >= mpb->num_disks)
+               return NULL;
+       return &mpb->disk[index];
+}
+
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+       if (index >= super->anchor->num_disks)
+               return NULL;
+       return super->disk_tbl[index];
+}
+
+/* generate a checksum directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load or write_super after coalescing
+ */
+static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
+{
+       __u32 end = mpb->mpb_size / sizeof(end);
+       __u32 *p = (__u32 *) mpb;
+       __u32 sum = 0;
+
+        while (end--)
+                sum += __le32_to_cpu(*p++);
+
+        return sum - __le32_to_cpu(mpb->check_sum);
+}
+
+static size_t sizeof_imsm_dev(struct imsm_dev *dev)
+{
+       size_t size = sizeof(*dev);
+
+       /* each map has disk_ord_tbl[num_members - 1] additional space */
+       size += sizeof(__u32) * (dev->vol.map[0].num_members - 1);
+
+       /* migrating means an additional map */
+       if (dev->vol.migr_state) {
+               size += sizeof(struct imsm_map);
+               size += sizeof(__u32) * (dev->vol.map[1].num_members - 1);
+       }
+
+       return size;
+}
+
+static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
+{
+       int offset;
+       int i;
+       void *_mpb = mpb;
+
+       if (index >= mpb->num_raid_devs)
+               return NULL;
+
+       /* devices start after all disks */
+       offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
+
+       for (i = 0; i <= index; i++)
+               if (i == index)
+                       return _mpb + offset;
+               else
+                       offset += sizeof_imsm_dev(_mpb + offset);
+
+       return NULL;
+}
+
+static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
+{
+       if (index >= super->anchor->num_raid_devs)
+               return NULL;
+       return super->dev_tbl[index];
+}
+
+static __u32 get_imsm_disk_idx(struct imsm_map *map, int slot)
+{
+       __u32 *ord_tbl = &map->disk_ord_tbl[slot];
+
+       /* top byte is 'special' */
+       return __le32_to_cpu(*ord_tbl & ~(0xff << 24));
+}
+
+static int get_imsm_raid_level(struct imsm_map *map)
+{
+       if (map->raid_level == 1) {
+               if (map->num_members == 2)
+                       return 1;
+               else
+                       return 10;
+       }
+
+       return map->raid_level;
+}
+
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static struct extent *get_extents(struct intel_super *super, struct dl *dl)
+{
+       /* find a list of used extents on the given physical device */
+       struct imsm_disk *disk;
+       struct extent *rv, *e;
+       int i, j;
+       int memberships = 0;
+
+       disk = get_imsm_disk(super, dl->index);
+       if (!disk)
+               return NULL;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = dev->vol.map;
+
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 index = get_imsm_disk_idx(map, j);
+
+                       if (index == dl->index)
+                               memberships++;
+               }
+       }
+       rv = malloc(sizeof(struct extent) * (memberships + 1));
+       if (!rv)
+               return NULL;
+       e = rv;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = dev->vol.map;
+
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 index = get_imsm_disk_idx(map, j);
+
+                       if (index == dl->index) {
+                               e->start = __le32_to_cpu(map->pba_of_lba0);
+                               e->size = __le32_to_cpu(map->blocks_per_member);
+                               e++;
+                       }
+               }
+       }
+       qsort(rv, memberships, sizeof(*rv), cmp_extent);
+
+       e->start = __le32_to_cpu(disk->total_blocks) -
+                  (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+       e->size = 0;
+       return rv;
+}
+
+#ifndef MDASSEMBLE
+static void print_imsm_dev(struct imsm_dev *dev, int index)
+{
+       __u64 sz;
+       int slot;
+       struct imsm_map *map = dev->vol.map;
+
+       printf("\n");
+       printf("[%s]:\n", dev->volume);
+       printf("     RAID Level : %d\n", get_imsm_raid_level(map));
+       printf("        Members : %d\n", map->num_members);
+       for (slot = 0; slot < map->num_members; slot++)
+               if (index == get_imsm_disk_idx(map, slot))
+                       break;
+       if (slot < map->num_members)
+               printf("      This Slot : %d\n", slot);
+       else
+               printf("      This Slot : ?\n");
+       sz = __le32_to_cpu(dev->size_high);
+       sz <<= 32;
+       sz += __le32_to_cpu(dev->size_low);
+       printf("     Array Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       sz = __le32_to_cpu(map->blocks_per_member);
+       printf("   Per Dev Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       printf("  Sector Offset : %u\n",
+               __le32_to_cpu(map->pba_of_lba0));
+       printf("    Num Stripes : %u\n",
+               __le32_to_cpu(map->num_data_stripes));
+       printf("     Chunk Size : %u KiB\n",
+               __le16_to_cpu(map->blocks_per_strip) / 2);
+       printf("       Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
+       printf("  Migrate State : %s\n", dev->vol.migr_state ? "migrating" : "idle");
+       printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+       printf("      Map State : %s\n", map_state_str[map->map_state]);
+}
+
+static void print_imsm_disk(struct imsm_super *mpb, int index)
+{
+       struct imsm_disk *disk = __get_imsm_disk(mpb, index);
+       char str[MAX_RAID_SERIAL_LEN];
+       __u32 s;
+       __u64 sz;
+
+       if (index < 0)
+               return;
+
+       printf("\n");
+       snprintf(str, MAX_RAID_SERIAL_LEN, "%s", disk->serial);
+       printf("  Disk%02d Serial : %s\n", index, str);
+       s = __le32_to_cpu(disk->status);
+       printf("          State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "",
+                                             s&CONFIGURED_DISK ? " active" : "",
+                                             s&FAILED_DISK ? " failed" : "",
+                                             s&USABLE_DISK ? " usable" : "");
+       printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+       sz = __le32_to_cpu(disk->total_blocks) -
+            (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS * mpb->num_raid_devs);
+       printf("    Usable Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+}
+
+static void examine_super_imsm(struct supertype *st, char *homehost)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       char str[MAX_SIGNATURE_LENGTH];
+       int i;
+       __u32 sum;
+
+       snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
+       printf("          Magic : %s\n", str);
+       snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb));
+       printf("        Version : %s\n", get_imsm_version(mpb));
+       printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
+       printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
+       sum = __le32_to_cpu(mpb->check_sum);
+       printf("       Checksum : %08x %s\n", sum,
+               __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
+       printf("    MPB Sectors : %d\n", mpb_sectors(mpb));
+       printf("          Disks : %d\n", mpb->num_disks);
+       printf("   RAID Devices : %d\n", mpb->num_raid_devs);
+       print_imsm_disk(mpb, super->disks->index);
+       for (i = 0; i < mpb->num_raid_devs; i++)
+               print_imsm_dev(__get_imsm_dev(mpb, i), super->disks->index);
+       for (i = 0; i < mpb->num_disks; i++) {
+               if (i == super->disks->index)
+                       continue;
+               print_imsm_disk(mpb, i);
+       }
+}
+
+static void brief_examine_super_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+
+       printf("ARRAY /dev/imsm family=%08x metadata=external:imsm\n",
+               __le32_to_cpu(super->anchor->family_num));
+}
+
+static void detail_super_imsm(struct supertype *st, char *homehost)
+{
+       printf("%s\n", __FUNCTION__);
+}
+
+static void brief_detail_super_imsm(struct supertype *st)
+{
+       printf("%s\n", __FUNCTION__);
+}
+#endif
+
+static int match_home_imsm(struct supertype *st, char *homehost)
+{
+       printf("%s\n", __FUNCTION__);
+
+       return 0;
+}
+
+static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
+{
+       printf("%s\n", __FUNCTION__);
+}
+
+#if 0
+static void
+get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
+{
+       __u8 *v = get_imsm_version(mpb);
+       __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
+       char major[] = { 0, 0, 0 };
+       char minor[] = { 0 ,0, 0 };
+       char patch[] = { 0, 0, 0 };
+       char *ver_parse[] = { major, minor, patch };
+       int i, j;
+
+       i = j = 0;
+       while (*v != '\0' && v < end) {
+               if (*v != '.' && j < 2)
+                       ver_parse[i][j++] = *v;
+               else {
+                       i++;
+                       j = 0;
+               }
+               v++;
+       }
+
+       *m = strtol(minor, NULL, 0);
+       *p = strtol(patch, NULL, 0);
+}
+#endif
+
+static int imsm_level_to_layout(int level)
+{
+       switch (level) {
+       case 0:
+       case 1:
+               return 0;
+       case 5:
+       case 6:
+               return ALGORITHM_LEFT_SYMMETRIC;
+       case 10:
+               return 0x102; //FIXME is this correct?
+       }
+       return -1;
+}
+
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+       struct imsm_map *map = &dev->vol.map[0];
+
+       info->container_member    = super->current_vol;
+       info->array.raid_disks    = map->num_members;
+       info->array.level         = get_imsm_raid_level(map);
+       info->array.layout        = imsm_level_to_layout(info->array.level);
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0;
+       info->array.utime         = 0;
+       info->array.chunk_size    = __le16_to_cpu(map->blocks_per_strip * 512);
+
+       info->data_offset         = __le32_to_cpu(map->pba_of_lba0);
+       info->component_size      = __le32_to_cpu(map->blocks_per_member);
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+
+       sprintf(info->text_version, "/%s/%d",
+               devnum2devname(st->container_dev),
+               info->container_member);
+}
+
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_disk *disk;
+       __u32 s;
+
+       if (super->current_vol >= 0) {
+               getinfo_super_imsm_volume(st, info);
+               return;
+       }
+       info->array.raid_disks    = super->anchor->num_disks;
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0; /* N/A for imsm */ 
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       info->disk.raid_disk = -1;
+       info->reshape_active = 0;
+       strcpy(info->text_version, "imsm");
+       info->disk.number = -1;
+       info->disk.state = 0;
+
+       if (super->disks) {
+               disk = get_imsm_disk(super, super->disks->index);
+               if (!disk) {
+                       info->disk.number = -1;
+                       info->disk.raid_disk = -1;
+                       return;
+               }
+               info->disk.number = super->disks->index;
+               info->disk.raid_disk = super->disks->index;
+               info->data_offset = __le32_to_cpu(disk->total_blocks) -
+                                   (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+               info->component_size = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+               s = __le32_to_cpu(disk->status);
+               info->disk.state  = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0;
+               info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0;
+               info->disk.state |= s & USABLE_DISK ? (1 << MD_DISK_SYNC) : 0;
+       }
+}
+
+static int update_super_imsm(struct supertype *st, struct mdinfo *info,
+                            char *update, char *devname, int verbose,
+                            int uuid_set, char *homehost)
+{
+       /* FIXME */
+
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  name:  update the name - preserving the homehost
+        *
+        * Following are not relevant for this imsm:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        *  uuid:  Change the uuid of the array to match watch is given
+        *  homehost:  update the recorded homehost
+        *  _reshape_progress: record new reshape_progress position.
+        */
+       int rv = 0;
+       //struct intel_super *super = st->sb;
+       //struct imsm_super *mpb = super->mpb;
+
+       if (strcmp(update, "grow") == 0) {
+       }
+       if (strcmp(update, "resync") == 0) {
+               /* dev->vol.dirty = 1; */
+       }
+
+       /* IMSM has no concept of UUID or homehost */
+
+       return rv;
+}
+
+static size_t disks_to_mpb_size(int disks)
+{
+       size_t size;
+
+       size = sizeof(struct imsm_super);
+       size += (disks - 1) * sizeof(struct imsm_disk);
+       size += 2 * sizeof(struct imsm_dev);
+       /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
+       size += (4 - 2) * sizeof(struct imsm_map);
+       /* 4 possible disk_ord_tbl's */
+       size += 4 * (disks - 1) * sizeof(__u32);
+
+       return size;
+}
+
+static __u64 avail_size_imsm(struct supertype *st, __u64 devsize)
+{
+       if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
+               return 0;
+
+       return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+}
+
+static int compare_super_imsm(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct intel_super *first = st->sb;
+       struct intel_super *sec = tst->sb;
+
+        if (!first) {
+                st->sb = tst->sb;
+                tst->sb = NULL;
+                return 0;
+        }
+
+       if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0)
+               return 3;
+       if (first->anchor->family_num != sec->anchor->family_num)
+               return 3;
+       if (first->anchor->mpb_size != sec->anchor->mpb_size)
+               return 3;
+       if (first->anchor->check_sum != sec->anchor->check_sum)
+               return 3;
+
+       return 0;
+}
+
+static void fd2devname(int fd, char *name)
+{
+       struct stat st;
+       char path[256];
+       char dname[100];
+       char *nm;
+       int rv;
+
+       name[0] = '\0';
+       if (fstat(fd, &st) != 0)
+               return;
+       sprintf(path, "/sys/dev/block/%d:%d",
+               major(st.st_rdev), minor(st.st_rdev));
+
+       rv = readlink(path, dname, sizeof(dname));
+       if (rv <= 0)
+               return;
+       
+       dname[rv] = '\0';
+       nm = strrchr(dname, '/');
+       nm++;
+       snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
+}
+
+
+extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
+
+static int imsm_read_serial(int fd, char *devname,
+                           __u8 serial[MAX_RAID_SERIAL_LEN])
+{
+       unsigned char scsi_serial[255];
+       int rv;
+       int rsp_len;
+       int i, cnt;
+
+       memset(scsi_serial, 0, sizeof(scsi_serial));
+
+       if (imsm_env_devname_as_serial()) {
+               char name[MAX_RAID_SERIAL_LEN];
+               
+               fd2devname(fd, name);
+               strcpy((char *) serial, name);
+               return 0;
+       }
+
+       rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial));
+
+       if (rv != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to retrieve serial for %s\n",
+                               devname);
+               return rv;
+       }
+
+       rsp_len = scsi_serial[3];
+       for (i = 0, cnt = 0; i < rsp_len; i++) {
+               if (!isspace(scsi_serial[4 + i]))
+                       serial[cnt++] = scsi_serial[4 + i];
+               if (cnt == MAX_RAID_SERIAL_LEN)
+                       break;
+       }
+
+       serial[MAX_RAID_SERIAL_LEN - 1] = '\0';
+
+       return 0;
+}
+
+static int
+load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+       struct dl *dl;
+       struct stat stb;
+       struct imsm_disk *disk;
+       int rv;
+       int i;
+
+       dl = malloc(sizeof(*dl));
+       disk = malloc(sizeof(*disk));
+       if (!dl || !disk) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": failed to allocate disk buffer for %s\n",
+                               devname);
+               if (disk)
+                       free(disk);
+               if (dl)
+                       free(dl);
+               return 2;
+       }
+       memset(dl, 0, sizeof(*dl));
+       memset(disk, 0, sizeof(*disk));
+
+       fstat(fd, &stb);
+       dl->major = major(stb.st_rdev);
+       dl->minor = minor(stb.st_rdev);
+       dl->next = super->disks;
+       dl->fd = keep_fd ? fd : -1;
+       dl->devname = devname ? strdup(devname) : NULL;
+       dl->index = -1;
+       super->disks = dl;
+       rv = imsm_read_serial(fd, devname, dl->serial);
+
+       if (rv != 0)
+               return 2;
+
+       /* look up this disk's index */
+       for (i = 0; i < super->anchor->num_disks; i++) {
+               struct imsm_disk *disk_iter;
+
+               disk_iter = __get_imsm_disk(super->anchor, i);
+
+               if (memcmp(disk_iter->serial, dl->serial,
+                          MAX_RAID_SERIAL_LEN) == 0) {
+                       *disk = *disk_iter;
+                       super->disk_tbl[i] = disk;
+                       dl->index = i;
+                       break;
+               }
+       }
+
+       if (i == super->anchor->num_disks) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": failed to match serial \'%s\' for %s\n",
+                               dl->serial, devname);
+               free(disk);
+               return 0;
+       }
+
+       return 0;
+}
+
+static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
+{
+       int i;
+
+       *dest = *src;
+
+       for (i = 0; i < src->vol.map[0].num_members; i++)
+               dest->vol.map[0].disk_ord_tbl[i] = src->vol.map[0].disk_ord_tbl[i];
+
+       if (!src->vol.migr_state)
+               return;
+
+       dest->vol.map[1] = src->vol.map[1];
+       for (i = 0; i < src->vol.map[1].num_members; i++)
+               dest->vol.map[1].disk_ord_tbl[i] = src->vol.map[1].disk_ord_tbl[i];
+}
+
+static int parse_raid_devices(struct intel_super *super)
+{
+       int i;
+       struct imsm_dev *dev_new;
+       size_t len;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+
+               len = sizeof_imsm_dev(dev_iter);
+               dev_new = malloc(len);
+               if (!dev_new)
+                       return 1;
+               imsm_copy_dev(dev_new, dev_iter);
+               super->dev_tbl[i] = dev_new;
+       }
+
+       return 0;
+}
+
+static void __free_imsm(struct intel_super *super);
+
+/* load_imsm_mpb - read matrix metadata
+ * allocates super->mpb to be freed by free_super
+ */
+static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
+{
+       unsigned long long dsize;
+       unsigned long long sectors;
+       struct stat;
+       struct imsm_super *anchor;
+       __u32 check_sum;
+       int rc;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (posix_memalign((void**)&anchor, 512, 512) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to allocate imsm anchor buffer"
+                               " on %s\n", devname);
+               return 1;
+       }
+       if (read(fd, anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               free(anchor);
+               return 1;
+       }
+
+       if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": no IMSM anchor on %s\n", devname);
+               free(anchor);
+               return 2;
+       }
+
+       __free_imsm(super);
+       super->len = __le32_to_cpu(anchor->mpb_size);
+       super->len = ROUND_UP(anchor->mpb_size, 512);
+       if (posix_memalign(&super->buf, 512, super->len) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": unable to allocate %zu byte mpb buffer\n",
+                               super->len);
+               free(anchor);
+               return 2;
+       }
+       memcpy(super->buf, anchor, 512);
+
+       sectors = mpb_sectors(anchor) - 1;
+       free(anchor);
+       if (!sectors) {
+               rc = load_imsm_disk(fd, super, devname, 0);
+               if (rc == 0)
+                       rc = parse_raid_devices(super);
+               return rc;
+       }
+
+       /* read the extended mpb */
+       if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 2;
+       }
+
+       check_sum = __gen_imsm_checksum(super->anchor);
+       if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": IMSM checksum %x != %x on %s\n",
+                               check_sum, __le32_to_cpu(super->anchor->check_sum),
+                               devname);
+               return 2;
+       }
+
+       rc = load_imsm_disk(fd, super, devname, 0);
+       if (rc == 0)
+               rc = parse_raid_devices(super);
+       return rc;
+}
+
+static void free_imsm_disks(struct intel_super *super)
+{
+       int i;
+
+       while (super->disks) {
+               struct dl *d = super->disks;
+
+               super->disks = d->next;
+               if (d->fd >= 0)
+                       close(d->fd);
+               if (d->devname)
+                       free(d->devname);
+               free(d);
+       }
+       for (i = 0; i < IMSM_MAX_DISKS; i++)
+               if (super->disk_tbl[i]) {
+                       free(super->disk_tbl[i]);
+                       super->disk_tbl[i] = NULL;
+               }
+}
+
+/* free all the pieces hanging off of a super pointer */
+static void __free_imsm(struct intel_super *super)
+{
+       int i;
+
+       if (super->buf) {
+               free(super->buf);
+               super->buf = NULL;
+       }
+       free_imsm_disks(super);
+       for (i = 0; i < IMSM_MAX_RAID_DEVS; i++)
+               if (super->dev_tbl[i]) {
+                       free(super->dev_tbl[i]);
+                       super->dev_tbl[i] = NULL;
+               }
+}
+
+static void free_imsm(struct intel_super *super)
+{
+       __free_imsm(super);
+       free(super);
+}
+
+static void free_super_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+
+       if (!super)
+               return;
+
+       free_imsm(super);
+       st->sb = NULL;
+}
+
+static struct intel_super *alloc_super(int creating_imsm)
+{
+       struct intel_super *super = malloc(sizeof(*super));
+
+       if (super) {
+               memset(super, 0, sizeof(*super));
+               super->creating_imsm = creating_imsm;
+               super->current_vol = -1;
+       }
+
+       return super;
+}
+
+#ifndef MDASSEMBLE
+static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
+                              char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct intel_super *super;
+       struct mdinfo *sd, *best = NULL;
+       __u32 bestgen = 0;
+       __u32 gen;
+       char nm[20];
+       int dfd;
+       int rv;
+
+       /* check if this disk is a member of an active array */
+       sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+       if (!sra)
+               return 1;
+
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "imsm") != 0)
+               return 1;
+
+       super = alloc_super(0);
+       if (!super)
+               return 1;
+
+       /* find the most up to date disk in this array */
+       for (sd = sra->devs; sd; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+               if (!dfd) {
+                       free_imsm(super);
+                       return 2;
+               }
+               rv = load_imsm_mpb(dfd, super, NULL);
+               if (!keep_fd)
+                       close(dfd);
+               if (rv == 0) {
+                       gen = __le32_to_cpu(super->anchor->generation_num);
+                       if (!best || gen > bestgen) {
+                               bestgen = gen;
+                               best = sd;
+                       }
+               } else {
+                       free_imsm(super);
+                       return 2;
+               }
+       }
+
+       if (!best) {
+               free_imsm(super);
+               return 1;
+       }
+
+       /* load the most up to date anchor */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (!dfd) {
+               free_imsm(super);
+               return 1;
+       }
+       rv = load_imsm_mpb(dfd, super, NULL);
+       close(dfd);
+       if (rv != 0) {
+               free_imsm(super);
+               return 2;
+       }
+
+       /* reset the disk list */
+       free_imsm_disks(super);
+
+       /* populate disk list */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (!dfd) {
+                       free_imsm(super);
+                       return 2;
+               }
+               load_imsm_disk(dfd, super, NULL, keep_fd);
+               if (!keep_fd)
+                       close(dfd);
+       }
+
+       if (st->subarray[0]) {
+               if (atoi(st->subarray) <= super->anchor->num_raid_devs)
+                       super->current_vol = atoi(st->subarray);
+               else
+                       return 1;
+       }
+
+       *sbp = super;
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+               st->container_dev = fd2devnum(fd);
+       }
+
+       return 0;
+}
+#endif
+
+static int load_super_imsm(struct supertype *st, int fd, char *devname)
+{
+       struct intel_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME */
+
+       super = alloc_super(0);
+       if (!super) {
+               fprintf(stderr,
+                       Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+
+       rv = load_imsm_mpb(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free_imsm(super);
+               return rv;
+       }
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+       }
+
+       return 0;
+}
+
+static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
+{
+       if (info->level == 1)
+               return 128;
+       return info->chunk_size >> 9;
+}
+
+static __u32 info_to_num_data_stripes(mdu_array_info_t *info)
+{
+       __u32 num_stripes;
+
+       num_stripes = (info->size * 2) / info_to_blocks_per_strip(info);
+       if (info->level == 1)
+               num_stripes /= 2;
+
+       return num_stripes;
+}
+
+static __u32 info_to_blocks_per_member(mdu_array_info_t *info)
+{
+       return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1);
+}
+
+static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
+                                 unsigned long long size, char *name,
+                                 char *homehost, int *uuid)
+{
+       /* We are creating a volume inside a pre-existing container.
+        * so st->sb is already set.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct imsm_dev *dev;
+       struct imsm_vol *vol;
+       struct imsm_map *map;
+       int idx = mpb->num_raid_devs;
+       int i;
+       unsigned long long array_blocks;
+       __u32 offset = 0;
+       size_t size_old, size_new;
+
+       if (mpb->num_raid_devs >= 2) {
+               fprintf(stderr, Name": This imsm-container already has the "
+                       "maximum of 2 volumes\n");
+               return 0;
+       }
+
+       /* ensure the mpb is large enough for the new data */
+       size_old = __le32_to_cpu(mpb->mpb_size);
+       size_new = disks_to_mpb_size(info->nr_disks);
+       if (size_new > size_old) {
+               void *mpb_new;
+               size_t size_round = ROUND_UP(size_new, 512);
+
+               if (posix_memalign(&mpb_new, 512, size_round) != 0) {
+                       fprintf(stderr, Name": could not allocate new mpb\n");
+                       return 0;
+               }
+               memcpy(mpb_new, mpb, size_old);
+               free(mpb);
+               mpb = mpb_new;
+               super->anchor = mpb_new;
+               mpb->mpb_size = __cpu_to_le32(size_new);
+               memset(mpb_new + size_old, 0, size_round - size_old);
+       }
+       super->current_vol = idx;
+       sprintf(st->subarray, "%d", idx);
+       dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+       if (!dev) {
+               fprintf(stderr, Name": could not allocate raid device\n");
+               return 0;
+       }
+       strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
+       array_blocks = calc_array_size(info->level, info->raid_disks,
+                                      info->layout, info->chunk_size,
+                                      info->size*2);
+       dev->size_low = __cpu_to_le32((__u32) array_blocks);
+       dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32));
+       dev->status = __cpu_to_le32(0);
+       dev->reserved_blocks = __cpu_to_le32(0);
+       vol = &dev->vol;
+       vol->migr_state = 0;
+       vol->migr_type = 0;
+       vol->dirty = 0;
+       for (i = 0; i < idx; i++) {
+               struct imsm_dev *prev = get_imsm_dev(super, i);
+               struct imsm_map *pmap = &prev->vol.map[0];
+
+               offset += __le32_to_cpu(pmap->blocks_per_member);
+               offset += IMSM_RESERVED_SECTORS;
+       }
+       map = &vol->map[0];
+       map->pba_of_lba0 = __cpu_to_le32(offset);
+       map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info));
+       map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
+       map->num_data_stripes = __cpu_to_le32(info_to_num_data_stripes(info));
+       map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED :
+                                      IMSM_T_STATE_NORMAL;
+
+       if (info->level == 1 && info->raid_disks > 2) {
+               fprintf(stderr, Name": imsm does not support more than 2 disks"
+                               "in a raid1 volume\n");
+               return 0;
+       }
+       if (info->level == 10)
+               map->raid_level = 1;
+       else
+               map->raid_level = info->level;
+
+       map->num_members = info->raid_disks;
+       for (i = 0; i < map->num_members; i++) {
+               /* initialized in add_to_super */
+               map->disk_ord_tbl[i] = __cpu_to_le32(0);
+       }
+       mpb->num_raid_devs++;
+       super->dev_tbl[super->current_vol] = dev;
+
+       return 1;
+}
+
+static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
+                          unsigned long long size, char *name,
+                          char *homehost, int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For IMSM, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        */
+       struct intel_super *super;
+       struct imsm_super *mpb;
+       size_t mpb_size;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_imsm_volume(st, info, size, name, homehost,
+                                             uuid);
+
+       super = alloc_super(1);
+       if (!super)
+               return 0;
+       mpb_size = disks_to_mpb_size(info->nr_disks);
+       if (posix_memalign(&super->buf, 512, mpb_size) != 0) {
+               free(super);
+               return 0;
+       }
+       mpb = super->buf;
+       memset(mpb, 0, mpb_size); 
+
+       memcpy(mpb->sig, MPB_SIGNATURE, strlen(MPB_SIGNATURE));
+       memcpy(mpb->sig + strlen(MPB_SIGNATURE), MPB_VERSION_RAID5,
+              strlen(MPB_VERSION_RAID5)); 
+       mpb->mpb_size = mpb_size;
+
+       st->sb = super;
+       return 1;
+}
+
+static void add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
+                                    int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dl;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+       struct imsm_disk *disk;
+       __u32 status;
+
+       dev = get_imsm_dev(super, super->current_vol);
+       map = &dev->vol.map[0];
+
+       for (dl = super->disks; dl ; dl = dl->next)
+               if (dl->major == dk->major &&
+                   dl->minor == dk->minor)
+                       break;
+       if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+               return;
+
+       map->disk_ord_tbl[dk->number] = __cpu_to_le32(dl->index);
+
+       disk = get_imsm_disk(super, dl->index);
+       status = CONFIGURED_DISK | USABLE_DISK;
+       disk->status = __cpu_to_le32(status);
+}
+
+static void add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
+                             int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct imsm_disk *disk;
+       struct dl *dd;
+       unsigned long long size;
+       __u32 status, id;
+       int rv;
+       struct stat stb;
+
+       if (super->current_vol >= 0) {
+               add_to_super_imsm_volume(st, dk, fd, devname);
+               return;
+       }
+
+       fstat(fd, &stb);
+       dd = malloc(sizeof(*dd));
+       disk = malloc(sizeof(*disk));
+       if (!dd || !disk) {
+               fprintf(stderr,
+                       Name ": malloc failed %s:%d.\n", __func__, __LINE__);
+               if (!dd)
+                       free(dd);
+               if (!disk)
+                       free(disk);
+               abort();
+       }
+       memset(dd, 0, sizeof(*dd));
+       memset(disk, 0, sizeof(*disk));
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->index = dk->number;
+       dd->devname = devname ? strdup(devname) : NULL;
+       dd->next = super->disks;
+       dd->fd = fd;
+       rv = imsm_read_serial(fd, devname, dd->serial);
+       if (rv) {
+               fprintf(stderr,
+                       Name ": failed to retrieve scsi serial, aborting\n");
+               free(dd);
+               free(disk);
+               abort();
+       }
+
+       if (mpb->num_disks <= dk->number)
+               mpb->num_disks = dk->number + 1;
+
+       get_dev_size(fd, NULL, &size);
+       size /= 512;
+       status = USABLE_DISK | SPARE_DISK;
+       strcpy((char *) disk->serial, (char *) dd->serial);
+       disk->total_blocks = __cpu_to_le32(size);
+       disk->status = __cpu_to_le32(status);
+       if (sysfs_disk_to_scsi_id(fd, &id) == 0)
+               disk->scsi_id = __cpu_to_le32(id);
+       else
+               disk->scsi_id = __cpu_to_le32(0);
+       super->disk_tbl[dd->index] = disk;
+
+       /* update the family number if we are creating a container */
+       if (super->creating_imsm) {
+               disk = __get_imsm_disk(mpb, dd->index);
+               *disk = *super->disk_tbl[dd->index]; /* copy in new disk */
+               mpb->family_num = __cpu_to_le32(__gen_imsm_checksum(mpb));
+       }
+       
+       super->disks = dd;
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super);
+
+static int write_super_imsm(struct intel_super *super, int doclose)
+{
+       struct imsm_super *mpb = super->anchor;
+       struct dl *d;
+       __u32 generation;
+       __u32 sum;
+       int i;
+
+       /* 'generation' is incremented everytime the metadata is written */
+       generation = __le32_to_cpu(mpb->generation_num);
+       generation++;
+       mpb->generation_num = __cpu_to_le32(generation);
+
+       for (i = 0; i < mpb->num_disks; i++)
+               mpb->disk[i] = *super->disk_tbl[i];
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+               imsm_copy_dev(dev, super->dev_tbl[i]);
+       }
+
+       /* recalculate checksum */
+       sum = __gen_imsm_checksum(mpb);
+       mpb->check_sum = __cpu_to_le32(sum);
+
+       for (d = super->disks; d ; d = d->next) {
+               if (store_imsm_mpb(d->fd, super)) {
+                       fprintf(stderr, "%s: failed for device %d:%d %s\n",
+                               __func__, d->major, d->minor, strerror(errno));
+                       return 0;
+               }
+               if (doclose) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+       }
+
+       return 1;
+}
+
+static int write_init_super_imsm(struct supertype *st)
+{
+       if (st->update_tail) {
+               /* queue the recently created array as a metadata update */
+               size_t len;
+               struct imsm_update_create_array *u;
+               struct intel_super *super = st->sb;
+               struct imsm_dev *dev;
+               struct dl *d;
+
+               if (super->current_vol < 0 ||
+                   !(dev = get_imsm_dev(super, super->current_vol))) {
+                       fprintf(stderr, "%s: could not determine sub-array\n",
+                               __func__);
+                       return 1;
+               }
+
+
+               len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev);
+               u = malloc(len);
+               if (!u) {
+                       fprintf(stderr, "%s: failed to allocate update buffer\n",
+                               __func__);
+                       return 1;
+               }
+
+               u->type = update_create_array;
+               u->dev_idx = super->current_vol;
+               imsm_copy_dev(&u->dev, dev);
+               append_metadata_update(st, u, len);
+
+               for (d = super->disks; d ; d = d->next) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+
+               return 0;
+       } else
+               return write_super_imsm(st->sb, 1);
+}
+
+static int store_zero_imsm(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (posix_memalign(&buf, 512, 512) != 0)
+               return 1;
+
+       memset(buf, 0, 512);
+       if (write(fd, buf, 512) != 512)
+               return 1;
+       return 0;
+}
+
+static int validate_geometry_imsm_container(struct supertype *st, int level,
+                                           int layout, int raiddisks, int chunk,
+                                           unsigned long long size, char *dev,
+                                           unsigned long long *freesize,
+                                           int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_imsm(st, ldsize >> 9);
+
+       return 1;
+}
+
+/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd 
+ * FIX ME add ahci details
+ */
+static int validate_geometry_imsm_volume(struct supertype *st, int level,
+                                        int layout, int raiddisks, int chunk,
+                                        unsigned long long size, char *dev,
+                                        unsigned long long *freesize,
+                                        int verbose)
+{
+       struct stat stb;
+       struct intel_super *super = st->sb;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+
+       if (level == 1 && raiddisks > 2) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm does not support more "
+                               "than 2 in a raid1 configuration\n");
+               return 0;
+       }
+
+       /* We must have the container info already read in. */
+       if (!super)
+               return 0;
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size' at a given
+                * offset
+                */
+               unsigned long long minsize = size*2 /* convert to blocks */;
+               unsigned long long start_offset = ~0ULL;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+               for (dl = super->disks; dl ; dl = dl->next) {
+                       int found = 0;
+
+                       pos = 0;
+                       i = 0;
+                       e = get_extents(super, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               if (found && start_offset == ~0ULL) {
+                                       start_offset = pos;
+                                       break;
+                               } else if (found && pos != start_offset) {
+                                       found = 0;
+                                       break;
+                               }
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr, Name ": imsm: Not enough "
+                                       "devices with space for this array "
+                                       "(%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = super->disks ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": %s is not in the "
+                               "same imsm set\n", dev);
+               return 0;
+       }
+       e = get_extents(super, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) do {
+               unsigned long long esize;
+               esize = e[i].start - pos;
+               if (esize >= maxsize)
+                       maxsize = esize;
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       *freesize = maxsize;
+
+       return 1;
+}
+
+static int validate_geometry_imsm(struct supertype *st, int level, int layout,
+                                 int raiddisks, int chunk, unsigned long long size,
+                                 char *dev, unsigned long long *freesize,
+                                 int verbose)
+{
+       int fd, cfd;
+       struct mdinfo *sra;
+
+       /* if given unused devices create a container 
+        * if given given devices in a container create a member volume
+        */
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_imsm_container(st, level, layout,
+                                                       raiddisks, chunk, size,
+                                                       dev, freesize,
+                                                       verbose);
+       }
+       
+       if (st->sb) {
+               /* creating in a given container */
+               return validate_geometry_imsm_volume(st, level, layout,
+                                                    raiddisks, chunk, size,
+                                                    dev, freesize, verbose);
+       }
+
+       /* limit creation to the following levels */
+       if (!dev)
+               switch (level) {
+               case 0:
+               case 1:
+               case 10:
+               case 5:
+                       break;
+               default:
+                       return 1;
+               }
+
+       /* This device needs to be a device in an 'imsm' container */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": Cannot create this array on device %s\n",
+                               dev);
+               close(fd);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe an 'imsm' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot use %s: It is busy\n",
+                               dev);
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "imsm") == 0) {
+               /* This is a member of a imsm container.  Load the container
+                * and try to create a volume
+                */
+               struct intel_super *super;
+
+               if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+                       st->sb = super;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_imsm_volume(st, level, layout,
+                                                            raiddisks, chunk,
+                                                            size, dev,
+                                                            freesize, verbose);
+               }
+               close(cfd);
+       } else /* may belong to another container */
+               return 0;
+
+       return 1;
+}
+
+static struct mdinfo *container_content_imsm(struct supertype *st)
+{
+       /* Given a container loaded by load_super_imsm_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each imsm_dev create an mdinfo, fill it in,
+        *  then look for matching devices in super->disks
+        *  and create appropriate device mdinfo.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct mdinfo *rest = NULL;
+       int i;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_vol *vol = &dev->vol;
+               struct imsm_map *map = vol->map;
+               struct mdinfo *this;
+               int slot;
+
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+               rest = this;
+
+               this->array.level = get_imsm_raid_level(map);
+               this->array.raid_disks = map->num_members;
+               this->array.layout = imsm_level_to_layout(this->array.level);
+               this->array.md_minor = -1;
+               this->array.ctime = 0;
+               this->array.utime = 0;
+               this->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9;
+               this->array.state = !vol->dirty;
+               this->container_member = i;
+               if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty)
+                       this->resync_start = 0;
+               else
+                       this->resync_start = ~0ULL;
+
+               strncpy(this->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
+               this->name[MAX_RAID_SERIAL_LEN] = 0;
+
+               sprintf(this->text_version, "/%s/%d",
+                       devnum2devname(st->container_dev),
+                       this->container_member);
+
+               memset(this->uuid, 0, sizeof(this->uuid));
+
+               this->component_size = __le32_to_cpu(map->blocks_per_member);
+
+               for (slot = 0 ; slot <  map->num_members; slot++) {
+                       struct imsm_disk *disk;
+                       struct mdinfo *info_d;
+                       struct dl *d;
+                       int idx;
+                       __u32 s;
+
+                       idx = __le32_to_cpu(map->disk_ord_tbl[slot] & ~(0xff << 24));
+                       for (d = super->disks; d ; d = d->next)
+                               if (d->index == idx)
+                                        break;
+
+                       if (d == NULL)
+                               break; /* shouldn't this be continue ?? */
+
+                       info_d = malloc(sizeof(*info_d));
+                       if (!info_d)
+                               break; /* ditto ?? */
+                       memset(info_d, 0, sizeof(*info_d));
+                       info_d->next = this->devs;
+                       this->devs = info_d;
+
+                       disk = get_imsm_disk(super, idx);
+                       s = __le32_to_cpu(disk->status);
+
+                       info_d->disk.number = d->index;
+                       info_d->disk.major = d->major;
+                       info_d->disk.minor = d->minor;
+                       info_d->disk.raid_disk = slot;
+                       info_d->disk.state  = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0;
+                       info_d->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0;
+                       info_d->disk.state |= s & USABLE_DISK ? (1 << MD_DISK_SYNC) : 0;
+
+                       this->array.working_disks++;
+
+                       info_d->events = __le32_to_cpu(mpb->generation_num);
+                       info_d->data_offset = __le32_to_cpu(map->pba_of_lba0);
+                       info_d->component_size = __le32_to_cpu(map->blocks_per_member);
+                       if (d->devname)
+                               strcpy(info_d->name, d->devname);
+               }
+       }
+
+       return rest;
+}
+
+
+static int imsm_open_new(struct supertype *c, struct active_array *a,
+                        char *inst)
+{
+       struct intel_super *super = c->sb;
+       struct imsm_super *mpb = super->anchor;
+       
+       if (atoi(inst) >= mpb->num_raid_devs) {
+               fprintf(stderr, "%s: subarry index %d, out of range\n",
+                       __func__, atoi(inst));
+               return -ENODEV;
+       }
+
+       dprintf("imsm: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, int n, int failed)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, n);
+       struct imsm_map *map = dev->vol.map;
+
+       if (!failed)
+               return map->map_state;
+
+       switch (get_imsm_raid_level(map)) {
+       case 0:
+               return IMSM_T_STATE_FAILED;
+               break;
+       case 1:
+               if (failed < map->num_members)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       case 10:
+       {
+               /**
+                * check to see if any mirrors have failed,
+                * otherwise we are degraded
+                */
+               int device_per_mirror = 2; /* FIXME is this always the case?
+                                           * and are they always adjacent?
+                                           */
+               int failed = 0;
+               int i;
+
+               for (i = 0; i < map->num_members; i++) {
+                       int idx = get_imsm_disk_idx(map, i);
+                       struct imsm_disk *disk = get_imsm_disk(super, idx);
+
+                       if (__le32_to_cpu(disk->status) & FAILED_DISK)
+                               failed++;
+
+                       if (failed >= device_per_mirror)
+                               return IMSM_T_STATE_FAILED;
+
+                       /* reset 'failed' for next mirror set */
+                       if (!((i + 1) % device_per_mirror))
+                               failed = 0;
+               }
+
+               return IMSM_T_STATE_DEGRADED;
+       }
+       case 5:
+               if (failed < 2)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       default:
+               break;
+       }
+
+       return map->map_state;
+}
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_map *map)
+{
+       int i;
+       int failed = 0;
+       struct imsm_disk *disk;
+
+       for (i = 0; i < map->num_members; i++) {
+               int idx = get_imsm_disk_idx(map, i);
+
+               disk = get_imsm_disk(super, idx);
+               if (__le32_to_cpu(disk->status) & FAILED_DISK)
+                       failed++;
+       }
+
+       return failed;
+}
+
+static void imsm_set_array_state(struct active_array *a, int consistent)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = &dev->vol.map[0];
+       int dirty = !consistent;
+       int failed;
+       __u8 map_state;
+
+       if (a->resync_start == ~0ULL) {
+               failed = imsm_count_failed(super, map);
+               map_state = imsm_check_degraded(super, inst, failed);
+               if (!failed)
+                       map_state = IMSM_T_STATE_NORMAL;
+               if (map->map_state != map_state) {
+                       dprintf("imsm: map_state %d: %d\n",
+                               inst, map_state);
+                       map->map_state = map_state;
+                       super->updates_pending++;
+               }
+       }
+
+       if (dev->vol.dirty != dirty) {
+               dprintf("imsm: mark '%s' (%llu)\n",
+                       dirty?"dirty":"clean", a->resync_start);
+
+               dev->vol.dirty = dirty;
+               super->updates_pending++;
+       }
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = dev->vol.map;
+       struct imsm_disk *disk;
+       __u32 status;
+       int failed = 0;
+       int new_failure = 0;
+
+       if (n > map->num_members)
+               fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n",
+                       n, map->num_members - 1);
+
+       if (n < 0)
+               return;
+
+       dprintf("imsm: set_disk %d:%x\n", n, state);
+
+       disk = get_imsm_disk(super, get_imsm_disk_idx(map, n));
+
+       /* check for new failures */
+       status = __le32_to_cpu(disk->status);
+       if ((state & DS_FAULTY) && !(status & FAILED_DISK)) {
+               status |= FAILED_DISK;
+               disk->status = __cpu_to_le32(status);
+               new_failure = 1;
+               super->updates_pending++;
+       }
+
+       /* the number of failures have changed, count up 'failed' to determine
+        * degraded / failed status
+        */
+       if (new_failure && map->map_state != IMSM_T_STATE_FAILED)
+               failed = imsm_count_failed(super, map);
+
+       /* determine map_state based on failed or in_sync count */
+       if (failed)
+               map->map_state = imsm_check_degraded(super, inst, failed);
+       else if (map->map_state == IMSM_T_STATE_DEGRADED) {
+               struct mdinfo *d;
+               int working = 0;
+
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->curr_state & DS_INSYNC)
+                               working++;
+
+               if (working == a->info.array.raid_disks) {
+                       map->map_state = IMSM_T_STATE_NORMAL;
+                       super->updates_pending++;
+               }
+       }
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super)
+{
+       struct imsm_super *mpb = super->anchor;
+       __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
+       unsigned long long dsize;
+       unsigned long long sectors;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (mpb_size > 512) {
+               /* -1 to account for anchor */
+               sectors = mpb_sectors(mpb) - 1;
+
+               /* write the extended mpb to the sectors preceeding the anchor */
+               if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0)
+                       return 1;
+
+               if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors)
+                       return 1;
+       }
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (write(fd, super->buf, 512) != 512)
+               return 1;
+
+       return 0;
+}
+
+static void imsm_sync_metadata(struct supertype *container)
+{
+       struct intel_super *super = container->sb;
+
+       if (!super->updates_pending)
+               return;
+
+       write_super_imsm(super, 0);
+
+       super->updates_pending = 0;
+}
+
+static struct mdinfo *imsm_activate_spare(struct active_array *a,
+                                         struct metadata_update **updates)
+{
+       /**
+        * Take a device that is marked spare in the metadata and use it to
+        * replace a failed/vacant slot in an array.  There may be a case where
+        * a device is failed in one array but active in a second.
+        * imsm_process_update catches this case and does not clear the SPARE_DISK
+        * flag, allowing the second array to start using the device on failure.
+        * SPARE_DISK is cleared when all arrays are using a device.
+        *
+        * FIXME: is this a valid use of SPARE_DISK?
+        */
+
+       struct intel_super *super = a->container->sb;
+       int inst = a->info.container_member;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = dev->vol.map;
+       int failed = a->info.array.raid_disks;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *d;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       struct imsm_update_activate_spare *u;
+       int num_spares = 0;
+       int i;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       failed--;
+       }
+
+       dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
+               inst, failed, a->info.array.raid_disks, a->info.array.level);
+       if (imsm_check_degraded(super, inst, failed) != IMSM_T_STATE_DEGRADED)
+               return NULL;
+
+       /* For each slot, if it is not working, find a spare */
+       dl = super->disks;
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /* OK, this device needs recovery.  Find a spare */
+               for ( ; dl ; dl = dl->next) {
+                       unsigned long long esize;
+                       unsigned long long pos;
+                       struct mdinfo *d2;
+                       struct extent *ex;
+                       struct imsm_disk *disk;
+                       int j;
+                       int found;
+                       __u32 array_start;
+
+                       /* If in this array, skip */
+                       for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+                               if (d2->disk.major == dl->major &&
+                                   d2->disk.minor == dl->minor) {
+                                       dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                                       break;
+                               }
+                       if (d2)
+                               continue;
+
+                       /* is this unused device marked as a spare? */
+                       disk = get_imsm_disk(super, dl->index);
+                       if (!(__le32_to_cpu(disk->status) & SPARE_DISK))
+                               continue;
+
+                       /* We are allowed to use this device - is there space?
+                        * We need a->info.component_size sectors */
+                       ex = get_extents(super, dl);
+                       if (!ex) {
+                               dprintf("cannot get extents\n");
+                               continue;
+                       }
+                       found = 0;
+                       j = 0;
+                       pos = 0;
+                       array_start = __le32_to_cpu(map->pba_of_lba0);
+
+                       do {
+                               /* check that we can start at pba_of_lba0 with
+                                * a->info.component_size of space
+                                */
+                               esize = ex[j].start - pos;
+                               if (array_start >= pos &&
+                                   array_start + a->info.component_size < ex[j].start) {
+                                       found = 1;
+                                       break;
+                               }
+                               pos = ex[j].start + ex[j].size;
+                               j++;
+                                   
+                       } while (ex[j-1].size);
+
+                       free(ex);
+                       if (!found) {
+                               dprintf("%x:%x does not have %llu at %d\n",
+                                       dl->major, dl->minor,
+                                       a->info.component_size,
+                                       __le32_to_cpu(map->pba_of_lba0));
+                               /* No room */
+                               continue;
+                       }
+
+                       /* found a usable disk with enough space */
+                       di = malloc(sizeof(*di));
+                       memset(di, 0, sizeof(*di));
+                       di->disk.number = dl->index;
+                       di->disk.raid_disk = i;
+                       di->disk.major = dl->major;
+                       di->disk.minor = dl->minor;
+                       di->disk.state = 0;
+                       di->data_offset = array_start;
+                       di->component_size = a->info.component_size;
+                       di->container_member = inst;
+                       di->next = rv;
+                       rv = di;
+                       num_spares++;
+                       dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                               i, pos);
+
+                       break;
+               }
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * disk_ord_tbl for the array
+        */
+       mu = malloc(sizeof(*mu));
+       mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares);
+       mu->space = NULL;
+       mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
+       mu->next = *updates;
+       u = (struct imsm_update_activate_spare *) mu->buf;
+
+       for (di = rv ; di ; di = di->next) {
+               u->type = update_activate_spare;
+               u->disk_idx = di->disk.number; 
+               u->slot = di->disk.raid_disk;
+               u->array = inst;
+               u->next = u + 1;
+               u++;
+       }
+       (u-1)->next = NULL;
+       *updates = mu;
+
+       return rv;
+}
+
+static int weight(unsigned int field)
+{
+       int weight;
+
+       for (weight = 0; field; weight++)
+               field &= field - 1;
+
+       return weight;
+}
+
+static int disks_overlap(struct imsm_map *m1, struct imsm_map *m2)
+{
+       int i;
+       int j;
+       int idx;
+
+       for (i = 0; i < m1->num_members; i++) {
+               idx = get_imsm_disk_idx(m1, i);
+               for (j = 0; j < m2->num_members; j++)
+                       if (idx == get_imsm_disk_idx(m2, j))
+                               return 1;
+       }
+
+       return 0;
+}
+
+static void imsm_process_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * crack open the metadata_update envelope to find the update record
+        * update can be one of:
+        *      update_activate_spare - a spare device has replaced a failed
+        *      device in an array, update the disk_ord_tbl.  If this disk is
+        *      present in all member arrays then also clear the SPARE_DISK
+        *      flag
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+       switch (type) {
+       case update_activate_spare: {
+               struct imsm_update_activate_spare *u = (void *) update->buf; 
+               struct imsm_dev *dev = get_imsm_dev(super, u->array);
+               struct imsm_map *map = &dev->vol.map[0];
+               struct active_array *a;
+               struct imsm_disk *disk;
+               __u32 status;
+               struct dl *dl;
+               struct mdinfo *d;
+               unsigned int members;
+               unsigned int found;
+               int victim;
+               int i;
+
+               for (dl = super->disks; dl; dl = dl->next)
+                       if (dl->index == u->disk_idx)
+                               break;
+
+               if (!dl) {
+                       fprintf(stderr, "error: imsm_activate_spare passed "
+                               "an unknown disk_idx: %d\n", u->disk_idx);
+                       return;
+               }
+
+               super->updates_pending++;
+
+               victim = get_imsm_disk_idx(map, u->slot);
+               map->disk_ord_tbl[u->slot] = __cpu_to_le32(u->disk_idx);
+               disk = get_imsm_disk(super, u->disk_idx);
+               status = __le32_to_cpu(disk->status);
+               status |= CONFIGURED_DISK;
+               disk->status = __cpu_to_le32(status);
+
+               /* map unique/live arrays using the spare */
+               members = 0;
+               found = 0;
+               for (a = st->arrays; a; a = a->next) {
+                       int inst = a->info.container_member;
+
+                       dev = get_imsm_dev(super, inst);
+                       map = &dev->vol.map[0];
+                       if (map->raid_level > 0)
+                               members |= 1 << inst;
+                       for (d = a->info.devs; d; d = d->next)
+                               if (d->disk.major == dl->major &&
+                                   d->disk.minor == dl->minor)
+                                       found |= 1 << inst;
+               }
+
+               /* until all arrays that can absorb this disk have absorbed
+                * this disk it can still be considered a spare
+                */
+               if (weight(found) >= weight(members)) {
+                       status = __le32_to_cpu(disk->status);
+                       status &= ~SPARE_DISK;
+                       disk->status = __cpu_to_le32(status);
+               }
+
+               /* count arrays using the victim in the metadata */
+               found = 0;
+               for (a = st->arrays; a ; a = a->next) {
+                       dev = get_imsm_dev(super, a->info.container_member);
+                       map = &dev->vol.map[0];
+                       for (i = 0; i < map->num_members; i++)
+                               if (victim == get_imsm_disk_idx(map, i))
+                                       found++;
+               }
+
+               /* clear some flags if the victim is no longer being
+                * utilized anywhere
+                */
+               disk = get_imsm_disk(super, victim);
+               if (!found) {
+                       status = __le32_to_cpu(disk->status);
+                       status &= ~(CONFIGURED_DISK | USABLE_DISK);
+                       disk->status = __cpu_to_le32(status);
+               }
+               break;
+       }
+       case update_create_array: {
+               /* someone wants to create a new array, we need to be aware of
+                * a few races/collisions:
+                * 1/ 'Create' called by two separate instances of mdadm
+                * 2/ 'Create' versus 'activate_spare': mdadm has chosen
+                *     devices that have since been assimilated via
+                *     activate_spare.
+                * In the event this update can not be carried out mdadm will
+                * (FIX ME) notice that its update did not take hold.
+                */
+               struct imsm_update_create_array *u = (void *) update->buf;
+               struct imsm_dev *dev;
+               struct imsm_map *map, *new_map;
+               unsigned long long start, end;
+               unsigned long long new_start, new_end;
+               int i;
+               int overlap = 0;
+
+               /* handle racing creates: first come first serve */
+               if (u->dev_idx < mpb->num_raid_devs) {
+                       dprintf("%s: subarray %d already defined\n",
+                               __func__, u->dev_idx);
+                       return;
+               }
+
+               /* check update is next in sequence */
+               if (u->dev_idx != mpb->num_raid_devs) {
+                       dprintf("%s: can not create arrays out of sequence\n",
+                               __func__);
+                       return;
+               }
+
+               new_map = &u->dev.vol.map[0];
+               new_start = __le32_to_cpu(new_map->pba_of_lba0);
+               new_end = new_start + __le32_to_cpu(new_map->blocks_per_member);
+
+               /* handle activate_spare versus create race:
+                * check to make sure that overlapping arrays do not include
+                * overalpping disks
+                */
+               for (i = 0; i < mpb->num_raid_devs; i++) {
+                       dev = get_imsm_dev(super, i);
+                       map = &dev->vol.map[0];
+                       start = __le32_to_cpu(map->pba_of_lba0);
+                       end = start + __le32_to_cpu(map->blocks_per_member);
+                       if ((new_start >= start && new_start <= end) ||
+                           (start >= new_start && start <= new_end))
+                               overlap = 1;
+                       if (overlap && disks_overlap(map, new_map)) {
+                               dprintf("%s: arrays overlap\n", __func__);
+                               return;
+                       }
+               }
+               /* check num_members sanity */
+               if (new_map->num_members > mpb->num_disks) {
+                       dprintf("%s: num_disks out of range\n", __func__);
+                       return;
+               }
+
+               /* check that prepare update was successful */
+               if (!update->space) {
+                       dprintf("%s: prepare update failed\n", __func__);
+                       return;
+               }
+
+               super->updates_pending++;
+               dev = update->space;
+               update->space = NULL;
+               imsm_copy_dev(dev, &u->dev);
+               super->dev_tbl[u->dev_idx] = dev;
+               mpb->num_raid_devs++;
+
+               /* fix up flags, if arrays overlap then the drives can not be
+                * spares
+                */
+               for (i = 0; i < map->num_members; i++) {
+                       struct imsm_disk *disk;
+                       __u32 status;
+
+                       disk = get_imsm_disk(super, get_imsm_disk_idx(map, i));
+                       status = __le32_to_cpu(disk->status);
+                       status |= CONFIGURED_DISK;
+                       if (overlap)
+                               status &= ~SPARE_DISK;
+                       disk->status = __cpu_to_le32(status);
+               }
+               break;
+       }
+       }
+}
+
+static void imsm_prepare_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * Allocate space to hold new disk entries, raid-device entries or a
+        * new mpb if necessary.  We currently maintain an mpb large enough to
+        * hold 2 subarrays for the given number of disks.  This may not be
+        * sufficient when reshaping.
+        *
+        * FIX ME handle the reshape case.
+        *
+        * The monitor will be able to safely change super->mpb by arranging
+        * for it to be freed in check_update_queue().  I.e. the monitor thread
+        * will start using the new pointer and the manager can continue to use
+        * the old value until check_update_queue() runs.
+        */
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+       switch (type) {
+       case update_create_array: {
+               struct imsm_update_create_array *u = (void *) update->buf;
+               size_t len = sizeof_imsm_dev(&u->dev);
+
+               update->space = malloc(len);
+               break;
+       default:
+               break;
+       }
+       }
+
+       return;
+}
+
+struct superswitch super_imsm = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_imsm,
+       .brief_examine_super = brief_examine_super_imsm,
+       .detail_super   = detail_super_imsm,
+       .brief_detail_super = brief_detail_super_imsm,
+       .write_init_super = write_init_super_imsm,
+#endif
+       .match_home     = match_home_imsm,
+       .uuid_from_super= uuid_from_super_imsm,
+       .getinfo_super  = getinfo_super_imsm,
+       .update_super   = update_super_imsm,
+
+       .avail_size     = avail_size_imsm,
+
+       .compare_super  = compare_super_imsm,
+
+       .load_super     = load_super_imsm,
+       .init_super     = init_super_imsm,
+       .add_to_super   = add_to_super_imsm,
+       .store_super    = store_zero_imsm,
+       .free_super     = free_super_imsm,
+       .match_metadata_desc = match_metadata_desc_imsm,
+       .container_content = container_content_imsm,
+
+       .validate_geometry = validate_geometry_imsm,
+       .external       = 1,
+
+/* for mdmon */
+       .open_new       = imsm_open_new,
+       .load_super     = load_super_imsm,
+       .set_array_state= imsm_set_array_state,
+       .set_disk       = imsm_set_disk,
+       .sync_metadata  = imsm_sync_metadata,
+       .activate_spare = imsm_activate_spare,
+       .process_update = imsm_process_update,
+       .prepare_update = imsm_prepare_update,
+};
index 8e4c568e1e360ac2e8ab1146b1bc5b06578ad8fc..ab636605bc6ab66fbd1b8931df6a6e0eb2b97c14 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -53,7 +53,7 @@ static unsigned long calc_sb0_csum(mdp_super_t *super)
 }
 
 
-void super0_swap_endian(struct mdp_superblock_s *sb)
+static void super0_swap_endian(struct mdp_superblock_s *sb)
 {
        /* as super0 superblocks are host-endian, it is sometimes
         * useful to be able to swap the endianness
@@ -369,6 +369,8 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info)
        info->events = md_event(sb);
        info->data_offset = 0;
 
+       sprintf(info->text_version, "0.%d", sb->minor_version);
+
        uuid_from_super0(st, info->uuid);
 
        if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) {
@@ -552,12 +554,14 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *ignored_name, char *homehost,
                       int *uuid)
 {
-       mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       mdp_super_t *sb;
        int spares;
+
+       posix_memalign((void**)&sb, 512, MD_SB_BYTES + sizeof(bitmap_super_t));
        memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t));
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing the superblock */
                return 0;
        }
@@ -623,17 +627,38 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
 /* Add a device to the superblock being created */
-static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo)
+static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
+                         int fd, char *devname)
 {
        mdp_super_t *sb = st->sb;
        mdp_disk_t *dk = &sb->disks[dinfo->number];
+       struct devinfo *di, **dip;
 
        dk->number = dinfo->number;
        dk->major = dinfo->major;
        dk->minor = dinfo->minor;
        dk->raid_disk = dinfo->raid_disk;
        dk->state = dinfo->state;
+
+       sb->this_disk = sb->disks[dinfo->number];
+       sb->sb_csum = calc_sb0_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dinfo;
+       di->next = NULL;
+       *dip = di;
 }
 
 static int store_super0(struct supertype *st, int fd)
@@ -661,7 +686,8 @@ static int store_super0(struct supertype *st, int fd)
        if (super->state & (1<<MD_SB_BITMAP_PRESENT)) {
                struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC)
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) != 
+                           ROUND_UP(sizeof(*bm),512))
                            return 5;
        }
 
@@ -669,32 +695,41 @@ static int store_super0(struct supertype *st, int fd)
        return 0;
 }
 
-static int write_init_super0(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super0(struct supertype *st)
 {
        mdp_super_t *sb = st->sb;
-       int fd = open(devname, O_RDWR|O_EXCL);
-       int rv;
+       int rv = 0;
+       struct devinfo *di;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname);
-               return -1;
-       }
+       for (di = st->info ; di && ! rv ; di = di->next) {
 
-       sb->disks[dinfo->number].state &= ~(1<<MD_DISK_FAULTY);
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd == -1)
+                       continue;
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->this_disk = sb->disks[dinfo->number];
-       sb->sb_csum = calc_sb0_csum(sb);
-       rv = store_super0(st, fd);
+               sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY);
 
-       if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
-               rv = st->ss->write_bitmap(st, fd);
+               sb->this_disk = sb->disks[di->disk.number];
+               sb->sb_csum = calc_sb0_csum(sb);
+               rv = store_super0(st, di->fd);
 
-       close(fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
+                       rv = st->ss->write_bitmap(st, di->fd);
+
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super0(struct supertype *st, struct supertype *tst)
 {
@@ -712,7 +747,8 @@ static int compare_super0(struct supertype *st, struct supertype *tst)
        if (second->md_magic != MD_SB_MAGIC)
                return 1;
        if (!first) {
-               first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s));
+               posix_memalign((void**)&first, 512, 
+                              MD_SB_BYTES + sizeof(struct bitmap_super_s));
                memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s));
                st->sb = first;
                return 0;
@@ -754,6 +790,9 @@ static int load_super0(struct supertype *st, int fd, char *devname)
 
        free_super0(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (!get_dev_size(fd, devname, &dsize))
                return 1;
 
@@ -778,7 +817,7 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       posix_memalign((void**)&super, 512, MD_SB_BYTES + sizeof(bitmap_super_t)+512);
 
        if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) {
                if (devname)
@@ -812,6 +851,7 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                st->ss = &super0;
                st->minor_version = super->minor_version;
                st->max_devs = MD_SB_DISKS;
+               st->info = NULL;
        }
 
        /* Now check on the bitmap superblock */
@@ -821,8 +861,8 @@ static int load_super0(struct supertype *st, int fd, char *devname)
         * valid.  If it doesn't clear the bit.  An --assemble --force
         * should get that written out.
         */
-       if (read(fd, super+1, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),512))
+           != ROUND_UP(sizeof(struct bitmap_super_s),512))
                goto no_bitmap;
 
        uuid_from_super0(st, uuid);
@@ -843,7 +883,9 @@ static struct supertype *match_metadata_desc0(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super0;
+       st->info = NULL;
        st->minor_version = 90;
        st->max_devs = MD_SB_DISKS;
        st->sb = NULL;
@@ -919,7 +961,7 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp,
 }
 
 
-void locate_bitmap0(struct supertype *st, int fd)
+static void locate_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -939,7 +981,7 @@ void locate_bitmap0(struct supertype *st, int fd)
        lseek64(fd, offset, 0);
 }
 
-int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -948,7 +990,8 @@ int write_bitmap0(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char abuf[4096+512];
+       char *buf = (char*)(((long)(abuf+512))&~511UL);
 
        if (!get_dev_size(fd, NULL, &dsize))
                return 1;
@@ -964,21 +1007,19 @@ int write_bitmap0(struct supertype *st, int fd)
        if (lseek64(fd, offset + 4096, 0)< 0LL)
                return 3;
 
-
-       if (write(fd, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
-       towrite = 64*1024 - MD_SB_BYTES - sizeof(bitmap_super_t);
-       memset(buf, 0xff, sizeof(buf));
+       memset(buf, 0xff, 4096);
+       memcpy(buf,  ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t));
+       towrite = 64*1024;
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -994,6 +1035,46 @@ static void free_super0(struct supertype *st)
        st->sb = NULL;
 }
 
+static int validate_geometry0(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (raiddisks > MD_SB_DISKS)
+               return 0;
+       if (size > (0x7fffffffULL<<10))
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super0.90 cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       if (ldsize < MD_RESERVED_SECTORS * 512)
+               return 0;
+       if (size > (0x7fffffffULL<<10))
+               return 0;
+       *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9);
+       return 1;
+}
+
 struct superswitch super0 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super0,
@@ -1002,6 +1083,7 @@ struct superswitch super0 = {
        .detail_super = detail_super0,
        .brief_detail_super = brief_detail_super0,
        .export_detail_super = export_detail_super0,
+       .write_init_super = write_init_super0,
 #endif
        .match_home = match_home0,
        .uuid_from_super = uuid_from_super0,
@@ -1010,7 +1092,6 @@ struct superswitch super0 = {
        .init_super = init_super0,
        .add_to_super = add_to_super0,
        .store_super = store_super0,
-       .write_init_super = write_init_super0,
        .compare_super = compare_super0,
        .load_super = load_super0,
        .match_metadata_desc = match_metadata_desc0,
@@ -1019,6 +1100,5 @@ struct superswitch super0 = {
        .locate_bitmap = locate_bitmap0,
        .write_bitmap = write_bitmap0,
        .free_super = free_super0,
-       .major = 0,
-       .swapuuid = 0,
+       .validate_geometry = validate_geometry0,
 };
index fe915f8d921ca498d02eef3a9cf92dfaedec8f22..06d0a1876cf1f678f27661c14dfc8649576d8c89 100644 (file)
--- a/super1.c
+++ b/super1.c
@@ -493,7 +493,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
        int role;
 
        info->array.major_version = 1;
-       info->array.minor_version = __le32_to_cpu(sb->feature_map);
+       info->array.minor_version = st->minor_version;
        info->array.patch_version = 0;
        info->array.raid_disks = __le32_to_cpu(sb->raid_disks);
        info->array.level = __le32_to_cpu(sb->level);
@@ -531,6 +531,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
                info->disk.raid_disk = role;
        }
        info->events = __le64_to_cpu(sb->events);
+       sprintf(info->text_version, "1.%d", st->minor_version);
 
        memcpy(info->uuid, sb->set_uuid, 16);
 
@@ -670,7 +671,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
            __le64_to_cpu(sb->data_offset)) {
                /* set data_size to device size less data_offset */
                struct misc_dev_info *misc = (struct misc_dev_info*)
-                       (st->sb + 1024 + sizeof(struct bitmap_super_s));
+                       (st->sb + 1024 + 512);
                printf("Size was %llu\n", (unsigned long long)
                       __le64_to_cpu(sb->data_size));
                sb->data_size = __cpu_to_le64(
@@ -688,15 +689,17 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 static int init_super1(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *name, char *homehost, int *uuid)
 {
-       struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) +
-                                            sizeof(struct misc_dev_info));
+       struct mdp_superblock_1 *sb;
        int spares;
        int rfd;
        char defname[10];
+
+       posix_memalign((void**)&sb, 512, (1024 + 512 + 
+                                         sizeof(struct misc_dev_info)));
        memset(sb, 0, 1024);
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing superblock */
                return 0;
        }
@@ -767,17 +770,39 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
 /* Add a device to the superblock being created */
-static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk)
+static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
+                         int fd, char *devname)
 {
        struct mdp_superblock_1 *sb = st->sb;
        __u16 *rp = sb->dev_roles + dk->number;
+       struct devinfo *di, **dip;
+
        if ((dk->state & 6) == 6) /* active, sync */
                *rp = __cpu_to_le16(dk->raid_disk);
        else if ((dk->state & ~2) == 0) /* active or idle -> spare */
                *rp = 0xffff;
        else
                *rp = 0xfffe;
+
+       sb->dev_number = __cpu_to_le32(dk->number);
+       sb->sb_csum = calc_sb_1_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dk;
+       di->next = NULL;
+       *dip = di;
 }
 
 static void locate_bitmap1(struct supertype *st, int fd);
@@ -834,6 +859,7 @@ static int store_super1(struct supertype *st, int fd)
                return 3;
 
        sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev);
+       sbsize = (sbsize+511)&(~511UL);
 
        if (write(fd, sb, sbsize) != sbsize)
                return 4;
@@ -843,7 +869,8 @@ static int store_super1(struct supertype *st, int fd)
                        (((char*)sb)+1024);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) {
                        locate_bitmap1(st, fd);
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) !=
+                           ROUND_UP(sizeof(*bm),512))
                            return 5;
                }
        }
@@ -866,123 +893,133 @@ static unsigned long choose_bm_space(unsigned long devsize)
        return 4*2;
 }
 
-static int write_init_super1(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super1(struct supertype *st)
 {
        struct mdp_superblock_1 *sb = st->sb;
        struct supertype refst;
-       int fd = open(devname, O_RDWR | O_EXCL);
        int rfd;
-       int rv;
+       int rv = 0;
        int bm_space;
-
+       struct devinfo *di;
        unsigned long long dsize, array_size;
        long long sb_offset;
 
+       for (di = st->info; di && ! rv ; di = di->next) {
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd < 0)
+                       continue;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n",
-                       devname);
-               return -1;
-       }
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->dev_number = __cpu_to_le32(dinfo->number);
-       if (dinfo->state & (1<<MD_DISK_WRITEMOSTLY))
-               sb->devflags |= __cpu_to_le32(WriteMostly1);
+               sb->dev_number = __cpu_to_le32(di->disk.number);
+               if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY))
+                       sb->devflags |= __cpu_to_le32(WriteMostly1);
 
-       if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
-           read(rfd, sb->device_uuid, 16) != 16) {
-               *(__u32*)(sb->device_uuid) = random();
-               *(__u32*)(sb->device_uuid+4) = random();
-               *(__u32*)(sb->device_uuid+8) = random();
-               *(__u32*)(sb->device_uuid+12) = random();
-       }
-       if (rfd >= 0) close(rfd);
-       sb->events = 0;
-
-       refst =*st;
-       refst.sb = NULL;
-       if (load_super1(&refst, fd, NULL)==0) {
-               struct mdp_superblock_1 *refsb = refst.sb;
-
-               memcpy(sb->device_uuid, refsb->device_uuid, 16);
-               if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
-                       /* same array, so preserve events and dev_number */
-                       sb->events = refsb->events;
-                       /* bugs in 2.6.17 and earlier mean the dev_number
-                        * chosen in Manage must be preserved
-                        */
-                       if (get_linux_version() >= 2006018)
-                               sb->dev_number = refsb->dev_number;
+               if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
+                   read(rfd, sb->device_uuid, 16) != 16) {
+                       *(__u32*)(sb->device_uuid) = random();
+                       *(__u32*)(sb->device_uuid+4) = random();
+                       *(__u32*)(sb->device_uuid+8) = random();
+                       *(__u32*)(sb->device_uuid+12) = random();
+               }
+               if (rfd >= 0) close(rfd);
+               sb->events = 0;
+
+               refst =*st;
+               refst.sb = NULL;
+               if (load_super1(&refst, di->fd, NULL)==0) {
+                       struct mdp_superblock_1 *refsb = refst.sb;
+
+                       memcpy(sb->device_uuid, refsb->device_uuid, 16);
+                       if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
+                               /* same array, so preserve events and
+                                * dev_number */
+                               sb->events = refsb->events;
+                               /* bugs in 2.6.17 and earlier mean the
+                                * dev_number chosen in Manage must be preserved
+                                */
+                               if (get_linux_version() >= 2006018)
+                                       sb->dev_number = refsb->dev_number;
+                       }
+                       free(refsb);
                }
-               free(refsb);
-       }
-
-       if (!get_dev_size(fd, NULL, &dsize))
-               return 1;
-       dsize >>= 9;
 
-       if (dsize < 24) {
-               close(fd);
-               return 2;
-       }
+               if (!get_dev_size(di->fd, NULL, &dsize))
+                       return 1;
+               dsize >>= 9;
 
+               if (dsize < 24) {
+                       close(di->fd);
+                       return 2;
+               }
 
-       /*
-        * Calculate the position of the superblock.
-        * It is always aligned to a 4K boundary and
-        * depending on minor_version, it can be:
-        * 0: At least 8K, but less than 12K, from end of device
-        * 1: At start of device
-        * 2: 4K from start of device.
-        * Depending on the array size, we might leave extra space
-        * for a bitmap.
-        */
-       array_size = __le64_to_cpu(sb->size);
-       /* work out how much space we left for a bitmap */
-       bm_space = choose_bm_space(array_size);
 
-       switch(st->minor_version) {
-       case 0:
-               sb_offset = dsize;
-               sb_offset -= 8*2;
-               sb_offset &= ~(4*2-1);
-               sb->super_offset = __cpu_to_le64(sb_offset);
-               sb->data_offset = __cpu_to_le64(0);
+               /*
+                * Calculate the position of the superblock.
+                * It is always aligned to a 4K boundary and
+                * depending on minor_version, it can be:
+                * 0: At least 8K, but less than 12K, from end of device
+                * 1: At start of device
+                * 2: 4K from start of device.
+                * Depending on the array size, we might leave extra space
+                * for a bitmap.
+                */
+               array_size = __le64_to_cpu(sb->size);
+               /* work out how much space we left for a bitmap */
+               bm_space = choose_bm_space(array_size);
+
+               switch(st->minor_version) {
+               case 0:
+                       sb_offset = dsize;
+                       sb_offset -= 8*2;
+                       sb_offset &= ~(4*2-1);
+                       sb->super_offset = __cpu_to_le64(sb_offset);
+                       sb->data_offset = __cpu_to_le64(0);
                if (sb_offset - bm_space < array_size)
                        bm_space = sb_offset - array_size;
-               sb->data_size = __cpu_to_le64(sb_offset - bm_space);
-               break;
-       case 1:
-               sb->super_offset = __cpu_to_le64(0);
-               if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2;
-               sb->data_offset = __cpu_to_le64(bm_space + 4*2);
-               sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
-               break;
-       case 2:
-               sb_offset = 4*2;
-               sb->super_offset = __cpu_to_le64(4*2);
-               if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2;
-               sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
-               sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space );
-               break;
-       default:
-               return -EINVAL;
-       }
+                       sb->data_size = __cpu_to_le64(sb_offset - bm_space);
+                       break;
+               case 1:
+                       sb->super_offset = __cpu_to_le64(0);
+                       if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size) -4*2;
+                       sb->data_offset = __cpu_to_le64(bm_space + 4*2);
+                       sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
+                       break;
+               case 2:
+                       sb_offset = 4*2;
+                       sb->super_offset = __cpu_to_le64(4*2);
+                       if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size)
+                           > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size)
+                                       - 4*2 - 4*2;
+                       sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
+                       sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2
+                                                     - bm_space );
+                       break;
+               default:
+                       return -EINVAL;
+               }
 
 
-       sb->sb_csum = calc_sb_1_csum(sb);
-       rv = store_super1(st, fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               sb->sb_csum = calc_sb_1_csum(sb);
+               rv = store_super1(st, di->fd);
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
 
-       if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
-               rv = st->ss->write_bitmap(st, fd);
-       close(fd);
+               if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+                       rv = st->ss->write_bitmap(st, di->fd);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super1(struct supertype *st, struct supertype *tst)
 {
@@ -1002,9 +1039,10 @@ static int compare_super1(struct supertype *st, struct supertype *tst)
                return 1;
 
        if (!first) {
-               first = malloc(1024+sizeof(bitmap_super_t) +
+               posix_memalign((void**)&first, 512,
+                              1024 + 512 +
                               sizeof(struct misc_dev_info));
-               memcpy(first, second, 1024+sizeof(bitmap_super_t) +
+               memcpy(first, second, 1024 + 512 + 
                       sizeof(struct misc_dev_info));
                st->sb = first;
                return 0;
@@ -1035,13 +1073,16 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        free_super1(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (st->ss == NULL || st->minor_version == -1) {
                int bestvers = -1;
                struct supertype tst;
                __u64 bestctime = 0;
                /* guess... choose latest ctime */
+               memset(&tst, 0, sizeof(tst));
                tst.ss = &super1;
-               tst.sb = NULL;
                for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) {
                        switch(load_super1(&tst, fd, devname)) {
                        case 0: super = tst.sb;
@@ -1114,7 +1155,8 @@ static int load_super1(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(1024 + sizeof(bitmap_super_t) +
+       posix_memalign((void**)&super, 512,
+                      1024 + 512 +
                       sizeof(struct misc_dev_info));
 
        if (read(fd, super, 1024) != 1024) {
@@ -1151,7 +1193,7 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        bsb = (struct bitmap_super_s *)(((char*)super)+1024);
 
-       misc = (struct misc_dev_info*) (bsb+1);
+       misc = (struct misc_dev_info*) (((char*)super)+1024+512);
        misc->device_size = dsize;
 
        /* Now check on the bitmap superblock */
@@ -1162,8 +1204,8 @@ static int load_super1(struct supertype *st, int fd, char *devname)
         * should get that written out.
         */
        locate_bitmap1(st, fd);
-       if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (read(fd, ((char*)super)+1024, 512)
+           != 512)
                goto no_bitmap;
 
        uuid_from_super1(st, uuid);
@@ -1183,6 +1225,7 @@ static struct supertype *match_metadata_desc1(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super1;
        st->max_devs = 384;
        st->sb = NULL;
@@ -1199,7 +1242,7 @@ static struct supertype *match_metadata_desc1(char *arg)
                return st;
        }
        if (strcmp(arg, "1") == 0 ||
-           strcmp(arg, "default/large") == 0) {
+           strcmp(arg, "default") == 0) {
                st->minor_version = -1;
                return st;
        }
@@ -1382,25 +1425,28 @@ static int write_bitmap1(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char abuf[4096+512];
+       char *buf = (char*)(((long)(abuf+512))&~511UL);
 
        locate_bitmap1(st, fd);
 
-       if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
+       memset(buf, 0xff, 4096);
+       memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t));
+
        towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
        towrite = (towrite+7) >> 3; /* bits to bytes */
-       memset(buf, 0xff, sizeof(buf));
+       towrite += sizeof(bitmap_super_t);
+       towrite = ROUND_UP(towrite, 512);
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -1416,6 +1462,38 @@ static void free_super1(struct supertype *st)
        st->sb = NULL;
 }
 
+static int validate_geometry1(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super1.x cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size1(st, ldsize >> 9);
+       return 1;
+}
+
 struct superswitch super1 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super1,
@@ -1424,6 +1502,7 @@ struct superswitch super1 = {
        .detail_super = detail_super1,
        .brief_detail_super = brief_detail_super1,
        .export_detail_super = export_detail_super1,
+       .write_init_super = write_init_super1,
 #endif
        .match_home = match_home1,
        .uuid_from_super = uuid_from_super1,
@@ -1432,7 +1511,6 @@ struct superswitch super1 = {
        .init_super = init_super1,
        .add_to_super = add_to_super1,
        .store_super = store_super1,
-       .write_init_super = write_init_super1,
        .compare_super = compare_super1,
        .load_super = load_super1,
        .match_metadata_desc = match_metadata_desc1,
@@ -1441,7 +1519,7 @@ struct superswitch super1 = {
        .locate_bitmap = locate_bitmap1,
        .write_bitmap = write_bitmap1,
        .free_super = free_super1,
-       .major = 1,
+       .validate_geometry = validate_geometry1,
 #if __BYTE_ORDER == BIG_ENDIAN
        .swapuuid = 0,
 #else
diff --git a/sysfs.c b/sysfs.c
index 0255f8825e9ef12af2f485a1e0ee247066bec388..0ea17eb9d00b0020192930bae809eab6ad830c15 100644 (file)
--- a/sysfs.c
+++ b/sysfs.c
@@ -34,10 +34,10 @@ int load_sys(char *path, char *buf)
                return -1;
        n = read(fd, buf, 1024);
        close(fd);
-       if (n <=0 || n >= 1024)
+       if (n <0 || n >= 1024)
                return -1;
        buf[n] = 0;
-       if (buf[n-1] == '\n')
+       if (n && buf[n-1] == '\n')
                buf[n-1] = 0;
        return 0;
 }
@@ -56,6 +56,23 @@ void sysfs_free(struct mdinfo *sra)
        }
 }
 
+int sysfs_open(int devnum, char *devname, char *attr)
+{
+       char fname[50];
+       int fd;
+
+       sprintf(fname, "/sys/block/%s/md/", devnum2devname(devnum));
+       if (devname) {
+               strcat(fname, devname);
+               strcat(fname, "/");
+       }
+       strcat(fname, attr);
+       fd = open(fname, O_RDWR);
+       if (fd < 0 && errno == EACCES)
+               fd = open(fname, O_RDONLY);
+       return fd;
+}
+
 struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
 {
        /* Longest possible name in sysfs, mounted at /sys, is
@@ -69,7 +86,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
        char *dbase;
        struct mdinfo *sra;
        struct mdinfo *dev;
-       DIR *dir;
+       DIR *dir = NULL;
        struct dirent *de;
 
        sra = malloc(sizeof(*sra));
@@ -111,10 +128,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        sra->array.major_version = -1;
                        sra->array.minor_version = -2;
                        strcpy(sra->text_version, buf+9);
-               } else
+               } else {
                        sscanf(buf, "%d.%d",
                               &sra->array.major_version,
                               &sra->array.minor_version);
+                       strcpy(sra->text_version, buf);
+               }
        }
        if (options & GET_LEVEL) {
                strcpy(base, "level");
@@ -128,6 +147,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        goto abort;
                sra->array.layout = strtoul(buf, NULL, 0);
        }
+       if (options & GET_DISKS) {
+               strcpy(base, "raid_disks");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->array.raid_disks = strtoul(buf, NULL, 0);
+       }
        if (options & GET_COMPONENT) {
                strcpy(base, "component_size");
                if (load_sys(fname, buf))
@@ -203,7 +228,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        strcpy(dbase, "size");
                        if (load_sys(fname, buf))
                                goto abort;
-                       dev->component_size = strtoull(buf, NULL, 0);
+                       dev->component_size = strtoull(buf, NULL, 0) * 2;
                }
                if (options & GET_STATE) {
                        dev->disk.state = 0;
@@ -224,9 +249,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        dev->errors = strtoul(buf, NULL, 0);
                }
        }
+       closedir(dir);
        return sra;
 
  abort:
+       if (dir)
+               closedir(dir);
        sysfs_free(sra);
        return NULL;
 }
@@ -267,6 +295,7 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
        char fname[50];
        int n;
        int fd;
+
        sprintf(fname, "/sys/block/%s/md/%s/%s",
                sra->sys_name, dev?dev->sys_name:"", name);
        fd = open(fname, O_WRONLY);
@@ -310,3 +339,240 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                return -1;
        return 0;
 }
+
+int sysfs_set_array(struct mdinfo *sra,
+                   struct mdinfo *info)
+{
+       int rv = 0;
+       sra->array = info->array;
+
+       if (info->array.level < 0)
+               return 0; /* FIXME */
+       rv |= sysfs_set_str(sra, NULL, "level",
+                           map_num(pers, info->array.level));
+       rv |= sysfs_set_num(sra, NULL, "raid_disks", info->array.raid_disks);
+       rv |= sysfs_set_num(sra, NULL, "chunk_size", info->array.chunk_size);
+       rv |= sysfs_set_num(sra, NULL, "layout", info->array.layout);
+       rv |= sysfs_set_num(sra, NULL, "component_size", info->component_size/2);
+       rv |= sysfs_set_num(sra, NULL, "resync_start", info->resync_start);
+       sra->array = info->array;
+       return rv;
+}
+
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd)
+{
+       char dv[100];
+       char nm[100];
+       struct mdinfo *sd2;
+       char *dname;
+       int rv;
+
+       sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor);
+       rv = sysfs_set_str(sra, NULL, "new_dev", dv);
+       if (rv)
+               return rv;
+
+       memset(nm, 0, sizeof(nm));
+       sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor);
+       rv = readlink(dv, nm, sizeof(nm));
+       if (rv <= 0)
+               return -1;
+       nm[rv] = '\0';
+       dname = strrchr(nm, '/');
+       if (dname) dname++;
+       strcpy(sd->sys_name, "dev-");
+       strcpy(sd->sys_name+4, dname);
+
+       rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
+       rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
+       if (sra->array.level != LEVEL_CONTAINER) {
+               rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+//             rv |= sysfs_set_str(sra, sd, "state", "in_sync");
+       }
+       if (! rv) {
+               sd2 = malloc(sizeof(*sd2));
+               *sd2 = *sd;
+               sd2->next = sra->devs;
+               sra->devs = sd2;
+       }
+       return rv;
+}
+
+#if 0
+int sysfs_disk_to_sg(int fd)
+{
+       /* from an open block device, try find and open its corresponding
+        * scsi_generic interface
+        */
+       struct stat st;
+       char path[256];
+       char sg_path[256];
+       char sg_major_minor[8];
+       char *c;
+       DIR *dir;
+       struct dirent *de;
+       int major, minor, rv;
+
+       if (fstat(fd, &st))
+               return -1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_generic:", de->d_name,
+                           strlen("scsi_generic:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return -1;
+
+       snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name);
+       fd = open(sg_path, O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       rv = read(fd, sg_major_minor, sizeof(sg_major_minor));
+       close(fd);
+       if (rv < 0)
+               return -1;
+       else
+               sg_major_minor[rv - 1] = '\0';
+
+       c = strchr(sg_major_minor, ':');
+       *c = '\0';
+       c++;
+       major = strtol(sg_major_minor, NULL, 10);
+       minor = strtol(c, NULL, 10);
+       snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d",
+                (int) getpid(), major, minor);
+       if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) {
+                       fd = open(path, O_RDONLY);
+                       unlink(path);
+                       return fd;
+       }
+
+       return -1;
+}
+#endif
+
+int sysfs_disk_to_scsi_id(int fd, __u32 *id)
+{
+       /* from an open block device, try to retrieve it scsi_id */
+       struct stat st;
+       char path[256];
+       char *c1, *c2;
+       DIR *dir;
+       struct dirent *de;
+
+       if (fstat(fd, &st))
+               return 1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return 1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_disk:", de->d_name,
+                           strlen("scsi_disk:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return 1;
+
+       c1 = strchr(de->d_name, ':');
+       c1++;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id = strtol(c1, NULL, 10) << 24; /* host */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 16; /* channel */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 8; /* lun */
+       c1 = c2 + 1;
+       *id |= strtol(c1, NULL, 10); /* id */
+
+       return 0;
+}
+
+
+int sysfs_unique_holder(int devnum, long rdev)
+{
+       /* Check that devnum is a holder of rdev,
+        * and is the only holder.
+        * we should be locked against races by
+        * an O_EXCL on devnum
+        */
+       DIR *dir;
+       struct dirent *de;
+       char dirname[100];
+       char l;
+       int found = 0;
+       sprintf(dirname, "/sys/dev/block/%d:%d/holders",
+               major(rdev), minor(rdev));
+       dir = opendir(dirname);
+       errno = ENOENT;
+       if (!dir)
+               return 0;
+       l = strlen(dirname);
+       while ((de = readdir(dir)) != NULL) {
+               char buf[10];
+               int n;
+               int mj, mn;
+               char c;
+               int fd;
+
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               strcpy(dirname+l, "/");
+               strcat(dirname+l, de->d_name);
+               strcat(dirname+l, "/dev");
+               fd = open(dirname, O_RDONLY);
+               if (fd < 0) {
+                       errno = ENOENT;
+                       break;
+               }
+               n = read(fd, buf, sizeof(buf)-1);
+               close(fd);
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 ||
+                   c != '\n') {
+                       errno = ENOENT;
+                       break;
+               }
+               if (mj != MD_MAJOR)
+                       mn = -1-(mn>>6);
+
+               if (devnum != mn) {
+                       errno = EEXIST;
+                       break;
+               }
+               found = 1;
+       }
+       closedir(dir);
+       if (de)
+               return 0;
+       else
+               return found;
+}
diff --git a/test b/test
index 1a79bab42295102a8731d854d75e3997b32f1bf1..bd8d279360d6a83aa910ab7f9276a0a66bb39931 100644 (file)
--- a/test
+++ b/test
@@ -174,6 +174,8 @@ do
   if [ -f "$script" ]
   then
    rm -f $targetdir/stderr
+   # stop all arrays, just incase some script left an array active.
+   mdadm -Ssq
    # source script in a subshell, so it has access to our
    # namespace, but cannot change it.
    if ( set -ex ; . $script )  2> $targetdir/log
diff --git a/util.c b/util.c
index 75f370644124351c6279d3a0b194d7416745604f..3bf4cbe31d832eb124f5bf37315ec67f70ce912c 100644 (file)
--- a/util.c
+++ b/util.c
 
 #include       "mdadm.h"
 #include       "md_p.h"
+#include       <sys/socket.h>
 #include       <sys/utsname.h>
+#include       <sys/wait.h>
+#include       <sys/un.h>
 #include       <ctype.h>
+#include       <dirent.h>
+#include       <signal.h>
 
 /*
  * following taken from linux/blkpg.h because they aren't
@@ -389,6 +394,9 @@ int is_standard(char *dev, int *nump)
        /* tests if dev is a "standard" md dev name.
         * i.e if the last component is "/dNN" or "/mdNN",
         * where NN is a string of digits
+        * Returns 1 if a partitionable standard,
+        *   -1 if non-partitonable,
+        *   0 if not a standard name.
         */
        char *d = strrchr(dev, '/');
        int type=0;
@@ -608,6 +616,23 @@ char *human_size_brief(long long bytes)
 }
 #endif
 
+unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize)
+{
+       int data_disks = 0;
+       switch (level) {
+       case 0: data_disks = raid_disks; break;
+       case 1: data_disks = 1; break;
+       case 4:
+       case 5: data_disks = raid_disks - 1; break;
+       case 6: data_disks = raid_disks - 2; break;
+       case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
+               break;
+       }
+       devsize &= ~(unsigned long long)((chunksize>>9)-1);
+       return data_disks * devsize;
+}
+
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
 int get_mdp_major(void)
 {
@@ -693,21 +718,6 @@ void put_md_name(char *name)
                unlink(name);
 }
 
-static int dev2major(int d)
-{
-       if (d >= 0)
-               return MD_MAJOR;
-       else
-               return get_mdp_major();
-}
-
-static int dev2minor(int d)
-{
-       if (d >= 0)
-               return d;
-       return (-1-d) << MdpMinorShift;
-}
-
 int find_free_devnum(int use_partitions)
 {
        int devnum;
@@ -749,19 +759,38 @@ int dev_open(char *dev, int flags)
        if (e > dev && *e == ':' && e[1] &&
            (minor = strtoul(e+1, &e, 0)) >= 0 &&
            *e == 0) {
-               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor);
+               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
+                        (int)getpid(), major, minor);
                if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) {
-                       fd = open(devname, flags);
+                       fd = open(devname, flags|O_DIRECT);
                        unlink(devname);
                }
        } else
-               fd = open(dev, flags);
+               fd = open(dev, flags|O_DIRECT);
        return fd;
 }
 
-struct superswitch *superlist[] = { &super0, &super1, NULL };
+int open_dev_excl(int devnum)
+{
+       char buf[20];
+       int i;
+
+       sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+       for (i=0 ; i<25 ; i++) {
+               int fd = dev_open(buf, O_RDWR|O_EXCL);
+               if (fd >= 0)
+                       return fd;
+               if (errno != EBUSY)
+                       return fd;
+               usleep(200000);
+       }
+       return -1;
+}
+
+struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
 
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+
 struct supertype *super_by_fd(int fd)
 {
        mdu_array_info_t array;
@@ -772,6 +801,7 @@ struct supertype *super_by_fd(int fd)
        char *verstr;
        char version[20];
        int i;
+       char *subarray = NULL;
 
        sra = sysfs_read(fd, 0, GET_VERSION);
 
@@ -791,40 +821,56 @@ struct supertype *super_by_fd(int fd)
                sprintf(version, "%d.%d", vers, minor);
                verstr = version;
        }
+       if (minor == -2 && verstr[0] == '/') {
+               char *dev = verstr+1;
+               subarray = strchr(dev, '/');
+               int devnum;
+               if (subarray)
+                       *subarray++ = '\0';
+               devnum = devname2devnum(dev);
+               subarray = strdup(subarray);
+               if (sra)
+                       sysfs_free(sra);
+               sra = sysfs_read(-1, devnum, GET_VERSION);
+               verstr = sra->text_version ? : "-no-metadata-";
+       }
+
        for (i = 0; st == NULL && superlist[i] ; i++)
                st = superlist[i]->match_metadata_desc(verstr);
 
        if (sra)
                sysfs_free(sra);
-       if (st)
+       if (st) {
                st->sb = NULL;
+               if (subarray) {
+                       strncpy(st->subarray, subarray, 32);
+                       st->subarray[31] = 0;
+                       free(subarray);
+               } else
+                       st->subarray[0] = 0;
+       }
        return st;
 }
 #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
 
 
-struct supertype *dup_super(struct supertype *st)
+struct supertype *dup_super(struct supertype *orig)
 {
-       struct supertype *stnew = NULL;
-       char *verstr = NULL;
-       char version[20];
-       int i;
+       struct supertype *st;
 
+       if (!orig)
+               return orig;
+       st = malloc(sizeof(*st));
        if (!st)
                return st;
-
-       if (st->minor_version == -1)
-               sprintf(version, "%d", st->ss->major);
-       else
-               sprintf(version, "%d.%d", st->ss->major, st->minor_version);
-       verstr = version;
-
-       for (i = 0; stnew == NULL && superlist[i] ; i++)
-               stnew = superlist[i]->match_metadata_desc(verstr);
-
-       if (stnew)
-               stnew->sb = NULL;
-       return stnew;
+       memset(st, 0, sizeof(*st));
+       st->ss = orig->ss;
+       st->max_devs = orig->max_devs;
+       st->minor_version = orig->minor_version;
+       strcpy(st->subarray, orig->subarray);
+       st->sb = NULL;
+       st->info = NULL;
+       return st;
 }
 
 struct supertype *guess_super(int fd)
@@ -839,11 +885,10 @@ struct supertype *guess_super(int fd)
        int i;
 
        st = malloc(sizeof(*st));
-       memset(st, 0, sizeof(*st));
        for (i=0 ; superlist[i]; i++) {
                int rv;
                ss = superlist[i];
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = ss->load_super(st, fd, NULL);
                if (rv == 0) {
                        struct mdinfo info;
@@ -858,7 +903,7 @@ struct supertype *guess_super(int fd)
        }
        if (bestsuper != -1) {
                int rv;
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = superlist[bestsuper]->load_super(st, fd, NULL);
                if (rv == 0) {
                        superlist[bestsuper]->free_super(st);
@@ -906,6 +951,236 @@ void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk)
                        return;
 }
 
+int open_container(int fd)
+{
+       /* 'fd' is a block device.  Find out if it is in use
+        * by a container, and return an open fd on that container.
+        */
+       char path[256];
+       char *e;
+       DIR *dir;
+       struct dirent *de;
+       int dfd, n;
+       char buf[200];
+       int major, minor;
+       struct stat st;
+
+       if (fstat(fd, &st) != 0)
+               return -1;
+       sprintf(path, "/sys/dev/block/%d:%d/holders",
+               (int)major(st.st_rdev), (int)minor(st.st_rdev));
+       e = path + strlen(path);
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+       while ((de = readdir(dir))) {
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               sprintf(e, "/%s/dev", de->d_name);
+               dfd = open(path, O_RDONLY);
+               if (dfd < 0)
+                       continue;
+               n = read(dfd, buf, sizeof(buf));
+               close(dfd);
+               if (n <= 0 || n >= sizeof(buf))
+                       continue;
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d", &major, &minor) != 2)
+                       continue;
+               sprintf(buf, "%d:%d", major, minor);
+               dfd = dev_open(buf, O_RDONLY);
+               if (dfd >= 0) {
+                       closedir(dir);
+                       return dfd;
+               }
+       }
+       closedir(dir);
+       return -1;
+}
+
+char *devnum2devname(int num)
+{
+       char name[100];
+       if (num > 0)
+               sprintf(name, "md%d", num);
+       else
+               sprintf(name, "md_d%d", -1-num);
+       return strdup(name);
+}
+
+int devname2devnum(char *name)
+{
+       char *ep;
+       int num;
+       if (strncmp(name, "md_d", 4)==0)
+               num = -1-strtoul(name+4, &ep, 10);
+       else
+               num = strtoul(name+2, &ep, 10);
+       return num;
+}
+
+int fd2devnum(int fd)
+{
+       struct stat stb;
+       if (fstat(fd, &stb) == 0 &&
+           (S_IFMT&stb.st_mode)==S_IFBLK) {
+               if (major(stb.st_rdev) == MD_MAJOR)
+                       return minor(stb.st_rdev);
+               else
+                       return -1- (minor(stb.st_rdev)>>6);
+       }
+       return -1;
+}
+
+int mdmon_running(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), 0) == 0)
+               return 1;
+       return 0;
+}
+
+int signal_mdmon(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), SIGUSR1) == 0)
+               return 1;
+       return 0;
+}
+
+int start_mdmon(int devnum)
+{
+       int i;
+       int len;
+       pid_t pid;      
+       int status;
+       char pathbuf[1024];
+       char *paths[4] = {
+               pathbuf,
+               "/sbin/mdmon",
+               "mdmon",
+               NULL
+       };
+
+       if (env_no_mdmon())
+               return 0;
+
+       len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf));
+       if (len > 0) {
+               char *sl;
+               pathbuf[len] = 0;
+               sl = strrchr(pathbuf, '/');
+               if (sl)
+                       sl++;
+               else
+                       sl = pathbuf;
+               strcpy(sl, "mdmon");
+       } else
+               pathbuf[0] = '\0';
+
+       switch(fork()) {
+       case 0:
+               /* FIXME yuk. CLOSE_EXEC?? */
+               for (i=3; i < 100; i++)
+                       close(i);
+               for (i=0; paths[i]; i++)
+                       if (paths[i][0])
+                               execl(paths[i], "mdmon",
+                                     map_dev(dev2major(devnum),
+                                             dev2minor(devnum),
+                                             1), NULL);
+               exit(1);
+       case -1: fprintf(stderr, Name ": cannot run mdmon. "
+                        "Array remains readonly\n");
+               return -1;
+       default: /* parent - good */
+               pid = wait(&status);
+               if (pid < 0 || status != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+int env_no_mdmon(void)
+{
+       char *val = getenv("MDADM_NO_MDMON");
+
+       if (val && atoi(val) == 1)
+               return 1;
+
+       return 0;
+}
+
+
+int flush_metadata_updates(struct supertype *st)
+{
+       int sfd;
+       if (!st->updates) {
+               st->update_tail = NULL;
+               return -1;
+       }
+
+       sfd = connect_monitor(devnum2devname(st->container_dev));
+       if (sfd < 0)
+               return -1;
+
+       while (st->updates) {
+               struct metadata_update *mu = st->updates;
+               st->updates = mu->next;
+
+               send_message(sfd, mu, 0);
+               wait_reply(sfd, 0);
+               free(mu->buf);
+               free(mu);
+       }
+       ack(sfd, 0);
+       wait_reply(sfd, 0);
+       close(sfd);
+       st->update_tail = NULL;
+       return 0;
+}
+
+void append_metadata_update(struct supertype *st, void *buf, int len)
+{
+
+       struct metadata_update *mu = malloc(sizeof(*mu));
+
+       mu->buf = buf;
+       mu->len = len;
+       mu->space = NULL;
+       mu->next = NULL;
+       *st->update_tail = mu;
+       st->update_tail = &mu->next;
+}
+
+
 #ifdef __TINYC__
 /* tinyc doesn't optimize this check in ioctl.h out ... */
 unsigned int __invalid_size_argument_for_IOC = 0;