]> git.ipfire.org Git - thirdparty/mdadm.git/commitdiff
Merge branch 'master' into scratch-3.0
authorNeilBrown <neilb@suse.de>
Sun, 2 Nov 2008 19:40:12 +0000 (06:40 +1100)
committerNeilBrown <neilb@suse.de>
Sun, 2 Nov 2008 19:40:12 +0000 (06:40 +1100)
47 files changed:
ANNOUNCE-3.0-devel1 [new file with mode: 0644]
Assemble.c
Create.c
Detail.c
Examine.c
Grow.c
Incremental.c
Kill.c
Makefile
Manage.c
Monitor.c
Query.c
ReadMe.c
TODO
bitmap.c
config.c
crc32.c [new file with mode: 0644]
crc32.h [new file with mode: 0644]
inventory
kernel-patch-2.6.25 [new file with mode: 0644]
kernel-patch-2.6.27 [new file with mode: 0644]
managemon.c [new file with mode: 0644]
mapfile.c
md.4
mdadm.8
mdadm.c
mdadm.h
mdadm.spec
mdassemble.8
mdmon.c [new file with mode: 0644]
mdmon.h [new file with mode: 0644]
mdopen.c
mdstat.c
monitor.c [new file with mode: 0644]
msg.c [new file with mode: 0644]
msg.h [new file with mode: 0644]
restripe.c
sg_io.c [new file with mode: 0644]
super-ddf.c [new file with mode: 0644]
super-intel.c [new file with mode: 0644]
super0.c
super1.c
sysfs.c
test
tests/03r0assem
tests/03r5assemV1
util.c

diff --git a/ANNOUNCE-3.0-devel1 b/ANNOUNCE-3.0-devel1
new file mode 100644 (file)
index 0000000..89ed2e3
--- /dev/null
@@ -0,0 +1,84 @@
+Subject:  ANNOUNCE: mdadm 3.0-devel1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.0-devel1
+
+It is available at the usual places:
+   countrycode=xx.
+   http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://neil.brown.name/mdadm
+   http://neil.brown.name/git?p=mdadm
+
+Note that this is a "devel" release.  It is not intended for
+production use yet, but rather for testing and ongoing development.
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+  - DDF  - The SNIA standard format
+  - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+The manual pages have not yet been updated, but here is a brief outline.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata.  A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays.  These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+   mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+   mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+   mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+The assemble a container, it is easiest just to pass each device in turn to 
+mdadm -I
+
+  for i in /dev/sd[abcde]
+  do mdadm -I $i
+  done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+   mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+   mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed.  The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata 
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to.  The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+Any testing and feedback will be greatly appreciated.
+
+NeilBrown  18th September 2008
+
index 79f091269e68f843cbf9bd03978b8c739f3cf80c..526b1d5a1aa67a9616f89b5c8f77bc3441d04d33 100644 (file)
@@ -139,6 +139,9 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        struct mdinfo info;
        char *avail;
        int nextspare = 0;
+       int uuid_for_name = 0;
+
+       memset(&info, 0, sizeof(info));
 
        if (get_linux_version() < 2004000)
                old_linux = 1;
@@ -178,6 +181,52 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        mddev ? mddev : "further assembly");
                return 1;
        }
+
+       /* if the configuration specifies a container then we use that to
+        * determine the devices and retrieve the array configuration
+        */
+#ifndef MDASSEMBLE
+       if (ident->container && ident->member) {
+               int cfd = open(ident->container, O_RDWR);
+               struct mdinfo *mdi;
+               struct supertype container;
+
+               if (verbose>0)
+                       fprintf(stderr, Name ": looking to assemble member array %s"
+                               " inside container %s\n", ident->member, ident->container);
+               if (cfd < 0) {
+                       if (verbose>0)
+                               fprintf(stderr, Name ": unable to open container %s: %s\n",
+                                       ident->container, strerror(errno));
+                       return 1;
+               }
+
+               mdi = sysfs_read(cfd, fd2devnum(cfd), GET_VERSION);
+               if (!mdi) {
+                       close(cfd);
+                       if (verbose>0)
+                               fprintf(stderr, Name ": unable to read container %s\n",
+                                       ident->container);
+                       return 1;
+               }
+               container.ss = find_metadata_methods(mdi->text_version);
+               sysfs_free(mdi);
+               if (!container.ss) {
+                       close(cfd);
+                       fprintf(stderr, Name ": %s uses unknown metadata: %s\n",
+                               ident->container, mdi->text_version);
+                       return 1;
+               }
+               if (container.ss->load_super(&container, cfd, ident->container)) {
+                       fprintf(stderr, Name ": Cannot load metadata for container %s\n",
+                               ident->container);
+                       return 1;
+               }
+
+               return Incremental_container(&container, ident->container,
+                                            verbose, runstop, ident->autof);
+       }
+#endif
        if (devlist == NULL)
                devlist = conf_get_devs();
        else if (mdfd >= 0)
@@ -294,15 +343,28 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                if (mdfd < 0) {
                        if (tst == NULL || tst->sb == NULL)
                                continue;
-                       if (update == NULL &&
-                           tst->ss->match_home(tst, homehost)==0) {
+                       switch(tst->ss->match_home(tst, homehost))
+                       {
+                       case 1: /* happy with match. */
+                               break;
+                       case -1: /* cannot match */
+                               uuid_for_name = 1;
+                               break;
+                       case 0: /* Doesn't match */
+                               if (update)
+                                       /* We are changing the name*/
+                                       break;
                                if ((inargv && verbose >= 0) || verbose > 0)
-                                       fprintf(stderr, Name ": %s is not built for host %s.\n",
+                                       fprintf(stderr, Name ": %s is not built for "
+                                               "host %s - using UUID for "
+                                               "device name.\n",
                                                devname, homehost);
+                               
                                /* Auto-assemble, and this is not a usable host */
                                /* if update != NULL, we are updating the host
                                 * name... */
-                               goto loop;
+                               uuid_for_name = 1;
+                               break;
                        }
                }
                /* If we are this far, then we are nearly commited to this device.
@@ -337,7 +399,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                        if (homehost) {
                                int first = st->ss->match_home(st, homehost);
                                int last = tst->ss->match_home(tst, homehost);
-                               if (first+last == 1) {
+                               if (first != last &&
+                                   (first == 1 || last == 1)) {
                                        /* We can do something */
                                        if (first) {/* just ignore this one */
                                                if ((inargv && verbose >= 0) || verbose > 0)
@@ -378,18 +441,28 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                 */
                mdu_array_info_t inf;
                char *c;
+               char nbuf[64];
+               int rc;
+
                if (!st || !st->sb) {
                        return 2;
                }
                st->ss->getinfo_super(st, &info);
-               c = strchr(info.name, ':');
-               if (c) c++; else c= info.name;
+               if (uuid_for_name)
+                       c = fname_from_uuid(st, &info, nbuf, '-');
+               else {
+                       c = strchr(info.name, ':');
+                       if (c) c++; else c= info.name;
+               }
                if (isdigit(*c) && ((ident->autof & 7)==4 || (ident->autof&7)==6))
                        /* /dev/md/d0 style for partitionable */
-                       asprintf(&mddev, "/dev/md/d%s", c);
+                       rc = asprintf(&mddev, "/dev/md/d%s", c);
+               else
+                       rc = asprintf(&mddev, "/dev/md/%s", c);
+               if (rc < 0)
+                       mdfd = -1;
                else
-                       asprintf(&mddev, "/dev/md/%s", c);
-               mdfd = open_mddev(mddev, ident->autof);
+                       mdfd = open_mddev(mddev, ident->autof);
                if (mdfd < 0) {
                        st->ss->free_super(st);
                        free(devices);
@@ -542,8 +615,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                            == devices[devcnt].i.events
                            && (devices[best[i]].i.disk.minor
                                != devices[devcnt].i.disk.minor)
-                           && st->ss->major == 0
-                           && info.array.level != -4) {
+                           && st->ss == &super0
+                           && info.array.level != LEVEL_MULTIPATH) {
                                /* two different devices with identical superblock.
                                 * Could be a mis-detection caused by overlapping
                                 * partitions.  fail-safe.
@@ -736,6 +809,9 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                return 1;
        }
        st->ss->getinfo_super(st, &info);
+#ifndef MDASSEMBLE
+       sysfs_init(&info, mdfd, 0);
+#endif
        for (i=0; i<bestcnt; i++) {
                int j = best[i];
                unsigned int desired_state;
@@ -845,17 +921,10 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
        /* Almost ready to actually *do* something */
        if (!old_linux) {
                int rv;
-               if ((vers % 100) >= 1) { /* can use different versions */
-                       mdu_array_info_t inf;
-                       memset(&inf, 0, sizeof(inf));
-                       inf.major_version = st->ss->major;
-                       inf.minor_version = st->minor_version;
-                       rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
-               } else
-                       rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
 
+               rv = set_array_info(mdfd, st, &info);
                if (rv) {
-                       fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+                       fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                mddev, strerror(errno));
                        if (must_close) close(mdfd);
                        return 1;
@@ -895,8 +964,9 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                j = chosen_drive;
 
                        if (j >= 0 /* && devices[j].uptodate */) {
-                               if (ioctl(mdfd, ADD_NEW_DISK,
-                                         &devices[j].i.disk)!=0) {
+                               rv = add_disk(mdfd, st, &info, &devices[j].i);
+
+                               if (rv) {
                                        fprintf(stderr, Name ": failed to add "
                                                        "%s to %s: %s\n",
                                                devices[j].devname,
@@ -918,6 +988,21 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                        i, mddev);
                }
 
+               if (info.array.level == LEVEL_CONTAINER) {
+                       if (verbose >= 0) {
+                               fprintf(stderr, Name ": Container %s has been "
+                                       "assembled with %d drive%s",
+                                       mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s");
+                               if (okcnt < info.array.raid_disks)
+                                       fprintf(stderr, " (out of %d)",
+                                               info.array.raid_disks);
+                               fprintf(stderr, "\n");
+                       }
+                       if (must_close)
+                               close(mdfd);
+                       return 0;
+               }
+
                if (runstop == 1 ||
                    (runstop <= 0 &&
                     ( enough(info.array.level, info.array.raid_disks,
@@ -940,7 +1025,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd,
                                        /* There is a nasty race with 'mdadm --monitor'.
                                         * If it opens this device before we close it,
                                         * it gets an incomplete open on which IO
-                                        * doesn't work and the capacity if wrong.
+                                        * doesn't work and the capacity is
+                                        * wrong.
                                         * If we reopen (to check for layered devices)
                                         * before --monitor closes, we loose.
                                         *
index 9e65d0a9f6516d125cfb5f1c96efb7106a678f46..8bcdc3bfc9de7e8067b9320102c24aa971e26bf1 100644 (file)
--- a/Create.c
+++ b/Create.c
@@ -66,12 +66,18 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        int second_missing = subdevs * 2;
        int missing_disks = 0;
        int insert_point = subdevs * 2; /* where to insert a missing drive */
+       int total_slots;
        int pass;
        int vers;
        int rv;
        int bitmap_fd;
+       int have_container = 0;
+       int container_fd = -1;
+       int need_mdmon = 0;
        unsigned long long bitmapsize;
-       struct mdinfo info;
+       struct mdinfo info, *infos;
+       int did_default = 0;
+       unsigned long safe_mode_delay = 0;
 
        int major_num = BITMAP_MAJOR_HI;
 
@@ -91,6 +97,14 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        return 1;
                }
        }
+       if (level == UnSet) {
+               /* "ddf" and "imsm" metadata only supports one level - should possibly
+                * push this into metadata handler??
+                */
+               if (st && (st->ss == &super_ddf || st->ss == &super_imsm))
+                       level = LEVEL_CONTAINER;
+       }
+
        if (level == UnSet) {
                fprintf(stderr,
                        Name ": a RAID level is needed to create an array.\n");
@@ -116,11 +130,53 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        Name ": This level does not support spare devices\n");
                return 1;
        }
+
+       if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+               /* If given a single device, it might be a container, and we can
+                * extract a device list from there
+                */
+               mdu_array_info_t inf;
+               int fd;
+
+               memset(&inf, 0, sizeof(inf));
+               fd = open(devlist->devname, O_RDONLY);
+               if (fd >= 0 &&
+                   ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+                   inf.raid_disks == 0) {
+                       /* yep, looks like a container */
+                       if (st) {
+                               rv = st->ss->load_super(st, fd,
+                                                       devlist->devname);
+                               if (rv == 0)
+                                       have_container = 1;
+                       } else {
+                               st = guess_super(fd);
+                               if (st && !(rv = st->ss->
+                                           load_super(st, fd,
+                                                      devlist->devname)))
+                                       have_container = 1;
+                               else
+                                       st = NULL;
+                       }
+               }
+               if (fd >= 0)
+                       close(fd);
+               if (have_container) {
+                       subdevs = 0;
+                       devlist = NULL;
+               }
+       }
+       if (st && st->ss->external && sparedisks) {
+               fprintf(stderr,
+                       Name ": This metadata type does not support "
+                       "spare disks are create time\n");
+               return 1;
+       }
        if (subdevs > raiddisks+sparedisks) {
                fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks);
                return 1;
        }
-       if (subdevs < raiddisks+sparedisks) {
+       if (!have_container && subdevs < raiddisks+sparedisks) {
                fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n");
                return 1;
        }
@@ -182,6 +238,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        case 1:
        case LEVEL_FAULTY:
        case LEVEL_MULTIPATH:
+       case LEVEL_CONTAINER:
                if (chunk) {
                        chunk = 0;
                        if (verbose > 0)
@@ -193,14 +250,17 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                return 1;
        }
 
+       if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks,
+                                             chunk, size, NULL, NULL, verbose>=0))
+               return 1;
+
        /* now look at the subdevs */
        info.array.active_disks = 0;
        info.array.working_disks = 0;
        dnum = 0;
        for (dv=devlist; dv; dv=dv->next, dnum++) {
                char *dname = dv->devname;
-               unsigned long long ldsize, freesize;
-               int fd;
+               unsigned long long freesize;
                if (strcasecmp(dname, "missing")==0) {
                        if (first_missing > dnum)
                                first_missing = dnum;
@@ -212,18 +272,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                info.array.working_disks++;
                if (dnum < raiddisks)
                        info.array.active_disks++;
-               fd = open(dname, O_RDONLY|O_EXCL);
-               if (fd <0 ) {
-                       fprintf(stderr, Name ": Cannot open %s: %s\n",
-                               dname, strerror(errno));
-                       fail=1;
-                       continue;
-               }
-               if (!get_dev_size(fd, dname, &ldsize)) {
-                       fail = 1;
-                       close(fd);
-                       continue;
-               }
                if (st == NULL) {
                        struct createinfo *ci = conf_get_create_info();
                        if (ci)
@@ -231,33 +279,42 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
                if (st == NULL) {
                        /* Need to choose a default metadata, which is different
-                        * depending on the sizes of devices
+                        * depending on geometry of array.
                         */
                        int i;
                        char *name = "default";
-                       if (level >= 1 && ldsize > (0x7fffffffULL<<10))
-                               name = "default/large";
-                       for(i=0; !st && superlist[i]; i++)
+                       for(i=0; !st && superlist[i]; i++) {
                                st = superlist[i]->match_metadata_desc(name);
+                               if (st && !st->ss->validate_geometry
+                                               (st, level, layout, raiddisks,
+                                                chunk, size, dname, &freesize,
+                                                verbose > 0))
+                                       st = NULL;
+                       }
 
                        if (!st) {
-                               fprintf(stderr, Name ": internal error - no default metadata style\n");
+                               fprintf(stderr, Name ": device %s not suitable "
+                                       "for any style of array\n",
+                                       dname);
                                exit(2);
                        }
-                       if (st->ss->major != 0 ||
+                       if (st->ss != &super0 ||
                            st->minor_version != 90)
-                               fprintf(stderr, Name ": Defaulting to version"
-                                       " %d.%d metadata\n",
-                                       st->ss->major,
-                                       st->minor_version);
-               }
-               freesize = st->ss->avail_size(st, ldsize >> 9);
-               if (freesize == 0) {
-                       fprintf(stderr, Name ": %s is too small: %luK\n",
-                               dname, (unsigned long)(ldsize>>10));
-                       fail = 1;
-                       close(fd);
-                       continue;
+                               did_default = 1;
+               } else {
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, size, dname,
+                                                      &freesize,
+                                                      verbose > 0)) {
+
+                               fprintf(stderr,
+                                       Name ": %s is not suitable for "
+                                       "this array.\n",
+                                       dname);
+                               fail = 1;
+                               continue;
+                       }
                }
 
                freesize /= 2; /* convert to K */
@@ -268,9 +325,9 @@ int Create(struct supertype *st, char *mddev, int mdfd,
 
                if (size && freesize < size) {
                        fprintf(stderr, Name ": %s is smaller that given size."
-                               " %lluK < %lluK + superblock\n", dname, freesize, size);
+                               " %lluK < %lluK + metadata\n",
+                               dname, freesize, size);
                        fail = 1;
-                       close(fd);
                        continue;
                }
                if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
@@ -282,24 +339,36 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                        minsize = freesize;
                }
                if (runstop != 1 || verbose >= 0) {
+                       int fd = open(dname, O_RDONLY);
+                       if (fd <0 ) {
+                               fprintf(stderr, Name ": Cannot open %s: %s\n",
+                                       dname, strerror(errno));
+                               fail=1;
+                               continue;
+                       }
                        warn |= check_ext2(fd, dname);
                        warn |= check_reiser(fd, dname);
                        warn |= check_raid(fd, dname);
+                       close(fd);
                }
-               close(fd);
        }
        if (fail) {
                fprintf(stderr, Name ": create aborted\n");
                return 1;
        }
        if (size == 0) {
-               if (mindisc == NULL) {
+               if (mindisc == NULL && !have_container) {
                        fprintf(stderr, Name ": no size and no drives given - aborting create.\n");
                        return 1;
                }
-               if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) {
+               if (level > 0 || level == LEVEL_MULTIPATH
+                   || level == LEVEL_FAULTY
+                   || st->ss->external ) {
                        /* size is meaningful */
-                       if (minsize > 0x100000000ULL && st->ss->major == 0) {
+                       if (!st->ss->validate_geometry(st, level, layout,
+                                                      raiddisks,
+                                                      chunk, minsize,
+                                                      NULL, NULL, 0)) {
                                fprintf(stderr, Name ": devices too large for RAID level %d\n", level);
                                return 1;
                        }
@@ -348,6 +417,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
         * into a spare, else the create will fail
         */
        if (assume_clean == 0 && force == 0 && first_missing < raiddisks &&
+           st->ss->external == 0 &&
            second_missing >= raiddisks && level == 6) {
                insert_point = raiddisks - 1;
                if (insert_point == first_missing)
@@ -357,7 +427,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                missing_disks++;
        }
 
-       if (level <= 0 && first_missing != subdevs * 2) {
+       if (level <= 0 && first_missing < subdevs * 2) {
                fprintf(stderr,
                        Name ": This level does not support missing devices\n");
                return 1;
@@ -382,12 +452,16 @@ int Create(struct supertype *st, char *mddev, int mdfd,
             ( level == 6 && (insert_point < raiddisks
                              || second_missing < raiddisks))
             ||
+            ( level <= 0 )
+            ||
             assume_clean
-               )
+               ) {
                info.array.state = 1; /* clean, but one+ drive will be missing*/
-       else
+               info.resync_start = ~0ULL;
+       } else {
                info.array.state = 0; /* not clean, but no errors */
-
+               info.resync_start = 0;
+       }
        if (level == 10) {
                /* for raid10, the bitmap size is the capacity of the array,
                 * which is array.size * raid_disks / ncopies;
@@ -424,7 +498,6 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                + info.array.failed_disks;
        info.array.layout = layout;
        info.array.chunk_size = chunk*1024;
-       info.array.major_version = st->ss->major;
 
        if (name == NULL || *name == 0) {
                /* base name on mddev */
@@ -453,6 +526,32 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid))
                return 1;
 
+       total_slots = info.array.nr_disks;
+       sysfs_init(&info, mdfd, 0);
+       st->ss->getinfo_super(st, &info);
+
+       if (did_default && verbose >= 0) {
+               if (is_subarray(info.text_version)) {
+                       int dnum = devname2devnum(info.text_version+1);
+                       char *path;
+                       int mdp = get_mdp_major();
+                       struct mdinfo *mdi;
+                       if (dnum > 0)
+                               path = map_dev(MD_MAJOR, dnum, 1);
+                       else
+                               path = map_dev(mdp, (-1-dnum)<< 6, 1);
+
+                       mdi = sysfs_read(-1, dnum, GET_VERSION);
+
+                       fprintf(stderr, Name ": Creating array inside "
+                               "%s container %s\n", 
+                               mdi?mdi->text_version:"managed", path);
+                       sysfs_free(mdi);
+               } else
+                       fprintf(stderr, Name ": Defaulting to version"
+                               " %s metadata\n", info.text_version);
+       }
+
        if (bitmap_file && vers < 9003) {
                major_num = BITMAP_MAJOR_HOSTENDIAN;
 #ifdef __BIG_ENDIAN
@@ -476,17 +575,41 @@ int Create(struct supertype *st, char *mddev, int mdfd,
        }
 
 
+       sysfs_init(&info, mdfd, 0);
 
-       if ((vers % 100) >= 1) { /* can use different versions */
-               mdu_array_info_t inf;
-               memset(&inf, 0, sizeof(inf));
-               inf.major_version = st->ss->major;
-               inf.minor_version = st->minor_version;
-               rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
-       } else
-               rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+       if (st->ss->external && st->subarray[0]) {
+               /* member */
+
+               /* When creating a member, we need to be careful
+                * to negotiate with mdmon properly.
+                * If it is already running, we cannot write to
+                * the devices and must ask it to do that part.
+                * If it isn't running, we write to the devices,
+                * and then start it.
+                * We hold an exclusive open on the container
+                * device to make sure mdmon doesn't exit after
+                * we checked that it is running.
+                *
+                * For now, fail if it is already running.
+                */
+               container_fd = open_dev_excl(st->container_dev);
+               if (container_fd < 0) {
+                       fprintf(stderr, Name ": Cannot get exclusive "
+                               "open on container - weird.\n");
+                       return 1;
+               }
+               if (mdmon_running(st->container_dev)) {
+                       if (verbose)
+                               fprintf(stderr, Name ": reusing mdmon "
+                                       "for %s.\n",
+                                       devnum2devname(st->container_dev));
+                       st->update_tail = &st->updates;
+               } else
+                       need_mdmon = 1;
+       }
+       rv = set_array_info(mdfd, st, &info);
        if (rv) {
-               fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\n",
+               fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                        mddev, strerror(errno));
                return 1;
        }
@@ -514,7 +637,7 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                }
        }
 
-
+       infos = malloc(sizeof(*infos) * total_slots);
 
        for (pass=1; pass <=2 ; pass++) {
                mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */
@@ -523,74 +646,121 @@ int Create(struct supertype *st, char *mddev, int mdfd,
                     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
                        int fd;
                        struct stat stb;
+                       struct mdinfo *inf = &infos[dnum];
 
-                       info.disk.number = dnum;
+                       if (dnum >= total_slots)
+                               abort();
                        if (dnum == insert_point) {
                                moved_disk = dv;
                        }
-                       info.disk.raid_disk = info.disk.number;
-                       if (info.disk.raid_disk < raiddisks)
-                               info.disk.state = (1<<MD_DISK_ACTIVE) |
+                       if (dnum == insert_point ||
+                           strcasecmp(dv->devname, "missing")==0)
+                               continue;
+
+                       switch(pass) {
+                       case 1:
+                               *inf = info;
+
+                               inf->disk.number = dnum;
+                               inf->disk.raid_disk = dnum;
+                               if (inf->disk.raid_disk < raiddisks)
+                                       inf->disk.state = (1<<MD_DISK_ACTIVE) |
                                                (1<<MD_DISK_SYNC);
-                       else
-                               info.disk.state = 0;
-                       if (dv->writemostly == 1)
-                               info.disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+                               else
+                                       inf->disk.state = 0;
+
+                               if (dv->writemostly == 1)
+                                       inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+                               if (st->ss->external && st->subarray[0])
+                                       fd = open(dv->devname, O_RDWR);
+                               else
+                                       fd = open(dv->devname, O_RDWR|O_EXCL);
 
-                       if (dnum == insert_point ||
-                           strcasecmp(dv->devname, "missing")==0) {
-                               info.disk.major = 0;
-                               info.disk.minor = 0;
-                               info.disk.state = (1<<MD_DISK_FAULTY);
-                       } else {
-                               fd = open(dv->devname, O_RDONLY|O_EXCL);
                                if (fd < 0) {
-                                       fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n",
+                                       fprintf(stderr, Name ": failed to open %s "
+                                               "after earlier success - aborting\n",
                                                dv->devname);
                                        return 1;
                                }
                                fstat(fd, &stb);
-                               info.disk.major = major(stb.st_rdev);
-                               info.disk.minor = minor(stb.st_rdev);
+                               inf->disk.major = major(stb.st_rdev);
+                               inf->disk.minor = minor(stb.st_rdev);
+
                                remove_partitions(fd);
-                               close(fd);
-                       }
-                       switch(pass){
-                       case 1:
-                               st->ss->add_to_super(st, &info.disk);
+                               st->ss->add_to_super(st, &inf->disk,
+                                                    fd, dv->devname);
+                               st->ss->getinfo_super(st, inf);
+                               safe_mode_delay = inf->safe_mode_delay;
+
+                               /* getinfo_super might have lost these ... */
+                               inf->disk.major = major(stb.st_rdev);
+                               inf->disk.minor = minor(stb.st_rdev);
                                break;
                        case 2:
-                               if (info.disk.state == 1) break;
-                               Kill(dv->devname, 0, 1); /* Just be sure it is clean */
-                               Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */
-                               st->ss->write_init_super(st, &info.disk,
-                                                        dv->devname);
-
-                               if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) {
-                                       fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n",
+                               inf->errors = 0;
+                               rv = 0;
+
+                               rv = add_disk(mdfd, st, &info, inf);
+
+                               if (rv) {
+                                       fprintf(stderr,
+                                               Name ": ADD_NEW_DISK for %s "
+                                               "failed: %s\n",
                                                dv->devname, strerror(errno));
                                        st->ss->free_super(st);
                                        return 1;
                                }
-
                                break;
                        }
                        if (dv == moved_disk && dnum != insert_point) break;
                }
+               if (pass == 1) {
+                       st->ss->write_init_super(st);
+                       flush_metadata_updates(st);
+               }
        }
+       free(infos);
        st->ss->free_super(st);
 
        /* param is not actually used */
-       if (runstop == 1 || subdevs >= raiddisks) {
-               mdu_param_t param;
-               if (ioctl(mdfd, RUN_ARRAY, &param)) {
-                       fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
-                               strerror(errno));
-                       Manage_runstop(mddev, mdfd, -1, 0);
-                       return 1;
+       if (level == LEVEL_CONTAINER)
+               /* No need to start */
+               ;
+       else if (runstop == 1 || subdevs >= raiddisks) {
+               if (st->ss->external) {
+                       switch(level) {
+                       case LEVEL_LINEAR:
+                       case LEVEL_MULTIPATH:
+                       case 0:
+                               sysfs_set_str(&info, NULL, "array_state",
+                                             "active");
+                               need_mdmon = 0;
+                               break;
+                       default:
+                               sysfs_set_str(&info, NULL, "array_state",
+                                             "readonly");
+                               break;
+                       }
+                       sysfs_set_safemode(&info, safe_mode_delay);
+               } else {
+                       mdu_param_t param;
+                       if (ioctl(mdfd, RUN_ARRAY, &param)) {
+                               fprintf(stderr, Name ": RUN_ARRAY failed: %s\n",
+                                       strerror(errno));
+                               Manage_runstop(mddev, mdfd, -1, 0);
+                               return 1;
+                       }
                }
                if (verbose >= 0)
                        fprintf(stderr, Name ": array %s started.\n", mddev);
+               if (st->ss->external && st->subarray[0]) {
+                       if (need_mdmon)
+                               start_mdmon(st->container_dev);
+
+                       ping_monitor(devnum2devname(st->container_dev));
+                       close(container_fd);
+               }
        } else {
                fprintf(stderr, Name ": not starting array - not enough devices.\n");
        }
index 3cee66fed89c8c6a315fa60efd3ce3ae7d196bd4..ed05cefae3b908dbee429bd0186e7fe6a302dc3d 100644 (file)
--- a/Detail.c
+++ b/Detail.c
@@ -30,6 +30,7 @@
 #include       "mdadm.h"
 #include       "md_p.h"
 #include       "md_u.h"
+#include       <dirent.h>
 
 int Detail(char *dev, int brief, int export, int test, char *homehost)
 {
@@ -56,6 +57,8 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        int max_disks = MD_SB_DISKS; /* just a default */
        struct mdinfo info;
        struct mdinfo *sra;
+       char *member = NULL;
+       char *container = NULL;
 
        int rv = test ? 4 : 1;
        int avail_disks = 0;
@@ -96,7 +99,21 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                stb.st_rdev = 0;
        rv = 0;
 
-       if (st) max_disks = st->max_devs;
+       if (st)
+               max_disks = st->max_devs;
+
+       if (sra && is_subarray(sra->text_version) &&
+               strchr(sra->text_version+1, '/')) {
+               /* This is a subarray of some container.
+                * We want the name of the container, and the member
+                */
+               char *s = strchr(sra->text_version+1, '/');
+               int dn;
+               *s++ = '\0';
+               member = s;
+               dn = devname2devnum(sra->text_version+1);
+               container = map_dev(dev2major(dn), dev2minor(dn), 1);
+       }
 
        /* try to load a superblock */
        for (d= 0; d<max_disks; d++) {
@@ -111,7 +128,8 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                        continue;
                if ((dv=map_dev(disk.major, disk.minor, 1))) {
                        if ((!st || !st->sb) &&
-                           (disk.state & (1<<MD_DISK_ACTIVE))) {
+                           (array.raid_disks == 0 || 
+                            (disk.state & (1<<MD_DISK_ACTIVE)))) {
                                /* try to read the superblock from this device
                                 * to get more info
                                 */
@@ -119,8 +137,9 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                if (fd2 >=0 && st &&
                                    st->ss->load_super(st, fd2, NULL) == 0) {
                                        st->ss->getinfo_super(st, &info);
-                                       if (info.array.ctime != array.ctime ||
-                                           info.array.level != array.level)
+                                       if (array.raid_disks != 0 && /* container */
+                                           (info.array.ctime != array.ctime ||
+                                            info.array.level != array.level))
                                                st->ss->free_super(st);
                                }
                                if (fd2 >= 0) close(fd2);
@@ -132,30 +151,58 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
        c = map_num(pers, array.level);
 
        if (export) {
-               if (c)
-                       printf("MD_LEVEL=%s\n", c);
-               printf("MD_DEVICES=%d\n", array.raid_disks);
-               if (sra && sra->array.major_version < 0)
-                       printf("MD_METADATA=%s\n", sra->text_version);
-               else
-                       printf("MD_METADATA=%d.%02d\n",
-                              array.major_version, array.minor_version);
+               if (array.raid_disks) {
+                       if (c)
+                               printf("MD_LEVEL=%s\n", c);
+                       printf("MD_DEVICES=%d\n", array.raid_disks);
+               } else {
+                       printf("MD_LEVEL=container\n");
+                       printf("MD_DEVICES=%d\n", array.nr_disks);
+               }
+               if (container) {
+                       printf("MD_CONTAINER=%s\n", container);
+                       printf("MD_MEMBER=%s\n", member);
+               } else {
+                       if (sra && sra->array.major_version < 0)
+                               printf("MD_METADATA=%s\n", sra->text_version);
+                       else
+                               printf("MD_METADATA=%d.%02d\n",
+                                      array.major_version, array.minor_version);
+               }
+               
+               if (st && st->sb) {
+                       struct mdinfo info;
+                       char nbuf[64];
+                       st->ss->getinfo_super(st, &info);
+                       fname_from_uuid(st, &info, nbuf, ':');
+                       printf("MD_UUID=%s\n", nbuf+5);
 
-               if (st && st->sb)
-                       st->ss->export_detail_super(st);
+                       if (st->ss->export_detail_super)
+                               st->ss->export_detail_super(st);
+               }
                goto out;
        }
 
        if (brief) {
                mdu_bitmap_file_t bmf;
-               printf("ARRAY %s level=%s num-devices=%d", dev,
-                      c?c:"-unknown-",
-                      array.raid_disks );
-               if (sra && sra->array.major_version < 0)
-                       printf(" metadata=%s", sra->text_version);
+               if (array.raid_disks)
+                       printf("ARRAY %s level=%s num-devices=%d", dev,
+                              c?c:"-unknown-",
+                              array.raid_disks );
                else
-                       printf(" metadata=%d.%02d",
-                              array.major_version, array.minor_version);
+                       printf("ARRAY %s level=container num-devices=%d",
+                              dev, array.nr_disks);
+
+               if (container) {
+                       printf(" container=%s", container);
+                       printf(" member=%s", member);
+               } else {
+                       if (sra && sra->array.major_version < 0)
+                               printf(" metadata=%s", sra->text_version);
+                       else
+                               printf(" metadata=%d.%02d",
+                                      array.major_version, array.minor_version);
+               }
 
                /* Only try GET_BITMAP_FILE for 0.90.01 and later */
                if (vers >= 9001 &&
@@ -180,14 +227,19 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
 
                printf("%s:\n", dev);
 
+               if (container)
+                       printf("      Container : %s, member %s\n", container, member);
+               else {
                if (sra && sra->array.major_version < 0)
                        printf("        Version : %s\n", sra->text_version);
                else
                        printf("        Version : %d.%02d\n",
                               array.major_version, array.minor_version);
+               }
 
                atime = array.ctime;
-               printf("  Creation Time : %.24s\n", ctime(&atime));
+               if (atime)
+                       printf("  Creation Time : %.24s\n", ctime(&atime));
                if (array.raid_disks == 0) c = "container";
                printf("     Raid Level : %s\n", c?c:"-unknown-");
                if (larray_size)
@@ -206,9 +258,13 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                                printf("  Used Dev Size : %d%s\n", array.size,
                                       human_size((long long)array.size<<10));
                }
-               printf("   Raid Devices : %d\n", array.raid_disks);
+               if (array.raid_disks)
+                       printf("   Raid Devices : %d\n", array.raid_disks);
                printf("  Total Devices : %d\n", array.nr_disks);
-               printf("Preferred Minor : %d\n", array.md_minor);
+               if (!container && 
+                   ((sra == NULL && array.major_version == 0) ||
+                    (sra && sra->array.major_version == 0)))
+                       printf("Preferred Minor : %d\n", array.md_minor);
                if (sra == NULL || sra->array.major_version >= 0)
                        printf("    Persistence : Superblock is %spersistent\n",
                               array.not_persistent?"not ":"");
@@ -222,17 +278,22 @@ int Detail(char *dev, int brief, int export, int test, char *homehost)
                } else if (array.state & (1<<MD_SB_BITMAP_PRESENT))
                        printf("  Intent Bitmap : Internal\n\n");
                atime = array.utime;
-               printf("    Update Time : %.24s\n", ctime(&atime));
-               printf("          State : %s%s%s%s\n",
-                      (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
-                      array.active_disks < array.raid_disks? ", degraded":"",
-                      (!e || e->percent < 0) ? "" :
-                       (e->resync) ? ", resyncing": ", recovering",
-                      larray_size ? "": ", Not Started");
-               printf(" Active Devices : %d\n", array.active_disks);
+               if (atime)
+                       printf("    Update Time : %.24s\n", ctime(&atime));
+               if (array.raid_disks)
+                       printf("          State : %s%s%s%s\n",
+                              (array.state&(1<<MD_SB_CLEAN))?"clean":"active",
+                              array.active_disks < array.raid_disks? ", degraded":"",
+                              (!e || e->percent < 0) ? "" :
+                              (e->resync) ? ", resyncing": ", recovering",
+                              larray_size ? "": ", Not Started");
+               if (array.raid_disks)
+                       printf(" Active Devices : %d\n", array.active_disks);
                printf("Working Devices : %d\n", array.working_disks);
-               printf(" Failed Devices : %d\n", array.failed_disks);
-               printf("  Spare Devices : %d\n", array.spare_disks);
+               if (array.raid_disks) {
+                       printf(" Failed Devices : %d\n", array.failed_disks);
+                       printf("  Spare Devices : %d\n", array.spare_disks);
+               }
                printf("\n");
                if (array.level == 5) {
                        c = map_num(r5layout, array.layout);
@@ -306,7 +367,45 @@ This is pretty boring
                if (st && st->sb)
                        st->ss->detail_super(st, homehost);
 
-               printf("    Number   Major   Minor   RaidDevice State\n");
+               if (array.raid_disks == 0 && sra && sra->array.major_version == -1
+                   && sra->array.minor_version == -2 && sra->text_version[0] != '/') {
+                       /* This looks like a container.  Find any active arrays
+                        * That claim to be a member.
+                        */
+                       DIR *dir = opendir("/sys/block");
+                       struct dirent *de;
+
+                       printf("  Member Arrays :");
+
+                       while (dir && (de = readdir(dir)) != NULL) {
+                               char path[200];
+                               char vbuf[1024];
+                               int nlen = strlen(sra->sys_name);
+                               int dn;
+                               if (de->d_name[0] == '.')
+                                       continue;
+                               sprintf(path, "/sys/block/%s/md/metadata_version",
+                                       de->d_name);
+                               if (load_sys(path, vbuf) < 0)
+                                       continue;
+                               if (strncmp(vbuf, "external:", 9) != 0 ||
+                                   !is_subarray(sra->sys_name+9) ||
+                                   strncmp(vbuf+10, sra->sys_name, nlen) != 0 ||
+                                   vbuf[10+nlen] != '/')
+                                       continue;
+                               dn = devname2devnum(de->d_name);
+                               printf(" %s", map_dev(dev2major(dn),
+                                                     dev2minor(dn), 1));
+                       }
+                       if (dir)
+                               closedir(dir);
+                       printf("\n\n");
+               }
+
+               if (array.raid_disks)
+                       printf("    Number   Major   Minor   RaidDevice State\n");
+               else
+                       printf("    Number   Major   Minor   RaidDevice\n");
        }
        disks = malloc(max_disks * sizeof(mdu_disk_info_t));
        for (d=0; d<max_disks; d++) {
@@ -350,6 +449,9 @@ This is pretty boring
                        else
                                printf("   %5d   %5d    %5d    %5d     ",
                                       disk.number, disk.major, disk.minor, disk.raid_disk);
+               }
+               if (!brief && array.raid_disks) {
+
                        if (disk.state & (1<<MD_DISK_FAULTY)) {
                                printf(" faulty");
                                if (disk.raid_disk < array.raid_disks &&
@@ -401,7 +503,7 @@ This is pretty boring
                }
                if (!brief) printf("\n");
        }
-       if (spares && brief) printf(" spares=%d", spares);
+       if (spares && brief && array.raid_disks) printf(" spares=%d", spares);
        if (brief && st && st->sb)
                st->ss->brief_detail_super(st);
        st->ss->free_super(st);
index 5de92028acd4c0bf971a859203bd363ca371ddae..d213664f965eaa1b14d4666ff8e5bf0b6caa4275 100644 (file)
--- a/Examine.c
+++ b/Examine.c
@@ -123,7 +123,7 @@ int Examine(mddev_dev_t devlist, int brief, int export, int scan,
                                st->ss->getinfo_super(st, &ap->info);
                                st->ss->free_super(st);
                        }
-                       if (!(ap->info.disk.state & MD_DISK_SYNC))
+                       if (!(ap->info.disk.state & (1<<MD_DISK_SYNC)))
                                ap->spares++;
                        d = dl_strdup(devlist->devname);
                        dl_add(ap->devs, d);
diff --git a/Grow.c b/Grow.c
index a8194bf05b69e3e86b5eefcc88241bdb837ea398..14e48f5696cb0c93fb35eb843a4a5fbd99e40b3e 100644 (file)
--- a/Grow.c
+++ b/Grow.c
@@ -69,7 +69,7 @@ int Grow_Add_device(char *devname, int fd, char *newdev)
                return 1;
        }
 
-       nfd = open(newdev, O_RDWR|O_EXCL);
+       nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
        if (nfd < 0) {
                fprintf(stderr, Name ": cannot open %s\n", newdev);
                return 1;
@@ -396,7 +396,8 @@ struct mdp_backup_super {
        __u64   arraystart;
        __u64   length;
        __u32   sb_csum;        /* csum of preceeding bytes. */
-};
+       __u8 pad[512-68];
+} __attribute__((aligned(512))) bsb;
 
 int bsb_csum(char *buf, int len)
 {
@@ -420,7 +421,6 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
        struct mdu_array_info_s array;
        char *c;
 
-       struct mdp_backup_super bsb;
        struct supertype *st;
 
        int nlevel, olevel;
@@ -720,7 +720,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                 * a leading superblock 4K earlier.
                 */
                for (i=array.raid_disks; i<d; i++) {
-                       char buf[4096];
+                       char abuf[4096+512];
+                       char *buf = (char*)(((unsigned long)abuf+511)& ~511);
                        if (i==d-1 && backup_file) {
                                /* This is the backup file */
                                offsets[i] = 8;
@@ -731,7 +732,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                                fprintf(stderr, Name ": could not seek...\n");
                                goto abort;
                        }
-                       memset(buf, 0, sizeof(buf));
+                       memset(buf, 0, 4096);
                        bsb.devstart = __cpu_to_le64(offsets[i]);
                        bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
                        memcpy(buf, &bsb, sizeof(bsb));
@@ -793,7 +794,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
                        if (lseek64(fdlist[i], (offsets[i]+last_block)<<9, 0) < 0 ||
                            write(fdlist[i], &bsb, sizeof(bsb)) != sizeof(bsb) ||
                            fsync(fdlist[i]) != 0) {
-                               fprintf(stderr, Name ": %s: fail to save metadata for critical region backups.\n",
+                               fprintf(stderr, Name ": %s: failed to save metadata for critical region backups.\n",
                                        devname);
                                goto abort_resume;
                        }
@@ -882,7 +883,6 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 
        for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
                struct mdinfo dinfo;
-               struct mdp_backup_super bsb;
                char buf[4096];
                int fd;
 
index 1d326fdabf4769fa165c5b6435d146cc8e09a48c..707fd05e9bb6219e866fca49f770ef78c3ecc5fb 100644 (file)
@@ -56,6 +56,7 @@ int Incremental(char *devname, int verbose, int runstop,
         * - Choose a free, high number.
         * - Use a partitioned device unless strong suggestion not to.
         *         e.g. auto=md
+        *   Don't choose partitioned for containers.
         * 5/ Find out if array already exists
         * 5a/ if it does not
         * - choose a name, from mdadm.conf or 'name' field in array.
@@ -67,6 +68,7 @@ int Incremental(char *devname, int verbose, int runstop,
         * - add the device
         * 6/ Make sure /var/run/mdadm.map contains this array.
         * 7/ Is there enough devices to possibly start the array?
+        *     For a container, this means running Incremental_container.
         * 7a/ if not, finish with success.
         * 7b/ if yes,
         * - read all metadata and arrange devices like -A does
@@ -74,7 +76,7 @@ int Incremental(char *devname, int verbose, int runstop,
         *   start the array (auto-readonly).
         */
        struct stat stb;
-       struct mdinfo info, info2;
+       struct mdinfo info;
        struct mddev_ident_s *array_list, *match;
        char chosen_name[1024];
        int rv;
@@ -83,11 +85,14 @@ int Incremental(char *devname, int verbose, int runstop,
        int dfd, mdfd;
        char *avail;
        int active_disks;
+       int uuid_for_name = 0;
+       char *name_to_use;
+       char nbuf[64];
+
        struct createinfo *ci = conf_get_create_info();
-       char *name;
 
 
-       /* 1/ Check if devices is permitted by mdadm.conf */
+       /* 1/ Check if device is permitted by mdadm.conf */
 
        if (!conf_test_dev(devname)) {
                if (verbose >= 0)
@@ -137,9 +142,18 @@ int Incremental(char *devname, int verbose, int runstop,
                close(dfd);
                return 1;
        }
-       st->ss->getinfo_super(st, &info);
        close (dfd);
 
+       if (st->ss->container_content && st->loaded_container) {
+               /* This is a pre-built container array, so we do something
+                * rather different.
+                */
+               return Incremental_container(st, devname, verbose, runstop,
+                                            autof);
+       }
+
+       memset(&info, 0, sizeof(info));
+       st->ss->getinfo_super(st, &info);
        /* 3/ Check if there is a match in mdadm.conf */
 
        array_list = conf_get_ident(NULL);
@@ -204,15 +218,10 @@ int Incremental(char *devname, int verbose, int runstop,
         * but don't trust the 'name' in the array. Thus a 'random' minor
         * number will be assigned, and the device name will be based
         * on that. */
-       name = info.name;
        if (!match) {
                if (homehost == NULL ||
-                   st->ss->match_home(st, homehost) == 0) {
-                       if (verbose >= 0)
-                               fprintf(stderr, Name
-             ": not found in mdadm.conf and not identified by homehost.\n");
-                       name = NULL;
-               }
+                      st->ss->match_home(st, homehost) != 1)
+                       uuid_for_name = 1;
        }
        /* 4/ Determine device number. */
        /* - If in mdadm.conf with std name, get number from name. */
@@ -222,6 +231,16 @@ int Incremental(char *devname, int verbose, int runstop,
        /* - Choose a free, high number. */
        /* - Use a partitioned device unless strong suggestion not to. */
        /*         e.g. auto=md */
+       mp = map_by_uuid(&map, info.uuid);
+
+       if (uuid_for_name && ! mp) {
+               name_to_use = fname_from_uuid(st, &info, nbuf, '-');
+               if (verbose >= 0)
+                       fprintf(stderr, Name
+               ": not found in mdadm.conf and not identified by homehost"
+                               " - using uuid based name\n");
+       } else
+               name_to_use = info.name;
 
        /* There are three possible sources for 'autof':  command line,
         * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf.
@@ -235,19 +254,24 @@ int Incremental(char *devname, int verbose, int runstop,
 
        if (match && (rv = is_standard(match->devname, &devnum))) {
                devnum = (rv > 0) ? (-1-devnum) : devnum;
-       } else if ((mp = map_by_uuid(&map, info.uuid)) != NULL)
+       } else if (mp != NULL)
                devnum = mp->devnum;
        else {
                /* Have to guess a bit. */
                int use_partitions = 1;
                char *np, *ep;
+               char *nm, nbuf[1024];
+               struct stat stb2;
+
                if ((autof&7) == 3 || (autof&7) == 5)
                        use_partitions = 0;
-               np = name ? strchr(name, ':') : ":NONAME";
+               if (st->ss->external)
+                       use_partitions = 0;
+               np = strchr(name_to_use, ':');
                if (np)
                        np++;
                else
-                       np = name;
+                       np = name_to_use;
                devnum = strtoul(np, &ep, 10);
                if (ep > np && *ep == 0) {
                        /* This is a number.  Let check that it is unused. */
@@ -256,6 +280,24 @@ int Incremental(char *devname, int verbose, int runstop,
                } else
                        devnum = -1;
 
+               if (match)
+                       nm = match->devname;
+               else {
+                       sprintf(nbuf, "/dev/md/%s", np);
+                       nm = nbuf;
+               }
+               if (stat(nm, &stb2) == 0 &&
+                   S_ISBLK(stb2.st_mode) &&
+                   major(stb2.st_rdev) == (use_partitions ?
+                                           get_mdp_major() : MD_MAJOR)) {
+                       if (use_partitions)
+                               devnum = minor(stb2.st_rdev) >> MdpMinorShift;
+                       else
+                               devnum = minor(stb2.st_rdev);
+                       if (mddev_busy(use_partitions ? (-1-devnum) : devnum))
+                               devnum = -1;
+               }
+
                if (devnum < 0) {
                        /* Haven't found anything yet, choose something free */
                        devnum = find_free_devnum(use_partitions);
@@ -268,44 +310,38 @@ int Incremental(char *devname, int verbose, int runstop,
                } else
                        devnum = use_partitions ? (-1-devnum) : devnum;
        }
-       mdfd = open_mddev_devnum(match ? match->devname : NULL,
+
+       mdfd = open_mddev_devnum(match ? match->devname : mp ? mp->path : NULL,
                                 devnum,
-                                name,
+                                name_to_use,
                                 chosen_name, autof >> 3);
        if (mdfd < 0) {
                fprintf(stderr, Name ": failed to open %s: %s.\n",
                        chosen_name, strerror(errno));
                return 2;
        }
+       sysfs_init(&info, mdfd, 0);
+
        /* 5/ Find out if array already exists */
        if (! mddev_busy(devnum)) {
        /* 5a/ if it does not */
        /* - choose a name, from mdadm.conf or 'name' field in array. */
        /* - create the array */
        /* - add the device */
-               mdu_array_info_t ainf;
-               mdu_disk_info_t disk;
-               char md[20];
                struct mdinfo *sra;
+               struct mdinfo dinfo;
 
-               memset(&ainf, 0, sizeof(ainf));
-               ainf.major_version = st->ss->major;
-               ainf.minor_version = st->minor_version;
-               if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) {
-                       fprintf(stderr, Name
-                               ": SET_ARRAY_INFO failed for %s: %s\b",
+               if (set_array_info(mdfd, st, &info) != 0) {
+                       fprintf(stderr, Name ": failed to set array info for %s: %s\n",
                                chosen_name, strerror(errno));
                        close(mdfd);
                        return 2;
                }
-               sprintf(md, "%d.%d\n", st->ss->major, st->minor_version);
-               sra = sysfs_read(mdfd, devnum, GET_VERSION);
-               sysfs_set_str(sra, NULL, "metadata_version", md);
-               memset(&disk, 0, sizeof(disk));
-               disk.major = major(stb.st_rdev);
-               disk.minor = minor(stb.st_rdev);
-               sysfs_free(sra);
-               if (ioctl(mdfd, ADD_NEW_DISK, &disk) != 0) {
+
+               dinfo = info;
+               dinfo.disk.major = major(stb.st_rdev);
+               dinfo.disk.minor = minor(stb.st_rdev);
+               if (add_disk(mdfd, st, &info, &dinfo) != 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
                                devname, chosen_name, strerror(errno));
                        ioctl(mdfd, STOP_ARRAY, 0);
@@ -326,6 +362,8 @@ int Incremental(char *devname, int verbose, int runstop,
                        sysfs_free(sra);
                        return 2;
                }
+               info.array.working_disks = 1;
+               sysfs_free(sra);
        } else {
        /* 5b/ if it does */
        /* - check one drive in array to make sure metadata is a reasonably */
@@ -333,38 +371,28 @@ int Incremental(char *devname, int verbose, int runstop,
        /* - add the device */
                char dn[20];
                int dfd2;
-               mdu_disk_info_t disk;
                int err;
                struct mdinfo *sra;
                struct supertype *st2;
-               sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS |
-                                               GET_STATE));
+               struct mdinfo info2, *d;
+               sra = sysfs_read(mdfd, devnum, (GET_DEVS | GET_STATE));
 
-               if (sra->array.major_version != st->ss->major ||
-                   sra->array.minor_version != st->minor_version) {
-                       if (verbose >= 0)
-                               fprintf(stderr, Name
-             ": %s has different metadata to chosen array %s %d.%d %d.%d.\n",
-                                       devname, chosen_name,
-                                       sra->array.major_version,
-                                       sra->array.minor_version,
-                                       st->ss->major, st->minor_version);
-                       close(mdfd);
-                       return 1;
-               }
                sprintf(dn, "%d:%d", sra->devs->disk.major,
                        sra->devs->disk.minor);
                dfd2 = dev_open(dn, O_RDONLY);
                st2 = dup_super(st);
-               if (st2->ss->load_super(st2, dfd2, NULL)) {
+               if (st2->ss->load_super(st2, dfd2, NULL) ||
+                   st->ss->compare_super(st, st2) != 0) {
                        fprintf(stderr, Name
-                               ": Strange error loading metadata for %s.\n",
-                               chosen_name);
+                               ": metadata mismatch between %s and "
+                               "chosen array %s\n",
+                               devname, chosen_name);
                        close(mdfd);
                        close(dfd2);
                        return 2;
                }
                close(dfd2);
+               memset(&info2, 0, sizeof(info2));
                st2->ss->getinfo_super(st2, &info2);
                st2->ss->free_super(st2);
                if (info.array.level != info2.array.level ||
@@ -376,17 +404,19 @@ int Incremental(char *devname, int verbose, int runstop,
                        close(mdfd);
                        return 2;
                }
-               memset(&disk, 0, sizeof(disk));
-               disk.major = major(stb.st_rdev);
-               disk.minor = minor(stb.st_rdev);
-               err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+               info2.disk.major = major(stb.st_rdev);
+               info2.disk.minor = minor(stb.st_rdev);
+               /* add disk needs to know about containers */
+               if (st->ss->external)
+                       sra->array.level = LEVEL_CONTAINER;
+               err = add_disk(mdfd, st2, sra, &info2);
                if (err < 0 && errno == EBUSY) {
                        /* could be another device present with the same
                         * disk.number. Find and reject any such
                         */
                        find_reject(mdfd, st, sra, info.disk.number,
                                    info.events, verbose, chosen_name);
-                       err = ioctl(mdfd, ADD_NEW_DISK, &disk);
+                       err = add_disk(mdfd, st2, sra, &info2);
                }
                if (err < 0) {
                        fprintf(stderr, Name ": failed to add %s to %s: %s.\n",
@@ -394,15 +424,28 @@ int Incremental(char *devname, int verbose, int runstop,
                        close(mdfd);
                        return 2;
                }
+               info.array.working_disks = 0;
+               for (d = sra->devs; d; d=d->next)
+                       info.array.working_disks ++;
+                       
        }
        /* 6/ Make sure /var/run/mdadm.map contains this array. */
        map_update(&map, devnum,
-                  info.array.major_version,
-                  info.array.minor_version,
+                  info.text_version,
                   info.uuid, chosen_name);
 
        /* 7/ Is there enough devices to possibly start the array? */
        /* 7a/ if not, finish with success. */
+       if (info.array.level == LEVEL_CONTAINER) {
+               /* Try to assemble within the container */
+               close(mdfd);
+               if (verbose >= 0)
+                       fprintf(stderr, Name
+                               ": container %s now has %d devices\n",
+                               chosen_name, info.array.working_disks);
+               return Incremental(chosen_name, verbose, runstop,
+                                  NULL, homehost, autof);
+       }
        avail = NULL;
        active_disks = count_active(st, mdfd, &avail, &info);
        if (enough(info.array.level, info.array.raid_disks,
@@ -459,7 +502,7 @@ int Incremental(char *devname, int verbose, int runstop,
                }
                sra = sysfs_read(mdfd, devnum, 0);
                if ((sra == NULL || active_disks >= info.array.working_disks)
-                   && name != NULL)
+                   && uuid_for_name == 0)
                        rv = ioctl(mdfd, RUN_ARRAY, NULL);
                else
                        rv = sysfs_set_str(sra, NULL,
@@ -639,8 +682,8 @@ void RebuildMap(void)
                                path = map_dev(MD_MAJOR, md->devnum, 0);
                        else
                                path = map_dev(mdp, (-1-md->devnum)<< 6, 0);
-                       map_add(&map, md->devnum, st->ss->major,
-                               st->minor_version,
+                       map_add(&map, md->devnum,
+                               info.text_version,
                                info.uuid, path ? : "/unknown");
                        st->ss->free_super(st);
                        break;
@@ -727,3 +770,218 @@ int IncrementalScan(int verbose)
        }
        return rv;
 }
+
+static char *container2devname(char *devname)
+{
+       int fd = open(devname, O_RDONLY);
+       char *mdname = NULL;
+
+       if (fd >= 0) {
+               mdname = devnum2devname(fd2devnum(fd));
+               close(fd);
+       }
+
+       return mdname;
+}
+
+int Incremental_container(struct supertype *st, char *devname, int verbose,
+                         int runstop, int autof)
+{
+       /* Collect the contents of this container and for each
+        * array, choose a device name and assemble the array.
+        */
+
+       struct mdinfo *list = st->ss->container_content(st);
+       struct mdinfo *ra;
+       char *mdname = container2devname(devname);
+
+       if (!mdname) {
+               fprintf(stderr, Name": failed to determine device name\n");
+               return 2;
+       }
+
+       for (ra = list ; ra ; ra = ra->next) {
+               struct mdinfo *dev, *sra;
+               int devnum = -1;
+               int mdfd;
+               char chosen_name[1024];
+               int usepart = 1;
+               char *n;
+               int working = 0, preexist = 0;
+               struct map_ent *mp, *map = NULL;
+               char nbuf[64];
+               char *name_to_use;
+               struct mddev_ident_s *match = NULL;
+
+               if ((autof&7) == 3 || (autof&7) == 5)
+                       usepart = 0;
+
+               mp = map_by_uuid(&map, ra->uuid);
+
+               name_to_use = ra->name;
+               if (! name_to_use ||
+                   ! *name_to_use ||
+                   (*devname != '/' || strncmp("UUID-", strrchr(devname,'/')+1,5) == 0)
+                       )
+                       name_to_use = fname_from_uuid(st, ra, nbuf, '-');
+                   
+               if (!mp) {
+
+                       /* Check in mdadm.conf for devices == devname and
+                        * member == ra->text_version after second slash.
+                        */
+                       char *sub = strchr(ra->text_version+1, '/');
+                       struct mddev_ident_s *array_list;
+                       if (sub) {
+                               sub++;
+                               array_list = conf_get_ident(NULL);
+                       } else
+                               array_list = NULL;
+                       for(; array_list ; array_list = array_list->next) {
+                               int fd;
+                               char *dn;
+                               if (array_list->member == NULL ||
+                                   array_list->container == NULL)
+                                       continue;
+                               if (strcmp(array_list->member, sub) != 0)
+                                       continue;
+                               if (array_list->uuid_set &&
+                                   !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid))
+                                       continue;
+                               fd = open(array_list->container, O_RDONLY);
+                               if (fd < 0)
+                                       continue;
+                               dn = devnum2devname(fd2devnum(fd));
+                               close(fd);
+                               if (strncmp(dn, ra->text_version+1,
+                                           strlen(dn)) != 0 ||
+                                   ra->text_version[strlen(dn)+1] != '/') {
+                                       free(dn);
+                                       continue;
+                               }
+                               free(dn);
+                               /* we have a match */
+                               match = array_list;
+                               if (verbose>0)
+                                       fprintf(stderr, Name ": match found for member %s\n",
+                                               array_list->member);
+                               break;
+                       }
+               }
+
+               if (match && is_standard(match->devname, &devnum))
+                       /* we have devnum now */;
+               else if (mp)
+                       devnum = mp->devnum;
+               else if (is_standard(name_to_use, &devnum))
+                       /* have devnum */;
+               else {
+                       n = name_to_use;
+                       if (*n == 'd')
+                               n++;
+                       if (*n && devnum < 0) {
+                               devnum = strtoul(n, &n, 10);
+                               if (devnum >= 0 && (*n == 0 || *n == ' ')) {
+                                       /* Use this devnum */
+                                       usepart = (name_to_use[0] == 'd');
+                                       if (mddev_busy(usepart ? (-1-devnum) : devnum))
+                                               devnum = -1;
+                               } else
+                                       devnum = -1;
+                       }
+
+                       if (devnum < 0) {
+                               char *nm = name_to_use;
+                               char nbuf[1024];
+                               struct stat stb;
+                               if (strchr(nm, ':'))
+                                       nm = strchr(nm, ':')+1;
+                               sprintf(nbuf, "/dev/md/%s", nm);
+
+                               if (stat(nbuf, &stb) == 0 &&
+                                   S_ISBLK(stb.st_mode) &&
+                                   major(stb.st_rdev) == (usepart ?
+                                                          get_mdp_major() : MD_MAJOR)){
+                                       if (usepart)
+                                               devnum = minor(stb.st_rdev)
+                                                       >> MdpMinorShift;
+                                       else
+                                               devnum = minor(stb.st_rdev);
+                                       if (mddev_busy(usepart ? (-1-devnum) : devnum))
+                                               devnum = -1;
+                               }
+                       }
+
+                       if (devnum >= 0)
+                               devnum = usepart ? (-1-devnum) : devnum;
+                       else
+                               devnum = find_free_devnum(usepart);
+               }
+               mdfd = open_mddev_devnum(mp ? mp->path : match ? match->devname : NULL,
+                                        devnum, name_to_use,
+                                        chosen_name, autof>>3);
+
+               if (mdfd < 0) {
+                       fprintf(stderr, Name ": failed to open %s: %s.\n",
+                               chosen_name, strerror(errno));
+                       return 2;
+               }
+
+
+               sysfs_init(ra, mdfd, 0);
+
+               sra = sysfs_read(mdfd, 0, GET_VERSION);
+               if (sra == NULL || strcmp(sra->text_version, ra->text_version) != 0)
+                       if (sysfs_set_array(ra, md_get_version(mdfd)) != 0)
+                               return 1;
+               if (sra)
+                       sysfs_free(sra);
+
+               for (dev = ra->devs; dev; dev = dev->next)
+                       if (sysfs_add_disk(ra, dev) == 0)
+                               working++;
+                       else if (errno == EEXIST)
+                               preexist++;
+               if (working == 0)
+                       /* Nothing new, don't try to start */ ;
+               else if (runstop > 0 ||
+                        (working + preexist) >= ra->array.working_disks) {
+                       switch(ra->array.level) {
+                       case LEVEL_LINEAR:
+                       case LEVEL_MULTIPATH:
+                       case 0:
+                               sysfs_set_str(ra, NULL, "array_state",
+                                             "active");
+                               break;
+                       default:
+                               sysfs_set_str(ra, NULL, "array_state",
+                                             "readonly");
+                               /* start mdmon if needed. */
+                               if (!mdmon_running(st->container_dev))
+                                       start_mdmon(st->container_dev);
+                               ping_monitor(devnum2devname(st->container_dev));
+                               break;
+                       }
+                       sysfs_set_safemode(ra, ra->safe_mode_delay);
+                       if (verbose >= 0) {
+                               fprintf(stderr, Name
+                                       ": Started %s with %d devices",
+                                       chosen_name, working + preexist);
+                               if (preexist)
+                                       fprintf(stderr, " (%d new)", working);
+                               fprintf(stderr, "\n");
+                       }
+                       /* FIXME should have an O_EXCL and wait for read-auto */
+               } else
+                       if (verbose >= 0)
+                               fprintf(stderr, Name
+                                       ": %s assembled with %d devices but "
+                                       "not started\n",
+                                       chosen_name, working);
+               close(mdfd);
+               map_update(&map, devnum,
+                          ra->text_version,
+                          ra->uuid, chosen_name);
+       }
+       return 0;
+}
diff --git a/Kill.c b/Kill.c
index 0a2763eaa20615940a035ce4d7e2b3be8bd2a222..d5c1e36df4d3724e6cf060238902cb897e01bfed 100644 (file)
--- a/Kill.c
+++ b/Kill.c
@@ -34,7 +34,7 @@
 #include       "md_u.h"
 #include       "md_p.h"
 
-int Kill(char *dev, int force, int quiet)
+int Kill(char *dev, int force, int quiet, int noexcl)
 {
        /*
         * Nothing fancy about Kill.  It just zeroes out a superblock
@@ -44,7 +44,7 @@ int Kill(char *dev, int force, int quiet)
        int fd, rv = 0;
        struct supertype *st;
 
-       fd = open(dev, O_RDWR|O_EXCL);
+       fd = open(dev, O_DIRECT | (noexcl ? O_RDWR : (O_RDWR|O_EXCL)));
        if (fd < 0) {
                if (!quiet)
                        fprintf(stderr, Name ": Couldn't open %s for write - not zeroing\n",
@@ -63,10 +63,8 @@ int Kill(char *dev, int force, int quiet)
        if (force && rv >= 2)
                rv = 0; /* ignore bad data in superblock */
        if (rv== 0 || (force && rv >= 2)) {
-               mdu_array_info_t info;
-               info.major_version = -1; /* zero superblock */
                st->ss->free_super(st);
-               st->ss->init_super(st, &info, 0, "", NULL, NULL);
+               st->ss->init_super(st, NULL, 0, "", NULL, NULL);
                if (st->ss->store_super(st, fd)) {
                        if (!quiet)
                                fprintf(stderr, Name ": Could not zero superblock on %s\n",
index 52bd55051049cc470adbaf36955646f06da68adc..826fd2f8e5dcfc59bd7ebcfd1bba9d156bd3cd78 100644 (file)
--- a/Makefile
+++ b/Makefile
 # e.g.  make CXFLAGS=-O to optimise
 TCC = tcc
 UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found )
-DIET_GCC = diet gcc
+#DIET_GCC = diet gcc
+# sorry, but diet-libc doesn't know about posix_memalign, 
+# so we cannot use it any more.
+DIET_GCC = gcc -DHAVE_STDINT_H
 
 KLIBC=/home/src/klibc/klibc-0.77
 
@@ -40,6 +43,9 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB
 CC = $(CROSS_COMPILE)gcc
 CXFLAGS = -ggdb
 CWFLAGS = -Wall -Werror -Wstrict-prototypes
+ifdef WARN_UNUSED
+CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O
+endif
 
 ifdef DEBIAN
 CPPFLAGS= -DDEBIAN
@@ -69,19 +75,24 @@ MAN8DIR = $(MANDIR)/man8
 OBJS =  mdadm.o config.o mdstat.o  ReadMe.o util.o Manage.o Assemble.o Build.o \
        Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
        Incremental.o \
-       mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \
-       mapfile.o
+       mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+       restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o
 SRCS =  mdadm.c config.c mdstat.c  ReadMe.c util.c Manage.c Assemble.c Build.c \
        Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \
        Incremental.c \
-       mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \
-       mapfile.c
+       mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \
+       restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \
+       Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+       super-ddf.o sha1.o crc32.o msg.o Monitor.o bitmap.o
+
 
 STATICSRC = pwgr.c
 STATICOBJS = pwgr.o
 
 ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \
-       super0.c super1.c sha1.c
+       super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c
 ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c sysfs.c
 ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
 ifdef MDASSEMBLE_AUTO
@@ -89,7 +100,7 @@ ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS)
 ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
 endif
 
-all : mdadm mdadm.man md.man mdadm.conf.man
+all : mdadm mdmon mdadm.man md.man mdadm.conf.man
 
 everything: all mdadm.static swap_super test_stripe \
        mdassemble mdassemble.auto mdassemble.static mdassemble.man \
@@ -119,6 +130,10 @@ mdadm.Os : $(SRCS) mdadm.h
 mdadm.O2 : $(SRCS) mdadm.h
        gcc -o mdadm.O2 $(CFLAGS)  -DHAVE_STDINT_H -O2 $(SRCS)
 
+mdmon : $(MON_OBJS)
+       $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
 test_stripe : restripe.c mdadm.h
        $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c
 
@@ -156,13 +171,15 @@ mdadm.conf.man : mdadm.conf.5
 mdassemble.man : mdassemble.8
        nroff -man mdassemble.8 > mdassemble.man
 
-$(OBJS) : mdadm.h bitmap.h
+$(OBJS) : mdadm.h mdmon.h bitmap.h
+$(MON_OBJS) : mdadm.h mdmon.h bitmap.h
 
 sha1.o : sha1.c sha1.h md5.h
        $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
 
-install : mdadm install-man
+install : mdadm mdmon install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+       $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
 
 install-static : mdadm.static install-man
        $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
@@ -188,7 +205,8 @@ test: mdadm test_stripe swap_super
        @echo "Please run 'sh ./test' as root"
 
 clean : 
-       rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+       rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+       mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
        mdadm.Os mdadm.O2 \
        mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
        mdassemble.klibc swap_super \
index 160778ed03125d4fd80fb9793bc81214619d9445..6b9825b41b0689065eb19f04afdd5cbb3cfd184d 100644 (file)
--- a/Manage.c
+++ b/Manage.c
@@ -45,11 +45,57 @@ int Manage_ro(char *devname, int fd, int readonly)
         *
         */
        mdu_array_info_t array;
+#ifndef MDASSEMBLE
+       struct mdinfo *mdi;
+#endif
 
        if (md_get_version(fd) < 9000) {
                fprintf(stderr, Name ": need md driver version 0.90.0 or later\n");
                return 1;
        }
+#ifndef MDASSEMBLE
+       /* If this is an externally-manage array, we need to modify the
+        * metadata_version so that mdmon doesn't undo our change.
+        */
+       mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+       if (mdi &&
+           mdi->array.major_version == -1 &&
+           mdi->array.level > 0 &&
+           is_subarray(mdi->text_version)) {
+               char vers[64];
+               strcpy(vers, "external:");
+               strcat(vers, mdi->text_version);
+               if (readonly > 0) {
+                       int rv;
+                       /* We set readonly ourselves. */
+                       vers[9] = '-';
+                       sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+                       close(fd);
+                       rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+                       if (rv < 0) {
+                               fprintf(stderr, Name ": failed to set readonly for %s: %s\n",
+                                       devname, strerror(errno));
+
+                               vers[9] = mdi->text_version[0];
+                               sysfs_set_str(mdi, NULL, "metadata_version", vers);
+                               return 1;
+                       }
+               } else {
+                       char *cp;
+                       /* We cannot set read/write - must signal mdmon */
+                       vers[9] = '/';
+                       sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+                       cp = strchr(vers+10, '/');
+                       if (*cp)
+                               *cp = 0;
+                       ping_monitor(vers+10);
+               }
+               return 0;
+       }
+#endif
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                fprintf(stderr, Name ": %s does not appear to be active.\n",
                        devname);
@@ -78,13 +124,18 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
 {
        /* Run or stop the array. array must already be configured
         * required >= 0.90.0
+        * Only print failure messages if quiet == 0;
+        * quiet > 0 means really be quiet
+        * quiet < 0 means we will try again if it fails.
         */
        mdu_param_t param; /* unused */
 
        if (runstop == -1 && md_get_version(fd) < 9000) {
                if (ioctl(fd, STOP_MD, 0)) {
-                       if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n",
-                                           devname, strerror(errno));
+                       if (quiet == 0) fprintf(stderr,
+                                               Name ": stopping device %s "
+                                               "failed: %s\n",
+                                               devname, strerror(errno));
                        return 1;
                }
        }
@@ -111,9 +162,46 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
        } else if (runstop < 0){
                struct map_ent *map = NULL;
                struct stat stb;
-               if (ioctl(fd, STOP_ARRAY, NULL)) {
-                       if (quiet==0) {
-                               fprintf(stderr, Name ": fail to stop array %s: %s\n",
+               struct mdinfo *mdi;
+               /* If this is an mdmon managed array, just write 'inactive'
+                * to the array state and let mdmon clear up.
+                */
+               mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION);
+               if (mdi &&
+                   mdi->array.level > 0 &&
+                   is_subarray(mdi->text_version)) {
+                       /* This is mdmon managed. */
+                       close(fd);
+                       if (sysfs_set_str(mdi, NULL,
+                                         "array_state", "inactive") < 0) {
+                               if (quiet == 0)
+                                       fprintf(stderr, Name
+                                               ": failed to stop array %s: %s\n",
+                                               devname, strerror(errno));
+                               return 1;
+                       }
+
+                       /* Give monitor a chance to act */
+                       ping_monitor(mdi->text_version);
+
+                       fd = open(devname, O_RDONLY);
+               } else if (mdi &&
+                          mdi->array.major_version == -1 &&
+                          mdi->array.minor_version == -2 &&
+                          !is_subarray(mdi->text_version)) {
+                       /* container, possibly mdmon-managed.
+                        * Make sure mdmon isn't opening it, which
+                        * would interfere with the 'stop'
+                        */
+                       ping_monitor(mdi->sys_name);
+               }
+               if (mdi)
+                       sysfs_free(mdi);
+
+               if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) {
+                       if (quiet == 0) {
+                               fprintf(stderr, Name
+                                       ": failed to stop array %s: %s\n",
                                        devname, strerror(errno));
                                if (errno == EBUSY)
                                        fprintf(stderr, "Perhaps a running "
@@ -122,9 +210,10 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet)
                        }
                        return 1;
                }
+
                if (quiet <= 0)
                        fprintf(stderr, Name ": stopped %s\n", devname);
-               if (fstat(fd, &stb) == 0) {
+               if (fd >= 0 && fstat(fd, &stb) == 0) {
                        int devnum;
                        if (major(stb.st_rdev) == MD_MAJOR)
                                devnum = minor(stb.st_rdev);
@@ -201,6 +290,7 @@ int Manage_subdevs(char *devname, int fd,
        struct supertype *st, *tst;
        int duuid[4];
        int ouuid[4];
+       int lfd = -1;
 
        if (ioctl(fd, GET_ARRAY_INFO, &array)) {
                fprintf(stderr, Name ": cannot get array info for %s\n",
@@ -227,6 +317,7 @@ int Manage_subdevs(char *devname, int fd,
                unsigned long long ldsize;
                char dvname[20];
                char *dnprintable = dv->devname;
+               int err;
 
                next = dv->next;
                jnext = 0;
@@ -311,9 +402,14 @@ int Manage_subdevs(char *devname, int fd,
                        return 1;
                case 'a':
                        /* add the device */
-
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot add disks to a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
                        /* Make sure it isn't in use (in 2.6 or later) */
-                       tfd = open(dv->devname, O_RDONLY|O_EXCL);
+                       tfd = open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT);
                        if (tfd < 0) {
                                fprintf(stderr, Name ": Cannot open %s: %s\n",
                                        dv->devname, strerror(errno));
@@ -332,7 +428,9 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        close(tfd);
 
-                       if (array.major_version == 0 &&
+
+                       if (!tst->ss->external &&
+                           array.major_version == 0 &&
                            md_get_version(fd)%100 < 2) {
                                if (ioctl(fd, HOT_ADD_DISK,
                                          (unsigned long)stb.st_rdev)==0) {
@@ -347,12 +445,16 @@ int Manage_subdevs(char *devname, int fd,
                                return 1;
                        }
 
-                       if (array.not_persistent == 0) {
+                       if (array.not_persistent == 0 || tst->ss->external) {
 
                                /* need to find a sample superblock to copy, and
-                                * a spare slot to use
+                                * a spare slot to use.
+                                * For 'external' array (well, container based),
+                                * We can just load the metadata for the array.
                                 */
-                               for (j = 0; j < tst->max_devs; j++) {
+                               if (tst->ss->external) {
+                                       tst->ss->load_super(tst, fd, NULL);
+                               } else for (j = 0; j < tst->max_devs; j++) {
                                        char *dev;
                                        int dfd;
                                        disc.number = j;
@@ -374,6 +476,7 @@ int Manage_subdevs(char *devname, int fd,
                                        close(dfd);
                                        break;
                                }
+                               /* FIXME this is a bad test to be using */
                                if (!tst->sb) {
                                        fprintf(stderr, Name ": cannot find valid superblock in this array - HELP\n");
                                        return 1;
@@ -453,12 +556,18 @@ int Manage_subdevs(char *devname, int fd,
                        disc.minor = minor(stb.st_rdev);
                        disc.number =j;
                        disc.state = 0;
-                       if (array.not_persistent==0) {
+                       if (array.not_persistent==0 || tst->ss->external) {
+                               int dfd;
                                if (dv->writemostly == 1)
                                        disc.state |= 1 << MD_DISK_WRITEMOSTLY;
-                               tst->ss->add_to_super(tst, &disc);
-                               if (tst->ss->write_init_super(tst, &disc,
-                                                             dv->devname))
+                               dfd = open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+                               tst->ss->add_to_super(tst, &disc, dfd,
+                                                     dv->devname);
+                               /* write_init_super will close 'dfd' */
+                               if (tst->ss->external)
+                                       /* mdmon will write the metadata */
+                                       close(dfd);
+                               else if (tst->ss->write_init_super(tst))
                                        return 1;
                        } else if (dv->re_add) {
                                /*  this had better be raid1.
@@ -491,7 +600,52 @@ int Manage_subdevs(char *devname, int fd,
                        }
                        if (dv->writemostly == 1)
                                disc.state |= (1 << MD_DISK_WRITEMOSTLY);
-                       if (ioctl(fd,ADD_NEW_DISK, &disc)) {
+                       if (tst->ss->external) {
+                               /* add a disk to an external metadata container
+                                * only if mdmon is around to see it
+                                */
+                               struct mdinfo new_mdi;
+                               struct mdinfo *sra;
+                               int container_fd;
+                               int devnum = fd2devnum(fd);
+
+                               container_fd = open_dev_excl(devnum);
+                               if (container_fd < 0) {
+                                       fprintf(stderr, Name ": add failed for %s:"
+                                               " could not get exclusive access to container\n",
+                                               dv->devname);
+                                       return 1;
+                               }
+
+                               if (!mdmon_running(devnum)) {
+                                       fprintf(stderr, Name ": add failed for %s: mdmon not running\n",
+                                               dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+
+                               sra = sysfs_read(container_fd, -1, 0);
+                               if (!sra) {
+                                       fprintf(stderr, Name ": add failed for %s: sysfs_read failed\n",
+                                               dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+                               sra->array.level = LEVEL_CONTAINER;
+                               /* Need to set data_offset and component_size */
+                               tst->ss->getinfo_super(tst, &new_mdi);
+                               new_mdi.disk.major = disc.major;
+                               new_mdi.disk.minor = disc.minor;
+                               if (sysfs_add_disk(sra, &new_mdi) != 0) {
+                                       fprintf(stderr, Name ": add new device to external metadata"
+                                               " failed for %s\n", dv->devname);
+                                       close(container_fd);
+                                       return 1;
+                               }
+                               ping_monitor(devnum2devname(devnum));
+                               sysfs_free(sra);
+                               close(container_fd);
+                       } else if (ioctl(fd, ADD_NEW_DISK, &disc)) {
                                fprintf(stderr, Name ": add new device failed for %s as %d: %s\n",
                                        dv->devname, j, strerror(errno));
                                return 1;
@@ -502,13 +656,87 @@ int Manage_subdevs(char *devname, int fd,
 
                case 'r':
                        /* hot remove */
+                       if (tst->subarray[0]) {
+                               fprintf(stderr, Name ": Cannot remove disks from a"
+                                       " \'member\' array, perform this"
+                                       " operation on the parent container\n");
+                               return 1;
+                       }
+                       if (tst->ss->external) {
+                               /* To remove a device from a container, we must
+                                * check that it isn't in use in an array.
+                                * This involves looking in the 'holders'
+                                * directory - there must be just one entry,
+                                * the container.
+                                * To ensure that it doesn't get used as a
+                                * hold spare while we are checking, we
+                                * get an O_EXCL open on the container
+                                */
+                               int dnum = fd2devnum(fd);
+                               lfd = open_dev_excl(dnum);
+                               if (lfd < 0) {
+                                       fprintf(stderr, Name
+                                               ": Cannot get exclusive access "
+                                               " to container - odd\n");
+                                       return 1;
+                               }
+                               if (!sysfs_unique_holder(dnum, stb.st_rdev)) {
+                                       fprintf(stderr, Name
+                                               ": %s is %s, cannot remove.\n",
+                                               dnprintable,
+                                               errno == EEXIST ? "still in use":
+                                               "not a member");
+                                       close(lfd);
+                                       return 1;
+                               }
+                       }
                        /* FIXME check that it is a current member */
-                       if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) {
+                       err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev);
+                       if (err && errno == ENODEV) {
+                               /* Old kernels rejected this if no personality
+                                * registered */
+                               struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS);
+                               struct mdinfo *dv = NULL;
+                               if (sra)
+                                       dv = sra->devs;
+                               for ( ; dv ; dv=dv->next)
+                                       if (dv->disk.major == major(stb.st_rdev) &&
+                                           dv->disk.minor == minor(stb.st_rdev))
+                                               break;
+                               if (dv)
+                                       err = sysfs_set_str(sra, dv,
+                                                           "state", "remove");
+                               else
+                                       err = -1;
+                               if (sra)
+                                       sysfs_free(sra);
+                       }
+                       if (err) {
                                fprintf(stderr, Name ": hot remove failed "
                                        "for %s: %s\n", dnprintable,
                                        strerror(errno));
+                               if (lfd >= 0)
+                                       close(lfd);
                                return 1;
                        }
+                       if (tst->ss->external) {
+                               /*
+                                * Before dropping our exclusive open we make an
+                                * attempt at preventing mdmon from seeing an
+                                * 'add' event before reconciling this 'remove'
+                                * event.
+                                */
+                               char *name = devnum2devname(fd2devnum(fd));
+
+                               if (!name) {
+                                       fprintf(stderr, Name ": unable to get container name\n");
+                                       return 1;
+                               }
+
+                               ping_manager(name);
+                               free(name);
+                       }
+                       close(lfd);
                        if (verbose >= 0)
                                fprintf(stderr, Name ": hot removed %s\n",
                                        dnprintable);
index abc2dbd684b24cc578932bcef8bfbedc4e6c458b..1c190577ab04417473d345ce89047e5896e99674 100644 (file)
--- a/Monitor.c
+++ b/Monitor.c
@@ -602,10 +602,7 @@ int Wait(char *dev)
                        strerror(errno));
                return 2;
        }
-       if (major(stb.st_rdev) == MD_MAJOR)
-               devnum = minor(stb.st_rdev);
-       else
-               devnum = -1-(minor(stb.st_rdev)/64);
+       devnum = stat2devnum(&stb);
 
        while(1) {
                struct mdstat_ent *ms = mdstat_read(1, 0);
@@ -616,6 +613,13 @@ int Wait(char *dev)
                                break;
 
                if (!e || e->percent < 0) {
+                       if (e &&
+                           strncmp(e->metadata_version, "external:", 9) == 0) {
+                               if (is_subarray(&e->metadata_version[9]))
+                                       ping_monitor(&e->metadata_version[9]);
+                               else
+                                       ping_monitor(devnum2devname(devnum));
+                       }
                        free_mdstat(ms);
                        return rv;
                }
@@ -624,3 +628,107 @@ int Wait(char *dev)
                mdstat_wait(5);
        }
 }
+
+static char *clean_states[] = {
+       "clear", "inactive", "readonly", "read-auto", "clean", NULL };
+
+int WaitClean(char *dev, int verbose)
+{
+       int fd;
+       struct mdinfo *mdi;
+       int rv = 1;
+       int devnum;
+
+       fd = open(dev, O_RDONLY); 
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": Couldn't open %s: %s\n", dev, strerror(errno));
+               return 1;
+       }
+
+       devnum = fd2devnum(fd);
+       mdi = sysfs_read(fd, devnum, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+       if (!mdi) {
+               if (verbose)
+                       fprintf(stderr, Name ": Failed to read sysfs attributes for "
+                               "%s\n", dev);
+               close(fd);
+               return 0;
+       }
+
+       switch(mdi->array.level) {
+       case LEVEL_LINEAR:
+       case LEVEL_MULTIPATH:
+       case 0:
+               /* safemode delay is irrelevant for these levels */
+               rv = 0;
+               
+       }
+
+       /* for internal metadata the kernel handles the final clean
+        * transition, containers can never be dirty
+        */
+       if (!is_subarray(mdi->text_version))
+               rv = 0;
+
+       /* safemode disabled ? */
+       if (mdi->safe_mode_delay == 0)
+               rv = 0;
+
+       if (rv) {
+               int state_fd = sysfs_open(fd2devnum(fd), NULL, "array_state");
+               char buf[20];
+               fd_set fds;
+               struct timeval tm;
+
+               /* minimize the safe_mode_delay and prepare to wait up to 5s
+                * for writes to quiesce
+                */
+               sysfs_set_safemode(mdi, 1);
+               tm.tv_sec = 5;
+               tm.tv_usec = 0;
+
+               /* give mdmon a chance to checkpoint resync */
+               sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+               FD_ZERO(&fds);
+
+               /* wait for array_state to be clean */
+               while (1) {
+                       rv = read(state_fd, buf, sizeof(buf));
+                       if (rv < 0)
+                               break;
+                       if (sysfs_match_word(buf, clean_states) <= 4)
+                               break;
+                       FD_SET(state_fd, &fds);
+                       rv = select(state_fd + 1, &fds, NULL, NULL, &tm);
+                       if (rv < 0 && errno != EINTR)
+                               break;
+                       lseek(state_fd, 0, SEEK_SET);
+               }
+               if (rv < 0)
+                       rv = 1;
+               else if (ping_monitor(mdi->text_version) == 0) {
+                       /* we need to ping to close the window between array
+                        * state transitioning to clean and the metadata being
+                        * marked clean
+                        */
+                       rv = 0;
+               } else
+                       rv = 1;
+               if (rv && verbose)
+                       fprintf(stderr, Name ": Error waiting for %s to be clean\n",
+                               dev);
+
+               /* restore the original safe_mode_delay */
+               sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+               close(state_fd);
+       }
+
+       sysfs_free(mdi);
+       close(fd);
+
+       return rv;
+}
+
+
diff --git a/Query.c b/Query.c
index 190ee298834e70d9640e29fbc551bba0a5742934..dc69eb8271ec171c35a418bf883b52cfacb04b6d 100644 (file)
--- a/Query.c
+++ b/Query.c
@@ -96,7 +96,7 @@ int Query(char *dev)
        if (superror == 0) {
                /* array might be active... */
                st->ss->getinfo_super(st, &info);
-               if (st->ss->major == 0) {
+               if (st->ss == &super0) {
                        mddev = get_md_name(info.array.md_minor);
                        disc.number = info.disk.number;
                        activity = "undetected";
@@ -121,7 +121,7 @@ int Query(char *dev)
                       activity,
                       map_num(pers, info.array.level),
                       mddev);
-               if (st->ss->major == 0)
+               if (st->ss == &super0)
                        put_md_name(mddev);
        }
        return 0;
index 031889432c21e85509e2efc5daefc5126fa73f29..e361f1dcbf3a83c9e262b42cb59099726a47ac4b 100644 (file)
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -24,7 +24,7 @@
 
 #include "mdadm.h"
 
-char Version[] = Name " - v2.6.7 - 6th June 2008\n";
+char Version[] = Name " - v3.0-devel1 - 18th September 2008\n";
 
 /*
  * File: ReadMe.c
@@ -161,6 +161,7 @@ struct option long_options[] = {
     {"readwrite", 0, 0, 'w'},
     {"no-degraded",0,0,  NoDegraded },
     {"wait",     0, 0, 'W'},
+    {"wait-clean", 0, 0, Waitclean },
 
     /* For Detail/Examine */
     {"brief",    0, 0, 'b'},
@@ -612,6 +613,7 @@ mapping_t pers[] = {
        { "raid10", 10},
        { "10", 10},
        { "faulty", LEVEL_FAULTY},
+       { "container", LEVEL_CONTAINER},
        { NULL, 0}
 };
 
diff --git a/TODO b/TODO
index f79163b88ca434065232034381af3a27aff23c25..279d20db99892c8e79b969a961ae38a7be5fd77c 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,3 +1,38 @@
+ - add 'name' field to metadata type and use it.
+ - use validate_geometry more
+ - metadata should be able to check/reject bitmap stuff.
+
+DDF:
+  Three new metadata types:
+    ddf - used only to create a container.
+    ddf-bvd - used to create an array in a container
+    ddf-svd - used to create a secondary array from bvds.
+
+  Usage:
+    mdadm -C /dev/ddf1 /dev/sd[abcdef]
+    mdadm -C /dev/md1 -e ddf /dev/sd[a-f]
+    mdadm -C /dev/md1 -l container /dev/sd[a-f]
+
+        Each of these create a new ddf container using all those
+       devices.  The name 'ddf*' signals that ddf metadata should be used.
+       '-e ddf' only supports one level - 'container'.  'container' is only
+       supported by ddf.
+
+    mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ???
+    mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb
+       If exactly one device is given, and it is a container, we select
+       devices from that container.
+       If devices are given that are already in use, they must be in use by
+       a container, and the array is created in the container.
+       If devices given are bvds, we slip under the hood to make
+         the svd arrays.
+
+    mdadm -A /dev/ddf ......
+       base drives make a container.  Anything in that container is started
+        auto-read-only.
+        if /dev/ddf is already assembled, we assemble bvds and svds inside it.
+
+
 2005-dec-20
   Want an incremental assembly mode to work nicely with udev.
   Core usage would be something like
index b64793924bd04c01e870e147f359af05468f5e9a..18101664fa1728e997bdb27826141d5a1f40cca7 100644 (file)
--- a/bitmap.c
+++ b/bitmap.c
@@ -131,11 +131,13 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
         */
        unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0;
        bitmap_info_t *info;
-       char *buf, *unaligned;
+       void *buf;
        int n, skip;
 
-       unaligned = malloc(8192*2);
-       buf = (char*) ((unsigned long)unaligned | 8191)+1;
+       if (posix_memalign(&buf, 512, 8192) != 0) {
+               fprintf(stderr, Name ": failed to allocate 8192 bytes\n");
+               return NULL;
+       }
        n = read(fd, buf, 8192);
 
        info = malloc(sizeof(*info));
@@ -154,7 +156,6 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief)
                fprintf(stderr, Name ": failed to read superblock of bitmap "
                        "file: %s\n", strerror(errno));
                free(info);
-               free(unaligned);
                return NULL;
        }
        memcpy(&info->sb, buf, sizeof(info->sb));
index 121b3373554076212b844823847bce18e7d96eff..02ab3e00cadf43a3eed2ff429ac17ac4926b6933 100644 (file)
--- a/config.c
+++ b/config.c
@@ -434,6 +434,8 @@ void arrayline(char *line)
        mis.bitmap_fd = -1;
        mis.bitmap_file = NULL;
        mis.name[0] = 0;
+       mis.container = NULL;
+       mis.member = NULL;
 
        for (w=dl_next(line); w!=line; w=dl_next(w)) {
                if (w[0] == '/') {
@@ -516,6 +518,12 @@ void arrayline(char *line)
                } else if (strncasecmp(w, "auto=", 5) == 0 ) {
                        /* whether to create device special files as needed */
                        mis.autof = parse_auto(w+5, "auto type", 0);
+               } else if (strncasecmp(w, "member=", 7) == 0) {
+                       /* subarray within a container */
+                       mis.member = strdup(w+7);
+               } else if (strncasecmp(w, "container=", 10) == 0) {
+                       /* the container holding this subarray */
+                       mis.container = strdup(w+10);
                } else {
                        fprintf(stderr, Name ": unrecognised word on ARRAY line: %s\n",
                                w);
@@ -558,10 +566,12 @@ void mailfromline(char *line)
                if (alert_mail_from == NULL)
                        alert_mail_from = strdup(w);
                else {
-                       char *t= NULL;
-                       asprintf(&t, "%s %s", alert_mail_from, w);
-                       free(alert_mail_from);
-                       alert_mail_from = t;
+                       char *t = NULL;
+
+                       if (asprintf(&t, "%s %s", alert_mail_from, w) > 0) {
+                               free(alert_mail_from);
+                               alert_mail_from = t;
+                       }
                }
        }
 }
diff --git a/crc32.c b/crc32.c
new file mode 100644 (file)
index 0000000..12d08e5
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,340 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results about a factor
+ * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+  protection on the static variables used to control the first-use generation
+  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+  first call get_crc_table() to initialize the tables before allowing more than
+  one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+/* #include "zutil.h"      / * for STDC and FAR definitions */
+#define STDC
+#define FAR
+#define Z_NULL ((void*)0)
+#define OF(X) X
+#define ZEXPORT
+typedef long ptrdiff_t;
+#define NOBYFOUR
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
+#    include <limits.h>
+#    define BYFOUR
+#    if (UINT_MAX == 0xffffffffUL)
+       typedef unsigned int u4;
+#    else
+#      if (ULONG_MAX == 0xffffffffUL)
+         typedef unsigned long u4;
+#      else
+#        if (USHRT_MAX == 0xffffffffUL)
+           typedef unsigned short u4;
+#        else
+#          undef BYFOUR     /* can't find a four-byte integer type! */
+#        endif
+#      endif
+#    endif
+#  endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+   local unsigned long crc32_little OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+   local unsigned long crc32_big OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+#  define TBLS 8
+#else
+#  define TBLS 1
+#endif /* BYFOUR */
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+   local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+
+/*
+  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+  Polynomials over GF(2) are represented in binary, one bit per coefficient,
+  with the lowest powers in the most significant bit.  Then adding polynomials
+  is just exclusive-or, and multiplying a polynomial by x is a right shift by
+  one.  If we call the above polynomial p, and represent a byte as the
+  polynomial q, also with the lowest power in the most significant bit (so the
+  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+  where a mod b means the remainder after dividing a by b.
+
+  This calculation is done using the shift-register method of multiplying and
+  taking the remainder.  The register is initialized to zero, and for each
+  incoming bit, x^32 is added mod p to the register if the bit is a one (where
+  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+  x (which is shifting right by one and adding x^32 mod p if the bit shifted
+  out is a one).  We start with the highest power (least significant bit) of
+  q and repeat for all eight bits of q.
+
+  The first table is simply the CRC of all possible eight bit values.  This is
+  all the information needed to generate CRCs on data a byte at a time for all
+  combinations of CRC register values and incoming bytes.  The remaining tables
+  allow for word-at-a-time CRC calculation for both big-endian and little-
+  endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+    unsigned long c;
+    int n, k;
+    unsigned long poly;                 /* polynomial exclusive-or pattern */
+    /* terms of polynomial defining this crc (except x^32): */
+    static volatile int first = 1;      /* flag to limit concurrent making */
+    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* See if another task is already doing this (not thread-safe, but better
+       than nothing -- significantly reduces duration of vulnerability in
+       case the advice about DYNAMIC_CRC_TABLE is ignored) */
+    if (first) {
+        first = 0;
+
+        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+        poly = 0UL;
+        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+            poly |= 1UL << (31 - p[n]);
+
+        /* generate a crc for every 8-bit value */
+        for (n = 0; n < 256; n++) {
+            c = (unsigned long)n;
+            for (k = 0; k < 8; k++)
+                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+            crc_table[0][n] = c;
+        }
+
+#ifdef BYFOUR
+        /* generate crc for each value followed by one, two, and three zeros,
+           and then the byte reversal of those as well as the first table */
+        for (n = 0; n < 256; n++) {
+            c = crc_table[0][n];
+            crc_table[4][n] = REV(c);
+            for (k = 1; k < 4; k++) {
+                c = crc_table[0][c & 0xff] ^ (c >> 8);
+                crc_table[k][n] = c;
+                crc_table[k + 4][n] = REV(c);
+            }
+        }
+#endif /* BYFOUR */
+
+        crc_table_empty = 0;
+    }
+    else {      /* not first */
+        /* wait for the other guy to finish (not efficient, but rare) */
+        while (crc_table_empty)
+            ;
+    }
+
+#ifdef MAKECRCH
+    /* write out CRC tables to crc32.h */
+    {
+        FILE *out;
+
+        out = fopen("crc32.h", "w");
+        if (out == NULL) return;
+        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+        fprintf(out, "local const unsigned long FAR ");
+        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
+        write_table(out, crc_table[0]);
+#  ifdef BYFOUR
+        fprintf(out, "#ifdef BYFOUR\n");
+        for (k = 1; k < 8; k++) {
+            fprintf(out, "  },\n  {\n");
+            write_table(out, crc_table[k]);
+        }
+        fprintf(out, "#endif\n");
+#  endif /* BYFOUR */
+        fprintf(out, "  }\n};\n");
+        fclose(out);
+    }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+    FILE *out;
+    const unsigned long FAR *table;
+{
+    int n;
+
+    for (n = 0; n < 256; n++)
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
+                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+    return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(
+       unsigned long crc,
+       const unsigned char FAR *buf,
+       unsigned len)
+{
+    if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+    if (sizeof(void *) == sizeof(ptrdiff_t)) {
+        u4 endian;
+
+        endian = 1;
+        if (*((unsigned char *)(&endian)))
+            return crc32_little(crc, buf, len);
+        else
+            return crc32_big(crc, buf, len);
+    }
+#endif /* BYFOUR */
+/*    crc = crc ^ 0xffffffffUL;*/
+    while (len >= 8) {
+        DO8;
+        len -= 8;
+    }
+    if (len) do {
+        DO1;
+    } while (--len);
+    return crc /* ^ 0xffffffffUL*/;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = (u4)crc;
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    while (len >= 32) {
+        DOLIT32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOLIT4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+    unsigned long crc;
+    const unsigned char FAR *buf;
+    unsigned len;
+{
+    register u4 c;
+    register const u4 FAR *buf4;
+
+    c = REV((u4)crc);
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)buf;
+    buf4--;
+    while (len >= 32) {
+        DOBIG32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOBIG4;
+        len -= 4;
+    }
+    buf4++;
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
diff --git a/crc32.h b/crc32.h
new file mode 100644 (file)
index 0000000..8053b61
--- /dev/null
+++ b/crc32.h
@@ -0,0 +1,441 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+  {
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL
+#ifdef BYFOUR
+  },
+  {
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL
+  },
+  {
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL
+  },
+  {
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL
+  },
+  {
+    0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+    0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+    0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+    0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+    0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+    0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+    0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+    0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+    0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+    0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+    0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+    0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+    0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+    0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+    0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+    0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+    0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+    0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+    0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+    0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+    0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+    0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+    0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+    0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+    0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+    0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+    0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+    0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+    0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+    0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+    0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+    0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+    0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+    0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+    0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+    0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+    0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+    0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+    0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+    0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+    0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+    0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+    0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+    0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+    0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+    0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+    0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+    0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+    0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+    0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+    0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+    0x8def022dUL
+  },
+  {
+    0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+    0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+    0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+    0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+    0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+    0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+    0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+    0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+    0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+    0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+    0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+    0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+    0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+    0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+    0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+    0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+    0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+    0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+    0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+    0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+    0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+    0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+    0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+    0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+    0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+    0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+    0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+    0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+    0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+    0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+    0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+    0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+    0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+    0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+    0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+    0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+    0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+    0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+    0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+    0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+    0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+    0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+    0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+    0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+    0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+    0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+    0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+    0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+    0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+    0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+    0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+    0x72fd2493UL
+  },
+  {
+    0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+    0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+    0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+    0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+    0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+    0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+    0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+    0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+    0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+    0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+    0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+    0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+    0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+    0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+    0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+    0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+    0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+    0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+    0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+    0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+    0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+    0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+    0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+    0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+    0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+    0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+    0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+    0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+    0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+    0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+    0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+    0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+    0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+    0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+    0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+    0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+    0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+    0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+    0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+    0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+    0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+    0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+    0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+    0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+    0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+    0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+    0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+    0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+    0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+    0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+    0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+    0xed3498beUL
+  },
+  {
+    0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+    0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+    0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+    0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+    0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+    0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+    0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+    0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+    0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+    0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+    0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+    0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+    0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+    0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+    0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+    0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+    0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+    0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+    0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+    0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+    0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+    0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+    0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+    0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+    0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+    0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+    0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+    0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+    0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+    0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+    0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+    0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+    0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+    0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+    0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+    0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+    0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+    0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+    0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+    0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+    0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+    0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+    0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+    0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+    0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+    0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+    0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+    0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+    0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+    0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+    0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+    0xf10605deUL
+#endif
+  }
+};
index 8c0a8d1894ce01ccf5e90488b28ed1486a5ede2c..1bf9550d51dd886f6039a15325373587d7e82e00 100755 (executable)
--- a/inventory
+++ b/inventory
@@ -22,6 +22,7 @@ ANNOUNCE-2.6.4
 ANNOUNCE-2.6.5
 ANNOUNCE-2.6.6
 ANNOUNCE-2.6.7
+ANNOUNCE-3.0-devel1
 Assemble.c
 bitmap.c
 bitmap.h
@@ -29,6 +30,8 @@ Build.c
 ChangeLog
 config.c
 COPYING
+crc32.c
+crc32.h
 Create.c
 Detail.c
 dlink.c
@@ -42,10 +45,12 @@ inventory
 kernel-patch-2.6.18
 kernel-patch-2.6.18.6
 kernel-patch-2.6.19
+kernel-patch-2.6.25
 Kill.c
 makedist
 Makefile
 Manage.c
+managemon.c
 mapfile.c
 md.4
 md5.h
@@ -57,6 +62,8 @@ mdadm.h
 mdadm.spec
 mdassemble.8
 mdassemble.c
+mdmon.c
+mdmon.h
 mdopen.c
 md_p.h
 mdstat.c
@@ -64,17 +71,23 @@ md_u.h
 misc/
 misc/syslog-events
 mkinitramfs
+monitor.c
 Monitor.c
+msg.c
+msg.h
 pwgr.c
 Query.c
 raid5extend.c
 ReadMe.c
 README.initramfs
 restripe.c
+sg_io.c
 sha1.c
 sha1.h
 super0.c
 super1.c
+super-ddf.c
+super-intel.c
 swap_super.c
 sysfs.c
 test
@@ -120,6 +133,7 @@ tests/06update-uuid
 tests/06wrmostly
 tests/07autoassemble
 tests/07autodetect
+tests/07reshape5intr
 tests/07testreshape5
 tests/check
 tests/testdev
diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25
new file mode 100644 (file)
index 0000000..2329007
--- /dev/null
@@ -0,0 +1,199 @@
+Status: ok
+
+Support adding a spare to a live md array with external metadata.
+
+i.e. extend the 'md/dev-XXX/slot' attribute so that you can
+tell a device to fill an vacant slot in an and md array.
+
+
+Signed-off-by: Neil Brown <neilb@suse.de>
+
+### Diffstat output
+ ./drivers/md/md.c        |   44 ++++++++++++++++++++++++++++++++++++++++----
+ ./drivers/md/multipath.c |    7 ++++++-
+ ./drivers/md/raid1.c     |    7 ++++++-
+ ./drivers/md/raid10.c    |   10 ++++++++--
+ ./drivers/md/raid5.c     |   10 ++++++++--
+ 5 files changed, 68 insertions(+), 10 deletions(-)
+
+diff .prev/drivers/md/md.c ./drivers/md/md.c
+--- .prev/drivers/md/md.c      2008-06-05 09:19:56.000000000 +1000
++++ ./drivers/md/md.c  2008-06-10 10:41:21.000000000 +1000
+@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char 
+               slot = -1;
+       else if (e==buf || (*e && *e!= '\n'))
+               return -EINVAL;
+-      if (rdev->mddev->pers) {
++      if (rdev->mddev->pers && slot == -1) {
+               /* Setting 'slot' on an active array requires also
+                * updating the 'rd%d' link, and communicating
+                * with the personality with ->hot_*_disk.
+@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char 
+                * failed/spare devices.  This normally happens automatically,
+                * but not when the metadata is externally managed.
+                */
+-              if (slot != -1)
+-                      return -EBUSY;
+               if (rdev->raid_disk == -1)
+                       return -EEXIST;
+               /* personality does all needed checks */
+@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char 
+               sysfs_remove_link(&rdev->mddev->kobj, nm);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
++      } else if (rdev->mddev->pers) {
++              mdk_rdev_t *rdev2;
++              struct list_head *tmp;
++              /* Activating a spare .. or possibly reactivating
++               * if we every get bitmaps working here.
++               */
++
++              if (rdev->raid_disk != -1)
++                      return -EBUSY;
++
++              if (rdev->mddev->pers->hot_add_disk == NULL)
++                      return -EINVAL;
++
++              rdev_for_each(rdev2, tmp, rdev->mddev)
++                      if (rdev2->raid_disk == slot)
++                              return -EEXIST;
++
++              rdev->raid_disk = slot;
++              if (test_bit(In_sync, &rdev->flags))
++                      rdev->saved_raid_disk = slot;
++              else
++                      rdev->saved_raid_disk = -1;
++              err = rdev->mddev->pers->
++                      hot_add_disk(rdev->mddev, rdev);
++              if (err != 1) {
++                      rdev->raid_disk = -1;
++                      if (err == 0)
++                              return -EEXIST;
++                      return err;
++              }
++              sprintf(nm, "rd%d", rdev->raid_disk);
++              if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
++                      printk(KERN_WARNING
++                             "md: cannot register "
++                             "%s for %s\n",
++                             nm, mdname(rdev->mddev));
++
++              /* don't wakeup anyone, leave that to userspace. */
+       } else {
+               if (slot >= rdev->mddev->raid_disks)
+                       return -ENOSPC;
+@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev,
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+                       err = mddev->pers->hot_add_disk(mddev, rdev);
+-                      if (err)
++                      if (err < 0)
+                               unbind_rdev_from_array(rdev);
+               }
+               if (err)
+
+diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c
+--- .prev/drivers/md/multipath.c       2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/multipath.c   2008-06-10 10:35:03.000000000 +1000
+@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m
+       int found = 0;
+       int path;
+       struct multipath_info *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
++
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
+       print_multipath_conf(conf);
+-      for (path=0; path<mddev->raid_disks; path++) 
++      for (path = first; path <= last; path++)
+               if ((p=conf->multipaths+path)->rdev == NULL) {
+                       q = rdev->bdev->bd_disk->queue;
+                       blk_queue_stack_limits(mddev->queue, q);
+
+diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c
+--- .prev/drivers/md/raid10.c  2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid10.c      2008-06-10 10:28:53.000000000 +1000
+@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde
+       int found = 0;
+       int mirror;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+       if (mddev->recovery_cp < MaxSector)
+               /* only hot-add to in-sync arrays, as recovery is
+@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde
+       if (!enough(conf))
+               return 0;
++      if (rdev->raid_disk)
++              first = last = rdev->raid_disk;
++
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+               mirror = rdev->saved_raid_disk;
+       else
+-              mirror = 0;
+-      for ( ; mirror < mddev->raid_disks; mirror++)
++              mirror = first;
++      for ( ; mirror <= last ; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
+--- .prev/drivers/md/raid1.c   2008-05-30 14:49:31.000000000 +1000
++++ ./drivers/md/raid1.c       2008-06-10 10:41:00.000000000 +1000
+@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev
+       int found = 0;
+       int mirror = 0;
+       mirror_info_t *p;
++      int first = 0;
++      int last = mddev->raid_disks - 1;
+-      for (mirror=0; mirror < mddev->raid_disks; mirror++)
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
++      for (mirror = first; mirror <= last; mirror++)
+               if ( !(p=conf->mirrors+mirror)->rdev) {
+                       blk_queue_stack_limits(mddev->queue,
+
+diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c
+--- .prev/drivers/md/raid5.c   2008-05-30 14:49:35.000000000 +1000
++++ ./drivers/md/raid5.c       2008-06-10 10:27:51.000000000 +1000
+@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev
+       int found = 0;
+       int disk;
+       struct disk_info *p;
++      int first = 0;
++      int last = conf->raid_disks - 1;
+       if (mddev->degraded > conf->max_degraded)
+               /* no point adding a device */
+               return 0;
++      if (rdev->raid_disk >= 0)
++              first = last = rdev->raid_disk;
++
+       /*
+        * find the disk ... but prefer rdev->saved_raid_disk
+        * if possible.
+        */
+       if (rdev->saved_raid_disk >= 0 &&
++          rdev->saved_raid_disk >= first &&
+           conf->disks[rdev->saved_raid_disk].rdev == NULL)
+               disk = rdev->saved_raid_disk;
+       else
+-              disk = 0;
+-      for ( ; disk < conf->raid_disks; disk++)
++              disk = first;
++      for ( ; disk <= last ; disk++)
+               if ((p=conf->disks + disk)->rdev == NULL) {
+                       clear_bit(In_sync, &rdev->flags);
+                       rdev->raid_disk = disk;
diff --git a/kernel-patch-2.6.27 b/kernel-patch-2.6.27
new file mode 100644 (file)
index 0000000..8d0785d
--- /dev/null
@@ -0,0 +1,36 @@
+touch_mnt_namespace when the mount flags change
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+Daemons that need to be launched while the rootfs is read-only can now
+poll /proc/mounts to be notified when their O_RDWR requests may no
+longer end in EROFS.
+
+Cc: Kay Sievers <kay.sievers@vrfy.org>
+Cc: Neil Brown <neilb@suse.de>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+---
+
+ fs/namespace.c |    7 ++++++-
+ 1 files changed, 6 insertions(+), 1 deletions(-)
+
+
+diff --git a/fs/namespace.c b/fs/namespace.c
+index 6e283c9..1bd5ba2 100644
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+       if (!err)
+               nd->path.mnt->mnt_flags = mnt_flags;
+       up_write(&sb->s_umount);
+-      if (!err)
++      if (!err) {
+               security_sb_post_remount(nd->path.mnt, flags, data);
++
++              spin_lock(&vfsmount_lock);
++              touch_mnt_namespace(nd->path.mnt->mnt_ns);
++              spin_unlock(&vfsmount_lock);
++      }
+       return err;
+ }
diff --git a/managemon.c b/managemon.c
new file mode 100644 (file)
index 0000000..c9b054f
--- /dev/null
@@ -0,0 +1,676 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2008 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ *   Allocate the data structures and open the files.
+ *
+ *   For this we watch /proc/mdstat and find new arrays with
+ *   metadata type that confirms sharing. e.g. "md4"
+ *   When we find a new array we slip it into the list of
+ *   arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ *   and opening new files.
+ *
+ *   These come as a change to raid_disks.  We allocate a new
+ *   version of the data structures and slip it into the list.
+ *   'monitor' will notice and release the old version.
+ *   Changes to level, chunksize, layout.. do not need re-allocation.
+ *   Reductions in raid_disks don't really either, but we handle
+ *   them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ *   as a spare.
+ *
+ * - Deal with degraded array
+ *    We only do this when first noticing the array is degraded.
+ *    This can be when we first see the array, when sync completes or
+ *    when recovery completes.
+ *
+ *    Check if number of failed devices suggests recovery is needed, and
+ *    skip if not.
+ *    Ask metadata to allocate a spare device
+ *    Add device as not in_sync and give a role
+ *    Update metadata.
+ *    Open sysfs files and pass to monitor.
+ *    Make sure that monitor Starts recovery....
+ *
+ * - Pass on metadata updates from external programs such as
+ *   mdadm creating a new array.
+ *
+ *   This is most-messy.
+ *   It might involve adding a new array or changing the status of
+ *   a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ *   The required updates are received via a named pipe.  There will
+ *   be one named pipe for each container. Each message contains a
+ *   sync marker: 0x5a5aa5a5, A byte count, and the message.  This is
+ *   passed to the metadata handler which will interpret and process it.
+ *   For 'DDF' messages are internal data blocks with the leading
+ *   'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata.  Maybe we require
+ * the metadata to be mdX/NN  when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ *   metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ *   and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays.  This shouldn't
+ *   happen, as we should do all the adding.  Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc.  Update metadata and
+ *   start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include       "mdadm.h"
+#include       "mdmon.h"
+#include       <sys/syscall.h>
+#include       <sys/socket.h>
+#include       <signal.h>
+
+static void close_aa(struct active_array *aa)
+{
+       struct mdinfo *d;
+
+       for (d = aa->info.devs; d; d = d->next)
+               close(d->state_fd);
+
+       close(aa->action_fd);
+       close(aa->info.state_fd);
+       close(aa->resync_start_fd);
+}
+
+static void free_aa(struct active_array *aa)
+{
+       /* Note that this doesn't close fds if they are being used
+        * by a clone.  ->container will be set for a clone
+        */
+       dprintf("%s: devnum: %d\n", __func__, aa->devnum);
+       if (!aa->container)
+               close_aa(aa);
+       while (aa->info.devs) {
+               struct mdinfo *d = aa->info.devs;
+               aa->info.devs = d->next;
+               free(d);
+       }
+       free(aa);
+}
+
+static struct active_array *duplicate_aa(struct active_array *aa)
+{
+       struct active_array *newa = malloc(sizeof(*newa));
+       struct mdinfo **dp1, **dp2;
+
+       *newa = *aa;
+       newa->next = NULL;
+       newa->replaces = NULL;
+       newa->info.next = NULL;
+
+       dp2 = &newa->info.devs;
+
+       for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
+               struct mdinfo *d;
+               if ((*dp1)->state_fd < 0)
+                       continue;
+
+               d = malloc(sizeof(*d));
+               *d = **dp1;
+               *dp2 = d;
+               dp2 = & d->next;
+       }
+       *dp2 = NULL;
+
+       return newa;
+}
+
+static void wakeup_monitor(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mon_tid, SIGUSR1);
+}
+
+static void remove_old(void)
+{
+       if (discard_this) {
+               discard_this->next = NULL;
+               free_aa(discard_this);
+               if (pending_discard == discard_this)
+                       pending_discard = NULL;
+               discard_this = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void replace_array(struct supertype *container,
+                         struct active_array *old,
+                         struct active_array *new)
+{
+       /* To replace an array, we add it to the top of the list
+        * marked with ->replaces to point to the original.
+        * 'monitor' will take the original out of the list
+        * and put it on 'discard_this'.  We take it from there
+        * and discard it.
+        */
+       remove_old();
+       while (pending_discard) {
+               while (discard_this == NULL)
+                       sleep(1);
+               remove_old();
+       }
+       pending_discard = old;
+       new->replaces = old;
+       new->next = container->arrays;
+       container->arrays = new;
+       wakeup_monitor();
+}
+
+struct metadata_update *update_queue = NULL;
+struct metadata_update *update_queue_handled = NULL;
+struct metadata_update *update_queue_pending = NULL;
+
+void check_update_queue(struct supertype *container)
+{
+       while (update_queue_handled) {
+               struct metadata_update *this = update_queue_handled;
+               update_queue_handled = this->next;
+               free(this->buf);
+               if (this->space)
+                       free(this->space);
+               free(this);
+       }
+       if (update_queue == NULL &&
+           update_queue_pending) {
+               update_queue = update_queue_pending;
+               update_queue_pending = NULL;
+               wakeup_monitor();
+       }
+}
+
+static void queue_metadata_update(struct metadata_update *mu)
+{
+       struct metadata_update **qp;
+
+       qp = &update_queue_pending;
+       while (*qp)
+               qp = & ((*qp)->next);
+       *qp = mu;
+}
+
+static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
+{
+       int dfd;
+       char nm[20];
+       struct metadata_update *update = NULL;
+       mdu_disk_info_t dk = {
+               .number = -1,
+               .major = sd->disk.major,
+               .minor = sd->disk.minor,
+               .raid_disk = -1,
+               .state = 0,
+       };
+
+       dprintf("%s: add %d:%d to container\n",
+               __func__, sd->disk.major, sd->disk.minor);
+
+       sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+       dfd = dev_open(nm, O_RDWR);
+       if (dfd < 0)
+               return;
+
+       st->update_tail = &update;
+       st->ss->add_to_super(st, &dk, dfd, NULL);
+       st->ss->write_init_super(st);
+       queue_metadata_update(update);
+       st->update_tail = NULL;
+}
+
+static void manage_container(struct mdstat_ent *mdstat,
+                            struct supertype *container)
+{
+       /* The only thing of interest here is if a new device
+        * has been added to the container.  We add it to the
+        * array ignoring any metadata on it.
+        * FIXME should we look for compatible metadata and take hints
+        * about spare assignment.... probably not.
+        */
+       if (mdstat->devcnt != container->devcnt) {
+               struct mdinfo **cdp, *cd, *di, *mdi;
+               int found;
+
+               /* read /sys/block/NAME/md/dev-??/block/dev to find out
+                * what is there, and compare with container->info.devs
+                * To see what is removed and what is added.
+                * These need to be remove from, or added to, the array
+                */
+               mdi = sysfs_read(-1, mdstat->devnum, GET_DEVS);
+               if (!mdi) {
+                       /* invalidate the current count so we can try again */
+                       container->devcnt = -1;
+                       return;
+               }
+
+               /* check for removals */
+               for (cdp = &container->devs; *cdp; ) {
+                       found = 0;
+                       for (di = mdi->devs; di; di = di->next)
+                               if (di->disk.major == (*cdp)->disk.major &&
+                                   di->disk.minor == (*cdp)->disk.minor) {
+                                       found = 1;
+                                       break;
+                               }
+                       if (!found) {
+                               cd = *cdp;
+                               *cdp = (*cdp)->next;
+                               free(cd);
+                       } else
+                               cdp = &(*cdp)->next;
+               }
+
+               /* check for additions */
+               for (di = mdi->devs; di; di = di->next) {
+                       for (cd = container->devs; cd; cd = cd->next)
+                               if (di->disk.major == cd->disk.major &&
+                                   di->disk.minor == cd->disk.minor)
+                                       break;
+                       if (!cd)
+                               add_disk_to_container(container, di);
+               }
+               sysfs_free(mdi);
+               container->devcnt = mdstat->devcnt;
+       }
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+                         struct active_array *a)
+{
+       /* Compare mdstat info with known state of member array.
+        * We do not need to look for device state changes here, that
+        * is dealt with by the monitor.
+        *
+        * We just look for changes which suggest that a reshape is
+        * being requested.
+        * Unfortunately decreases in raid_disks don't show up in
+        * mdstat until the reshape completes FIXME.
+        *
+        * Actually, we also want to handle degraded arrays here by
+        * trying to find and assign a spare.
+        * We do that whenever the monitor tells us too.
+        */
+       // FIXME
+       a->info.array.raid_disks = mdstat->raid_disks;
+       a->info.array.chunk_size = mdstat->chunk_size;
+       // MORE
+
+       if (a->check_degraded) {
+               struct metadata_update *updates = NULL;
+               struct mdinfo *newdev;
+               struct active_array *newa;
+
+               a->check_degraded = 0;
+
+               /* The array may not be degraded, this is just a good time
+                * to check.
+                */
+               newdev = a->container->ss->activate_spare(a, &updates);
+               if (newdev) {
+                       struct mdinfo *d;
+                       /* Cool, we can add a device or several. */
+                       newa = duplicate_aa(a);
+                       /* suspend recovery - maybe not needed */
+
+                       /* Add device to array and set offset/size/slot.
+                        * and open files for each newdev */
+                       for (d = newdev; d ; d = d->next) {
+                               struct mdinfo *newd;
+                               if (sysfs_add_disk(&newa->info, d) < 0)
+                                       continue;
+                               newd = malloc(sizeof(*newd));
+                               *newd = *d;
+                               newd->next = newa->info.devs;
+                               newa->info.devs = newd;
+
+                               newd->state_fd = sysfs_open(a->devnum,
+                                                           newd->sys_name,
+                                                           "state");
+                               newd->prev_state
+                                       = read_dev_state(newd->state_fd);
+                               newd->curr_state = newd->prev_state;
+                       }
+                       queue_metadata_update(updates);
+                       replace_array(a->container, a, newa);
+                       sysfs_set_str(&a->info, NULL, "sync_action", "recover");
+               }
+       }
+}
+
+static int aa_ready(struct active_array *aa)
+{
+       struct mdinfo *d;
+       int level = aa->info.array.level;
+
+       for (d = aa->info.devs; d; d = d->next)
+               if (d->state_fd < 0)
+                       return 0;
+
+       if (aa->info.state_fd < 0)
+               return 0;
+
+       if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0))
+               return 0;
+
+       if (!aa->container)
+               return 0;
+
+       return 1;
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+                      struct supertype *container,
+                      struct active_array *victim)
+{
+       /* A new array has appeared in this container.
+        * Hopefully it is already recorded in the metadata.
+        * Check, then create the new array to report it to
+        * the monitor.
+        */
+
+       struct active_array *new;
+       struct mdinfo *mdi, *di;
+       char *inst;
+       int i;
+       int failed = 0;
+
+       /* check if array is ready to be monitored */
+       if (!mdstat->active)
+               return;
+
+       mdi = sysfs_read(-1, mdstat->devnum,
+                        GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
+                        GET_DEGRADED|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+
+       new = malloc(sizeof(*new));
+
+       if (!new || !mdi) {
+               if (mdi)
+                       sysfs_free(mdi);
+               if (new)
+                       free(new);
+               return;
+       }
+       memset(new, 0, sizeof(*new));
+
+       new->devnum = mdstat->devnum;
+       strcpy(new->info.sys_name, devnum2devname(new->devnum));
+
+       new->prev_state = new->curr_state = new->next_state = inactive;
+       new->prev_action= new->curr_action= new->next_action= idle;
+
+       new->container = container;
+
+       inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
+
+       new->info.array = mdi->array;
+       new->info.component_size = mdi->component_size;
+
+       for (i = 0; i < new->info.array.raid_disks; i++) {
+               struct mdinfo *newd = malloc(sizeof(*newd));
+
+               for (di = mdi->devs; di; di = di->next)
+                       if (i == di->disk.raid_disk)
+                               break;
+
+               if (di) {
+                       memcpy(newd, di, sizeof(*newd));
+
+                       newd->state_fd = sysfs_open(new->devnum,
+                                                   newd->sys_name,
+                                                   "state");
+
+                       newd->prev_state = read_dev_state(newd->state_fd);
+                       newd->curr_state = newd->prev_state;
+               } else if (failed + 1 > new->info.array.failed_disks) {
+                       /* we cannot properly monitor without all working disks */
+                       new->container = NULL;
+                       break;
+               } else {
+                       failed++;
+                       free(newd);
+                       continue;
+               }
+               sprintf(newd->sys_name, "rd%d", i);
+               newd->next = new->info.devs;
+               new->info.devs = newd;
+       }
+
+       new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
+       new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
+       new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
+       new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version");
+       get_resync_start(new);
+       dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
+               new->action_fd, new->info.state_fd);
+
+       sysfs_free(mdi);
+
+       /* if everything checks out tell the metadata handler we want to
+        * manage this instance
+        */
+       if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) {
+               fprintf(stderr, "mdmon: failed to monitor %s\n",
+                       mdstat->metadata_version);
+               new->container = NULL;
+               free_aa(new);
+       } else {
+               replace_array(container, victim, new);
+               if (failed) {
+                       new->check_degraded = 1;
+                       manage_member(mdstat, new);
+               }
+       }
+}
+
+void manage(struct mdstat_ent *mdstat, struct supertype *container)
+{
+       /* We have just read mdstat and need to compare it with
+        * the known active arrays.
+        * Arrays with the wrong metadata are ignored.
+        */
+
+       for ( ; mdstat ; mdstat = mdstat->next) {
+               struct active_array *a;
+               if (mdstat->devnum == container->devnum) {
+                       manage_container(mdstat, container);
+                       continue;
+               }
+               if (!is_container_member(mdstat, container->devname))
+                       /* Not for this array */
+                       continue;
+               /* Looks like a member of this container */
+               for (a = container->arrays; a; a = a->next) {
+                       if (mdstat->devnum == a->devnum) {
+                               if (a->container)
+                                       manage_member(mdstat, a);
+                               break;
+                       }
+               }
+               if (a == NULL || !a->container)
+                       manage_new(mdstat, container, a);
+       }
+}
+
+static void handle_message(struct supertype *container, struct metadata_update *msg)
+{
+       /* queue this metadata update through to the monitor */
+
+       struct metadata_update *mu;
+
+       if (msg->len <= 0)
+               while (update_queue_pending || update_queue) {
+                       check_update_queue(container);
+                       usleep(15*1000);
+               }
+
+       if (msg->len == 0) { /* ping_monitor */
+               int cnt;
+               
+               cnt = monitor_loop_cnt;
+               if (cnt & 1)
+                       cnt += 2; /* wait until next pselect */
+               else
+                       cnt += 3; /* wait for 2 pselects */
+               wakeup_monitor();
+
+               while (monitor_loop_cnt - cnt < 0)
+                       usleep(10 * 1000);
+       } else if (msg->len == -1) { /* ping_manager */
+               struct mdstat_ent *mdstat = mdstat_read(1, 0);
+
+               manage(mdstat, container);
+               free_mdstat(mdstat);
+       } else if (!sigterm) {
+               mu = malloc(sizeof(*mu));
+               mu->len = msg->len;
+               mu->buf = msg->buf;
+               msg->buf = NULL;
+               mu->space = NULL;
+               mu->next = NULL;
+               if (container->ss->prepare_update)
+                       container->ss->prepare_update(container, mu);
+               queue_metadata_update(mu);
+       }
+}
+
+void read_sock(struct supertype *container)
+{
+       int fd;
+       struct metadata_update msg;
+       int terminate = 0;
+       long fl;
+       int tmo = 3; /* 3 second timeout before hanging up the socket */
+
+       fd = accept(container->sock, NULL, NULL);
+       if (fd < 0)
+               return;
+
+       fl = fcntl(fd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(fd, F_SETFL, fl);
+
+       do {
+               msg.buf = NULL;
+
+               /* read and validate the message */
+               if (receive_message(fd, &msg, tmo) == 0) {
+                       handle_message(container, &msg);
+                       if (ack(fd, tmo) < 0)
+                               terminate = 1;
+               } else
+                       terminate = 1;
+
+       } while (!terminate);
+
+       close(fd);
+}
+
+int exit_now = 0;
+int manager_ready = 0;
+void do_manager(struct supertype *container)
+{
+       struct mdstat_ent *mdstat;
+       sigset_t set;
+       int proc_fd;
+
+       sigprocmask(SIG_UNBLOCK, NULL, &set);
+       sigdelset(&set, SIGUSR1);
+       sigdelset(&set, SIGHUP);
+       sigdelset(&set, SIGALRM);
+       sigdelset(&set, SIGTERM);
+       proc_fd = open("/proc/mounts", O_RDONLY);
+
+       do {
+
+               if (exit_now)
+                       exit(0);
+
+               /* Can only 'manage' things if 'monitor' is not making
+                * structural changes to metadata, so need to check
+                * update_queue
+                */
+               if (update_queue == NULL) {
+                       mdstat = mdstat_read(1, 0);
+
+                       manage(mdstat, container);
+
+                       read_sock(container);
+
+                       if (container->sock < 0 || socket_hup_requested) {
+                               close(container->sock);
+                               container->sock = make_control_sock(container->devname);
+                               make_pidfile(container->devname, 0);
+                               socket_hup_requested = 0;
+                       }
+                       if (container->sock < 0)
+                               alarm(30);
+
+                       free_mdstat(mdstat);
+               }
+               remove_old();
+
+               check_update_queue(container);
+
+               manager_ready = 1;
+
+               if (sigterm)
+                       wakeup_monitor();
+
+               if (update_queue == NULL) {
+                       if (container->sock < 0)
+                               mdstat_wait_fd(proc_fd, &set);
+                       else
+                               mdstat_wait_fd(container->sock, &set);
+               } else
+                       /* If an update is happening, just wait for signal */
+                       pselect(0, NULL, NULL, NULL, NULL, &set);
+       } while(1);
+}
index 746073d059c508685a3743af79b995af36b05d6a..955a1f98a243c1844b22a65d50909b42c9708c30 100644 (file)
--- a/mapfile.c
+++ b/mapfile.c
@@ -33,8 +33,8 @@
  * also allows the array device name to be easily found.
  *
  * The map file is line based with space separated fields.  The fields are:
- *  Device id  -  mdX or mdpX  where is a number.
- *  metadata   -  0.90 1.0 1.1 1.2
+ *  Device id  -  mdX or mdpX  where is a number.
+ *  metadata   -  0.90 1.0 1.1 1.2 ddf ...
  *  UUID       -  uuid of the array
  *  path       -  path where device created: /dev/md/home
  *
@@ -62,7 +62,7 @@ int map_write(struct map_ent *mel)
                        fprintf(f, "mdp%d ", -1-mel->devnum);
                else
                        fprintf(f, "md%d ", mel->devnum);
-               fprintf(f, "%d.%d ", mel->major, mel->minor);
+               fprintf(f, "%s ", mel->metadata);
                fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0],
                        mel->uuid[1], mel->uuid[2], mel->uuid[3]);
                fprintf(f, "%s\n", mel->path);
@@ -87,13 +87,12 @@ int map_write(struct map_ent *mel)
 }
 
 void map_add(struct map_ent **melp,
-           int devnum, int major, int minor, int uuid[4], char *path)
+           int devnum, char *metadata, int uuid[4], char *path)
 {
        struct map_ent *me = malloc(sizeof(*me));
 
        me->devnum = devnum;
-       me->major = major;
-       me->minor = minor;
+       strcpy(me->metadata, metadata);
        memcpy(me->uuid, uuid, 16);
        me->path = strdup(path);
        me->next = *melp;
@@ -105,7 +104,8 @@ void map_read(struct map_ent **melp)
        FILE *f;
        char buf[8192];
        char path[200];
-       int devnum, major, minor, uuid[4];
+       int devnum, uuid[4];
+       char metadata[30];
        char nam[4];
 
        *melp = NULL;
@@ -117,12 +117,14 @@ void map_read(struct map_ent **melp)
                return;
 
        while (fgets(buf, sizeof(buf), f)) {
-               if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s",
-                          nam, &devnum, &major, &minor, uuid, uuid+1,
-                          uuid+2, uuid+3, path) == 9) {
-                       if (nam[0] == 'p')
+               if (sscanf(buf, " %3[mdp]%d %s %x:%x:%x:%x %200s",
+                          nam, &devnum, metadata, uuid, uuid+1,
+                          uuid+2, uuid+3, path) == 8) {
+                       if (strncmp(nam, "md", 2) != 0)
+                               continue;
+                       if (nam[2] == 'p')
                                devnum = -1 - devnum;
-                       map_add(melp, devnum, major, minor, uuid, path);
+                       map_add(melp, devnum, metadata, uuid, path);
                }
        }
        fclose(f);
@@ -138,7 +140,7 @@ void map_free(struct map_ent *map)
        }
 }
 
-int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+int map_update(struct map_ent **mpp, int devnum, char *metadata,
               int *uuid, char *path)
 {
        struct map_ent *map, *mp;
@@ -151,15 +153,14 @@ int map_update(struct map_ent **mpp, int devnum, int major, int minor,
 
        for (mp = map ; mp ; mp=mp->next)
                if (mp->devnum == devnum) {
-                       mp->major = major;
-                       mp->minor = minor;
+                       strcpy(mp->metadata, metadata);
                        memcpy(mp->uuid, uuid, 16);
                        free(mp->path);
                        mp->path = strdup(path);
                        break;
                }
        if (!mp)
-               map_add(&map, devnum, major, minor, uuid, path);
+               map_add(&map, devnum, metadata, uuid, path);
        *mpp = NULL;
        rv = map_write(map);
        map_free(map);
diff --git a/md.4 b/md.4
index dfd287f1f156db44c03715fa26303c0b6f6c9011..ea12eaffaf1570f040801ff518c5a62b4b655d12 100644 (file)
--- a/md.4
+++ b/md.4
@@ -526,10 +526,22 @@ Finally, "idle" can be written to stop the check/repair process.
 .B md/stripe_cache_size
 This is only available on RAID5 and RAID6.  It records the size (in
 pages per device) of the  stripe cache which is used for synchronising
-all read and write operations to the array.  The default is 128.
+all write operations to the array and all read operations if the array
+is degraded.  The default is 256.  Valid values are 17 to 32768.
 Increasing this number can increase performance in some situations, at
-some cost in system memory.
+some cost in system memory.  Note, setting this value too high can
+result in an "out of memory" condition for the system.
 
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6.  This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading".  For fairness this defaults to
+1.  Valid values are 0 to stripe_cache_size.  Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.  
 
 .SS KERNEL PARAMETERS
 
diff --git a/mdadm.8 b/mdadm.8
index 054addae82d524bf1e6752d106c9b1c3502039b3..9f62b012a6f7990101c9add4796e71dcac2390d5 100644 (file)
--- a/mdadm.8
+++ b/mdadm.8
@@ -5,7 +5,7 @@
 .\"   the Free Software Foundation; either version 2 of the License, or
 .\"   (at your option) any later version.
 .\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v2.6.7
+.TH MDADM 8 "" v3.0-devel1
 .SH NAME
 mdadm \- manage MD devices
 .I aka
@@ -1025,6 +1025,19 @@ activity to finish before returning.
 will return with success if it actually waited for every device
 listed, otherwise it will return failure.
 
+.TP
+.BR \-\-wait\-clean
+For each md device given, arrange for the array to be marked clean as
+soon as possible.  Also, quiesce resync so that the monitor for external
+metadata arrays (mdmon) has an opportunity to checkpoint the resync
+position.
+.I mdadm
+will return with success if the array uses external metadata and we
+successfully waited.  For native arrays this returns immediately as the
+kernel handles both dirty-clean transitions and resync checkpointing in
+the kernel at shutdown.  No action is taken if safe-mode handling is
+disabled.
+
 .SH For Incremental Assembly mode:
 .TP
 .BR \-\-rebuild\-map ", " \-r
@@ -1948,6 +1961,16 @@ that no metadata updates are made and no attempt at resync or recovery
 happens.  Further devices that are found before the first write can
 still be added safely.
 
+
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon.  This variable is intended primarily for debugging mdadm/mdmon.
+
 .SH EXAMPLES
 
 .B "  mdadm \-\-query /dev/name-of-device"
diff --git a/mdadm.c b/mdadm.c
index af9627d545307a4c485ec344d77247c5d941a5a4..3229de4bcbf3dca20228adedc69987ab85d56d9c 100644 (file)
--- a/mdadm.c
+++ b/mdadm.c
@@ -214,6 +214,7 @@ int main(int argc, char *argv[])
                case 'o':
                case 'w':
                case 'W':
+               case Waitclean:
                case 'K': if (!mode) newmode = MISC; break;
                }
                if (mode && newmode == mode) {
@@ -643,6 +644,7 @@ int main(int argc, char *argv[])
                        continue;
 
                case O(ASSEMBLE,'c'): /* config file */
+               case O(INCREMENTAL, 'c'):
                case O(MISC, 'c'):
                case O(MONITOR,'c'):
                        if (configfile) {
@@ -761,6 +763,7 @@ int main(int argc, char *argv[])
                case O(MISC,'o'):
                case O(MISC,'w'):
                case O(MISC,'W'):
+               case O(MISC, Waitclean):
                        if (devmode && devmode != opt &&
                            (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) {
                                fprintf(stderr, Name ": --examine/-E cannot be given with -%c\n",
@@ -1263,13 +1266,16 @@ int main(int argc, char *argv[])
                                                     export, test, homehost);
                                        continue;
                                case 'K': /* Zero superblock */
-                                       rv |= Kill(dv->devname, force, quiet); continue;
+                                       rv |= Kill(dv->devname, force, quiet,0);
+                                       continue;
                                case 'Q':
                                        rv |= Query(dv->devname); continue;
                                case 'X':
                                        rv |= ExamineBitmap(dv->devname, brief, ss); continue;
                                case 'W':
                                        rv |= Wait(dv->devname); continue;
+                               case Waitclean:
+                                       rv |= WaitClean(dv->devname, verbose-quiet); continue;
                                }
                                mdfd = open_mddev(dv->devname, 1);
                                if (mdfd>=0) {
diff --git a/mdadm.h b/mdadm.h
index bc4b38e7a5150305a1fc75086a4178f83cb60554..4b006fb6884c108871d3539a25f81d0f168af6c6 100644 (file)
--- a/mdadm.h
+++ b/mdadm.h
@@ -76,6 +76,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #include       "md_u.h"
 #include       "md_p.h"
 #include       "bitmap.h"
+#include       "msg.h"
 
 #include <endian.h>
 /* Redhat don't like to #include <asm/byteorder.h>, and
@@ -106,6 +107,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) (_x)
 #define __le32_to_cpu(_x) (_x)
 #define __le64_to_cpu(_x) (_x)
+
+#define        __cpu_to_be16(_x) bswap_16(_x)
+#define __cpu_to_be32(_x) bswap_32(_x)
+#define __cpu_to_be64(_x) bswap_64(_x)
+#define        __be16_to_cpu(_x) bswap_16(_x)
+#define __be32_to_cpu(_x) bswap_32(_x)
+#define __be64_to_cpu(_x) bswap_64(_x)
 #elif BYTE_ORDER == BIG_ENDIAN
 #define        __cpu_to_le16(_x) bswap_16(_x)
 #define __cpu_to_le32(_x) bswap_32(_x)
@@ -113,6 +121,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define        __le16_to_cpu(_x) bswap_16(_x)
 #define __le32_to_cpu(_x) bswap_32(_x)
 #define __le64_to_cpu(_x) bswap_64(_x)
+
+#define        __cpu_to_be16(_x) (_x)
+#define __cpu_to_be32(_x) (_x)
+#define __cpu_to_be64(_x) (_x)
+#define        __be16_to_cpu(_x) (_x)
+#define __be32_to_cpu(_x) (_x)
+#define __be64_to_cpu(_x) (_x)
 #else
 #  error "unknown endianness."
 #endif
@@ -128,18 +143,38 @@ struct mdinfo {
        int                     uuid[4];
        char                    name[33];
        unsigned long long      data_offset;
-       unsigned long long      component_size;
+       unsigned long long      component_size; /* same as array.size, except in
+                                                * sectors and up to 64bits.
+                                                */
        int                     reshape_active;
        unsigned long long      reshape_progress;
+       unsigned long long      resync_start;
+       unsigned long           safe_mode_delay; /* ms delay to mark clean */
        int                     new_level, delta_disks, new_layout, new_chunk;
        int                     errors;
        int                     cache_size; /* size of raid456 stripe cache*/
        int                     mismatch_cnt;
        char                    text_version[50];
 
+       int container_member; /* for assembling external-metatdata arrays
+                              * This is to be used internally by metadata
+                              * handler only */
+
        char            sys_name[20];
        struct mdinfo *devs;
        struct mdinfo *next;
+
+       /* Device info for mdmon: */
+       int state_fd;
+       #define DS_FAULTY       1
+       #define DS_INSYNC       2
+       #define DS_WRITE_MOSTLY 4
+       #define DS_SPARE        8
+       #define DS_BLOCKED      16
+       #define DS_REMOVE       1024
+       #define DS_UNBLOCK      2048
+       int prev_state, curr_state, next_state;
+
 };
 
 struct createinfo {
@@ -189,6 +224,7 @@ enum special_options {
        AutoHomeHost,
        Symlinks,
        AutoDetect,
+       Waitclean,
 };
 
 /* structures read from config file */
@@ -223,6 +259,12 @@ typedef struct mddev_ident_s {
        char    *bitmap_file;
        int     bitmap_fd;
 
+       char    *container;     /* /dev/whatever name of container.  You
+                                * would expect this to be the 'devname'
+                                * of some other entry.
+                                */
+       char    *member;        /* subarray within a container */
+
        struct mddev_ident_s *next;
 } *mddev_ident_t;
 
@@ -252,22 +294,27 @@ struct mdstat_ent {
        char            *pattern; /* U or up, _ for down */
        int             percent; /* -1 if no resync */
        int             resync; /* 1 if resync, 0 if recovery */
+       int             devcnt;
+       int             raid_disks;
+       int             chunk_size;
+       char *          metadata_version;
        struct mdstat_ent *next;
 };
 
 extern struct mdstat_ent *mdstat_read(int hold, int start);
 extern void free_mdstat(struct mdstat_ent *ms);
 extern void mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
 extern int mddev_busy(int devnum);
 
 struct map_ent {
        struct map_ent *next;
        int     devnum;
-       int     major,minor;
+       char    metadata[20];
        int     uuid[4];
        char    *path;
 };
-extern int map_update(struct map_ent **mpp, int devnum, int major, int minor,
+extern int map_update(struct map_ent **mpp, int devnum, char *metadata,
                      int uuid[4], char *path);
 extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]);
 extern void map_read(struct map_ent **melp);
@@ -275,7 +322,7 @@ extern int map_write(struct map_ent *mel);
 extern void map_delete(struct map_ent **mapp, int devnum);
 extern void map_free(struct map_ent *map);
 extern void map_add(struct map_ent **melp,
-                   int devnum, int major, int minor, int uuid[4], char *path);
+                   int devnum, char *metadata, int uuid[4], char *path);
 
 /* various details can be requested */
 #define        GET_LEVEL       1
@@ -285,6 +332,9 @@ extern void map_add(struct map_ent **melp,
 #define GET_CACHE      16
 #define        GET_MISMATCH    32
 #define        GET_VERSION     64
+#define        GET_DISKS       128
+#define        GET_DEGRADED    256
+#define        GET_SAFEMODE    512
 
 #define        GET_DEVS        1024 /* gets role, major, minor */
 #define        GET_OFFSET      2048
@@ -295,14 +345,24 @@ extern void map_add(struct map_ent **melp,
 /* If fd >= 0, get the array it is open on,
  * else use devnum. >=0 -> major9. <0.....
  */
+extern int sysfs_open(int devnum, char *devname, char *attr);
+extern void sysfs_init(struct mdinfo *mdi, int fd, int devnum);
 extern void sysfs_free(struct mdinfo *sra);
 extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options);
+extern int sysfs_attr_match(const char *attr, const char *str);
+extern int sysfs_match_word(const char *word, char **list);
 extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, char *val);
 extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
                         char *name, unsigned long long val);
 extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                        char *name, unsigned long long *val);
+extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
+extern int sysfs_set_array(struct mdinfo *info, int vers);
+extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd);
+extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
+extern int sysfs_unique_holder(int devnum, long rdev);
+extern int load_sys(char *path, char *buf);
 
 
 extern int save_stripes(int *source, unsigned long long *offsets,
@@ -326,28 +386,125 @@ extern mapping_t r5layout[], pers[], modes[], faultylayout[];
 
 extern char *map_dev(int major, int minor, int create);
 
+struct active_array;
+struct metadata_update;
 
+/* A superswitch provides entry point the a metadata handler.
+ *
+ * The super_switch primarily operates on some "metadata" that
+ * is accessed via the 'supertype'.
+ * This metadata has one of three possible sources.
+ * 1/ It is read from a single device.  In this case it may not completely
+ *    describe the array or arrays as some information might be on other
+ *    devices.
+ * 2/ It is read from all devices in a container.  In this case all
+ *    information is present.
+ * 3/ It is created by ->init_super / ->add_to_super.  In this case it will
+ *    be complete once enough ->add_to_super calls have completed.
+ *
+ * When creating an array inside a container, the metadata will be
+ * formed by a combination of 2 and 3.  The metadata or the array is read,
+ * then new information is added.
+ *
+ * The metadata must sometimes have a concept of a 'current' array
+ * and a 'current' device.
+ * The 'current' array is set by init_super to be the newly created array,
+ * or is set by super_by_fd when it finds it is looking at an array inside
+ * a container.
+ *
+ * The 'current' device is either the device that the metadata was read from
+ * in case 1, or the last device added by add_to_super in case 3.
+ * Case 2 does not identify a 'current' device.
+ */
 extern struct superswitch {
+
+       /* Used to report details of metadata read from a component
+        * device. ->load_super has been called.
+        */
        void (*examine_super)(struct supertype *st, char *homehost);
        void (*brief_examine_super)(struct supertype *st);
        void (*export_examine_super)(struct supertype *st);
+
+       /* Used to report details of an active array.
+        * ->load_super was possibly given a 'component' string.
+        */
        void (*detail_super)(struct supertype *st, char *homehost);
        void (*brief_detail_super)(struct supertype *st);
        void (*export_detail_super)(struct supertype *st);
+
+       /* Used:
+        *   to get uuid to storing in bitmap metadata
+        *   and 'reshape' backup-data metadata
+        *   To see if a device is being re-added to an array it was part of.
+        */
        void (*uuid_from_super)(struct supertype *st, int uuid[4]);
+
+       /* Extra generic details from metadata.  This could be details about
+        * the container, or about an individual array within the container.
+        * The determination is made either by:
+        *   load_super being given a 'component' string.
+        *   validate_geometry determining what to create.
+        * The info includes both array information and device information.
+        * The particular device should be:
+        *   The last device added by add_to_super
+        *   The device the metadata was loaded from by load_super
+        */
        void (*getinfo_super)(struct supertype *st, struct mdinfo *info);
+
+       /* Check if the given metadata is flagged as belonging to "this"
+        * host.  0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
+        */
        int (*match_home)(struct supertype *st, char *homehost);
+
+       /* Make one of several generic modifications to metadata
+        * prior to assembly (or other times).
+        *   sparc2.2  - first bug in early 0.90 metadata
+        *   super-minor - change name of 0.90 metadata
+        *   summaries - 'correct' any redundant data
+        *   resync - mark array as dirty to trigger a resync.
+        *   uuid - set new uuid - only 0.90 or 1.x
+        *   name - change the name of the array (where supported)
+        *   homehost - change which host this array is tied to.
+        *   devicesize - If metadata is at start of device, change recorded
+        *               device size to match actual device size
+        *   byteorder - swap bytes for 0.90 metadata
+        *
+        *   force-one  - mark that device as uptodate, not old or failed.
+        *   force-array - mark array as clean if it would not otherwise
+        *               assemble
+        *   assemble   - not sure how this is different from force-one...
+        *   linear-grow-new - add a new device to a linear array, but don't
+        *                   change the size: so superblock still matches
+        *   linear-grow-update - now change the size of the array.
+        */
        int (*update_super)(struct supertype *st, struct mdinfo *info,
                            char *update,
                            char *devname, int verbose,
                            int uuid_set, char *homehost);
+
+       /* Create new metadata for new array as described.  This could
+        * be a new container, or an array in a pre-existing container.
+        * Also used to zero metadata prior to writing it to invalidate old
+        * metadata.
+        */
        int (*init_super)(struct supertype *st, mdu_array_info_t *info,
                          unsigned long long size, char *name,
                          char *homehost, int *uuid);
-       void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo);
+
+       /* update the metadata to include new device, either at create or
+        * when hot-adding a spare.
+        */
+       void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
+                            int fd, char *devname);
+
+       /* Write metadata to one device when fixing problems or adding
+        * a new device.
+        */
        int (*store_super)(struct supertype *st, int fd);
-       int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo,
-                               char *devname);
+
+       /*  Write all metadata for this array.
+        */
+       int (*write_init_super)(struct supertype *st);
        int (*compare_super)(struct supertype *st, struct supertype *tst);
        int (*load_super)(struct supertype *st, int fd, char *devname);
        struct supertype * (*match_metadata_desc)(char *arg);
@@ -358,15 +515,120 @@ extern struct superswitch {
        void (*locate_bitmap)(struct supertype *st, int fd);
        int (*write_bitmap)(struct supertype *st, int fd);
        void (*free_super)(struct supertype *st);
-       int major;
+
+       /* validate_geometry is called with an st returned by
+        * match_metadata_desc.
+        * It should check that the geometry described in compatible with
+        * the metadata type.  It will be called repeatedly as devices
+        * added to validate changing size and new devices.  If there are
+        * inter-device dependencies, it should record sufficient details
+        * so these can be validated.
+        */
+       int (*validate_geometry)(struct supertype *st, int level, int layout,
+                                int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *subdev, unsigned long long *freesize,
+                                int verbose);
+
+       struct mdinfo *(*container_content)(struct supertype *st);
+
+/* for mdmon */
+       int (*open_new)(struct supertype *c, struct active_array *a,
+                       char *inst);
+
+       /* Tell the metadata handler the current state of the array.
+        * This covers whether it is known to be consistent (no pending writes)
+        * and how far along a resync is known to have progressed
+        * (in a->resync_start).
+        * resync status is really irrelevant if the array is not consistent,
+        * but some metadata (DDF!) have a place to record the distinction.
+        * If 'consistent' is '2', then the array can mark it dirty if a 
+        * resync/recovery/whatever is required, or leave it clean if not.
+        * Return value is 0 dirty (not consistent) and 1 if clean.
+        * it is only really important if consistent is passed in as '2'.
+        */
+       int (*set_array_state)(struct active_array *a, int consistent);
+
+       /* When the state of a device might have changed, we call set_disk to
+        * tell the metadata what the current state is.
+        * Typically this happens on spare->in_sync and (spare|in_sync)->faulty
+        * transitions.
+        * set_disk might be called when the state of the particular disk has
+        * not in fact changed.
+        */
+       void (*set_disk)(struct active_array *a, int n, int state);
+       void (*sync_metadata)(struct supertype *st);
+       void (*process_update)(struct supertype *st,
+                              struct metadata_update *update);
+       void (*prepare_update)(struct supertype *st,
+                              struct metadata_update *update);
+
+       /* activate_spare will check if the array is degraded and, if it
+        * is, try to find some spare space in the container.
+        * On success, it add appropriate updates (For process_update) to
+        * to the 'updates' list and returns a list of 'mdinfo' identifying
+        * the device, or devices as there might be multiple missing
+        * devices and multiple spares available.
+        */
+       struct mdinfo *(*activate_spare)(struct active_array *a,
+                                        struct metadata_update **updates);
+
        int swapuuid; /* true if uuid is bigending rather than hostendian */
-} super0, super1, *superlist[];
+       int external;
+} super0, super1, super_ddf, *superlist[];
 
+extern struct superswitch super_imsm;
+
+struct metadata_update {
+       int     len;
+       char    *buf;
+       void    *space; /* allocated space that monitor will use */
+       struct metadata_update *next;
+};
+
+/* A supertype holds a particular collection of metadata.
+ * It identifies the metadata type by the superswitch, and the particular
+ * sub-version of that metadata type.
+ * metadata read in or created is stored in 'sb' and 'info'.
+ * There are also fields used by mdmon to track containers.
+ *
+ * A supertype may refer to:
+ *   Just an array, possibly in a container
+ *   A container, not identifying any particular array
+ *   Info read from just one device, not yet fully describing the array/container.
+ *
+ *
+ * A supertype is created by:
+ *   super_by_fd
+ *   guess_super
+ *   dup_super
+ */
 struct supertype {
        struct superswitch *ss;
        int minor_version;
        int max_devs;
+       int container_dev;    /* devnum of container */
+       char subarray[32];      /* name of array inside container */
        void *sb;
+       void *info;
+       int loaded_container;   /* Set if load_super found a container,
+                                * not just one device */
+
+       struct metadata_update *updates;
+       struct metadata_update **update_tail;
+
+       /* extra stuff used by mdmon */
+       struct active_array *arrays;
+       int sock; /* listen to external programs */
+       int devnum;
+       char *devname; /* e.g. md0.  This appears in metadata_verison:
+                       *  external:/md0/12
+                       */
+       int devcnt;
+       char *device_name; /* e.g. /dev/md/whatever */
+
+       struct mdinfo *devs;
+
 };
 
 extern struct supertype *super_by_fd(int fd);
@@ -459,11 +721,14 @@ extern int Monitor(mddev_dev_t devlist,
                   int period, int daemonise, int scan, int oneshot,
                   int dosyslog, int test, char *pidfile);
 
-extern int Kill(char *dev, int force, int quiet);
+extern int Kill(char *dev, int force, int quiet, int noexcl);
 extern int Wait(char *dev);
+extern int WaitClean(char *dev, int verbose);
 
 extern int Incremental(char *devname, int verbose, int runstop,
                       struct supertype *st, char *homehost, int autof);
+extern int Incremental_container(struct supertype *st, char *devname,
+                                int verbose, int runstop, int autof);
 extern void RebuildMap(void);
 extern int IncrementalScan(int verbose);
 
@@ -485,6 +750,7 @@ extern int check_raid(int fd, char *name);
 
 extern int get_mdp_major(void);
 extern int dev_open(char *dev, int flags);
+extern int open_dev_excl(int devnum);
 extern int is_standard(char *dev, int *nump);
 
 extern int parse_auto(char *str, char *msg, int config);
@@ -502,15 +768,26 @@ extern char *conf_word(FILE *file, int allow_key);
 extern void free_line(char *line);
 extern int match_oneof(char *devices, char *devname);
 extern void uuid_from_super(int uuid[4], mdp_super_t *super);
+extern const int uuid_match_any[4];
 extern int same_uuid(int a[4], int b[4], int swapuuid);
 extern void copy_uuid(void *a, int b[4], int swapuuid);
+extern char *fname_from_uuid(struct supertype *st,
+                            struct mdinfo *info, char *buf, char sep);
 extern unsigned long calc_csum(void *super, int bytes);
 extern int enough(int level, int raid_disks, int layout, int clean,
                   char *avail, int avail_disks);
 extern int ask(char *mesg);
 extern unsigned long long get_component_size(int fd);
 extern void remove_partitions(int fd);
+extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize);
+extern int flush_metadata_updates(struct supertype *st);
+extern void append_metadata_update(struct supertype *st, void *buf, int len);
+extern struct superswitch *find_metadata_methods(char *vers);
 
+extern int add_disk(int mdfd, struct supertype *st,
+                   struct mdinfo *sra, struct mdinfo *info);
+extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
 
 extern char *human_size(long long bytes);
 extern char *human_size_brief(long long bytes);
@@ -527,12 +804,66 @@ extern char DefaultConfFile[];
 extern int open_mddev(char *dev, int autof);
 extern int open_mddev_devnum(char *devname, int devnum, char *name,
                             char *chosen_name, int parts);
-
+extern int open_container(int fd);
+
+extern int mdmon_running(int devnum);
+extern int signal_mdmon(int devnum);
+extern int env_no_mdmon(void);
+extern int start_mdmon(int devnum);
+
+extern char *devnum2devname(int num);
+extern int devname2devnum(char *name);
+extern int stat2devnum(struct stat *st);
+extern int fd2devnum(int fd);
+
+static inline int dev2major(int d)
+{
+       if (d >= 0)
+               return MD_MAJOR;
+       else
+               return get_mdp_major();
+}
+
+static inline int dev2minor(int d)
+{
+       if (d >= 0)
+               return d;
+       return (-1-d) << MdpMinorShift;
+}
+
+static inline int ROUND_UP(int a, int base)
+{
+       return ((a+base-1)/base)*base;
+}
+
+static inline int is_subarray(char *vers)
+{
+       /* The version string for a 'subarray' (an array in a container)
+        * is 
+        *    /containername/componentname    for normal read-write arrays
+        *    -containername/componentname    for read-only arrays.
+        * containername is e.g. md0, md_d1
+        * componentname is dependant on the metadata. e.g. '1' 'S1' ...
+        */
+       return (*vers == '/' || *vers == '-');
+}
+
+#ifdef DEBUG
+#define dprintf(fmt, arg...) \
+       fprintf(stderr, fmt, ##arg)
+#else
+#define dprintf(fmt, arg...) \
+        ({ if (0) fprintf(stderr, fmt, ##arg); 0; })
+#endif
 
 #define        LEVEL_MULTIPATH         (-4)
 #define        LEVEL_LINEAR            (-1)
 #define        LEVEL_FAULTY            (-5)
 
+/* kernel module doesn't know about these */
+#define LEVEL_CONTAINER                (-100)
+#define        LEVEL_UNSUPPORTED       (-200)
+
 
 /* faulty stuff */
 
index 8e1a9bbc0b8c756f1a72c2ae7f9104a425063e06..ba85fbf887ed0937579b5aac79c7cac39e3251c2 100644 (file)
@@ -1,6 +1,6 @@
 Summary:     mdadm is used for controlling Linux md devices (aka RAID arrays)
 Name:        mdadm
-Version:     2.6.7
+Version:     3.0_devel1
 Release:     1
 Source:      http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tgz
 URL:         http://neil.brown.name/blog/mdadm
index d60c775d73e210bfc87ee466a02ed5df3c04a277..d8aa0f3a02dd6e836cf779d1cf15fcc410736bba 100644 (file)
@@ -1,5 +1,5 @@
 .\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v2.6.7
+.TH MDASSEMBLE 8 "" v3.0-devel1
 .SH NAME
 mdassemble \- assemble MD devices
 .I aka
diff --git a/mdmon.c b/mdmon.c
new file mode 100644 (file)
index 0000000..bfad18a
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,474 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2008 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked.  It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include       <unistd.h>
+#include       <stdlib.h>
+#include       <sys/types.h>
+#include       <sys/stat.h>
+#include       <sys/socket.h>
+#include       <sys/un.h>
+#include       <sys/mman.h>
+#include       <sys/syscall.h>
+#include       <sys/wait.h>
+#include       <stdio.h>
+#include       <errno.h>
+#include       <string.h>
+#include       <fcntl.h>
+#include       <signal.h>
+#include       <dirent.h>
+
+#include       <sched.h>
+
+#include       "mdadm.h"
+#include       "mdmon.h"
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int sigterm;
+
+int run_child(void *v)
+{
+       struct supertype *c = v;
+
+       do_monitor(c);
+       return 0;
+}
+
+int clone_monitor(struct supertype *container)
+{
+       static char stack[4096];
+
+       mon_tid = clone(run_child, stack+4096-64,
+                  CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+                  container);
+
+       mgr_tid = syscall(SYS_gettid);
+
+       return mon_tid;
+}
+
+
+int make_pidfile(char *devname, int o_excl)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+
+       if (sigterm)
+               return -1;
+
+       sprintf(path, "/var/run/mdadm/%s.pid", devname);
+
+       fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
+       if (fd < 0)
+               return -errno;
+       sprintf(pid, "%d\n", getpid());
+       n = write(fd, pid, strlen(pid));
+       close(fd);
+       if (n < 0)
+               return -errno;
+       return 0;
+}
+
+int is_container_member(struct mdstat_ent *mdstat, char *container)
+{
+       if (mdstat->metadata_version == NULL ||
+           strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
+           !is_subarray(mdstat->metadata_version+9) ||
+           strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
+           mdstat->metadata_version[10+strlen(container)] != '/')
+               return 0;
+
+       return 1;
+}
+
+void remove_pidfile(char *devname);
+static void try_kill_monitor(char *devname)
+{
+       char buf[100];
+       int fd;
+       pid_t pid;
+       struct mdstat_ent *mdstat;
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       close(fd);
+       pid = strtoul(buf, NULL, 10);
+
+       /* first rule of survival... don't off yourself */
+       if (pid == getpid())
+               return;
+
+       /* kill this process if it is mdmon */
+       sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       if (read(fd, buf, sizeof(buf)) < 0) {
+               close(fd);
+               return;
+       }
+
+       if (!strstr(buf, "mdmon"))
+               return;
+
+       kill(pid, SIGTERM);
+
+       mdstat = mdstat_read(0, 0);
+       for ( ; mdstat; mdstat = mdstat->next)
+               if (is_container_member(mdstat, devname)) {
+                       sprintf(buf, "/dev/%s", mdstat->dev);
+                       WaitClean(buf, 0);
+               }
+       free_mdstat(mdstat);
+       remove_pidfile(devname);
+}
+
+void remove_pidfile(char *devname)
+{
+       char buf[100];
+
+       if (sigterm)
+               return;
+
+       sprintf(buf, "/var/run/mdadm/%s.pid", devname);
+       unlink(buf);
+       sprintf(buf, "/var/run/mdadm/%s.sock", devname);
+       unlink(buf);
+}
+
+int make_control_sock(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+
+       if (sigterm)
+               return -1;
+
+       sprintf(path, "/var/run/mdadm/%s.sock", devname);
+       unlink(path);
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (bind(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+       listen(sfd, 10);
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+       return sfd;
+}
+
+int socket_hup_requested;
+static void hup(int sig)
+{
+       socket_hup_requested = 1;
+}
+
+static void term(int sig)
+{
+       sigterm = 1;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+       #ifdef DEBUG
+       if (env_no_mdmon())
+               return 0;
+       #endif
+
+       return 1;
+}
+
+void usage(void)
+{
+       fprintf(stderr, "Usage: mdmon [--switch-root dir] /device/name/for/container\n");
+       exit(2);
+}
+
+int main(int argc, char *argv[])
+{
+       int mdfd;
+       struct mdinfo *mdi, *di;
+       struct supertype *container;
+       sigset_t set;
+       struct sigaction act;
+       int pfd[2];
+       int status;
+       int ignore;
+       char *container_name = NULL;
+       char *switchroot = NULL;
+
+       switch (argc) {
+       case 2:
+               container_name = argv[1];
+               break;
+       case 4:
+               if (strcmp(argv[1], "--switch-root") != 0) {
+                       fprintf(stderr, "mdmon: unknown argument %s\n", argv[1]);
+                       usage();
+               }
+               switchroot = argv[2];
+               container_name = argv[3];
+               break;
+       default:
+               usage();
+       }
+
+       mdfd = open(container_name, O_RDWR);
+       if (mdfd < 0) {
+               fprintf(stderr, "mdmon: %s: %s\n", container_name,
+                       strerror(errno));
+               exit(1);
+       }
+       if (md_get_version(mdfd) < 0) {
+               fprintf(stderr, "mdmon: %s: Not an md device\n",
+                       container_name);
+               exit(1);
+       }
+
+       /* Fork, and have the child tell us when they are ready */
+       if (do_fork()) {
+               if (pipe(pfd) != 0) {
+                       fprintf(stderr, "mdmon: failed to create pipe\n");
+                       exit(1);
+               }
+               switch(fork()) {
+               case -1:
+                       fprintf(stderr, "mdmon: failed to fork: %s\n",
+                               strerror(errno));
+                       exit(1);
+               case 0: /* child */
+                       close(pfd[0]);
+                       break;
+               default: /* parent */
+                       close(pfd[1]);
+                       if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+                               wait(&status);
+                               status = WEXITSTATUS(status);
+                       }
+                       exit(status);
+               }
+       } else
+               pfd[0] = pfd[1] = -1;
+
+       container = malloc(sizeof(*container));
+       container->devnum = fd2devnum(mdfd);
+       container->devname = devnum2devname(container->devnum);
+       container->device_name = container_name;
+       container->arrays = NULL;
+
+       if (!container->devname) {
+               fprintf(stderr, "mdmon: failed to allocate container name string\n");
+               exit(3);
+       }
+
+       mdi = sysfs_read(mdfd, container->devnum,
+                        GET_VERSION|GET_LEVEL|GET_DEVS);
+
+       if (!mdi) {
+               fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
+                       container->devname);
+               exit(3);
+       }
+       if (mdi->array.level != UnSet) {
+               fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
+                       container_name);
+               exit(3);
+       }
+       if (mdi->array.major_version != -1 ||
+           mdi->array.minor_version != -2) {
+               fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
+                       container_name);
+               exit(3);
+       }
+
+       container->ss = find_metadata_methods(mdi->text_version);
+       if (container->ss == NULL) {
+               fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
+                       container_name, mdi->text_version);
+               exit(3);
+       }
+
+       container->devs = NULL;
+       for (di = mdi->devs; di; di = di->next) {
+               struct mdinfo *cd = malloc(sizeof(*cd));
+               *cd = *di;
+               cd->next = container->devs;
+               container->devs = cd;
+       }
+       sysfs_free(mdi);
+
+       /* SIGUSR is sent between parent and child.  So both block it
+        * and enable it only with pselect.
+        */
+       sigemptyset(&set);
+       sigaddset(&set, SIGUSR1);
+       sigaddset(&set, SIGHUP);
+       sigaddset(&set, SIGALRM);
+       sigaddset(&set, SIGTERM);
+       sigprocmask(SIG_BLOCK, &set, NULL);
+       act.sa_handler = wake_me;
+       act.sa_flags = 0;
+       sigaction(SIGUSR1, &act, NULL);
+       sigaction(SIGALRM, &act, NULL);
+       act.sa_handler = hup;
+       sigaction(SIGHUP, &act, NULL);
+       act.sa_handler = term;
+       sigaction(SIGTERM, &act, NULL);
+       act.sa_handler = SIG_IGN;
+       sigaction(SIGPIPE, &act, NULL);
+
+       if (switchroot) {
+               /* we assume we assume that /sys /proc /dev are available in
+                * the new root (see nash:setuproot)
+                *
+                * kill any monitors in the current namespace and change
+                * to the new one
+                */
+               try_kill_monitor(container->devname);
+               if (chroot(switchroot) != 0) {
+                       fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
+                               switchroot, strerror(errno));
+                       exit(4);
+               }
+       }
+
+       /* If this fails, we hope it already exists 
+        * pid file lives in /var/run/mdadm/mdXX.pid
+        */
+       mkdir("/var", 0600);
+       mkdir("/var/run", 0600);
+       mkdir("/var/run/mdadm", 0600);
+       ignore = chdir("/");
+       if (make_pidfile(container->devname, O_EXCL) < 0) {
+               if (ping_monitor(container->devname) == 0) {
+                       fprintf(stderr, "mdmon: %s already managed\n",
+                               container->devname);
+                       exit(3);
+               } else {
+                       int err;
+
+                       /* cleanup the old monitor, this one is taking over */
+                       try_kill_monitor(container->devname);
+                       err = make_pidfile(container->devname, 0);
+                       if (err < 0) {
+                               fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
+                                       container->devname);
+                               if (err == -EROFS) {
+                                       /* FIXME implement a mechanism to
+                                        * prevent duplicate monitor instances
+                                        */
+                                       fprintf(stderr,
+                                               "mdmon: continuing on read-only file system\n");
+                               } else
+                                       exit(3);
+                       }
+               }
+       }
+       container->sock = make_control_sock(container->devname);
+
+       if (container->ss->load_super(container, mdfd, container_name)) {
+               fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
+                       container_name);
+               exit(3);
+       }
+
+       /* Ok, this is close enough.  We can say goodbye to our parent now.
+        */
+       status = 0;
+       if (write(pfd[1], &status, sizeof(status)) < 0)
+               fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
+                       getppid());
+       close(pfd[1]);
+
+       setsid();
+       close(0);
+       open("/dev/null", O_RDWR);
+       close(1);
+       ignore = dup(0);
+#ifndef DEBUG
+       close(2);
+       ignore = dup(0);
+#endif
+
+       mlockall(MCL_FUTURE);
+
+       if (clone_monitor(container) < 0) {
+               fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
+                       strerror(errno));
+               exit(2);
+       }
+
+       do_manager(container);
+
+       exit(0);
+}
diff --git a/mdmon.h b/mdmon.h
new file mode 100644 (file)
index 0000000..e4904ba
--- /dev/null
+++ b/mdmon.h
@@ -0,0 +1,94 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2008 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+                  clean, active, write_pending, active_idle, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+
+struct active_array {
+       struct mdinfo info;
+       struct supertype *container;
+       struct active_array *next, *replaces;
+
+       int action_fd;
+       int resync_start_fd;
+       int metadata_fd; /* for monitoring rw/ro status */
+
+       enum array_state prev_state, curr_state, next_state;
+       enum sync_action prev_action, curr_action, next_action;
+
+       int check_degraded; /* flag set by mon, read by manage */
+
+       int devnum;
+
+       unsigned long long resync_start;
+};
+
+/*
+ * Metadata updates are handled by the monitor thread,
+ * as it has exclusive access to the metadata.
+ * When the manager want to updates metadata, either
+ * for it's own reason (e.g. committing a spare) or
+ * on behalf of mdadm, it creates a metadata_update
+ * structure and queues it to the monitor.
+ * Updates are created and processed by code under the
+ * superswitch.  All common code sees them as opaque
+ * blobs.
+ */
+extern struct metadata_update *update_queue, *update_queue_handled;
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+extern struct md_generic_cmd *active_cmd;
+
+
+void remove_pidfile(char *devname);
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+int make_control_sock(char *devname);
+int make_pidfile(char *devname, int o_excl);
+extern int socket_hup_requested;
+extern int sigterm;
+
+int read_dev_state(int fd);
+int get_resync_start(struct active_array *a);
+int is_container_member(struct mdstat_ent *mdstat, char *container);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern int exit_now, manager_ready;
+extern int mon_tid, mgr_tid;
+extern int monitor_loop_cnt;
+
+/* helper routine to determine resync completion since MaxSector is a
+ * moving target
+ */
+static inline int is_resync_complete(struct active_array *a)
+{
+       if (a->resync_start >= a->info.component_size)
+               return 1;
+       return 0;
+}
+
index 9250e4bacf3062b308157c83480de7561603ed1c..eee1eea15f049c734e47cb09c0fd524da5f94373 100644 (file)
--- a/mdopen.c
+++ b/mdopen.c
@@ -282,7 +282,7 @@ int open_mddev_devnum(char *devname, int devnum, char *name,
 
        if (devname)
                strcpy(chosen_name, devname);
-       else if (name && *name && strchr(name,'/') == NULL) {
+       else if (name && *name && name[0] && strchr(name,'/') == NULL) {
                char *n = strchr(name, ':');
                if (n) n++; else n = name;
                if (isdigit(*n) && devnum < 0)
index a8f7ce7576eb1d017acfe962a1dea48084cde5d2..ebdfc67bf6d46705e1ebcb74212299b2aa47b66c 100644 (file)
--- a/mdstat.c
+++ b/mdstat.c
@@ -86,6 +86,7 @@
 #include       "mdadm.h"
 #include       "dlink.h"
 #include       <sys/select.h>
+#include       <ctype.h>
 
 void free_mdstat(struct mdstat_ent *ms)
 {
@@ -94,6 +95,7 @@ void free_mdstat(struct mdstat_ent *ms)
                if (ms->dev) free(ms->dev);
                if (ms->level) free(ms->level);
                if (ms->pattern) free(ms->pattern);
+               if (ms->metadata_version) free(ms->metadata_version);
                t = ms;
                ms = ms->next;
                free(t);
@@ -158,6 +160,10 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                ent->percent = -1;
                ent->active = -1;
                ent->resync = 0;
+               ent->metadata_version = NULL;
+               ent->raid_disks = 0;
+               ent->chunk_size = 0;
+               ent->devcnt = 0;
 
                ent->dev = strdup(line);
                ent->devnum = devnum;
@@ -176,22 +182,28 @@ struct mdstat_ent *mdstat_read(int hold, int start)
                                in_devs = 1;
                        } else if (in_devs && strcmp(w, "blocks")==0)
                                in_devs = 0;
-                       else if (in_devs && strncmp(w, "md", 2)==0) {
-                               /* This has an md device as a component.
-                                * If that device is already in the list,
-                                * make sure we insert before there.
-                                */
-                               struct mdstat_ent **ih;
-                               int dn2;
-                               if (strncmp(w, "md_d", 4)==0)
-                                       dn2 = -1-strtoul(w+4, &ep, 10);
-                               else
-                                       dn2 = strtoul(w+2, &ep, 10);
-                               ih = &all;
-                               while (ih != insert_here && *ih &&
-                                      (*ih)->devnum != dn2)
-                                       ih = & (*ih)->next;
-                               insert_here = ih;
+                       else if (in_devs) {
+                               ent->devcnt++;
+                               if (strncmp(w, "md", 2)==0) {
+                                       /* This has an md device as a component.
+                                        * If that device is already in the
+                                        * list, make sure we insert before
+                                        * there.
+                                        */
+                                       struct mdstat_ent **ih;
+                                       int dn2 = devname2devnum(w);
+                                       ih = &all;
+                                       while (ih != insert_here && *ih &&
+                                              (*ih)->devnum != dn2)
+                                               ih = & (*ih)->next;
+                                       insert_here = ih;
+                               }
+                       } else if (strcmp(w, "super") == 0 &&
+                                  dl_next(w) != line) {
+                               w = dl_next(w);
+                               ent->metadata_version = strdup(w);
+                       } else if (w[0] == '[' && isdigit(w[1])) {
+                               ent->raid_disks = atoi(w+1);
                        } else if (!ent->pattern &&
                                 w[0] == '[' &&
                                 (w[1] == 'U' || w[1] == '_')) {
@@ -248,12 +260,33 @@ void mdstat_wait(int seconds)
 {
        fd_set fds;
        struct timeval tm;
+       int maxfd = 0;
        FD_ZERO(&fds);
-       if (mdstat_fd >= 0)
+       if (mdstat_fd >= 0) {
                FD_SET(mdstat_fd, &fds);
+               maxfd = mdstat_fd;
+       }
        tm.tv_sec = seconds;
        tm.tv_usec = 0;
-       select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm);
+       select(maxfd + 1, NULL, NULL, &fds, &tm);
+}
+
+void mdstat_wait_fd(int fd, const sigset_t *sigmask)
+{
+       fd_set fds, rfds;
+       int maxfd = fd;
+
+       FD_ZERO(&fds);
+       FD_ZERO(&rfds);
+       if (mdstat_fd >= 0)
+               FD_SET(mdstat_fd, &fds);
+       if (fd >= 0)
+               FD_SET(fd, &rfds);
+       if (mdstat_fd > maxfd)
+               maxfd = mdstat_fd;
+
+       pselect(maxfd + 1, &rfds, NULL, &fds,
+               NULL, sigmask);
 }
 
 int mddev_busy(int devnum)
diff --git a/monitor.c b/monitor.c
new file mode 100644 (file)
index 0000000..15791f4
--- /dev/null
+++ b/monitor.c
@@ -0,0 +1,578 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2008 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/select.h>
+#include <signal.h>
+
+static char *array_states[] = {
+       "clear", "inactive", "suspended", "readonly", "read-auto",
+       "clean", "active", "write-pending", "active-idle", NULL };
+static char *sync_actions[] = {
+       "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+static int write_attr(char *attr, int fd)
+{
+       return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+       if (fd < 0)
+               return;
+       if (fd > *maxfd)
+               *maxfd = fd;
+       FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+       int n;
+
+       if (fd < 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       lseek(fd, 0, 0);
+       n = read(fd, buf, len - 1);
+
+       if (n <= 0) {
+               buf[0] = 0;
+               return 0;
+       }
+       buf[n] = 0;
+       if (buf[n-1] == '\n')
+               buf[n-1] = 0;
+       return n;
+}
+
+int get_resync_start(struct active_array *a)
+{
+       char buf[30];
+       int n;
+
+       n = read_attr(buf, 30, a->resync_start_fd);
+       if (n <= 0)
+               return n;
+
+       a->resync_start = strtoull(buf, NULL, 10);
+
+       return 1;
+}
+
+
+static enum array_state read_state(int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_word;
+       return (enum array_state) sysfs_match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+       char buf[20];
+       int n = read_attr(buf, 20, fd);
+
+       if (n <= 0)
+               return bad_action;
+       return (enum sync_action) sysfs_match_word(buf, sync_actions);
+}
+
+int read_dev_state(int fd)
+{
+       char buf[60];
+       int n = read_attr(buf, 60, fd);
+       char *cp;
+       int rv = 0;
+
+       if (n <= 0)
+               return 0;
+
+       cp = buf;
+       while (cp) {
+               if (sysfs_attr_match(cp, "faulty"))
+                       rv |= DS_FAULTY;
+               if (sysfs_attr_match(cp, "in_sync"))
+                       rv |= DS_INSYNC;
+               if (sysfs_attr_match(cp, "write_mostly"))
+                       rv |= DS_WRITE_MOSTLY;
+               if (sysfs_attr_match(cp, "spare"))
+                       rv |= DS_SPARE;
+               if (sysfs_attr_match(cp, "blocked"))
+                       rv |= DS_BLOCKED;
+               cp = strchr(cp, ',');
+               if (cp)
+                       cp++;
+       }
+       return rv;
+}
+
+static void signal_manager(void)
+{
+       /* tgkill(getpid(), mon_tid, SIGUSR1); */
+       int pid = getpid();
+       syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
+}
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ *  Array stops.
+ *    This is detected by array_state going to 'clear' or 'inactive'.
+ *    while we thought it was active.
+ *    Response is to mark metadata as clean and 'clear' the array(??)
+ *  write-pending
+ *    array_state if 'write-pending'
+ *    We mark metadata as 'dirty' then set array to 'active'.
+ *  active_idle
+ *    Either ignore, or mark clean, then mark metadata as clean.
+ *
+ *  device fails
+ *    detected by rd-N/state reporting "faulty"
+ *    mark device as 'failed' in metadata, let the kernel release the
+ *    device by writing '-blocked' to rd/state, and finally write 'remove' to
+ *    rd/state.  Before a disk can be replaced it must be failed and removed
+ *    from all container members, this will be preemptive for the other
+ *    arrays... safe?
+ *
+ *  sync completes
+ *    sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ *    MaxSector
+ *    Notify metadata that sync is complete.
+ *
+ *  recovery completes
+ *    sync_action changes from 'recover' to 'idle'
+ *    Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ *
+ *  deal with resync
+ *    This only happens on finding a new array... mdadm will have set
+ *    'resync_start' to the correct value.  If 'resync_start' indicates that an
+ *    resync needs to occur set the array to the 'active' state rather than the
+ *    initial read-auto state.
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything.  So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ *   - update the array_state
+ *   - set the role of some devices.
+ *   - request a sync_action
+ *
+ */
+
+static int read_and_act(struct active_array *a)
+{
+       int check_degraded = 0;
+       int deactivate = 0;
+       struct mdinfo *mdi;
+
+       a->next_state = bad_word;
+       a->next_action = bad_action;
+
+       a->curr_state = read_state(a->info.state_fd);
+       a->curr_action = read_action(a->action_fd);
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->next_state = 0;
+               if (mdi->state_fd >= 0)
+                       mdi->curr_state = read_dev_state(mdi->state_fd);
+       }
+
+       if (a->curr_state <= inactive &&
+           a->prev_state > inactive) {
+               /* array has been stopped */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 1);
+               a->next_state = clear;
+               deactivate = 1;
+       }
+       if (a->curr_state == write_pending) {
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 0);
+               a->next_state = active;
+       }
+       if (a->curr_state == active_idle) {
+               /* Set array to 'clean' FIRST, then mark clean
+                * in the metadata
+                */
+               a->next_state = clean;
+       }
+       if (a->curr_state == clean) {
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, 1);
+       }
+
+       if (a->curr_state == readonly) {
+               /* Well, I'm ready to handle things.  If readonly
+                * wasn't requested, transition to read-auto.
+                */
+               char buf[64];
+               read_attr(buf, sizeof(buf), a->metadata_fd);
+               if (strncmp(buf, "external:-", 10) == 0) {
+                       /* explicit request for readonly array.  Leave it alone */
+                       ;
+               } else {
+                       get_resync_start(a);
+                       if (a->container->ss->set_array_state(a, 2))
+                               a->next_state = read_auto; /* array is clean */
+                       else
+                               a->next_state = active; /* Now active for recovery etc */
+               }
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == resync) {
+               /* A resync has finished.  The endpoint is recorded in
+                * 'sync_start'.  We don't update the metadata
+                * until the array goes inactive or readonly though.
+                * Just check if we need to fiddle spares.
+                */
+               get_resync_start(a);
+               a->container->ss->set_array_state(a, a->curr_state <= clean);
+               check_degraded = 1;
+       }
+
+       if (!deactivate &&
+           a->curr_action == idle &&
+           a->prev_action == recover) {
+               /* A recovery has finished.  Some disks may be in sync now,
+                * and the array may no longer be degraded
+                */
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       if (! (mdi->curr_state & DS_INSYNC))
+                               check_degraded = 1;
+               }
+       }
+
+       /* Check for failures and if found:
+        * 1/ Record the failure in the metadata and unblock the device.
+        *    FIXME update the kernel to stop notifying on failed drives when
+        *    the array is readonly and we have cleared 'blocked'
+        * 2/ Try to remove the device if the array is writable, or can be
+        *    made writable.
+        */
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+               if (mdi->curr_state & DS_FAULTY) {
+                       a->container->ss->set_disk(a, mdi->disk.raid_disk,
+                                                  mdi->curr_state);
+                       check_degraded = 1;
+                       mdi->next_state |= DS_UNBLOCK;
+                       if (a->curr_state == read_auto) {
+                               a->container->ss->set_array_state(a, 0);
+                               a->next_state = active;
+                       }
+                       if (a->curr_state > readonly)
+                               mdi->next_state |= DS_REMOVE;
+               }
+       }
+
+       a->container->ss->sync_metadata(a->container);
+       dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member,
+               array_states[a->curr_state], sync_actions[a->curr_action]);
+
+       /* Effect state changes in the array */
+       if (a->next_state != bad_word) {
+               dprintf(" state:%s", array_states[a->next_state]);
+               write_attr(array_states[a->next_state], a->info.state_fd);
+       }
+       if (a->next_action != bad_action) {
+               write_attr(sync_actions[a->next_action], a->action_fd);
+               dprintf(" action:%s", sync_actions[a->next_action]);
+       }
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               if (mdi->next_state & DS_UNBLOCK) {
+                       dprintf(" %d:-blocked", mdi->disk.raid_disk);
+                       write_attr("-blocked", mdi->state_fd);
+               }
+
+               if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
+                       int remove_result;
+
+                       /* the kernel may not be able to immediately remove the
+                        * disk, we can simply wait until the next event to try
+                        * again.
+                        */
+                       remove_result = write_attr("remove", mdi->state_fd);
+                       if (remove_result > 0) {
+                               dprintf(" %d:removed", mdi->disk.raid_disk);
+                               close(mdi->state_fd);
+                               mdi->state_fd = -1;
+                       }
+               }
+               if (mdi->next_state & DS_INSYNC) {
+                       write_attr("+in_sync", mdi->state_fd);
+                       dprintf(" %d:+in_sync", mdi->disk.raid_disk);
+               }
+       }
+       dprintf(" )\n");
+
+       /* move curr_ to prev_ */
+       a->prev_state = a->curr_state;
+
+       a->prev_action = a->curr_action;
+
+       for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+               mdi->prev_state = mdi->curr_state;
+               mdi->next_state = 0;
+       }
+
+       if (check_degraded) {
+               /* manager will do the actual check */
+               a->check_degraded = 1;
+               signal_manager();
+       }
+
+       if (deactivate)
+               a->container = NULL;
+
+       return 1;
+}
+
+static struct mdinfo *
+find_device(struct active_array *a, int major, int minor)
+{
+       struct mdinfo *mdi;
+
+       for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+               if (mdi->disk.major == major && mdi->disk.minor == minor)
+                       return mdi;
+
+       return NULL;
+}
+
+static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
+{
+       struct active_array *a;
+       struct mdinfo *victim;
+
+       for (a = aa; a; a = a->next) {
+               if (!a->container)
+                       continue;
+               victim = find_device(a, failed->disk.major, failed->disk.minor);
+               if (!victim)
+                       continue;
+
+               if (!(victim->curr_state & DS_FAULTY))
+                       write_attr("faulty", victim->state_fd);
+       }
+}
+
+#ifdef DEBUG
+static void dprint_wake_reasons(fd_set *fds)
+{
+       int i;
+       char proc_path[256];
+       char link[256];
+       char *basename;
+       int rv;
+
+       fprintf(stderr, "monitor: wake ( ");
+       for (i = 0; i < FD_SETSIZE; i++) {
+               if (FD_ISSET(i, fds)) {
+                       sprintf(proc_path, "/proc/%d/fd/%d",
+                               (int) getpid(), i);
+
+                       rv = readlink(proc_path, link, sizeof(link) - 1);
+                       if (rv < 0) {
+                               fprintf(stderr, "%d:unknown ", i);
+                               continue;
+                       }
+                       link[rv] = '\0';
+                       basename = strrchr(link, '/');
+                       fprintf(stderr, "%d:%s ",
+                               i, basename ? ++basename : link);
+               }
+       }
+       fprintf(stderr, ")\n");
+}
+#endif
+
+int monitor_loop_cnt;
+
+static int wait_and_act(struct supertype *container, int nowait)
+{
+       fd_set rfds;
+       int maxfd = 0;
+       struct active_array **aap = &container->arrays;
+       struct active_array *a, **ap;
+       int rv;
+       struct mdinfo *mdi;
+       static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+
+       FD_ZERO(&rfds);
+
+       for (ap = aap ; *ap ;) {
+               a = *ap;
+               /* once an array has been deactivated we want to
+                * ask the manager to discard it.
+                */
+               if (!a->container) {
+                       if (discard_this) {
+                               ap = &(*ap)->next;
+                               continue;
+                       }
+                       *ap = a->next;
+                       a->next = NULL;
+                       discard_this = a;
+                       signal_manager();
+                       continue;
+               }
+
+               add_fd(&rfds, &maxfd, a->info.state_fd);
+               add_fd(&rfds, &maxfd, a->action_fd);
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       add_fd(&rfds, &maxfd, mdi->state_fd);
+
+               ap = &(*ap)->next;
+       }
+
+       if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) {
+               /* No interesting arrays, or we have been told to
+                * terminate and everything is clean.  Lets see about
+                * exiting.  Note that blocking at this point is not a
+                * problem as there are no active arrays, there is
+                * nothing that we need to be ready to do.
+                */
+               int fd = open(container->device_name, O_RDONLY|O_EXCL);
+               if (fd >= 0 || errno != EBUSY) {
+                       /* OK, we are safe to leave */
+                       if (sigterm && !dirty_arrays)
+                               dprintf("caught sigterm, all clean... exiting\n");
+                       else
+                               dprintf("no arrays to monitor... exiting\n");
+                       remove_pidfile(container->devname);
+                       exit_now = 1;
+                       signal_manager();
+                       exit(0);
+               }
+       }
+
+       if (!nowait) {
+               sigset_t set;
+               sigprocmask(SIG_UNBLOCK, NULL, &set);
+               sigdelset(&set, SIGUSR1);
+               monitor_loop_cnt |= 1;
+               rv = pselect(maxfd+1, &rfds, NULL, NULL, NULL, &set);
+               monitor_loop_cnt += 1;
+               if (rv == -1 && errno == EINTR)
+                       rv = 0;
+               #ifdef DEBUG
+               dprint_wake_reasons(&rfds);
+               #endif
+
+       }
+
+       if (update_queue) {
+               struct metadata_update *this;
+
+               for (this = update_queue; this ; this = this->next)
+                       container->ss->process_update(container, this);
+
+               update_queue_handled = update_queue;
+               update_queue = NULL;
+               signal_manager();
+               container->ss->sync_metadata(container);
+       }
+
+       rv = 0;
+       dirty_arrays = 0;
+       for (a = *aap; a ; a = a->next) {
+               int is_dirty;
+
+               if (a->replaces && !discard_this) {
+                       struct active_array **ap;
+                       for (ap = &a->next; *ap && *ap != a->replaces;
+                            ap = & (*ap)->next)
+                               ;
+                       if (*ap)
+                               *ap = (*ap)->next;
+                       discard_this = a->replaces;
+                       a->replaces = NULL;
+                       /* FIXME check if device->state_fd need to be cleared?*/
+                       signal_manager();
+               }
+               if (a->container)
+                       rv += read_and_act(a);
+               else
+                       continue;
+
+               /* when terminating stop manipulating the array after it is
+                * clean, but make sure read_and_act() is given a chance to
+                * handle 'active_idle'
+                */
+               switch (read_state(a->info.state_fd)) {
+                       case active:
+                       case active_idle:
+                       case suspended:
+                       case bad_word:
+                               is_dirty = 1;
+                               break;
+                       default:
+                               if (a->curr_state == active_idle)
+                                       is_dirty = 1;
+                               else
+                                       is_dirty = 0;
+                               break;
+               }
+               dirty_arrays += is_dirty;
+               if (sigterm && !is_dirty)
+                       a->container = NULL; /* stop touching this array */
+       }
+
+       /* propagate failures across container members */
+       for (a = *aap; a ; a = a->next) {
+               if (!a->container)
+                       continue;
+               for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+                       if (mdi->curr_state & DS_FAULTY)
+                               reconcile_failed(*aap, mdi);
+       }
+
+       return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+       int rv;
+       int first = 1;
+       do {
+               rv = wait_and_act(container, first);
+               first = 0;
+       } while (rv >= 0);
+}
diff --git a/msg.c b/msg.c
new file mode 100644 (file)
index 0000000..5a4839f
--- /dev/null
+++ b/msg.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include "mdadm.h"
+#include "mdmon.h"
+
+static const __u32 start_magic = 0x5a5aa5a5;
+static const __u32 end_magic = 0xa5a55a5a;
+
+static int send_buf(int fd, const void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, NULL, &set, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = write(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+static int recv_buf(int fd, void* buf, int len, int tmo)
+{
+       fd_set set;
+       int rv;
+       struct timeval timeout = {tmo, 0};
+       struct timeval *ptmo = tmo ? &timeout : NULL;
+
+       while (len) {
+               FD_ZERO(&set);
+               FD_SET(fd, &set);
+               rv = select(fd+1, &set, NULL, NULL, ptmo);
+               if (rv <= 0)
+                       return -1;
+               rv = read(fd, buf, len);
+               if (rv <= 0)
+                       return -1;
+               len -= rv;
+               buf += rv;
+       }
+       return 0;
+}
+
+
+int send_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __s32 len = msg->len;
+       int rv;
+
+       rv = send_buf(fd, &start_magic, 4, tmo);
+       rv = rv ?: send_buf(fd, &len, 4, tmo);
+       if (len > 0)
+               rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo);
+       rv = send_buf(fd, &end_magic, 4, tmo);
+
+       return rv;
+}
+
+int receive_message(int fd, struct metadata_update *msg, int tmo)
+{
+       __u32 magic;
+       __s32 len;
+       int rv;
+
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != start_magic)
+               return -1;
+       rv = recv_buf(fd, &len, 4, tmo);
+       if (rv < 0 || len > MSG_MAX_LEN)
+               return -1;
+       if (len > 0) {
+               msg->buf = malloc(len);
+               if (msg->buf == NULL)
+                       return -1;
+               rv = recv_buf(fd, msg->buf, len, tmo);
+               if (rv < 0) {
+                       free(msg->buf);
+                       return -1;
+               }
+       } else
+               msg->buf = NULL;
+       rv = recv_buf(fd, &magic, 4, tmo);
+       if (rv < 0 || magic != end_magic) {
+               free(msg->buf);
+               return -1;
+       }
+       msg->len = len;
+       return 0;
+}
+
+int ack(int fd, int tmo)
+{
+       struct metadata_update msg = { .len = 0 };
+
+       return send_message(fd, &msg, tmo);
+}
+
+int wait_reply(int fd, int tmo)
+{
+       struct metadata_update msg;
+       return receive_message(fd, &msg, tmo);
+}
+
+int connect_monitor(char *devname)
+{
+       char path[100];
+       int sfd;
+       long fl;
+       struct sockaddr_un addr;
+       int pos;
+       char *c;
+
+       pos = sprintf(path, "/var/run/mdadm/");
+       if (is_subarray(devname)) {
+               devname++;
+               c = strchr(devname, '/');
+               if (!c)
+                       return -1;
+               snprintf(&path[pos], c - devname + 1, "%s", devname);
+               pos += c - devname;
+       } else
+               pos += sprintf(&path[pos], "%s", devname);
+       sprintf(&path[pos], ".sock");
+
+       sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+       if (sfd < 0)
+               return -1;
+
+       addr.sun_family = PF_LOCAL;
+       strcpy(addr.sun_path, path);
+       if (connect(sfd, &addr, sizeof(addr)) < 0) {
+               close(sfd);
+               return -1;
+       }
+
+       fl = fcntl(sfd, F_GETFL, 0);
+       fl |= O_NONBLOCK;
+       fcntl(sfd, F_SETFL, fl);
+
+       return sfd;
+}
+
+/* give the monitor a chance to update the metadata */
+int ping_monitor(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       int err = 0;
+
+       if (sfd < 0)
+               return sfd;
+
+       /* try to ping existing socket */
+       if (ack(sfd, 20) != 0)
+               err = -1;
+
+       /* check the reply */
+       if (!err && wait_reply(sfd, 20) != 0)
+               err = -1;
+
+       close(sfd);
+       return err;
+}
+
+/* give the manager a chance to view the updated container state.  This
+ * would naturally happen due to the manager noticing a change in
+ * /proc/mdstat; however, pinging encourages this detection to happen
+ * while an exclusive open() on the container is active
+ */
+int ping_manager(char *devname)
+{
+       int sfd = connect_monitor(devname);
+       struct metadata_update msg = { .len = -1 };
+       int err = 0;
+
+       if (sfd < 0)
+               return sfd;
+
+       err = send_message(sfd, &msg, 20);
+
+       /* check the reply */
+       if (!err && wait_reply(sfd, 20) != 0)
+               err = -1;
+
+       close(sfd);
+       return err;
+}
diff --git a/msg.h b/msg.h
new file mode 100644 (file)
index 0000000..b9bd205
--- /dev/null
+++ b/msg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ *     mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+struct mdinfo;
+struct metadata_update;
+
+extern int receive_message(int fd, struct metadata_update *msg, int tmo);
+extern int send_message(int fd, struct metadata_update *msg, int tmo);
+extern int ack(int fd, int tmo);
+extern int wait_reply(int fd, int tmo);
+extern int connect_monitor(char *devname);
+extern int ping_monitor(char *devname);
+extern int ping_manager(char *devname);
+
+#define MSG_MAX_LEN (4*1024*1024)
index afde8363529c5bc48cebc1a108bf2505d074e8d0..509b45087bed453b6f76aa23e4a2d4639a6d1405 100644 (file)
@@ -152,7 +152,8 @@ int save_stripes(int *source, unsigned long long *offsets,
                 int nwrites, int *dest,
                 unsigned long long start, unsigned long long length)
 {
-       char buf[8192];
+       char abuf[8192+512];
+       char *buf = (char*)(((unsigned long)abuf+511)&~511UL);
        int cpos = start % chunk_size; /* where in chunk we are up to */
        int len;
        int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
@@ -162,7 +163,7 @@ int save_stripes(int *source, unsigned long long *offsets,
                unsigned long long offset;
                int i;
                len = chunk_size - cpos;
-               if (len > sizeof(buf)) len = sizeof(buf);
+               if (len > 8192) len = 8192;
                if (len > length) len = length;
                /* len bytes to be moved from one device */
 
diff --git a/sg_io.c b/sg_io.c
new file mode 100644 (file)
index 0000000..f9682be
--- /dev/null
+++ b/sg_io.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ *     Retrieve drive serial numbers for scsi disks
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <string.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <sys/ioctl.h>
+
+int scsi_get_serial(int fd, void *buf, size_t buf_len)
+{
+       unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0};
+       unsigned char sense[32];
+       struct sg_io_hdr io_hdr;
+
+       memset(&io_hdr, 0, sizeof(io_hdr));
+       io_hdr.interface_id = 'S';
+       io_hdr.cmdp = inq_cmd;
+       io_hdr.cmd_len = sizeof(inq_cmd);
+       io_hdr.dxferp = buf;
+       io_hdr.dxfer_len = buf_len;
+       io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+       io_hdr.sbp = sense;
+       io_hdr.mx_sb_len = sizeof(sense);
+       io_hdr.timeout = 5000;
+
+       return ioctl(fd, SG_IO, &io_hdr);
+}
diff --git a/super-ddf.c b/super-ddf.c
new file mode 100644 (file)
index 0000000..4264bdf
--- /dev/null
@@ -0,0 +1,3400 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2007 Neil Brown <neilb@suse.de>
+ *
+ *
+ *    This program is free software; you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation; either version 2 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *    Author: Neil Brown
+ *    Email: <neil@brown.name>
+ *
+ * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2
+ * (July 28 2006).  Reused by permission of SNIA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+
+/* a non-official T10 name for creation GUIDs */
+static char T10[] = "Linux-MD";
+
+/* DDF timestamps are 1980 based, so we need to add
+ * second-in-decade-of-seventies to convert to linux timestamps.
+ * 10 years with 2 leap years.
+ */
+#define DECADE (3600*24*(365*10+2))
+unsigned long crc32(
+       unsigned long crc,
+       const unsigned char *buf,
+       unsigned len);
+
+/* The DDF metadata handling.
+ * DDF metadata lives at the end of the device.
+ * The last 512 byte block provides an 'anchor' which is used to locate
+ * the rest of the metadata which usually lives immediately behind the anchor.
+ *
+ * Note:
+ *  - all multibyte numeric fields are bigendian.
+ *  - all strings are space padded.
+ *
+ */
+
+/* Primary Raid Level (PRL) */
+#define        DDF_RAID0       0x00
+#define        DDF_RAID1       0x01
+#define        DDF_RAID3       0x03
+#define        DDF_RAID4       0x04
+#define        DDF_RAID5       0x05
+#define        DDF_RAID1E      0x11
+#define        DDF_JBOD        0x0f
+#define        DDF_CONCAT      0x1f
+#define        DDF_RAID5E      0x15
+#define        DDF_RAID5EE     0x25
+#define        DDF_RAID6       0x06
+
+/* Raid Level Qualifier (RLQ) */
+#define        DDF_RAID0_SIMPLE        0x00
+#define        DDF_RAID1_SIMPLE        0x00 /* just 2 devices in this plex */
+#define        DDF_RAID1_MULTI         0x01 /* exactly 3 devices in this plex */
+#define        DDF_RAID3_0             0x00 /* parity in first extent */
+#define        DDF_RAID3_N             0x01 /* parity in last extent */
+#define        DDF_RAID4_0             0x00 /* parity in first extent */
+#define        DDF_RAID4_N             0x01 /* parity in last extent */
+/* these apply to raid5e and raid5ee as well */
+#define        DDF_RAID5_0_RESTART     0x00 /* same as 'right asymmetric' - layout 1 */
+#define        DDF_RAID6_0_RESTART     0x01 /* raid6 different from raid5 here!!! */
+#define        DDF_RAID5_N_RESTART     0x02 /* same as 'left asymmetric' - layout 0 */
+#define        DDF_RAID5_N_CONTINUE    0x03 /* same as 'left symmetric' - layout 2 */
+
+#define        DDF_RAID1E_ADJACENT     0x00 /* raid10 nearcopies==2 */
+#define        DDF_RAID1E_OFFSET       0x01 /* raid10 offsetcopies==2 */
+
+/* Secondary RAID Level (SRL) */
+#define        DDF_2STRIPED    0x00    /* This is weirder than RAID0 !! */
+#define        DDF_2MIRRORED   0x01
+#define        DDF_2CONCAT     0x02
+#define        DDF_2SPANNED    0x03    /* This is also weird - be careful */
+
+/* Magic numbers */
+#define        DDF_HEADER_MAGIC        __cpu_to_be32(0xDE11DE11)
+#define        DDF_CONTROLLER_MAGIC    __cpu_to_be32(0xAD111111)
+#define        DDF_PHYS_RECORDS_MAGIC  __cpu_to_be32(0x22222222)
+#define        DDF_PHYS_DATA_MAGIC     __cpu_to_be32(0x33333333)
+#define        DDF_VIRT_RECORDS_MAGIC  __cpu_to_be32(0xDDDDDDDD)
+#define        DDF_VD_CONF_MAGIC       __cpu_to_be32(0xEEEEEEEE)
+#define        DDF_SPARE_ASSIGN_MAGIC  __cpu_to_be32(0x55555555)
+#define        DDF_VU_CONF_MAGIC       __cpu_to_be32(0x88888888)
+#define        DDF_VENDOR_LOG_MAGIC    __cpu_to_be32(0x01dBEEF0)
+#define        DDF_BBM_LOG_MAGIC       __cpu_to_be32(0xABADB10C)
+
+#define        DDF_GUID_LEN    24
+#define DDF_REVISION_0 "01.00.00"
+#define DDF_REVISION_2 "01.02.00"
+
+struct ddf_header {
+       __u32   magic;          /* DDF_HEADER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       char    revision[8];    /* 01.02.00 */
+       __u32   seq;            /* starts at '1' */
+       __u32   timestamp;
+       __u8    openflag;
+       __u8    foreignflag;
+       __u8    enforcegroups;
+       __u8    pad0;           /* 0xff */
+       __u8    pad1[12];       /* 12 * 0xff */
+       /* 64 bytes so far */
+       __u8    header_ext[32]; /* reserved: fill with 0xff */
+       __u64   primary_lba;
+       __u64   secondary_lba;
+       __u8    type;
+       __u8    pad2[3];        /* 0xff */
+       __u32   workspace_len;  /* sectors for vendor space -
+                                * at least 32768(sectors) */
+       __u64   workspace_lba;
+       __u16   max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */
+       __u16   max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */
+       __u16   max_partitions; /* i.e. max num of configuration
+                                  record entries per disk */
+       __u16   config_record_len; /* 1 +ROUNDUP(max_primary_element_entries
+                                                *12/512) */
+       __u16   max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */
+       __u8    pad3[54];       /* 0xff */
+       /* 192 bytes so far */
+       __u32   controller_section_offset;
+       __u32   controller_section_length;
+       __u32   phys_section_offset;
+       __u32   phys_section_length;
+       __u32   virt_section_offset;
+       __u32   virt_section_length;
+       __u32   config_section_offset;
+       __u32   config_section_length;
+       __u32   data_section_offset;
+       __u32   data_section_length;
+       __u32   bbm_section_offset;
+       __u32   bbm_section_length;
+       __u32   diag_space_offset;
+       __u32   diag_space_length;
+       __u32   vendor_offset;
+       __u32   vendor_length;
+       /* 256 bytes so far */
+       __u8    pad4[256];      /* 0xff */
+};
+
+/* type field */
+#define        DDF_HEADER_ANCHOR       0x00
+#define        DDF_HEADER_PRIMARY      0x01
+#define        DDF_HEADER_SECONDARY    0x02
+
+/* The content of the 'controller section' - global scope */
+struct ddf_controller_data {
+       __u32   magic;                  /* DDF_CONTROLLER_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       struct controller_type {
+               __u16 vendor_id;
+               __u16 device_id;
+               __u16 sub_vendor_id;
+               __u16 sub_device_id;
+       } type;
+       char    product_id[16];
+       __u8    pad[8]; /* 0xff */
+       __u8    vendor_data[448];
+};
+
+/* The content of phys_section - global scope */
+struct phys_disk {
+       __u32   magic;          /* DDF_PHYS_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   used_pdes;
+       __u16   max_pdes;
+       __u8    pad[52];
+       struct phys_disk_entry {
+               char    guid[DDF_GUID_LEN];
+               __u32   refnum;
+               __u16   type;
+               __u16   state;
+               __u64   config_size; /* DDF structures must be after here */
+               char    path[18];       /* another horrible structure really */
+               __u8    pad[6];
+       } entries[0];
+};
+
+/* phys_disk_entry.type is a bitmap - bigendian remember */
+#define        DDF_Forced_PD_GUID              1
+#define        DDF_Active_in_VD                2
+#define        DDF_Global_Spare                4 /* VD_CONF records are ignored */
+#define        DDF_Spare                       8 /* overrides Global_spare */
+#define        DDF_Foreign                     16
+#define        DDF_Legacy                      32 /* no DDF on this device */
+
+#define        DDF_Interface_mask              0xf00
+#define        DDF_Interface_SCSI              0x100
+#define        DDF_Interface_SAS               0x200
+#define        DDF_Interface_SATA              0x300
+#define        DDF_Interface_FC                0x400
+
+/* phys_disk_entry.state is a bigendian bitmap */
+#define        DDF_Online                      1
+#define        DDF_Failed                      2 /* overrides  1,4,8 */
+#define        DDF_Rebuilding                  4
+#define        DDF_Transition                  8
+#define        DDF_SMART                       16
+#define        DDF_ReadErrors                  32
+#define        DDF_Missing                     64
+
+/* The content of the virt_section global scope */
+struct virtual_disk {
+       __u32   magic;          /* DDF_VIRT_RECORDS_MAGIC */
+       __u32   crc;
+       __u16   populated_vdes;
+       __u16   max_vdes;
+       __u8    pad[52];
+       struct virtual_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   unit;
+               __u16   pad0;   /* 0xffff */
+               __u16   guid_crc;
+               __u16   type;
+               __u8    state;
+               __u8    init_state;
+               __u8    pad1[14];
+               char    name[16];
+       } entries[0];
+};
+
+/* virtual_entry.type is a bitmap - bigendian */
+#define        DDF_Shared              1
+#define        DDF_Enforce_Groups      2
+#define        DDF_Unicode             4
+#define        DDF_Owner_Valid         8
+
+/* virtual_entry.state is a bigendian bitmap */
+#define        DDF_state_mask          0x7
+#define        DDF_state_optimal       0x0
+#define        DDF_state_degraded      0x1
+#define        DDF_state_deleted       0x2
+#define        DDF_state_missing       0x3
+#define        DDF_state_failed        0x4
+#define        DDF_state_part_optimal  0x5
+
+#define        DDF_state_morphing      0x8
+#define        DDF_state_inconsistent  0x10
+
+/* virtual_entry.init_state is a bigendian bitmap */
+#define        DDF_initstate_mask      0x03
+#define        DDF_init_not            0x00
+#define        DDF_init_quick          0x01 /* initialisation is progress.
+                                     * i.e. 'state_inconsistent' */
+#define        DDF_init_full           0x02
+
+#define        DDF_access_mask         0xc0
+#define        DDF_access_rw           0x00
+#define        DDF_access_ro           0x80
+#define        DDF_access_blocked      0xc0
+
+/* The content of the config_section - local scope
+ * It has multiple records each config_record_len sectors
+ * They can be vd_config or spare_assign
+ */
+
+struct vd_config {
+       __u32   magic;          /* DDF_VD_CONF_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   timestamp;
+       __u32   seqnum;
+       __u8    pad0[24];
+       __u16   prim_elmnt_count;
+       __u8    chunk_shift;    /* 0 == 512, 1==1024 etc */
+       __u8    prl;
+       __u8    rlq;
+       __u8    sec_elmnt_count;
+       __u8    sec_elmnt_seq;
+       __u8    srl;
+       __u64   blocks;         /* blocks per component could be different
+                                * on different component devices...(only
+                                * for concat I hope) */
+       __u64   array_blocks;   /* blocks in array */
+       __u8    pad1[8];
+       __u32   spare_refs[8];
+       __u8    cache_pol[8];
+       __u8    bg_rate;
+       __u8    pad2[3];
+       __u8    pad3[52];
+       __u8    pad4[192];
+       __u8    v0[32]; /* reserved- 0xff */
+       __u8    v1[32]; /* reserved- 0xff */
+       __u8    v2[16]; /* reserved- 0xff */
+       __u8    v3[16]; /* reserved- 0xff */
+       __u8    vendor[32];
+       __u32   phys_refnum[0]; /* refnum of each disk in sequence */
+      /*__u64  lba_offset[0];  LBA offset in each phys.  Note extents in a
+                               bvd are always the same size */
+};
+
+/* vd_config.cache_pol[7] is a bitmap */
+#define        DDF_cache_writeback     1       /* else writethrough */
+#define        DDF_cache_wadaptive     2       /* only applies if writeback */
+#define        DDF_cache_readahead     4
+#define        DDF_cache_radaptive     8       /* only if doing read-ahead */
+#define        DDF_cache_ifnobatt      16      /* even to write cache if battery is poor */
+#define        DDF_cache_wallowed      32      /* enable write caching */
+#define        DDF_cache_rallowed      64      /* enable read caching */
+
+struct spare_assign {
+       __u32   magic;          /* DDF_SPARE_ASSIGN_MAGIC */
+       __u32   crc;
+       __u32   timestamp;
+       __u8    reserved[7];
+       __u8    type;
+       __u16   populated;      /* SAEs used */
+       __u16   max;            /* max SAEs */
+       __u8    pad[8];
+       struct spare_assign_entry {
+               char    guid[DDF_GUID_LEN];
+               __u16   secondary_element;
+               __u8    pad[6];
+       } spare_ents[0];
+};
+/* spare_assign.type is a bitmap */
+#define        DDF_spare_dedicated     0x1     /* else global */
+#define        DDF_spare_revertible    0x2     /* else committable */
+#define        DDF_spare_active        0x4     /* else not active */
+#define        DDF_spare_affinity      0x8     /* enclosure affinity */
+
+/* The data_section contents - local scope */
+struct disk_data {
+       __u32   magic;          /* DDF_PHYS_DATA_MAGIC */
+       __u32   crc;
+       char    guid[DDF_GUID_LEN];
+       __u32   refnum;         /* crc of some magic drive data ... */
+       __u8    forced_ref;     /* set when above was not result of magic */
+       __u8    forced_guid;    /* set if guid was forced rather than magic */
+       __u8    vendor[32];
+       __u8    pad[442];
+};
+
+/* bbm_section content */
+struct bad_block_log {
+       __u32   magic;
+       __u32   crc;
+       __u16   entry_count;
+       __u32   spare_count;
+       __u8    pad[10];
+       __u64   first_spare;
+       struct mapped_block {
+               __u64   defective_start;
+               __u32   replacement_start;
+               __u16   remap_count;
+               __u8    pad[2];
+       } entries[0];
+};
+
+/* Struct for internally holding ddf structures */
+/* The DDF structure stored on each device is potentially
+ * quite different, as some data is global and some is local.
+ * The global data is:
+ *   - ddf header
+ *   - controller_data
+ *   - Physical disk records
+ *   - Virtual disk records
+ * The local data is:
+ *   - Configuration records
+ *   - Physical Disk data section
+ *  (  and Bad block and vendor which I don't care about yet).
+ *
+ * The local data is parsed into separate lists as it is read
+ * and reconstructed for writing.  This means that we only need
+ * to make config changes once and they are automatically
+ * propagated to all devices.
+ * Note that the ddf_super has space of the conf and disk data
+ * for this disk and also for a list of all such data.
+ * The list is only used for the superblock that is being
+ * built in Create or Assemble to describe the whole array.
+ */
+struct ddf_super {
+       struct ddf_header anchor, primary, secondary;
+       struct ddf_controller_data controller;
+       struct ddf_header *active;
+       struct phys_disk        *phys;
+       struct virtual_disk     *virt;
+       int pdsize, vdsize;
+       int max_part, mppe, conf_rec_len;
+       int currentdev;
+       int updates_pending;
+       struct vcl {
+               union {
+                       char space[512];
+                       struct {
+                               struct vcl      *next;
+                               __u64           *lba_offset; /* location in 'conf' of
+                                                             * the lba table */
+                               int     vcnum; /* index into ->virt */
+                               __u64           *block_sizes; /* NULL if all the same */
+                       };
+               };
+               struct vd_config conf;
+       } *conflist, *currentconf;
+       struct dl {
+               union {
+                       char space[512];
+                       struct {
+                               struct dl       *next;
+                               int major, minor;
+                               char *devname;
+                               int fd;
+                               unsigned long long size; /* sectors */
+                               int pdnum;      /* index in ->phys */
+                               struct spare_assign *spare;
+                       };
+               };
+               struct disk_data disk;
+               void *mdupdate; /* hold metadata update */
+               struct vcl *vlist[0]; /* max_part in size */
+       } *dlist, *add_list;
+};
+
+#ifndef offsetof
+#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#endif
+
+
+static int calc_crc(void *buf, int len)
+{
+       /* crcs are always at the same place as in the ddf_header */
+       struct ddf_header *ddf = buf;
+       __u32 oldcrc = ddf->crc;
+       __u32 newcrc;
+       ddf->crc = 0xffffffff;
+
+       newcrc = crc32(0, buf, len);
+       ddf->crc = oldcrc;
+       /* The crc is store (like everything) bigendian, so convert
+        * here for simplicity
+        */
+       return __cpu_to_be32(newcrc);
+}
+
+static int load_ddf_header(int fd, unsigned long long lba,
+                          unsigned long long size,
+                          int type,
+                          struct ddf_header *hdr, struct ddf_header *anchor)
+{
+       /* read a ddf header (primary or secondary) from fd/lba
+        * and check that it is consistent with anchor
+        * Need to check:
+        *   magic, crc, guid, rev, and LBA's header_type, and
+        *  everything after header_type must be the same
+        */
+       if (lba >= size-1)
+               return 0;
+
+       if (lseek64(fd, lba<<9, 0) < 0)
+               return 0;
+
+       if (read(fd, hdr, 512) != 512)
+               return 0;
+
+       if (hdr->magic != DDF_HEADER_MAGIC)
+               return 0;
+       if (calc_crc(hdr, 512) != hdr->crc)
+               return 0;
+       if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 ||
+           memcmp(anchor->revision, hdr->revision, 8) != 0 ||
+           anchor->primary_lba != hdr->primary_lba ||
+           anchor->secondary_lba != hdr->secondary_lba ||
+           hdr->type != type ||
+           memcmp(anchor->pad2, hdr->pad2, 512 -
+                  offsetof(struct ddf_header, pad2)) != 0)
+               return 0;
+
+       /* Looks good enough to me... */
+       return 1;
+}
+
+static void *load_section(int fd, struct ddf_super *super, void *buf,
+                         __u32 offset_be, __u32 len_be, int check)
+{
+       unsigned long long offset = __be32_to_cpu(offset_be);
+       unsigned long long len = __be32_to_cpu(len_be);
+       int dofree = (buf == NULL);
+
+       if (check)
+               if (len != 2 && len != 8 && len != 32
+                   && len != 128 && len != 512)
+                       return NULL;
+
+       if (len > 1024)
+               return NULL;
+       if (buf) {
+               /* All pre-allocated sections are a single block */
+               if (len != 1)
+                       return NULL;
+       } else if (posix_memalign(&buf, 512, len<<9) != 0)
+               buf = NULL;
+
+       if (!buf)
+               return NULL;
+
+       if (super->active->type == 1)
+               offset += __be64_to_cpu(super->active->primary_lba);
+       else
+               offset += __be64_to_cpu(super->active->secondary_lba);
+
+       if (lseek64(fd, offset<<9, 0) != (offset<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       if (read(fd, buf, len<<9) != (len<<9)) {
+               if (dofree)
+                       free(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
+{
+       unsigned long long dsize;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize-512, 0) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (read(fd, &super->anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+       if (super->anchor.magic != DDF_HEADER_MAGIC) {
+               if (devname)
+                       fprintf(stderr, Name ": no DDF anchor found on %s\n",
+                               devname);
+               return 2;
+       }
+       if (calc_crc(&super->anchor, 512) != super->anchor.crc) {
+               if (devname)
+                       fprintf(stderr, Name ": bad CRC on anchor on %s\n",
+                               devname);
+               return 2;
+       }
+       if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 &&
+           memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) {
+               if (devname)
+                       fprintf(stderr, Name ": can only support super revision"
+                               " %.8s and earlier, not %.8s on %s\n",
+                               DDF_REVISION_2, super->anchor.revision,devname);
+               return 2;
+       }
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba),
+                           dsize >> 9,  1,
+                           &super->primary, &super->anchor) == 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load primary DDF header "
+                               "on %s\n", devname);
+               return 2;
+       }
+       super->active = &super->primary;
+       if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba),
+                           dsize >> 9,  2,
+                           &super->secondary, &super->anchor)) {
+               if ((__be32_to_cpu(super->primary.seq)
+                    < __be32_to_cpu(super->secondary.seq) &&
+                    !super->secondary.openflag)
+                   || (__be32_to_cpu(super->primary.seq)
+                       == __be32_to_cpu(super->secondary.seq) &&
+                       super->primary.openflag && !super->secondary.openflag)
+                       )
+                       super->active = &super->secondary;
+       }
+       return 0;
+}
+
+static int load_ddf_global(int fd, struct ddf_super *super, char *devname)
+{
+       void *ok;
+       ok = load_section(fd, super, &super->controller,
+                         super->active->controller_section_offset,
+                         super->active->controller_section_length,
+                         0);
+       super->phys = load_section(fd, super, NULL,
+                                  super->active->phys_section_offset,
+                                  super->active->phys_section_length,
+                                  1);
+       super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512;
+
+       super->virt = load_section(fd, super, NULL,
+                                  super->active->virt_section_offset,
+                                  super->active->virt_section_length,
+                                  1);
+       super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512;
+       if (!ok ||
+           !super->phys ||
+           !super->virt) {
+               free(super->phys);
+               free(super->virt);
+               super->phys = NULL;
+               super->virt = NULL;
+               return 2;
+       }
+       super->conflist = NULL;
+       super->dlist = NULL;
+
+       super->max_part = __be16_to_cpu(super->active->max_partitions);
+       super->mppe = __be16_to_cpu(super->active->max_primary_element_entries);
+       super->conf_rec_len = __be16_to_cpu(super->active->config_record_len);
+       return 0;
+}
+
+static int load_ddf_local(int fd, struct ddf_super *super,
+                         char *devname, int keep)
+{
+       struct dl *dl;
+       struct stat stb;
+       char *conf;
+       int i;
+       int vnum;
+       int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries);
+       unsigned long long dsize;
+
+       /* First the local disk info */
+       if (posix_memalign((void**)&dl, 512,
+                      sizeof(*dl) +
+                      (super->max_part) * sizeof(dl->vlist[0])) != 0) {
+               fprintf(stderr, Name ": %s could not allocate disk info buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       load_section(fd, super, &dl->disk,
+                    super->active->data_section_offset,
+                    super->active->data_section_length,
+                    0);
+       dl->devname = devname ? strdup(devname) : NULL;
+
+       fstat(fd, &stb);
+       dl->major = major(stb.st_rdev);
+       dl->minor = minor(stb.st_rdev);
+       dl->next = super->dlist;
+       dl->fd = keep ? fd : -1;
+
+       dl->size = 0;
+       if (get_dev_size(fd, devname, &dsize))
+               dl->size = dsize >> 9;
+       dl->spare = NULL;
+       for (i=0 ; i < super->max_part ; i++)
+               dl->vlist[i] = NULL;
+       super->dlist = dl;
+       dl->pdnum = -1;
+       for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++)
+               if (memcmp(super->phys->entries[i].guid,
+                          dl->disk.guid, DDF_GUID_LEN) == 0)
+                       dl->pdnum = i;
+
+       /* Now the config list. */
+       /* 'conf' is an array of config entries, some of which are
+        * probably invalid.  Those which are good need to be copied into
+        * the conflist
+        */
+
+       conf = load_section(fd, super, NULL,
+                           super->active->config_section_offset,
+                           super->active->config_section_length,
+                           0);
+
+       vnum = 0;
+       for (i = 0;
+            i < __be32_to_cpu(super->active->config_section_length);
+            i += super->conf_rec_len) {
+               struct vd_config *vd =
+                       (struct vd_config *)((char*)conf + i*512);
+               struct vcl *vcl;
+
+               if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) {
+                       if (dl->spare)
+                               continue;
+                       if (posix_memalign((void**)&dl->spare, 512,
+                                      super->conf_rec_len*512) != 0) {
+                               fprintf(stderr, Name
+                                       ": %s could not allocate spare info buf\n",
+                                       __func__);
+                               return 1;
+                       }
+                               
+                       memcpy(dl->spare, vd, super->conf_rec_len*512);
+                       continue;
+               }
+               if (vd->magic != DDF_VD_CONF_MAGIC)
+                       continue;
+               for (vcl = super->conflist; vcl; vcl = vcl->next) {
+                       if (memcmp(vcl->conf.guid,
+                                  vd->guid, DDF_GUID_LEN) == 0)
+                               break;
+               }
+
+               if (vcl) {
+                       dl->vlist[vnum++] = vcl;
+                       if (__be32_to_cpu(vd->seqnum) <=
+                           __be32_to_cpu(vcl->conf.seqnum))
+                               continue;
+               } else {
+                       if (posix_memalign((void**)&vcl, 512,
+                                      (super->conf_rec_len*512 +
+                                       offsetof(struct vcl, conf))) != 0) {
+                               fprintf(stderr, Name
+                                       ": %s could not allocate vcl buf\n",
+                                       __func__);
+                               return 1;
+                       }
+                       vcl->next = super->conflist;
+                       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+                       super->conflist = vcl;
+                       dl->vlist[vnum++] = vcl;
+               }
+               memcpy(&vcl->conf, vd, super->conf_rec_len*512);
+               vcl->lba_offset = (__u64*)
+                       &vcl->conf.phys_refnum[super->mppe];
+
+               for (i=0; i < max_virt_disks ; i++)
+                       if (memcmp(super->virt->entries[i].guid,
+                                  vcl->conf.guid, DDF_GUID_LEN)==0)
+                               break;
+               if (i < max_virt_disks)
+                       vcl->vcnum = i;
+       }
+       free(conf);
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd);
+#endif
+static int load_super_ddf(struct supertype *st, int fd,
+                         char *devname)
+{
+       unsigned long long dsize;
+       struct ddf_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       /* if 'fd' is a container, load metadata from all the devices */
+       if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME Is this correct */
+
+       if (get_dev_size(fd, devname, &dsize) == 0)
+               return 1;
+
+       /* 32M is a lower bound */
+       if (dsize <= 32*1024*1024) {
+               if (devname) {
+                       fprintf(stderr,
+                               Name ": %s is too small for ddf: "
+                               "size is %llu sectors.\n",
+                               devname, dsize>>9);
+                       return 1;
+               }
+       }
+       if (dsize & 511) {
+               if (devname) {
+                       fprintf(stderr,
+                               Name ": %s is an odd size for ddf: "
+                               "size is %llu bytes.\n",
+                               devname, dsize);
+                       return 1;
+               }
+       }
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) {
+               fprintf(stderr, Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+       memset(super, 0, sizeof(*super));
+
+       rv = load_ddf_headers(fd, super, devname);
+       if (rv) {
+               free(super);
+               return rv;
+       }
+
+       /* Have valid headers and have chosen the best. Let's read in the rest*/
+
+       rv = load_ddf_global(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free(super);
+               return rv;
+       }
+
+       rv = load_ddf_local(fd, super, devname, 0);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free(super);
+               return rv;
+       }
+
+       /* Should possibly check the sections .... */
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+       }
+       st->loaded_container = 0;
+       return 0;
+
+}
+
+static void free_super_ddf(struct supertype *st)
+{
+       struct ddf_super *ddf = st->sb;
+       if (ddf == NULL)
+               return;
+       free(ddf->phys);
+       free(ddf->virt);
+       while (ddf->conflist) {
+               struct vcl *v = ddf->conflist;
+               ddf->conflist = v->next;
+               if (v->block_sizes)
+                       free(v->block_sizes);
+               free(v);
+       }
+       while (ddf->dlist) {
+               struct dl *d = ddf->dlist;
+               ddf->dlist = d->next;
+               if (d->fd >= 0)
+                       close(d->fd);
+               if (d->spare)
+                       free(d->spare);
+               free(d);
+       }
+       free(ddf);
+       st->sb = NULL;
+}
+
+static struct supertype *match_metadata_desc_ddf(char *arg)
+{
+       /* 'ddf' only support containers */
+       struct supertype *st;
+       if (strcmp(arg, "ddf") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_ddf;
+       st->max_devs = 512;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+
+#ifndef MDASSEMBLE
+
+static mapping_t ddf_state[] = {
+       { "Optimal", 0},
+       { "Degraded", 1},
+       { "Deleted", 2},
+       { "Missing", 3},
+       { "Failed", 4},
+       { "Partially Optimal", 5},
+       { "-reserved-", 6},
+       { "-reserved-", 7},
+       { NULL, 0}
+};
+
+static mapping_t ddf_init_state[] = {
+       { "Not Initialised", 0},
+       { "QuickInit in Progress", 1},
+       { "Fully Initialised", 2},
+       { "*UNKNOWN*", 3},
+       { NULL, 0}
+};
+static mapping_t ddf_access[] = {
+       { "Read/Write", 0},
+       { "Reserved", 1},
+       { "Read Only", 2},
+       { "Blocked (no access)", 3},
+       { NULL ,0}
+};
+
+static mapping_t ddf_level[] = {
+       { "RAID0", DDF_RAID0},
+       { "RAID1", DDF_RAID1},
+       { "RAID3", DDF_RAID3},
+       { "RAID4", DDF_RAID4},
+       { "RAID5", DDF_RAID5},
+       { "RAID1E",DDF_RAID1E},
+       { "JBOD",  DDF_JBOD},
+       { "CONCAT",DDF_CONCAT},
+       { "RAID5E",DDF_RAID5E},
+       { "RAID5EE",DDF_RAID5EE},
+       { "RAID6", DDF_RAID6},
+       { NULL, 0}
+};
+static mapping_t ddf_sec_level[] = {
+       { "Striped", DDF_2STRIPED},
+       { "Mirrored", DDF_2MIRRORED},
+       { "Concat", DDF_2CONCAT},
+       { "Spanned", DDF_2SPANNED},
+       { NULL, 0}
+};
+#endif
+
+struct num_mapping {
+       int num1, num2;
+};
+static struct num_mapping ddf_level_num[] = {
+       { DDF_RAID0, 0 },
+       { DDF_RAID1, 1 },
+       { DDF_RAID3, LEVEL_UNSUPPORTED },
+       { DDF_RAID4, 4 },
+       { DDF_RAID5, 5 },
+       { DDF_RAID1E, LEVEL_UNSUPPORTED },
+       { DDF_JBOD, LEVEL_UNSUPPORTED },
+       { DDF_CONCAT, LEVEL_LINEAR },
+       { DDF_RAID5E, LEVEL_UNSUPPORTED },
+       { DDF_RAID5EE, LEVEL_UNSUPPORTED },
+       { DDF_RAID6, 6},
+       { MAXINT, MAXINT }
+};
+
+static int map_num1(struct num_mapping *map, int num)
+{
+       int i;
+       for (i=0 ; map[i].num1 != MAXINT; i++)
+               if (map[i].num1 == num)
+                       break;
+       return map[i].num2;
+}
+
+#ifndef MDASSEMBLE
+static void print_guid(char *guid, int tstamp)
+{
+       /* A GUIDs are part (or all) ASCII and part binary.
+        * They tend to be space padded.
+        * We print the GUID in HEX, then in parentheses add
+        * any initial ASCII sequence, and a possible
+        * time stamp from bytes 16-19
+        */
+       int l = DDF_GUID_LEN;
+       int i;
+
+       for (i=0 ; i<DDF_GUID_LEN ; i++) {
+               if ((i&3)==0 && i != 0) printf(":");
+               printf("%02X", guid[i]&255);
+       }
+
+       printf(" (");
+       while (l && guid[l-1] == ' ')
+               l--;
+       for (i=0 ; i<l ; i++) {
+               if (guid[i] >= 0x20 && guid[i] < 0x7f)
+                       fputc(guid[i], stdout);
+               else
+                       break;
+       }
+       if (tstamp) {
+               time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE;
+               char tbuf[100];
+               struct tm *tm;
+               tm = localtime(&then);
+               strftime(tbuf, 100, " %D %T",tm);
+               fputs(tbuf, stdout);
+       }
+       printf(")");
+}
+
+static void examine_vd(int n, struct ddf_super *sb, char *guid)
+{
+       int crl = sb->conf_rec_len;
+       struct vcl *vcl;
+
+       for (vcl = sb->conflist ; vcl ; vcl = vcl->next) {
+               struct vd_config *vc = &vcl->conf;
+
+               if (calc_crc(vc, crl*512) != vc->crc)
+                       continue;
+               if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0)
+                       continue;
+
+               /* Ok, we know about this VD, let's give more details */
+               printf(" Raid Devices[%d] : %d\n", n,
+                      __be16_to_cpu(vc->prim_elmnt_count));
+               printf("   Chunk Size[%d] : %d sectors\n", n,
+                      1 << vc->chunk_shift);
+               printf("   Raid Level[%d] : %s\n", n,
+                      map_num(ddf_level, vc->prl)?:"-unknown-");
+               if (vc->sec_elmnt_count != 1) {
+                       printf("  Secondary Position[%d] : %d of %d\n", n,
+                              vc->sec_elmnt_seq, vc->sec_elmnt_count);
+                       printf("  Secondary Level[%d] : %s\n", n,
+                              map_num(ddf_sec_level, vc->srl) ?: "-unknown-");
+               }
+               printf("  Device Size[%d] : %llu\n", n,
+                      __be64_to_cpu(vc->blocks)/2);
+               printf("   Array Size[%d] : %llu\n", n,
+                      __be64_to_cpu(vc->array_blocks)/2);
+       }
+}
+
+static void examine_vds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->virt->populated_vdes);
+       int i;
+       printf("  Virtual Disks : %d\n", cnt);
+
+       for (i=0; i<cnt; i++) {
+               struct virtual_entry *ve = &sb->virt->entries[i];
+               printf("      VD GUID[%d] : ", i); print_guid(ve->guid, 1);
+               printf("\n");
+               printf("         unit[%d] : %d\n", i, __be16_to_cpu(ve->unit));
+               printf("        state[%d] : %s, %s%s\n", i,
+                      map_num(ddf_state, ve->state & 7),
+                      (ve->state & 8) ? "Morphing, ": "",
+                      (ve->state & 16)? "Not Consistent" : "Consistent");
+               printf("   init state[%d] : %s\n", i,
+                      map_num(ddf_init_state, ve->init_state&3));
+               printf("       access[%d] : %s\n", i,
+                      map_num(ddf_access, (ve->init_state>>6) & 3));
+               printf("         Name[%d] : %.16s\n", i, ve->name);
+               examine_vd(i, sb, ve->guid);
+       }
+       if (cnt) printf("\n");
+}
+
+static void examine_pds(struct ddf_super *sb)
+{
+       int cnt = __be16_to_cpu(sb->phys->used_pdes);
+       int i;
+       struct dl *dl;
+       printf(" Physical Disks : %d\n", cnt);
+
+       for (i=0 ; i<cnt ; i++) {
+               struct phys_disk_entry *pd = &sb->phys->entries[i];
+               int type = __be16_to_cpu(pd->type);
+               int state = __be16_to_cpu(pd->state);
+
+               printf("      PD GUID[%d] : ", i); print_guid(pd->guid, 0);
+               printf("\n");
+               printf("          ref[%d] : %08x\n", i,
+                      __be32_to_cpu(pd->refnum));
+               printf("         mode[%d] : %s%s%s%s%s\n", i,
+                      (type&2) ? "active":"",
+                      (type&4) ? "Global Spare":"",
+                      (type&8) ? "spare" : "",
+                      (type&16)? ", foreign" : "",
+                      (type&32)? "pass-through" : "");
+               printf("        state[%d] : %s%s%s%s%s%s%s\n", i,
+                      (state&1)? "Online": "Offline",
+                      (state&2)? ", Failed": "",
+                      (state&4)? ", Rebuilding": "",
+                      (state&8)? ", in-transition": "",
+                      (state&16)? ", SMART errors": "",
+                      (state&32)? ", Unrecovered Read Errors": "",
+                      (state&64)? ", Missing" : "");
+               printf("   Avail Size[%d] : %llu K\n", i,
+                      __be64_to_cpu(pd->config_size)>>1);
+               for (dl = sb->dlist; dl ; dl = dl->next) {
+                       if (dl->disk.refnum == pd->refnum) {
+                               char *dv = map_dev(dl->major, dl->minor, 0);
+                               if (dv)
+                                       printf("       Device[%d] : %s\n",
+                                              i, dv);
+                       }
+               }
+               printf("\n");
+       }
+}
+
+static void examine_super_ddf(struct supertype *st, char *homehost)
+{
+       struct ddf_super *sb = st->sb;
+
+       printf("          Magic : %08x\n", __be32_to_cpu(sb->anchor.magic));
+       printf("        Version : %.8s\n", sb->anchor.revision);
+       printf("Controller GUID : "); print_guid(sb->controller.guid, 0);
+       printf("\n");
+       printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+       printf("\n");
+       printf("            Seq : %08x\n", __be32_to_cpu(sb->active->seq));
+       printf("  Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC
+              ?"yes" : "no");
+       examine_vds(sb);
+       examine_pds(sb);
+}
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info);
+
+
+static void brief_examine_super_ddf(struct supertype *st)
+{
+       /* We just write a generic DDF ARRAY entry
+        */
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_ddf(st, &info);
+       fname_from_uuid(st, &info, nbuf, ':');
+       printf("ARRAY /dev/ddf metadata=ddf UUID=%s\n", nbuf + 5);
+}
+
+static void detail_super_ddf(struct supertype *st, char *homehost)
+{
+       /* FIXME later
+        * Could print DDF GUID
+        * Need to find which array
+        *  If whole, briefly list all arrays
+        *  If one, give name
+        */
+}
+
+static void brief_detail_super_ddf(struct supertype *st)
+{
+       /* FIXME I really need to know which array we are detailing.
+        * Can that be stored in ddf_super??
+        */
+//     struct ddf_super *ddf = st->sb;
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_ddf(st, &info);
+       fname_from_uuid(st, &info, nbuf,':');
+       printf(" UUID=%s", nbuf + 5);
+}
+#endif
+
+static int match_home_ddf(struct supertype *st, char *homehost)
+{
+       /* It matches 'this' host if the controller is a
+        * Linux-MD controller with vendor_data matching
+        * the hostname
+        */
+       struct ddf_super *ddf = st->sb;
+       int len = strlen(homehost);
+
+       return (memcmp(ddf->controller.guid, T10, 8) == 0 &&
+               len < sizeof(ddf->controller.vendor_data) &&
+               memcmp(ddf->controller.vendor_data, homehost,len) == 0 &&
+               ddf->controller.vendor_data[len] == 0);
+}
+
+#ifndef MDASSEMBLE
+static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst)
+{
+       struct vcl *v;
+
+       for (v = ddf->conflist; v; v = v->next)
+               if (inst == v->vcnum)
+                       return &v->conf;
+       return NULL;
+}
+#endif
+
+static int find_phys(struct ddf_super *ddf, __u32 phys_refnum)
+{
+       /* Find the entry in phys_disk which has the given refnum
+        * and return it's index
+        */
+       int i;
+       for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++)
+               if (ddf->phys->entries[i].refnum == phys_refnum)
+                       return i;
+       return -1;
+}
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
+{
+       /* The uuid returned here is used for:
+        *  uuid to put into bitmap file (Create, Grow)
+        *  uuid for backup header when saving critical section (Grow)
+        *  comparing uuids when re-adding a device into an array
+        *    In these cases the uuid required is that of the data-array,
+        *    not the device-set.
+        *  uuid to recognise same set when adding a missing device back
+        *    to an array.   This is a uuid for the device-set.
+        *  
+        * For each of these we can make do with a truncated
+        * or hashed uuid rather than the original, as long as
+        * everyone agrees.
+        * In the case of SVD we assume the BVD is of interest,
+        * though that might be the case if a bitmap were made for
+        * a mirrored SVD - worry about that later.
+        * So we need to find the VD configuration record for the
+        * relevant BVD and extract the GUID and Secondary_Element_Seq.
+        * The first 16 bytes of the sha1 of these is used.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vcl = ddf->currentconf;
+       char *guid;
+       char buf[20];
+       struct sha1_ctx ctx;
+
+       if (vcl)
+               guid = vcl->conf.guid;
+       else
+               guid = ddf->anchor.guid;
+
+       sha1_init_ctx(&ctx);
+       sha1_process_bytes(guid, DDF_GUID_LEN, &ctx);
+       if (vcl && vcl->conf.sec_elmnt_count > 1)
+               sha1_process_bytes(&vcl->conf.sec_elmnt_seq, 1, &ctx);
+       sha1_finish_ctx(&ctx, buf);
+       memcpy(uuid, buf, 4*4);
+}
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info);
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+
+       if (ddf->currentconf) {
+               getinfo_super_ddf_bvd(st, info);
+               return;
+       }
+
+       info->array.raid_disks    = __be16_to_cpu(ddf->phys->used_pdes);
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE + __be32_to_cpu(*(__u32*)
+                                                        (ddf->anchor.guid+16));
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       if (ddf->dlist) {
+               info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum);
+               info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
+
+               info->data_offset = __be64_to_cpu(ddf->phys->
+                                         entries[info->disk.raid_disk].
+                                         config_size);
+               info->component_size = ddf->dlist->size - info->data_offset;
+       } else {
+               info->disk.number = -1;
+//             info->disk.raid_disk = find refnum in the table and use index;
+       }
+       info->disk.state = (1 << MD_DISK_SYNC);
+
+
+       info->reshape_active = 0;
+       info->name[0] = 0;
+
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       strcpy(info->text_version, "ddf");
+       info->safe_mode_delay = 0;
+
+       uuid_from_super_ddf(st, info->uuid);
+
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks);
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info)
+{
+       struct ddf_super *ddf = st->sb;
+       struct vcl *vc = ddf->currentconf;
+       int cd = ddf->currentdev;
+
+       /* FIXME this returns BVD info - what if we want SVD ?? */
+
+       info->array.raid_disks    = __be16_to_cpu(vc->conf.prim_elmnt_count);
+       info->array.level         = map_num1(ddf_level_num, vc->conf.prl);
+       info->array.layout        = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                 info->array.raid_disks);
+       info->array.md_minor      = -1;
+       info->array.ctime         = DECADE +
+               __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+       info->array.utime         = DECADE + __be32_to_cpu(vc->conf.timestamp);
+       info->array.chunk_size    = 512 << vc->conf.chunk_shift;
+
+       if (cd >= 0 && cd < ddf->mppe) {
+               info->data_offset         = __be64_to_cpu(vc->lba_offset[cd]);
+               if (vc->block_sizes)
+                       info->component_size = vc->block_sizes[cd];
+               else
+                       info->component_size = __be64_to_cpu(vc->conf.blocks);
+       }
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+//     info->disk.number = __be32_to_cpu(ddf->disk.refnum);
+//     info->disk.raid_disk = find refnum in the table and use index;
+//     info->disk.state = ???;
+
+       info->container_member = ddf->currentconf->vcnum;
+
+       info->resync_start = 0;
+       if (!(ddf->virt->entries[info->container_member].state
+             & DDF_state_inconsistent)  &&
+           (ddf->virt->entries[info->container_member].init_state
+            & DDF_initstate_mask)
+           == DDF_init_full)
+               info->resync_start = ~0ULL;
+
+       uuid_from_super_ddf(st, info->uuid);
+
+       info->container_member = atoi(st->subarray);
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       sprintf(info->text_version, "/%s/%s",
+               devnum2devname(st->container_dev),
+               st->subarray);
+       info->safe_mode_delay = 200;
+
+       info->name[0] = 0;
+}
+
+
+static int update_super_ddf(struct supertype *st, struct mdinfo *info,
+                           char *update,
+                           char *devname, int verbose,
+                           int uuid_set, char *homehost)
+{
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  uuid:  Change the uuid of the array to match what is given
+        *  homehost:  update the recorded homehost
+        *  name:  update the name - preserving the homehost
+        *  _reshape_progress: record new reshape_progress position.
+        *
+        * Following are not relevant for this version:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        */
+       int rv = 0;
+//     struct ddf_super *ddf = st->sb;
+//     struct vd_config *vd = find_vdcr(ddf, info->container_member);
+//     struct virtual_entry *ve = find_ve(ddf);
+
+       /* we don't need to handle "force-*" or "assemble" as
+        * there is no need to 'trick' the kernel.  We the metadata is
+        * first updated to activate the array, all the implied modifications
+        * will just happen.
+        */
+
+       if (strcmp(update, "grow") == 0) {
+               /* FIXME */
+       }
+       if (strcmp(update, "resync") == 0) {
+//             info->resync_checkpoint = 0;
+       }
+       /* We ignore UUID updates as they make even less sense
+        * with DDF
+        */
+       if (strcmp(update, "homehost") == 0) {
+               /* homehost is stored in controller->vendor_data,
+                * or it is when we are the vendor
+                */
+//             if (info->vendor_is_local)
+//                     strcpy(ddf->controller.vendor_data, homehost);
+       }
+       if (strcmp(update, "name") == 0) {
+               /* name is stored in virtual_entry->name */
+//             memset(ve->name, ' ', 16);
+//             strncpy(ve->name, info->name, 16);
+       }
+       if (strcmp(update, "_reshape_progress") == 0) {
+               /* We don't support reshape yet */
+       }
+
+//     update_all_csum(ddf);
+
+       return rv;
+}
+
+static void make_header_guid(char *guid)
+{
+       __u32 stamp;
+       int rfd;
+       /* Create a DDF Header of Virtual Disk GUID */
+
+       /* 24 bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000
+        * Remaining 8 random number plus timestamp
+        */
+       memcpy(guid, T10, sizeof(T10));
+       stamp = __cpu_to_be32(0xdeadbeef);
+       memcpy(guid+8, &stamp, 4);
+       stamp = __cpu_to_be32(0);
+       memcpy(guid+12, &stamp, 4);
+       stamp = __cpu_to_be32(time(0) - DECADE);
+       memcpy(guid+16, &stamp, 4);
+       rfd = open("/dev/urandom", O_RDONLY);
+       if (rfd < 0 || read(rfd, &stamp, 4) != 4)
+               stamp = random();
+       memcpy(guid+20, &stamp, 4);
+       if (rfd >= 0) close(rfd);
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid);
+
+static int init_super_ddf(struct supertype *st,
+                         mdu_array_info_t *info,
+                         unsigned long long size, char *name, char *homehost,
+                         int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For DDF, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        *
+        * We need to create the entire 'ddf' structure which includes:
+        *  DDF headers - these are easy.
+        *  Controller data - a Sector describing this controller .. not that
+        *                  this is a controller exactly.
+        *  Physical Disk Record - one entry per device, so
+        *                      leave plenty of space.
+        *  Virtual Disk Records - again, just leave plenty of space.
+        *                   This just lists VDs, doesn't give details
+        *  Config records - describes the VDs that use this disk
+        *  DiskData  - describes 'this' device.
+        *  BadBlockManagement - empty
+        *  Diag Space - empty
+        *  Vendor Logs - Could we put bitmaps here?
+        *
+        */
+       struct ddf_super *ddf;
+       char hostname[17];
+       int hostlen;
+       int max_phys_disks, max_virt_disks;
+       unsigned long long sector;
+       int clen;
+       int i;
+       int pdsize, vdsize;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_ddf_bvd(st, info, size, name, homehost,
+                                         uuid);
+
+       if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
+       memset(ddf, 0, sizeof(*ddf));
+       ddf->dlist = NULL; /* no physical disks yet */
+       ddf->conflist = NULL; /* No virtual disks yet */
+
+       /* At least 32MB *must* be reserved for the ddf.  So let's just
+        * start 32MB from the end, and put the primary header there.
+        * Don't do secondary for now.
+        * We don't know exactly where that will be yet as it could be
+        * different on each device.  To just set up the lengths.
+        *
+        */
+
+       ddf->anchor.magic = DDF_HEADER_MAGIC;
+       make_header_guid(ddf->anchor.guid);
+
+       memcpy(ddf->anchor.revision, DDF_REVISION_2, 8);
+       ddf->anchor.seq = __cpu_to_be32(1);
+       ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE);
+       ddf->anchor.openflag = 0xFF;
+       ddf->anchor.foreignflag = 0;
+       ddf->anchor.enforcegroups = 0; /* Is this best?? */
+       ddf->anchor.pad0 = 0xff;
+       memset(ddf->anchor.pad1, 0xff, 12);
+       memset(ddf->anchor.header_ext, 0xff, 32);
+       ddf->anchor.primary_lba = ~(__u64)0;
+       ddf->anchor.secondary_lba = ~(__u64)0;
+       ddf->anchor.type = DDF_HEADER_ANCHOR;
+       memset(ddf->anchor.pad2, 0xff, 3);
+       ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */
+       ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom
+                                                 of 32M reserved.. */
+       max_phys_disks = 1023;   /* Should be enough */
+       ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks);
+       max_virt_disks = 255;
+       ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */
+       ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */
+       ddf->max_part = 64;
+       ddf->mppe = 256;
+       ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
+       ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len);
+       ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe);
+       memset(ddf->anchor.pad3, 0xff, 54);
+       /* controller sections is one sector long immediately
+        * after the ddf header */
+       sector = 1;
+       ddf->anchor.controller_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.controller_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       /* phys is 8 sectors after that */
+       pdsize = ROUND_UP(sizeof(struct phys_disk) +
+                         sizeof(struct phys_disk_entry)*max_phys_disks,
+                         512);
+       switch(pdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.phys_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.phys_section_length =
+               __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */
+       sector += pdsize/512;
+
+       /* virt is another 32 sectors */
+       vdsize = ROUND_UP(sizeof(struct virtual_disk) +
+                         sizeof(struct virtual_entry) * max_virt_disks,
+                         512);
+       switch(vdsize/512) {
+       case 2: case 8: case 32: case 128: case 512: break;
+       default: abort();
+       }
+       ddf->anchor.virt_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.virt_section_length =
+               __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */
+       sector += vdsize/512;
+
+       clen = ddf->conf_rec_len * (ddf->max_part+1);
+       ddf->anchor.config_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.config_section_length = __cpu_to_be32(clen);
+       sector += clen;
+
+       ddf->anchor.data_section_offset = __cpu_to_be32(sector);
+       ddf->anchor.data_section_length = __cpu_to_be32(1);
+       sector += 1;
+
+       ddf->anchor.bbm_section_length = __cpu_to_be32(0);
+       ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.diag_space_length = __cpu_to_be32(0);
+       ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF);
+       ddf->anchor.vendor_length = __cpu_to_be32(0);
+       ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF);
+
+       memset(ddf->anchor.pad4, 0xff, 256);
+
+       memcpy(&ddf->primary, &ddf->anchor, 512);
+       memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+       ddf->primary.openflag = 1; /* I guess.. */
+       ddf->primary.type = DDF_HEADER_PRIMARY;
+
+       ddf->secondary.openflag = 1; /* I guess.. */
+       ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+       ddf->active = &ddf->primary;
+
+       ddf->controller.magic = DDF_CONTROLLER_MAGIC;
+
+       /* 24 more bytes of fiction required.
+        * first 8 are a 'vendor-id'  - "Linux-MD"
+        * Remaining 16 are serial number.... maybe a hostname would do?
+        */
+       memcpy(ddf->controller.guid, T10, sizeof(T10));
+       gethostname(hostname, sizeof(hostname));
+       hostname[sizeof(hostname) - 1] = 0;
+       hostlen = strlen(hostname);
+       memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen);
+       for (i = strlen(T10) ; i+hostlen < 24; i++)
+               ddf->controller.guid[i] = ' ';
+
+       ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD);
+       ddf->controller.type.device_id = __cpu_to_be16(0xBEEF);
+       ddf->controller.type.sub_vendor_id = 0;
+       ddf->controller.type.sub_device_id = 0;
+       memcpy(ddf->controller.product_id, "What Is My PID??", 16);
+       memset(ddf->controller.pad, 0xff, 8);
+       memset(ddf->controller.vendor_data, 0xff, 448);
+
+       if (posix_memalign((void**)&pd, 512, pdsize) != 0) {
+               fprintf(stderr, Name ": %s could not allocate pd\n", __func__);
+               return 0;
+       }
+       ddf->phys = pd;
+       ddf->pdsize = pdsize;
+
+       memset(pd, 0xff, pdsize);
+       memset(pd, 0, sizeof(*pd));
+       pd->magic = DDF_PHYS_DATA_MAGIC;
+       pd->used_pdes = __cpu_to_be16(0);
+       pd->max_pdes = __cpu_to_be16(max_phys_disks);
+       memset(pd->pad, 0xff, 52);
+
+       if (posix_memalign((void**)&vd, 512, vdsize) != 0) {
+               fprintf(stderr, Name ": %s could not allocate vd\n", __func__);
+               return 0;
+       }
+       ddf->virt = vd;
+       ddf->vdsize = vdsize;
+       memset(vd, 0, vdsize);
+       vd->magic = DDF_VIRT_RECORDS_MAGIC;
+       vd->populated_vdes = __cpu_to_be16(0);
+       vd->max_vdes = __cpu_to_be16(max_virt_disks);
+       memset(vd->pad, 0xff, 52);
+
+       for (i=0; i<max_virt_disks; i++)
+               memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
+
+       st->sb = ddf;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+static int all_ff(char *guid)
+{
+       int i;
+       for (i = 0; i < DDF_GUID_LEN; i++)
+               if (guid[i] != (char)0xff)
+                       return 0;
+       return 1;
+}
+static int chunk_to_shift(int chunksize)
+{
+       return ffs(chunksize/512)-1;
+}
+
+static int level_to_prl(int level)
+{
+       switch (level) {
+       case LEVEL_LINEAR: return DDF_CONCAT;
+       case 0: return DDF_RAID0;
+       case 1: return DDF_RAID1;
+       case 4: return DDF_RAID4;
+       case 5: return DDF_RAID5;
+       case 6: return DDF_RAID6;
+       default: return -1;
+       }
+}
+static int layout_to_rlq(int level, int layout, int raiddisks)
+{
+       switch(level) {
+       case 0:
+               return DDF_RAID0_SIMPLE;
+       case 1:
+               switch(raiddisks) {
+               case 2: return DDF_RAID1_SIMPLE;
+               case 3: return DDF_RAID1_MULTI;
+               default: return -1;
+               }
+       case 4:
+               switch(layout) {
+               case 0: return DDF_RAID4_N;
+               }
+               break;
+       case 5:
+       case 6:
+               switch(layout) {
+               case ALGORITHM_LEFT_ASYMMETRIC:
+                       return DDF_RAID5_N_RESTART;
+               case ALGORITHM_RIGHT_ASYMMETRIC:
+                       if (level == 5)
+                               return DDF_RAID5_0_RESTART;
+                       else
+                               return DDF_RAID6_0_RESTART;
+               case ALGORITHM_LEFT_SYMMETRIC:
+                       return DDF_RAID5_N_CONTINUE;
+               case ALGORITHM_RIGHT_SYMMETRIC:
+                       return -1; /* not mentioned in standard */
+               }
+       }
+       return -1;
+}
+
+static int rlq_to_layout(int rlq, int prl, int raiddisks)
+{
+       switch(prl) {
+       case DDF_RAID0:
+               return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */
+       case DDF_RAID1:
+               return 0; /* hopefully rlq == SIMPLE or MULTI depending
+                            on raiddisks*/
+       case DDF_RAID4:
+               switch(rlq) {
+               case DDF_RAID4_N:
+                       return 0;
+               default:
+                       /* not supported */
+                       return -1; /* FIXME this isn't checked */
+               }
+       case DDF_RAID5:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_LEFT_ASYMMETRIC;
+               case DDF_RAID5_0_RESTART:
+                       return ALGORITHM_RIGHT_ASYMMETRIC;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_LEFT_SYMMETRIC;
+               default:
+                       return -1;
+               }
+       case DDF_RAID6:
+               switch(rlq) {
+               case DDF_RAID5_N_RESTART:
+                       return ALGORITHM_LEFT_ASYMMETRIC;
+               case DDF_RAID6_0_RESTART:
+                       return ALGORITHM_RIGHT_ASYMMETRIC;
+               case DDF_RAID5_N_CONTINUE:
+                       return ALGORITHM_LEFT_SYMMETRIC;
+               default:
+                       return -1;
+               }
+       }
+       return -1;
+}
+
+#ifndef MDASSEMBLE
+struct extent {
+       unsigned long long start, size;
+};
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
+{
+       /* find a list of used extents on the give physical device
+        * (dnum) of the given ddf.
+        * Return a malloced array of 'struct extent'
+
+FIXME ignore DDF_Legacy devices?
+
+        */
+       struct extent *rv;
+       int n = 0;
+       int i, j;
+
+       rv = malloc(sizeof(struct extent) * (ddf->max_part + 2));
+       if (!rv)
+               return NULL;
+
+       for (i = 0; i < ddf->max_part; i++) {
+               struct vcl *v = dl->vlist[i];
+               if (v == NULL)
+                       continue;
+               for (j=0; j < v->conf.prim_elmnt_count; j++)
+                       if (v->conf.phys_refnum[j] == dl->disk.refnum) {
+                               /* This device plays role 'j' in  'v'. */
+                               rv[n].start = __be64_to_cpu(v->lba_offset[j]);
+                               rv[n].size = __be64_to_cpu(v->conf.blocks);
+                               n++;
+                               break;
+                       }
+       }
+       qsort(rv, n, sizeof(*rv), cmp_extent);
+
+       rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size);
+       rv[n].size = 0;
+       return rv;
+}
+#endif
+
+static int init_super_ddf_bvd(struct supertype *st,
+                             mdu_array_info_t *info,
+                             unsigned long long size,
+                             char *name, char *homehost,
+                             int *uuid)
+{
+       /* We are creating a BVD inside a pre-existing container.
+        * so st->sb is already set.
+        * We need to create a new vd_config and a new virtual_entry
+        */
+       struct ddf_super *ddf = st->sb;
+       int venum;
+       struct virtual_entry *ve;
+       struct vcl *vcl;
+       struct vd_config *vc;
+
+       if (__be16_to_cpu(ddf->virt->populated_vdes)
+           >= __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name": This ddf already has the "
+                       "maximum of %d virtual devices\n",
+                       __be16_to_cpu(ddf->virt->max_vdes));
+               return 0;
+       }
+
+       for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++)
+               if (all_ff(ddf->virt->entries[venum].guid))
+                       break;
+       if (venum == __be16_to_cpu(ddf->virt->max_vdes)) {
+               fprintf(stderr, Name ": Cannot find spare slot for "
+                       "virtual disk - DDF is corrupt\n");
+               return 0;
+       }
+       ve = &ddf->virt->entries[venum];
+
+       /* A Virtual Disk GUID contains the T10 Vendor ID, controller type,
+        * timestamp, random number
+        */
+       make_header_guid(ve->guid);
+       ve->unit = __cpu_to_be16(info->md_minor);
+       ve->pad0 = 0xFFFF;
+       ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN);
+       ve->type = 0;
+       ve->state = DDF_state_degraded; /* Will be modified as devices are added */
+       if (info->state & 1) /* clean */
+               ve->init_state = DDF_init_full;
+       else
+               ve->init_state = DDF_init_not;
+
+       memset(ve->pad1, 0xff, 14);
+       memset(ve->name, ' ', 16);
+       if (name)
+               strncpy(ve->name, name, 16);
+       ddf->virt->populated_vdes =
+               __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1);
+
+       /* Now create a new vd_config */
+       if (posix_memalign((void**)&vcl, 512,
+                          (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate vd_config\n", __func__);
+               return 0;
+       }
+       vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe];
+       vcl->vcnum = venum;
+       sprintf(st->subarray, "%d", venum);
+       vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+
+       vc = &vcl->conf;
+
+       vc->magic = DDF_VD_CONF_MAGIC;
+       memcpy(vc->guid, ve->guid, DDF_GUID_LEN);
+       vc->timestamp = __cpu_to_be32(time(0)-DECADE);
+       vc->seqnum = __cpu_to_be32(1);
+       memset(vc->pad0, 0xff, 24);
+       vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks);
+       vc->chunk_shift = chunk_to_shift(info->chunk_size);
+       vc->prl = level_to_prl(info->level);
+       vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks);
+       vc->sec_elmnt_count = 1;
+       vc->sec_elmnt_seq = 0;
+       vc->srl = 0;
+       vc->blocks = __cpu_to_be64(info->size * 2);
+       vc->array_blocks = __cpu_to_be64(
+               calc_array_size(info->level, info->raid_disks, info->layout,
+                               info->chunk_size, info->size*2));
+       memset(vc->pad1, 0xff, 8);
+       vc->spare_refs[0] = 0xffffffff;
+       vc->spare_refs[1] = 0xffffffff;
+       vc->spare_refs[2] = 0xffffffff;
+       vc->spare_refs[3] = 0xffffffff;
+       vc->spare_refs[4] = 0xffffffff;
+       vc->spare_refs[5] = 0xffffffff;
+       vc->spare_refs[6] = 0xffffffff;
+       vc->spare_refs[7] = 0xffffffff;
+       memset(vc->cache_pol, 0, 8);
+       vc->bg_rate = 0x80;
+       memset(vc->pad2, 0xff, 3);
+       memset(vc->pad3, 0xff, 52);
+       memset(vc->pad4, 0xff, 192);
+       memset(vc->v0, 0xff, 32);
+       memset(vc->v1, 0xff, 32);
+       memset(vc->v2, 0xff, 16);
+       memset(vc->v3, 0xff, 16);
+       memset(vc->vendor, 0xff, 32);
+
+       memset(vc->phys_refnum, 0xff, 4*ddf->mppe);
+       memset(vc->phys_refnum+(ddf->mppe * 4), 0x00, 8*ddf->mppe);
+
+       vcl->next = ddf->conflist;
+       ddf->conflist = vcl;
+       ddf->currentconf = vcl;
+       ddf->updates_pending = 1;
+       return 1;
+}
+
+#ifndef MDASSEMBLE
+static void add_to_super_ddf_bvd(struct supertype *st,
+                                mdu_disk_info_t *dk, int fd, char *devname)
+{
+       /* fd and devname identify a device with-in the ddf container (st).
+        * dk identifies a location in the new BVD.
+        * We need to find suitable free space in that device and update
+        * the phys_refnum and lba_offset for the newly created vd_config.
+        * We might also want to update the type in the phys_disk
+        * section.
+        */
+       struct dl *dl;
+       struct ddf_super *ddf = st->sb;
+       struct vd_config *vc;
+       __u64 *lba_offset;
+       int working;
+       int i;
+       unsigned long long blocks, pos, esize;
+       struct extent *ex;
+
+       for (dl = ddf->dlist; dl ; dl = dl->next)
+               if (dl->major == dk->major &&
+                   dl->minor == dk->minor)
+                       break;
+       if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+               return;
+
+       vc = &ddf->currentconf->conf;
+       lba_offset = ddf->currentconf->lba_offset;
+
+       ex = get_extents(ddf, dl);
+       if (!ex)
+               return;
+
+       i = 0; pos = 0;
+       blocks = __be64_to_cpu(vc->blocks);
+       if (ddf->currentconf->block_sizes)
+               blocks = ddf->currentconf->block_sizes[dk->raid_disk];
+
+       do {
+               esize = ex[i].start - pos;
+               if (esize >= blocks)
+                       break;
+               pos = ex[i].start + ex[i].size;
+               i++;
+       } while (ex[i-1].size);
+
+       free(ex);
+       if (esize < blocks)
+               return;
+
+       ddf->currentdev = dk->raid_disk;
+       vc->phys_refnum[dk->raid_disk] = dl->disk.refnum;
+       lba_offset[dk->raid_disk] = __cpu_to_be64(pos);
+
+       for (i=0; i < ddf->max_part ; i++)
+               if (dl->vlist[i] == NULL)
+                       break;
+       if (i == ddf->max_part)
+               return;
+       dl->vlist[i] = ddf->currentconf;
+
+       dl->fd = fd;
+       dl->devname = devname;
+
+       /* Check how many working raid_disks, and if we can mark
+        * array as optimal yet
+        */
+       working = 0;
+
+       for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++)
+               if (vc->phys_refnum[i] != 0xffffffff)
+                       working++;
+
+       /* Find which virtual_entry */
+       i = ddf->currentconf->vcnum;
+       if (working == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_optimal;
+
+       if (vc->prl == DDF_RAID6 &&
+           working+1 == __be16_to_cpu(vc->prim_elmnt_count))
+               ddf->virt->entries[i].state =
+                       (ddf->virt->entries[i].state & ~DDF_state_mask)
+                       | DDF_state_part_optimal;
+
+       ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare);
+       ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD);
+       ddf->updates_pending = 1;
+}
+
+/* add a device to a container, either while creating it or while
+ * expanding a pre-existing container
+ */
+static void add_to_super_ddf(struct supertype *st,
+                            mdu_disk_info_t *dk, int fd, char *devname)
+{
+       struct ddf_super *ddf = st->sb;
+       struct dl *dd;
+       time_t now;
+       struct tm *tm;
+       unsigned long long size;
+       struct phys_disk_entry *pde;
+       int n, i;
+       struct stat stb;
+
+       if (ddf->currentconf) {
+               add_to_super_ddf_bvd(st, dk, fd, devname);
+               return;
+       }
+
+       /* This is device numbered dk->number.  We need to create
+        * a phys_disk entry and a more detailed disk_data entry.
+        */
+       fstat(fd, &stb);
+       if (posix_memalign((void**)&dd, 512,
+                          sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) {
+               fprintf(stderr, Name
+                       ": %s could allocate buffer for new disk, aborting\n",
+                       __func__);
+               abort();
+       }
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->devname = devname;
+       dd->fd = fd;
+       dd->spare = NULL;
+
+       dd->disk.magic = DDF_PHYS_DATA_MAGIC;
+       now = time(0);
+       tm = localtime(&now);
+       sprintf(dd->disk.guid, "%8s%04d%02d%02d",
+               T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday);
+       *(__u32*)(dd->disk.guid + 16) = random();
+       *(__u32*)(dd->disk.guid + 20) = random();
+
+       do {
+               /* Cannot be bothered finding a CRC of some irrelevant details*/
+               dd->disk.refnum = random();
+               for (i = __be16_to_cpu(ddf->active->max_pd_entries) - 1;
+                    i >= 0; i--)
+                       if (ddf->phys->entries[i].refnum == dd->disk.refnum)
+                               break;
+       } while (i >= 0);
+
+       dd->disk.forced_ref = 1;
+       dd->disk.forced_guid = 1;
+       memset(dd->disk.vendor, ' ', 32);
+       memcpy(dd->disk.vendor, "Linux", 5);
+       memset(dd->disk.pad, 0xff, 442);
+       for (i = 0; i < ddf->max_part ; i++)
+               dd->vlist[i] = NULL;
+
+       n = __be16_to_cpu(ddf->phys->used_pdes);
+       pde = &ddf->phys->entries[n];
+       dd->pdnum = n;
+
+       if (st->update_tail) {
+               int len = (sizeof(struct phys_disk) +
+                          sizeof(struct phys_disk_entry));
+               struct phys_disk *pd;
+
+               pd = malloc(len);
+               pd->magic = DDF_PHYS_RECORDS_MAGIC;
+               pd->used_pdes = __cpu_to_be16(n);
+               pde = &pd->entries[0];
+               dd->mdupdate = pd;
+       } else {
+               n++;
+               ddf->phys->used_pdes = __cpu_to_be16(n);
+       }
+
+       memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN);
+       pde->refnum = dd->disk.refnum;
+       pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare);
+       pde->state = __cpu_to_be16(DDF_Online);
+       get_dev_size(fd, NULL, &size);
+       /* We are required to reserve 32Meg, and record the size in sectors */
+       pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512);
+       sprintf(pde->path, "%17.17s","Information: nil") ;
+       memset(pde->pad, 0xff, 6);
+
+       dd->size = size >> 9;
+       if (st->update_tail) {
+               dd->next = ddf->add_list;
+               ddf->add_list = dd;
+       } else {
+               dd->next = ddf->dlist;
+               ddf->dlist = dd;
+               ddf->updates_pending = 1;
+       }
+}
+
+/*
+ * This is the write_init_super method for a ddf container.  It is
+ * called when creating a container or adding another device to a
+ * container.
+ */
+
+static unsigned char null_conf[4096+512];
+
+static int __write_init_super_ddf(struct supertype *st, int do_close)
+{
+
+       struct ddf_super *ddf = st->sb;
+       int i;
+       struct dl *d;
+       int n_config;
+       int conf_size;
+       int attempts = 0;
+       int successes = 0;
+       unsigned long long size, sector;
+
+       /* try to write updated metadata,
+        * if we catch a failure move on to the next disk
+        */
+       for (d = ddf->dlist; d; d=d->next) {
+               int fd = d->fd;
+
+               if (fd < 0)
+                       continue;
+
+               attempts++;
+               /* We need to fill in the primary, (secondary) and workspace
+                * lba's in the headers, set their checksums,
+                * Also checksum phys, virt....
+                *
+                * Then write everything out, finally the anchor is written.
+                */
+               get_dev_size(fd, NULL, &size);
+               size /= 512;
+               ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2);
+               ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2);
+               ddf->anchor.seq = __cpu_to_be32(1);
+               memcpy(&ddf->primary, &ddf->anchor, 512);
+               memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+               ddf->anchor.openflag = 0xFF; /* 'open' means nothing */
+               ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */
+               ddf->anchor.crc = calc_crc(&ddf->anchor, 512);
+
+               ddf->primary.openflag = 0;
+               ddf->primary.type = DDF_HEADER_PRIMARY;
+
+               ddf->secondary.openflag = 0;
+               ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+               ddf->primary.crc = calc_crc(&ddf->primary, 512);
+               ddf->secondary.crc = calc_crc(&ddf->secondary, 512);
+
+               sector = size - 16*1024*2;
+               lseek64(fd, sector<<9, 0);
+               if (write(fd, &ddf->primary, 512) < 0)
+                       continue;
+
+               ddf->controller.crc = calc_crc(&ddf->controller, 512);
+               if (write(fd, &ddf->controller, 512) < 0)
+                       continue;
+
+               ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize);
+
+               if (write(fd, ddf->phys, ddf->pdsize) < 0)
+                       continue;
+
+               ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize);
+               if (write(fd, ddf->virt, ddf->vdsize) < 0)
+                       continue;
+
+               /* Now write lots of config records. */
+               n_config = ddf->max_part;
+               conf_size = ddf->conf_rec_len * 512;
+               for (i = 0 ; i <= n_config ; i++) {
+                       struct vcl *c = d->vlist[i];
+                       if (i == n_config)
+                               c = (struct vcl*)d->spare;
+
+                       if (c) {
+                               c->conf.crc = calc_crc(&c->conf, conf_size);
+                               if (write(fd, &c->conf, conf_size) < 0)
+                                       break;
+                       } else {
+                               char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL);
+                               if (null_conf[0] != 0xff)
+                                       memset(null_conf, 0xff, sizeof(null_conf));
+                               int togo = conf_size;
+                               while (togo > sizeof(null_conf)-512) {
+                                       if (write(fd, null_aligned, sizeof(null_conf)-512) < 0)
+                                               break;
+                                       togo -= sizeof(null_conf)-512;
+                               }
+                               if (write(fd, null_aligned, togo) < 0)
+                                       break;
+                       }
+               }
+               if (i <= n_config)
+                       continue;
+               d->disk.crc = calc_crc(&d->disk, 512);
+               if (write(fd, &d->disk, 512) < 0)
+                       continue;
+
+               /* Maybe do the same for secondary */
+
+               lseek64(fd, (size-1)*512, SEEK_SET);
+               if (write(fd, &ddf->anchor, 512) < 0)
+                       continue;
+               successes++;
+       }
+
+       if (do_close)
+               for (d = ddf->dlist; d; d=d->next) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+
+       return attempts != successes;
+}
+
+static int write_init_super_ddf(struct supertype *st)
+{
+
+       if (st->update_tail) {
+               /* queue the virtual_disk and vd_config as metadata updates */
+               struct virtual_disk *vd;
+               struct vd_config *vc;
+               struct ddf_super *ddf = st->sb;
+               int len;
+
+               if (!ddf->currentconf) {
+                       int len = (sizeof(struct phys_disk) +
+                                  sizeof(struct phys_disk_entry));
+
+                       /* adding a disk to the container. */
+                       if (!ddf->add_list)
+                               return 0;
+
+                       append_metadata_update(st, ddf->add_list->mdupdate, len);
+                       ddf->add_list->mdupdate = NULL;
+                       return 0;
+               }
+
+               /* Newly created VD */
+
+               /* First the virtual disk.  We have a slightly fake header */
+               len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry);
+               vd = malloc(len);
+               *vd = *ddf->virt;
+               vd->entries[0] = ddf->virt->entries[ddf->currentconf->vcnum];
+               vd->populated_vdes = __cpu_to_be16(ddf->currentconf->vcnum);
+               append_metadata_update(st, vd, len);
+
+               /* Then the vd_config */
+               len = ddf->conf_rec_len * 512;
+               vc = malloc(len);
+               memcpy(vc, &ddf->currentconf->conf, len);
+               append_metadata_update(st, vc, len);
+
+               /* FIXME I need to close the fds! */
+               return 0;
+       } else 
+               return __write_init_super_ddf(st, 1);
+}
+
+#endif
+
+static __u64 avail_size_ddf(struct supertype *st, __u64 devsize)
+{
+       /* We must reserve the last 32Meg */
+       if (devsize <= 32*1024*2)
+               return 0;
+       return devsize - 32*1024*2;
+}
+
+#ifndef MDASSEMBLE
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose);
+
+static int validate_geometry_ddf(struct supertype *st,
+                                int level, int layout, int raiddisks,
+                                int chunk, unsigned long long size,
+                                char *dev, unsigned long long *freesize,
+                                int verbose)
+{
+       int fd;
+       struct mdinfo *sra;
+       int cfd;
+
+       /* ddf potentially supports lots of things, but it depends on
+        * what devices are offered (and maybe kernel version?)
+        * If given unused devices, we will make a container.
+        * If given devices in a container, we will make a BVD.
+        * If given BVDs, we make an SVD, changing all the GUIDs in the process.
+        */
+
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_ddf_container(st, level, layout,
+                                                      raiddisks, chunk,
+                                                      size, dev, freesize,
+                                                      verbose);
+       }
+
+       if (st->sb) {
+               /* A container has already been opened, so we are
+                * creating in there.  Maybe a BVD, maybe an SVD.
+                * Should make a distinction one day.
+                */
+               return validate_geometry_ddf_bvd(st, level, layout, raiddisks,
+                                                chunk, size, dev, freesize,
+                                                verbose);
+       }
+       if (!dev) {
+               /* Initial sanity check.  Exclude illegal levels. */
+               int i;
+               for (i=0; ddf_level_num[i].num1 != MAXINT; i++)
+                       if (ddf_level_num[i].num2 == level)
+                               break;
+               if (ddf_level_num[i].num1 == MAXINT)
+                       return 0;
+               /* Should check layout? etc */
+               return 1;
+       }
+
+       /* This is the first device for the array.
+        * If it is a container, we read it in and do automagic allocations,
+        * no other devices should be given.
+        * Otherwise it must be a member device of a container, and we
+        * do manual allocation.
+        * Later we should check for a BVD and make an SVD.
+        */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               sra = sysfs_read(fd, 0, GET_VERSION);
+               close(fd);
+               if (sra && sra->array.major_version == -1 &&
+                   strcmp(sra->text_version, "ddf") == 0) {
+
+                       /* load super */
+                       /* find space for 'n' devices. */
+                       /* remember the devices */
+                       /* Somehow return the fact that we have enough */
+               }
+
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": ddf: Cannot create this array "
+                               "on device %s\n",
+                               dev);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe a 'ddf' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot use %s: %s\n",
+                               dev, strerror(EBUSY));
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "ddf") == 0) {
+               /* This is a member of a ddf container.  Load the container
+                * and try to create a bvd
+                */
+               struct ddf_super *ddf;
+               if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) {
+                       st->sb = ddf;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_ddf_bvd(st, level, layout,
+                                                        raiddisks, chunk, size,
+                                                        dev, freesize,
+                                                        verbose);
+               }
+               close(cfd);
+       } else /* device may belong to a different container */
+               return 0;
+
+       return 1;
+}
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+                               int level, int layout, int raiddisks,
+                               int chunk, unsigned long long size,
+                               char *dev, unsigned long long *freesize,
+                               int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_ddf(st, ldsize >> 9);
+
+       return 1;
+}
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+                                    int level, int layout, int raiddisks,
+                                    int chunk, unsigned long long size,
+                                    char *dev, unsigned long long *freesize,
+                                    int verbose)
+{
+       struct stat stb;
+       struct ddf_super *ddf = st->sb;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+       /* ddf/bvd supports lots of things, but not containers */
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       /* We must have the container info already read in. */
+       if (!ddf)
+               return 0;
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size'.
+                */
+               unsigned long long minsize = size;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = 8;
+               for (dl = ddf->dlist; dl ; dl = dl->next)
+               {
+                       int found = 0;
+                       pos = 0;
+
+                       i = 0;
+                       e = get_extents(ddf, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr,
+                                       Name ": ddf: Not enough devices with "
+                                       "space for this array (%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = ddf->dlist ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": ddf: %s is not in the "
+                               "same DDF set\n",
+                               dev);
+               return 0;
+       }
+       e = get_extents(ddf, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) do {
+               unsigned long long esize;
+               esize = e[i].start - pos;
+               if (esize >= maxsize)
+                       maxsize = esize;
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       *freesize = maxsize;
+       // FIXME here I am
+
+       return 1;
+}
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+                             void **sbp, char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct ddf_super *super;
+       struct mdinfo *sd, *best = NULL;
+       int bestseq = 0;
+       int seq;
+       char nm[20];
+       int dfd;
+
+       sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+       if (!sra)
+               return 1;
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "ddf") != 0)
+               return 1;
+
+       if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0)
+               return 1;
+       memset(super, 0, sizeof(*super));
+
+       /* first, try each device, and choose the best ddf */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               int rv;
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               rv = load_ddf_headers(dfd, super, NULL);
+               close(dfd);
+               if (rv == 0) {
+                       seq = __be32_to_cpu(super->active->seq);
+                       if (super->active->openflag)
+                               seq--;
+                       if (!best || seq > bestseq) {
+                               bestseq = seq;
+                               best = sd;
+                       }
+               }
+       }
+       if (!best)
+               return 1;
+       /* OK, load this ddf */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (dfd < 0)
+               return 1;
+       load_ddf_headers(dfd, super, NULL);
+       load_ddf_global(dfd, super, NULL);
+       close(dfd);
+       /* Now we need the device-local bits */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               int rv;
+
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (dfd < 0)
+                       return 2;
+               rv = load_ddf_headers(dfd, super, NULL);
+               if (rv == 0)
+                       rv = load_ddf_local(dfd, super, NULL, keep_fd);
+               if (!keep_fd) close(dfd);
+               if (rv)
+                       return 1;
+       }
+       if (st->subarray[0]) {
+               struct vcl *v;
+
+               for (v = super->conflist; v; v = v->next)
+                       if (v->vcnum == atoi(st->subarray))
+                               super->currentconf = v;
+               if (!super->currentconf)
+                       return 1;
+       }
+       *sbp = super;
+       if (st->ss == NULL) {
+               st->ss = &super_ddf;
+               st->minor_version = 0;
+               st->max_devs = 512;
+               st->container_dev = fd2devnum(fd);
+       }
+       st->loaded_container = 1;
+       return 0;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_ddf(struct supertype *st)
+{
+       /* Given a container loaded by load_super_ddf_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each vcl in conflist: create an mdinfo, fill it in,
+        *  then look for matching devices (phys_refnum) in dlist
+        *  and create appropriate device mdinfo.
+        */
+       struct ddf_super *ddf = st->sb;
+       struct mdinfo *rest = NULL;
+       struct vcl *vc;
+
+       for (vc = ddf->conflist ; vc ; vc=vc->next)
+       {
+               int i;
+               struct mdinfo *this;
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+               rest = this;
+
+               this->array.level = map_num1(ddf_level_num, vc->conf.prl);
+               this->array.raid_disks =
+                       __be16_to_cpu(vc->conf.prim_elmnt_count);
+               this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl,
+                                                  this->array.raid_disks);
+               this->array.md_minor      = -1;
+               this->array.major_version = -1;
+               this->array.minor_version = -2;
+               this->array.ctime         = DECADE +
+                       __be32_to_cpu(*(__u32*)(vc->conf.guid+16));
+               this->array.utime         = DECADE +
+                       __be32_to_cpu(vc->conf.timestamp);
+               this->array.chunk_size    = 512 << vc->conf.chunk_shift;
+
+               i = vc->vcnum;
+               if ((ddf->virt->entries[i].state & DDF_state_inconsistent) ||
+                   (ddf->virt->entries[i].init_state & DDF_initstate_mask) !=
+                   DDF_init_full) {
+                       this->array.state = 0;
+                       this->resync_start = 0;
+               } else {
+                       this->array.state = 1;
+                       this->resync_start = ~0ULL;
+               }
+               memcpy(this->name, ddf->virt->entries[i].name, 32);
+               this->name[32]=0;
+
+               memset(this->uuid, 0, sizeof(this->uuid));
+               this->component_size = __be64_to_cpu(vc->conf.blocks);
+               this->array.size = this->component_size / 2;
+               this->container_member = i;
+
+               ddf->currentconf = vc;
+               uuid_from_super_ddf(st, this->uuid);
+               ddf->currentconf = NULL;
+
+               sprintf(this->text_version, "/%s/%d",
+                       devnum2devname(st->container_dev),
+                       this->container_member);
+
+               for (i=0 ; i < ddf->mppe ; i++) {
+                       struct mdinfo *dev;
+                       struct dl *d;
+
+                       if (vc->conf.phys_refnum[i] == 0xFFFFFFFF)
+                               continue;
+
+                       this->array.working_disks++;
+
+                       for (d = ddf->dlist; d ; d=d->next)
+                               if (d->disk.refnum == vc->conf.phys_refnum[i])
+                                       break;
+                       if (d == NULL)
+                               break;
+
+                       dev = malloc(sizeof(*dev));
+                       memset(dev, 0, sizeof(*dev));
+                       dev->next = this->devs;
+                       this->devs = dev;
+
+                       dev->disk.number = __be32_to_cpu(d->disk.refnum);
+                       dev->disk.major = d->major;
+                       dev->disk.minor = d->minor;
+                       dev->disk.raid_disk = i;
+                       dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+
+                       dev->events = __be32_to_cpu(ddf->primary.seq);
+                       dev->data_offset = __be64_to_cpu(vc->lba_offset[i]);
+                       dev->component_size = __be64_to_cpu(vc->conf.blocks);
+                       if (d->devname)
+                               strcpy(dev->name, d->devname);
+               }
+       }
+       return rest;
+}
+
+static int store_zero_ddf(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+       int rc;
+
+       if (!get_dev_size(fd, NULL, &dsize))
+               return 1;
+
+       if (posix_memalign(&buf, 512, 512) != 0)
+               return 1;
+       memset(buf, 0, 512);
+
+       lseek64(fd, dsize-512, 0);
+       rc = write(fd, buf, 512);
+       free(buf);
+       if (rc < 0)
+               return 1;
+       return 0;
+}
+
+static int compare_super_ddf(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct ddf_super *first = st->sb;
+       struct ddf_super *second = tst->sb;
+
+       if (!first) {
+               st->sb = tst->sb;
+               tst->sb = NULL;
+               return 0;
+       }
+
+       if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
+               return 2;
+
+       /* FIXME should I look at anything else? */
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+/*
+ * A new array 'a' has been started which claims to be instance 'inst'
+ * within container 'c'.
+ * We need to confirm that the array matches the metadata in 'c' so
+ * that we don't corrupt any metadata.
+ */
+static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
+{
+       dprintf("ddf: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+/*
+ * The array 'a' is to be marked clean in the metadata.
+ * If '->resync_start' is not ~(unsigned long long)0, then the array is only
+ * clean up to the point (in sectors).  If that cannot be recorded in the
+ * metadata, then leave it as dirty.
+ *
+ * For DDF, we need to clear the DDF_state_inconsistent bit in the
+ * !global! virtual_disk.virtual_entry structure.
+ */
+static int ddf_set_array_state(struct active_array *a, int consistent)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       int old = ddf->virt->entries[inst].state;
+       if (consistent == 2) {
+               /* Should check if a recovery should be started FIXME */
+               consistent = 1;
+               if (!is_resync_complete(a))
+                       consistent = 0;
+       }
+       if (consistent)
+               ddf->virt->entries[inst].state &= ~DDF_state_inconsistent;
+       else
+               ddf->virt->entries[inst].state |= DDF_state_inconsistent;
+       if (old != ddf->virt->entries[inst].state)
+               ddf->updates_pending = 1;
+
+       old = ddf->virt->entries[inst].init_state;
+       ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
+       if (is_resync_complete(a))
+               ddf->virt->entries[inst].init_state |= DDF_init_full;
+       else if (a->resync_start == 0)
+               ddf->virt->entries[inst].init_state |= DDF_init_not;
+       else
+               ddf->virt->entries[inst].init_state |= DDF_init_quick;
+       if (old != ddf->virt->entries[inst].init_state)
+               ddf->updates_pending = 1;
+
+       dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty",
+               a->resync_start);
+       return consistent;
+}
+
+/*
+ * The state of each disk is stored in the global phys_disk structure
+ * in phys_disk.entries[n].state.
+ * This makes various combinations awkward.
+ * - When a device fails in any array, it must be failed in all arrays
+ *   that include a part of this device.
+ * - When a component is rebuilding, we cannot include it officially in the
+ *   array unless this is the only array that uses the device.
+ *
+ * So: when transitioning:
+ *   Online -> failed,  just set failed flag.  monitor will propagate
+ *   spare -> online,   the device might need to be added to the array.
+ *   spare -> failed,   just set failed.  Don't worry if in array or not.
+ */
+static void ddf_set_disk(struct active_array *a, int n, int state)
+{
+       struct ddf_super *ddf = a->container->sb;
+       int inst = a->info.container_member;
+       struct vd_config *vc = find_vdcr(ddf, inst);
+       int pd = find_phys(ddf, vc->phys_refnum[n]);
+       int i, st, working;
+
+       if (vc == NULL) {
+               dprintf("ddf: cannot find instance %d!!\n", inst);
+               return;
+       }
+       if (pd < 0) {
+               /* disk doesn't currently exist. If it is now in_sync,
+                * insert it. */
+               if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) {
+                       /* Find dev 'n' in a->info->devs, determine the
+                        * ddf refnum, and set vc->phys_refnum and update
+                        * phys->entries[]
+                        */
+                       /* FIXME */
+               }
+       } else {
+               int old = ddf->phys->entries[pd].state;
+               if (state & DS_FAULTY)
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Failed);
+               if (state & DS_INSYNC) {
+                       ddf->phys->entries[pd].state  |= __cpu_to_be16(DDF_Online);
+                       ddf->phys->entries[pd].state  &= __cpu_to_be16(~DDF_Rebuilding);
+               }
+               if (old != ddf->phys->entries[pd].state)
+                       ddf->updates_pending = 1;
+       }
+
+       dprintf("ddf: set_disk %d to %x\n", n, state);
+
+       /* Now we need to check the state of the array and update
+        * virtual_disk.entries[n].state.
+        * It needs to be one of "optimal", "degraded", "failed".
+        * I don't understand 'deleted' or 'missing'.
+        */
+       working = 0;
+       for (i=0; i < a->info.array.raid_disks; i++) {
+               pd = find_phys(ddf, vc->phys_refnum[i]);
+               if (pd < 0)
+                       continue;
+               st = __be16_to_cpu(ddf->phys->entries[pd].state);
+               if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding))
+                   == DDF_Online)
+                       working++;
+       }
+       state = DDF_state_degraded;
+       if (working == a->info.array.raid_disks)
+               state = DDF_state_optimal;
+       else switch(vc->prl) {
+       case DDF_RAID0:
+       case DDF_CONCAT:
+       case DDF_JBOD:
+               state = DDF_state_failed;
+               break;
+       case DDF_RAID1:
+               if (working == 0)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID4:
+       case DDF_RAID5:
+               if (working < a->info.array.raid_disks-1)
+                       state = DDF_state_failed;
+               break;
+       case DDF_RAID6:
+               if (working < a->info.array.raid_disks-2)
+                       state = DDF_state_failed;
+               else if (working == a->info.array.raid_disks-1)
+                       state = DDF_state_part_optimal;
+               break;
+       }
+
+       if (ddf->virt->entries[inst].state !=
+           ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+            | state)) {
+
+               ddf->virt->entries[inst].state =
+                       (ddf->virt->entries[inst].state & ~DDF_state_mask)
+                       | state;
+               ddf->updates_pending = 1;
+       }
+
+}
+
+static void ddf_sync_metadata(struct supertype *st)
+{
+
+       /*
+        * Write all data to all devices.
+        * Later, we might be able to track whether only local changes
+        * have been made, or whether any global data has been changed,
+        * but ddf is sufficiently weird that it probably always
+        * changes global data ....
+        */
+       struct ddf_super *ddf = st->sb;
+       if (!ddf->updates_pending)
+               return;
+       ddf->updates_pending = 0;
+       __write_init_super_ddf(st, 0);
+       dprintf("ddf: sync_metadata\n");
+}
+
+static void ddf_process_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* Apply this update to the metadata.
+        * The first 4 bytes are a DDF_*_MAGIC which guides
+        * our actions.
+        * Possible update are:
+        *  DDF_PHYS_RECORDS_MAGIC
+        *    Add a new physical device.  Changes to this record
+        *    only happen implicitly.
+        *    used_pdes is the device number.
+        *  DDF_VIRT_RECORDS_MAGIC
+        *    Add a new VD.  Possibly also change the 'access' bits.
+        *    populated_vdes is the entry number.
+        *  DDF_VD_CONF_MAGIC
+        *    New or updated VD.  the VIRT_RECORD must already
+        *    exist.  For an update, phys_refnum and lba_offset
+        *    (at least) are updated, and the VD_CONF must
+        *    be written to precisely those devices listed with
+        *    a phys_refnum.
+        *  DDF_SPARE_ASSIGN_MAGIC
+        *    replacement Spare Assignment Record... but for which device?
+        *
+        * So, e.g.:
+        *  - to create a new array, we send a VIRT_RECORD and
+        *    a VD_CONF.  Then assemble and start the array.
+        *  - to activate a spare we send a VD_CONF to add the phys_refnum
+        *    and offset.  This will also mark the spare as active with
+        *    a spare-assignment record.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       struct phys_disk *pd;
+       struct virtual_disk *vd;
+       struct vd_config *vc;
+       struct vcl *vcl;
+       struct dl *dl;
+       int mppe;
+       int ent;
+
+       dprintf("Process update %x\n", *magic);
+
+       switch (*magic) {
+       case DDF_PHYS_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct phys_disk) +
+                                   sizeof(struct phys_disk_entry)))
+                       return;
+               pd = (struct phys_disk*)update->buf;
+
+               ent = __be16_to_cpu(pd->used_pdes);
+               if (ent >= __be16_to_cpu(ddf->phys->max_pdes))
+                       return;
+               if (!all_ff(ddf->phys->entries[ent].guid))
+                       return;
+               ddf->phys->entries[ent] = pd->entries[0];
+               ddf->phys->used_pdes = __cpu_to_be16(1 +
+                                          __be16_to_cpu(ddf->phys->used_pdes));
+               ddf->updates_pending = 1;
+               if (ddf->add_list) {
+                       struct active_array *a;
+                       struct dl *al = ddf->add_list;
+                       ddf->add_list = al->next;
+
+                       al->next = ddf->dlist;
+                       ddf->dlist = al;
+
+                       /* As a device has been added, we should check
+                        * for any degraded devices that might make
+                        * use of this spare */
+                       for (a = st->arrays ; a; a=a->next)
+                               a->check_degraded = 1;
+               }
+               break;
+
+       case DDF_VIRT_RECORDS_MAGIC:
+
+               if (update->len != (sizeof(struct virtual_disk) +
+                                   sizeof(struct virtual_entry)))
+                       return;
+               vd = (struct virtual_disk*)update->buf;
+
+               ent = __be16_to_cpu(vd->populated_vdes);
+               if (ent >= __be16_to_cpu(ddf->virt->max_vdes))
+                       return;
+               if (!all_ff(ddf->virt->entries[ent].guid))
+                       return;
+               ddf->virt->entries[ent] = vd->entries[0];
+               ddf->virt->populated_vdes = __cpu_to_be16(1 +
+                             __be16_to_cpu(ddf->virt->populated_vdes));
+               ddf->updates_pending = 1;
+               break;
+
+       case DDF_VD_CONF_MAGIC:
+               dprintf("len %d %d\n", update->len, ddf->conf_rec_len);
+
+               mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries);
+               if (update->len != ddf->conf_rec_len * 512)
+                       return;
+               vc = (struct vd_config*)update->buf;
+               for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                       if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+                               break;
+               dprintf("vcl = %p\n", vcl);
+               if (vcl) {
+                       /* An update, just copy the phys_refnum and lba_offset
+                        * fields
+                        */
+                       memcpy(vcl->conf.phys_refnum, vc->phys_refnum,
+                              mppe * (sizeof(__u32) + sizeof(__u64)));
+               } else {
+                       /* A new VD_CONF */
+                       if (!update->space)
+                               return;
+                       vcl = update->space;
+                       update->space = NULL;
+                       vcl->next = ddf->conflist;
+                       memcpy(&vcl->conf, vc, update->len);
+                       vcl->lba_offset = (__u64*)
+                               &vcl->conf.phys_refnum[mppe];
+                       ddf->conflist = vcl;
+               }
+               /* Now make sure vlist is correct for each dl. */
+               for (dl = ddf->dlist; dl; dl = dl->next) {
+                       int dn;
+                       int vn = 0;
+                       for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+                               for (dn=0; dn < ddf->mppe ; dn++)
+                                       if (vcl->conf.phys_refnum[dn] ==
+                                           dl->disk.refnum) {
+                                               dprintf("dev %d has %p at %d\n",
+                                                       dl->pdnum, vcl, vn);
+                                               dl->vlist[vn++] = vcl;
+                                               break;
+                                       }
+                       while (vn < ddf->max_part)
+                               dl->vlist[vn++] = NULL;
+                       if (dl->vlist[0]) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Active_in_VD);
+                       }
+                       if (dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Spare);
+                       }
+                       if (!dl->vlist[0] && !dl->spare) {
+                               ddf->phys->entries[dl->pdnum].type |=
+                                       __cpu_to_be16(DDF_Global_Spare);
+                               ddf->phys->entries[dl->pdnum].type &=
+                                       ~__cpu_to_be16(DDF_Spare |
+                                                      DDF_Active_in_VD);
+                       }
+               }
+               ddf->updates_pending = 1;
+               break;
+       case DDF_SPARE_ASSIGN_MAGIC:
+       default: break;
+       }
+}
+
+static void ddf_prepare_update(struct supertype *st,
+                              struct metadata_update *update)
+{
+       /* This update arrived at managemon.
+        * We are about to pass it to monitor.
+        * If a malloc is needed, do it here.
+        */
+       struct ddf_super *ddf = st->sb;
+       __u32 *magic = (__u32*)update->buf;
+       if (*magic == DDF_VD_CONF_MAGIC)
+               if (posix_memalign(&update->space, 512,
+                              offsetof(struct vcl, conf)
+                              + ddf->conf_rec_len * 512) != 0)
+                       update->space = NULL;
+}
+
+/*
+ * Check if the array 'a' is degraded but not failed.
+ * If it is, find as many spares as are available and needed and
+ * arrange for their inclusion.
+ * We only choose devices which are not already in the array,
+ * and prefer those with a spare-assignment to this array.
+ * otherwise we choose global spares - assuming always that
+ * there is enough room.
+ * For each spare that we assign, we return an 'mdinfo' which
+ * describes the position for the device in the array.
+ * We also add to 'updates' a DDF_VD_CONF_MAGIC update with
+ * the new phys_refnum and lba_offset values.
+ *
+ * Only worry about BVDs at the moment.
+ */
+static struct mdinfo *ddf_activate_spare(struct active_array *a,
+                                        struct metadata_update **updates)
+{
+       int working = 0;
+       struct mdinfo *d;
+       struct ddf_super *ddf = a->container->sb;
+       int global_ok = 0;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       int i;
+       struct vd_config *vc;
+       __u64 *lba;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       working ++;
+       }
+
+       dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks,
+               a->info.array.level);
+       if (working == a->info.array.raid_disks)
+               return NULL; /* array not degraded */
+       switch (a->info.array.level) {
+       case 1:
+               if (working == 0)
+                       return NULL; /* failed */
+               break;
+       case 4:
+       case 5:
+               if (working < a->info.array.raid_disks - 1)
+                       return NULL; /* failed */
+               break;
+       case 6:
+               if (working < a->info.array.raid_disks - 2)
+                       return NULL; /* failed */
+               break;
+       default: /* concat or stripe */
+               return NULL; /* failed */
+       }
+
+       /* For each slot, if it is not working, find a spare */
+       dl = ddf->dlist;
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /* OK, this device needs recovery.  Find a spare */
+       again:
+               for ( ; dl ; dl = dl->next) {
+                       unsigned long long esize;
+                       unsigned long long pos;
+                       struct mdinfo *d2;
+                       int is_global = 0;
+                       int is_dedicated = 0;
+                       struct extent *ex;
+                       int j;
+                       /* If in this array, skip */
+                       for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+                               if (d2->disk.major == dl->major &&
+                                   d2->disk.minor == dl->minor) {
+                                       dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                                       break;
+                               }
+                       if (d2)
+                               continue;
+                       if (ddf->phys->entries[dl->pdnum].type &
+                           __cpu_to_be16(DDF_Spare)) {
+                               /* Check spare assign record */
+                               if (dl->spare) {
+                                       if (dl->spare->type & DDF_spare_dedicated) {
+                                               /* check spare_ents for guid */
+                                               for (j = 0 ;
+                                                    j < __be16_to_cpu(dl->spare->populated);
+                                                    j++) {
+                                                       if (memcmp(dl->spare->spare_ents[j].guid,
+                                                                  ddf->virt->entries[a->info.container_member].guid,
+                                                                  DDF_GUID_LEN) == 0)
+                                                               is_dedicated = 1;
+                                               }
+                                       } else
+                                               is_global = 1;
+                               }
+                       } else if (ddf->phys->entries[dl->pdnum].type &
+                                  __cpu_to_be16(DDF_Global_Spare)) {
+                               is_global = 1;
+                       }
+                       if ( ! (is_dedicated ||
+                               (is_global && global_ok))) {
+                               dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor,
+                                      is_dedicated, is_global);
+                               continue;
+                       }
+
+                       /* We are allowed to use this device - is there space?
+                        * We need a->info.component_size sectors */
+                       ex = get_extents(ddf, dl);
+                       if (!ex) {
+                               dprintf("cannot get extents\n");
+                               continue;
+                       }
+                       j = 0; pos = 0;
+                       esize = 0;
+
+                       do {
+                               esize = ex[j].start - pos;
+                               if (esize >= a->info.component_size)
+                                       break;
+                               pos = ex[i].start + ex[i].size;
+                               i++;
+                       } while (ex[i-1].size);
+
+                       free(ex);
+                       if (esize < a->info.component_size) {
+                               dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor,
+                                       esize, a->info.component_size);
+                               /* No room */
+                               continue;
+                       }
+
+                       /* Cool, we have a device with some space at pos */
+                       di = malloc(sizeof(*di));
+                       if (!di)
+                               continue;
+                       memset(di, 0, sizeof(*di));
+                       di->disk.number = i;
+                       di->disk.raid_disk = i;
+                       di->disk.major = dl->major;
+                       di->disk.minor = dl->minor;
+                       di->disk.state = 0;
+                       di->data_offset = pos;
+                       di->component_size = a->info.component_size;
+                       di->container_member = dl->pdnum;
+                       di->next = rv;
+                       rv = di;
+                       dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                               i, pos);
+
+                       break;
+               }
+               if (!dl && ! global_ok) {
+                       /* not enough dedicated spares, try global */
+                       global_ok = 1;
+                       dl = ddf->dlist;
+                       goto again;
+               }
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * phys_refnum and lba_offset values
+        */
+       mu = malloc(sizeof(*mu));
+       if (mu && posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) {
+               free(mu);
+               mu = NULL;
+       }
+       if (!mu) {
+               while (rv) {
+                       struct mdinfo *n = rv->next;
+
+                       free(rv);
+                       rv = n;
+               }
+               return NULL;
+       }
+               
+       mu->buf = malloc(ddf->conf_rec_len * 512);
+       mu->len = ddf->conf_rec_len;
+       mu->next = *updates;
+       vc = find_vdcr(ddf, a->info.container_member);
+       memcpy(mu->buf, vc, ddf->conf_rec_len * 512);
+
+       vc = (struct vd_config*)mu->buf;
+       lba = (__u64*)&vc->phys_refnum[ddf->mppe];
+       for (di = rv ; di ; di = di->next) {
+               vc->phys_refnum[di->disk.raid_disk] =
+                       ddf->phys->entries[dl->pdnum].refnum;
+               lba[di->disk.raid_disk] = di->data_offset;
+       }
+       *updates = mu;
+       return rv;
+}
+#endif /* MDASSEMBLE */
+
+struct superswitch super_ddf = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_ddf,
+       .brief_examine_super = brief_examine_super_ddf,
+       .detail_super   = detail_super_ddf,
+       .brief_detail_super = brief_detail_super_ddf,
+       .validate_geometry = validate_geometry_ddf,
+       .write_init_super = write_init_super_ddf,
+       .add_to_super   = add_to_super_ddf,
+#endif
+       .match_home     = match_home_ddf,
+       .uuid_from_super= uuid_from_super_ddf,
+       .getinfo_super  = getinfo_super_ddf,
+       .update_super   = update_super_ddf,
+
+       .avail_size     = avail_size_ddf,
+
+       .compare_super  = compare_super_ddf,
+
+       .load_super     = load_super_ddf,
+       .init_super     = init_super_ddf,
+       .store_super    = store_zero_ddf,
+       .free_super     = free_super_ddf,
+       .match_metadata_desc = match_metadata_desc_ddf,
+       .container_content = container_content_ddf,
+
+       .external       = 1,
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+       .open_new       = ddf_open_new,
+       .set_array_state= ddf_set_array_state,
+       .set_disk       = ddf_set_disk,
+       .sync_metadata  = ddf_sync_metadata,
+       .process_update = ddf_process_update,
+       .prepare_update = ddf_prepare_update,
+       .activate_spare = ddf_activate_spare,
+#endif
+};
diff --git a/super-intel.c b/super-intel.c
new file mode 100644 (file)
index 0000000..29085a5
--- /dev/null
@@ -0,0 +1,3409 @@
+/*
+ * mdadm - Intel(R) Matrix Storage Manager Support
+ *
+ * Copyright (C) 2002-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+#include <scsi/sg.h>
+#include <ctype.h>
+
+/* MPB == Metadata Parameter Block */
+#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
+#define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
+#define MPB_VERSION_RAID0 "1.0.00"
+#define MPB_VERSION_RAID1 "1.1.00"
+#define MPB_VERSION_RAID5 "1.2.02"
+#define MAX_SIGNATURE_LENGTH  32
+#define MAX_RAID_SERIAL_LEN   16
+#define MPB_SECTOR_CNT 418
+#define IMSM_RESERVED_SECTORS 4096
+
+/* Disk configuration info. */
+#define IMSM_MAX_DEVICES 255
+struct imsm_disk {
+       __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
+       __u32 total_blocks;              /* 0xE8 - 0xEB total blocks */
+       __u32 scsi_id;                   /* 0xEC - 0xEF scsi ID */
+       __u32 status;                    /* 0xF0 - 0xF3 */
+#define SPARE_DISK      0x01  /* Spare */
+#define CONFIGURED_DISK 0x02  /* Member of some RaidDev */
+#define FAILED_DISK     0x04  /* Permanent failure */
+#define USABLE_DISK     0x08  /* Fully usable unless FAILED_DISK is set */
+
+#define        IMSM_DISK_FILLERS       5
+       __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */
+};
+
+/* RAID map configuration infos. */
+struct imsm_map {
+       __u32 pba_of_lba0;      /* start address of partition */
+       __u32 blocks_per_member;/* blocks per member */
+       __u32 num_data_stripes; /* number of data stripes */
+       __u16 blocks_per_strip;
+       __u8  map_state;        /* Normal, Uninitialized, Degraded, Failed */
+#define IMSM_T_STATE_NORMAL 0
+#define IMSM_T_STATE_UNINITIALIZED 1
+#define IMSM_T_STATE_DEGRADED 2 /* FIXME: is this correct? */
+#define IMSM_T_STATE_FAILED 3 /* FIXME: is this correct? */
+       __u8  raid_level;
+#define IMSM_T_RAID0 0
+#define IMSM_T_RAID1 1
+#define IMSM_T_RAID5 5         /* since metadata version 1.2.02 ? */
+       __u8  num_members;      /* number of member disks */
+       __u8  reserved[3];
+       __u32 filler[7];        /* expansion area */
+#define IMSM_ORD_REBUILD (1 << 24)
+       __u32 disk_ord_tbl[1];  /* disk_ord_tbl[num_members],
+                                * top byte contains some flags
+                                */
+} __attribute__ ((packed));
+
+struct imsm_vol {
+       __u32 curr_migr_unit;
+       __u32 reserved;
+       __u8  migr_state;       /* Normal or Migrating */
+       __u8  migr_type;        /* Initializing, Rebuilding, ... */
+       __u8  dirty;
+       __u8  fill[1];
+       __u32 filler[5];
+       struct imsm_map map[1];
+       /* here comes another one if migr_state */
+} __attribute__ ((packed));
+
+struct imsm_dev {
+       __u8    volume[MAX_RAID_SERIAL_LEN];
+       __u32 size_low;
+       __u32 size_high;
+       __u32 status;   /* Persistent RaidDev status */
+       __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
+#define IMSM_DEV_FILLERS 12
+       __u32 filler[IMSM_DEV_FILLERS];
+       struct imsm_vol vol;
+} __attribute__ ((packed));
+
+struct imsm_super {
+       __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
+       __u32 check_sum;                /* 0x20 - 0x23 MPB Checksum */
+       __u32 mpb_size;                 /* 0x24 - 0x27 Size of MPB */
+       __u32 family_num;               /* 0x28 - 0x2B Checksum from first time this config was written */
+       __u32 generation_num;           /* 0x2C - 0x2F Incremented each time this array's MPB is written */
+       __u32 error_log_size;           /* 0x30 - 0x33 in bytes */
+       __u32 attributes;               /* 0x34 - 0x37 */
+       __u8 num_disks;                 /* 0x38 Number of configured disks */
+       __u8 num_raid_devs;             /* 0x39 Number of configured volumes */
+       __u8 error_log_pos;             /* 0x3A  */
+       __u8 fill[1];                   /* 0x3B */
+       __u32 cache_size;               /* 0x3c - 0x40 in mb */
+       __u32 orig_family_num;          /* 0x40 - 0x43 original family num */
+       __u32 pwr_cycle_count;          /* 0x44 - 0x47 simulated power cycle count for array */
+       __u32 bbm_log_size;             /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */
+#define IMSM_FILLERS 35
+       __u32 filler[IMSM_FILLERS];     /* 0x4C - 0xD7 RAID_MPB_FILLERS */
+       struct imsm_disk disk[1];       /* 0xD8 diskTbl[numDisks] */
+       /* here comes imsm_dev[num_raid_devs] */
+       /* here comes BBM logs */
+} __attribute__ ((packed));
+
+#define BBM_LOG_MAX_ENTRIES 254
+
+struct bbm_log_entry {
+       __u64 defective_block_start;
+#define UNREADABLE 0xFFFFFFFF
+       __u32 spare_block_offset;
+       __u16 remapped_marked_count;
+       __u16 disk_ordinal;
+} __attribute__ ((__packed__));
+
+struct bbm_log {
+       __u32 signature; /* 0xABADB10C */
+       __u32 entry_count;
+       __u32 reserved_spare_block_count; /* 0 */
+       __u32 reserved; /* 0xFFFF */
+       __u64 first_spare_lba;
+       struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
+
+#ifndef MDASSEMBLE
+static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
+#endif
+
+static unsigned int sector_count(__u32 bytes)
+{
+       return ((bytes + (512-1)) & (~(512-1))) / 512;
+}
+
+static unsigned int mpb_sectors(struct imsm_super *mpb)
+{
+       return sector_count(__le32_to_cpu(mpb->mpb_size));
+}
+
+/* internal representation of IMSM metadata */
+struct intel_super {
+       union {
+               void *buf; /* O_DIRECT buffer for reading/writing metadata */
+               struct imsm_super *anchor; /* immovable parameters */
+       };
+       size_t len; /* size of the 'buf' allocation */
+       void *next_buf; /* for realloc'ing buf from the manager */
+       size_t next_len;
+       int updates_pending; /* count of pending updates for mdmon */
+       int creating_imsm; /* flag to indicate container creation */
+       int current_vol; /* index of raid device undergoing creation */
+       #define IMSM_MAX_RAID_DEVS 2
+       struct imsm_dev *dev_tbl[IMSM_MAX_RAID_DEVS];
+       struct dl {
+               struct dl *next;
+               int index;
+               __u8 serial[MAX_RAID_SERIAL_LEN];
+               int major, minor;
+               char *devname;
+               struct imsm_disk disk;
+               int fd;
+       } *disks;
+       struct dl *add; /* list of disks to add while mdmon active */
+       struct dl *missing; /* disks removed while we weren't looking */
+       struct bbm_log *bbm_log;
+};
+
+struct extent {
+       unsigned long long start, size;
+};
+
+/* definition of messages passed to imsm_process_update */
+enum imsm_update_type {
+       update_activate_spare,
+       update_create_array,
+       update_add_disk,
+};
+
+struct imsm_update_activate_spare {
+       enum imsm_update_type type;
+       struct dl *dl;
+       int slot;
+       int array;
+       struct imsm_update_activate_spare *next;
+};
+
+struct imsm_update_create_array {
+       enum imsm_update_type type;
+       int dev_idx;
+       struct imsm_dev dev;
+};
+
+struct imsm_update_add_disk {
+       enum imsm_update_type type;
+};
+
+static int imsm_env_devname_as_serial(void)
+{
+       char *val = getenv("IMSM_DEVNAME_AS_SERIAL");
+
+       if (val && atoi(val) == 1)
+               return 1;
+
+       return 0;
+}
+
+
+static struct supertype *match_metadata_desc_imsm(char *arg)
+{
+       struct supertype *st;
+
+       if (strcmp(arg, "imsm") != 0 &&
+           strcmp(arg, "default") != 0
+               )
+               return NULL;
+
+       st = malloc(sizeof(*st));
+       memset(st, 0, sizeof(*st));
+       st->ss = &super_imsm;
+       st->max_devs = IMSM_MAX_DEVICES;
+       st->minor_version = 0;
+       st->sb = NULL;
+       return st;
+}
+
+#ifndef MDASSEMBLE
+static __u8 *get_imsm_version(struct imsm_super *mpb)
+{
+       return &mpb->sig[MPB_SIG_LEN];
+}
+#endif 
+
+/* retrieve a disk directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load time
+ */
+static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
+{
+       if (index >= mpb->num_disks)
+               return NULL;
+       return &mpb->disk[index];
+}
+
+#ifndef MDASSEMBLE
+/* retrieve a disk from the parsed metadata */
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+       struct dl *d;
+
+       for (d = super->disks; d; d = d->next)
+               if (d->index == index)
+                       return &d->disk;
+       
+       return NULL;
+}
+#endif
+
+/* generate a checksum directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load or write_super after coalescing
+ */
+static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
+{
+       __u32 end = mpb->mpb_size / sizeof(end);
+       __u32 *p = (__u32 *) mpb;
+       __u32 sum = 0;
+
+        while (end--)
+                sum += __le32_to_cpu(*p++);
+
+        return sum - __le32_to_cpu(mpb->check_sum);
+}
+
+static size_t sizeof_imsm_map(struct imsm_map *map)
+{
+       return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1);
+}
+
+struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
+{
+       struct imsm_map *map = &dev->vol.map[0];
+
+       if (second_map && !dev->vol.migr_state)
+               return NULL;
+       else if (second_map) {
+               void *ptr = map;
+
+               return ptr + sizeof_imsm_map(map);
+       } else
+               return map;
+               
+}
+
+/* return the size of the device.
+ * migr_state increases the returned size if map[0] were to be duplicated
+ */
+static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state)
+{
+       size_t size = sizeof(*dev) - sizeof(struct imsm_map) +
+                     sizeof_imsm_map(get_imsm_map(dev, 0));
+
+       /* migrating means an additional map */
+       if (dev->vol.migr_state)
+               size += sizeof_imsm_map(get_imsm_map(dev, 1));
+       else if (migr_state)
+               size += sizeof_imsm_map(get_imsm_map(dev, 0));
+
+       return size;
+}
+
+static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
+{
+       int offset;
+       int i;
+       void *_mpb = mpb;
+
+       if (index >= mpb->num_raid_devs)
+               return NULL;
+
+       /* devices start after all disks */
+       offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
+
+       for (i = 0; i <= index; i++)
+               if (i == index)
+                       return _mpb + offset;
+               else
+                       offset += sizeof_imsm_dev(_mpb + offset, 0);
+
+       return NULL;
+}
+
+static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
+{
+       if (index >= super->anchor->num_raid_devs)
+               return NULL;
+       return super->dev_tbl[index];
+}
+
+static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, int slot)
+{
+       struct imsm_map *map;
+
+       if (dev->vol.migr_state)
+               map = get_imsm_map(dev, 1);
+       else
+               map = get_imsm_map(dev, 0);
+
+       /* top byte identifies disk under rebuild */
+       return __le32_to_cpu(map->disk_ord_tbl[slot]);
+}
+
+#define ord_to_idx(ord) (((ord) << 8) >> 8)
+static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot)
+{
+       __u32 ord = get_imsm_ord_tbl_ent(dev, slot);
+
+       return ord_to_idx(ord);
+}
+
+static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord)
+{
+       map->disk_ord_tbl[slot] = __cpu_to_le32(ord);
+}
+
+static int get_imsm_raid_level(struct imsm_map *map)
+{
+       if (map->raid_level == 1) {
+               if (map->num_members == 2)
+                       return 1;
+               else
+                       return 10;
+       }
+
+       return map->raid_level;
+}
+
+static int cmp_extent(const void *av, const void *bv)
+{
+       const struct extent *a = av;
+       const struct extent *b = bv;
+       if (a->start < b->start)
+               return -1;
+       if (a->start > b->start)
+               return 1;
+       return 0;
+}
+
+static struct extent *get_extents(struct intel_super *super, struct dl *dl)
+{
+       /* find a list of used extents on the given physical device */
+       struct extent *rv, *e;
+       int i, j;
+       int memberships = 0;
+       __u32 reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 index = get_imsm_disk_idx(dev, j);
+
+                       if (index == dl->index)
+                               memberships++;
+               }
+       }
+       rv = malloc(sizeof(struct extent) * (memberships + 1));
+       if (!rv)
+               return NULL;
+       e = rv;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+
+               for (j = 0; j < map->num_members; j++) {
+                       __u32 index = get_imsm_disk_idx(dev, j);
+
+                       if (index == dl->index) {
+                               e->start = __le32_to_cpu(map->pba_of_lba0);
+                               e->size = __le32_to_cpu(map->blocks_per_member);
+                               e++;
+                       }
+               }
+       }
+       qsort(rv, memberships, sizeof(*rv), cmp_extent);
+
+       /* determine the start of the metadata 
+        * when no raid devices are defined use the default
+        * ...otherwise allow the metadata to truncate the value
+        * as is the case with older versions of imsm
+        */
+       if (memberships) {
+               struct extent *last = &rv[memberships - 1];
+               __u32 remainder;
+
+               remainder = __le32_to_cpu(dl->disk.total_blocks) - 
+                           (last->start + last->size);
+               if (reservation > remainder)
+                       reservation = remainder;
+       }
+       e->start = __le32_to_cpu(dl->disk.total_blocks) - reservation;
+       e->size = 0;
+       return rv;
+}
+
+/* try to determine how much space is reserved for metadata from
+ * the last get_extents() entry, otherwise fallback to the
+ * default
+ */
+static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl)
+{
+       struct extent *e;
+       int i;
+       __u32 rv;
+
+       /* for spares just return a minimal reservation which will grow
+        * once the spare is picked up by an array
+        */
+       if (dl->index == -1)
+               return MPB_SECTOR_CNT;
+
+       e = get_extents(super, dl);
+       if (!e)
+               return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+       /* scroll to last entry */
+       for (i = 0; e[i].size; i++)
+               continue;
+
+       rv = __le32_to_cpu(dl->disk.total_blocks) - e[i].start;
+
+       free(e);
+
+       return rv;
+}
+
+#ifndef MDASSEMBLE
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+
+static void print_imsm_dev(struct imsm_dev *dev, int index)
+{
+       __u64 sz;
+       int slot;
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       __u32 ord;
+
+       printf("\n");
+       printf("[%.16s]:\n", dev->volume);
+       printf("     RAID Level : %d\n", get_imsm_raid_level(map));
+       printf("        Members : %d\n", map->num_members);
+       for (slot = 0; slot < map->num_members; slot++)
+               if (index == get_imsm_disk_idx(dev, slot))
+                       break;
+       if (slot < map->num_members) {
+               ord = get_imsm_ord_tbl_ent(dev, slot);
+               printf("      This Slot : %d%s\n", slot,
+                      ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
+       } else
+               printf("      This Slot : ?\n");
+       sz = __le32_to_cpu(dev->size_high);
+       sz <<= 32;
+       sz += __le32_to_cpu(dev->size_low);
+       printf("     Array Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       sz = __le32_to_cpu(map->blocks_per_member);
+       printf("   Per Dev Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+       printf("  Sector Offset : %u\n",
+               __le32_to_cpu(map->pba_of_lba0));
+       printf("    Num Stripes : %u\n",
+               __le32_to_cpu(map->num_data_stripes));
+       printf("     Chunk Size : %u KiB\n",
+               __le16_to_cpu(map->blocks_per_strip) / 2);
+       printf("       Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
+       printf("  Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle");
+       if (dev->vol.migr_state)
+               printf(": %s", dev->vol.migr_type ? "rebuilding" : "initializing");
+       printf("\n");
+       printf("      Map State : %s", map_state_str[map->map_state]);
+       if (dev->vol.migr_state) {
+               struct imsm_map *map = get_imsm_map(dev, 1);
+               printf(" <-- %s", map_state_str[map->map_state]);
+       }
+       printf("\n");
+       printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+}
+
+static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved)
+{
+       struct imsm_disk *disk = __get_imsm_disk(mpb, index);
+       char str[MAX_RAID_SERIAL_LEN + 1];
+       __u32 s;
+       __u64 sz;
+
+       if (index < 0)
+               return;
+
+       printf("\n");
+       snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
+       printf("  Disk%02d Serial : %s\n", index, str);
+       s = __le32_to_cpu(disk->status);
+       printf("          State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "",
+                                             s&CONFIGURED_DISK ? " active" : "",
+                                             s&FAILED_DISK ? " failed" : "",
+                                             s&USABLE_DISK ? " usable" : "");
+       printf("             Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+       sz = __le32_to_cpu(disk->total_blocks) - reserved;
+       printf("    Usable Size : %llu%s\n", (unsigned long long)sz,
+              human_size(sz * 512));
+}
+
+static void examine_super_imsm(struct supertype *st, char *homehost)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       char str[MAX_SIGNATURE_LENGTH];
+       int i;
+       struct mdinfo info;
+       char nbuf[64];
+       __u32 sum;
+       __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+
+       snprintf(str, MPB_SIG_LEN, "%s", mpb->sig);
+       printf("          Magic : %s\n", str);
+       snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb));
+       printf("        Version : %s\n", get_imsm_version(mpb));
+       printf("         Family : %08x\n", __le32_to_cpu(mpb->family_num));
+       printf("     Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf,'-');
+       printf("           UUID : %s\n", nbuf + 5);
+       sum = __le32_to_cpu(mpb->check_sum);
+       printf("       Checksum : %08x %s\n", sum,
+               __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
+       printf("    MPB Sectors : %d\n", mpb_sectors(mpb));
+       printf("          Disks : %d\n", mpb->num_disks);
+       printf("   RAID Devices : %d\n", mpb->num_raid_devs);
+       print_imsm_disk(mpb, super->disks->index, reserved);
+       if (super->bbm_log) {
+               struct bbm_log *log = super->bbm_log;
+
+               printf("\n");
+               printf("Bad Block Management Log:\n");
+               printf("       Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
+               printf("      Signature : %x\n", __le32_to_cpu(log->signature));
+               printf("    Entry Count : %d\n", __le32_to_cpu(log->entry_count));
+               printf("   Spare Blocks : %d\n",  __le32_to_cpu(log->reserved_spare_block_count));
+               printf("    First Spare : %llx\n", __le64_to_cpu(log->first_spare_lba));
+       }
+       for (i = 0; i < mpb->num_raid_devs; i++)
+               print_imsm_dev(__get_imsm_dev(mpb, i), super->disks->index);
+       for (i = 0; i < mpb->num_disks; i++) {
+               if (i == super->disks->index)
+                       continue;
+               print_imsm_disk(mpb, i, reserved);
+       }
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info);
+
+static void brief_examine_super_imsm(struct supertype *st)
+{
+       /* We just write a generic IMSM ARRAY entry */
+       struct mdinfo info;
+       char nbuf[64];
+       struct intel_super *super = st->sb;
+       int i;
+
+       if (!super->anchor->num_raid_devs)
+               return;
+
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf,'-');
+       printf("ARRAY /dev/imsm metadata=imsm auto=md UUID=%s\n", nbuf + 5);
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+
+               super->current_vol = i;
+               getinfo_super_imsm(st, &info);
+               fname_from_uuid(st, &info, nbuf,'-');
+               printf("ARRAY /dev/md/%.16s container=/dev/imsm member=%d auto=mdp UUID=%s\n",
+                      dev->volume, i, nbuf + 5);
+       }
+}
+
+static void detail_super_imsm(struct supertype *st, char *homehost)
+{
+       printf("%s\n", __FUNCTION__);
+}
+
+static void brief_detail_super_imsm(struct supertype *st)
+{
+       struct mdinfo info;
+       char nbuf[64];
+       getinfo_super_imsm(st, &info);
+       fname_from_uuid(st, &info, nbuf,'-');
+       printf(" UUID=%s", nbuf + 5);
+}
+#endif
+
+static int match_home_imsm(struct supertype *st, char *homehost)
+{
+       printf("%s\n", __FUNCTION__);
+
+       return -1;
+}
+
+static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
+{
+       /* The uuid returned here is used for:
+        *  uuid to put into bitmap file (Create, Grow)
+        *  uuid for backup header when saving critical section (Grow)
+        *  comparing uuids when re-adding a device into an array
+        *    In these cases the uuid required is that of the data-array,
+        *    not the device-set.
+        *  uuid to recognise same set when adding a missing device back
+        *    to an array.   This is a uuid for the device-set.
+        *  
+        * For each of these we can make do with a truncated
+        * or hashed uuid rather than the original, as long as
+        * everyone agrees.
+        * In each case the uuid required is that of the data-array,
+        * not the device-set.
+        */
+       /* imsm does not track uuid's so we synthesis one using sha1 on
+        * - The signature (Which is constant for all imsm array, but no matter)
+        * - the family_num of the container
+        * - the index number of the volume
+        * - the 'serial' number of the volume.
+        * Hopefully these are all constant.
+        */
+       struct intel_super *super = st->sb;
+
+       char buf[20];
+       struct sha1_ctx ctx;
+       struct imsm_dev *dev = NULL;
+
+       sha1_init_ctx(&ctx);
+       sha1_process_bytes(super->anchor->sig, MAX_SIGNATURE_LENGTH, &ctx);
+       sha1_process_bytes(&super->anchor->family_num, sizeof(__u32), &ctx);
+       if (super->current_vol >= 0)
+               dev = get_imsm_dev(super, super->current_vol);
+       if (dev) {
+               __u32 vol = super->current_vol;
+               sha1_process_bytes(&vol, sizeof(vol), &ctx);
+               sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx);
+       }
+       sha1_finish_ctx(&ctx, buf);
+       memcpy(uuid, buf, 4*4);
+}
+
+#if 0
+static void
+get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
+{
+       __u8 *v = get_imsm_version(mpb);
+       __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
+       char major[] = { 0, 0, 0 };
+       char minor[] = { 0 ,0, 0 };
+       char patch[] = { 0, 0, 0 };
+       char *ver_parse[] = { major, minor, patch };
+       int i, j;
+
+       i = j = 0;
+       while (*v != '\0' && v < end) {
+               if (*v != '.' && j < 2)
+                       ver_parse[i][j++] = *v;
+               else {
+                       i++;
+                       j = 0;
+               }
+               v++;
+       }
+
+       *m = strtol(minor, NULL, 0);
+       *p = strtol(patch, NULL, 0);
+}
+#endif
+
+static int imsm_level_to_layout(int level)
+{
+       switch (level) {
+       case 0:
+       case 1:
+               return 0;
+       case 5:
+       case 6:
+               return ALGORITHM_LEFT_ASYMMETRIC;
+       case 10:
+               return 0x102;
+       }
+       return -1;
+}
+
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+
+       info->container_member    = super->current_vol;
+       info->array.raid_disks    = map->num_members;
+       info->array.level         = get_imsm_raid_level(map);
+       info->array.layout        = imsm_level_to_layout(info->array.level);
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0;
+       info->array.utime         = 0;
+       info->array.chunk_size    = __le16_to_cpu(map->blocks_per_strip) << 9;
+       info->array.state         = !dev->vol.dirty;
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+
+       info->data_offset         = __le32_to_cpu(map->pba_of_lba0);
+       info->component_size      = __le32_to_cpu(map->blocks_per_member);
+       memset(info->uuid, 0, sizeof(info->uuid));
+
+       if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty)
+               info->resync_start = 0;
+       else if (dev->vol.migr_state)
+               info->resync_start = __le32_to_cpu(dev->vol.curr_migr_unit);
+       else
+               info->resync_start = ~0ULL;
+
+       strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
+       info->name[MAX_RAID_SERIAL_LEN] = 0;
+
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       sprintf(info->text_version, "/%s/%d",
+               devnum2devname(st->container_dev),
+               info->container_member);
+       info->safe_mode_delay = 4000;  /* 4 secs like the Matrix driver */
+       uuid_from_super_imsm(st, info->uuid);
+}
+
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_disk *disk;
+       __u32 s;
+
+       if (super->current_vol >= 0) {
+               getinfo_super_imsm_volume(st, info);
+               return;
+       }
+
+       /* Set raid_disks to zero so that Assemble will always pull in valid
+        * spares
+        */
+       info->array.raid_disks    = 0;
+       info->array.level         = LEVEL_CONTAINER;
+       info->array.layout        = 0;
+       info->array.md_minor      = -1;
+       info->array.ctime         = 0; /* N/A for imsm */ 
+       info->array.utime         = 0;
+       info->array.chunk_size    = 0;
+
+       info->disk.major = 0;
+       info->disk.minor = 0;
+       info->disk.raid_disk = -1;
+       info->reshape_active = 0;
+       info->array.major_version = -1;
+       info->array.minor_version = -2;
+       strcpy(info->text_version, "imsm");
+       info->safe_mode_delay = 0;
+       info->disk.number = -1;
+       info->disk.state = 0;
+       info->name[0] = 0;
+
+       if (super->disks) {
+               __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+               disk = &super->disks->disk;
+               info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved;
+               info->component_size = reserved;
+               s = __le32_to_cpu(disk->status);
+               info->disk.state  = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0;
+               info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0;
+               info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC);
+       }
+
+       /* only call uuid_from_super_imsm when this disk is part of a populated container,
+        * ->compare_super may have updated the 'num_raid_devs' field for spares
+        */
+       if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
+               uuid_from_super_imsm(st, info->uuid);
+       else
+               memcpy(info->uuid, uuid_match_any, sizeof(int[4]));
+}
+
+static int update_super_imsm(struct supertype *st, struct mdinfo *info,
+                            char *update, char *devname, int verbose,
+                            int uuid_set, char *homehost)
+{
+       /* FIXME */
+
+       /* For 'assemble' and 'force' we need to return non-zero if any
+        * change was made.  For others, the return value is ignored.
+        * Update options are:
+        *  force-one : This device looks a bit old but needs to be included,
+        *        update age info appropriately.
+        *  assemble: clear any 'faulty' flag to allow this device to
+        *              be assembled.
+        *  force-array: Array is degraded but being forced, mark it clean
+        *         if that will be needed to assemble it.
+        *
+        *  newdev:  not used ????
+        *  grow:  Array has gained a new device - this is currently for
+        *              linear only
+        *  resync: mark as dirty so a resync will happen.
+        *  name:  update the name - preserving the homehost
+        *
+        * Following are not relevant for this imsm:
+        *  sparc2.2 : update from old dodgey metadata
+        *  super-minor: change the preferred_minor number
+        *  summaries:  update redundant counters.
+        *  uuid:  Change the uuid of the array to match watch is given
+        *  homehost:  update the recorded homehost
+        *  _reshape_progress: record new reshape_progress position.
+        */
+       int rv = 0;
+       //struct intel_super *super = st->sb;
+       //struct imsm_super *mpb = super->mpb;
+
+       if (strcmp(update, "grow") == 0) {
+       }
+       if (strcmp(update, "resync") == 0) {
+               /* dev->vol.dirty = 1; */
+       }
+
+       /* IMSM has no concept of UUID or homehost */
+
+       return rv;
+}
+
+static size_t disks_to_mpb_size(int disks)
+{
+       size_t size;
+
+       size = sizeof(struct imsm_super);
+       size += (disks - 1) * sizeof(struct imsm_disk);
+       size += 2 * sizeof(struct imsm_dev);
+       /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
+       size += (4 - 2) * sizeof(struct imsm_map);
+       /* 4 possible disk_ord_tbl's */
+       size += 4 * (disks - 1) * sizeof(__u32);
+
+       return size;
+}
+
+static __u64 avail_size_imsm(struct supertype *st, __u64 devsize)
+{
+       if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
+               return 0;
+
+       return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+}
+
+static int compare_super_imsm(struct supertype *st, struct supertype *tst)
+{
+       /*
+        * return:
+        *  0 same, or first was empty, and second was copied
+        *  1 second had wrong number
+        *  2 wrong uuid
+        *  3 wrong other info
+        */
+       struct intel_super *first = st->sb;
+       struct intel_super *sec = tst->sb;
+
+        if (!first) {
+                st->sb = tst->sb;
+                tst->sb = NULL;
+                return 0;
+        }
+
+       if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0)
+               return 3;
+
+       /* if an anchor does not have num_raid_devs set then it is a free
+        * floating spare
+        */
+       if (first->anchor->num_raid_devs > 0 &&
+           sec->anchor->num_raid_devs > 0) {
+               if (first->anchor->family_num != sec->anchor->family_num)
+                       return 3;
+       }
+
+       /* if 'first' is a spare promote it to a populated mpb with sec's
+        * family number
+        */
+       if (first->anchor->num_raid_devs == 0 &&
+           sec->anchor->num_raid_devs > 0) {
+               int i;
+
+               /* we need to copy raid device info from sec if an allocation
+                * fails here we don't associate the spare
+                */
+               for (i = 0; i < sec->anchor->num_raid_devs; i++) {
+                       first->dev_tbl[i] = malloc(sizeof(struct imsm_dev));
+                       if (!first->dev_tbl) {
+                               while (--i >= 0) {
+                                       free(first->dev_tbl[i]);
+                                       first->dev_tbl[i] = NULL;
+                               }
+                               fprintf(stderr, "imsm: failed to associate spare\n"); 
+                               return 3;
+                       }
+                       *first->dev_tbl[i] = *sec->dev_tbl[i];
+               }
+
+               first->anchor->num_raid_devs = sec->anchor->num_raid_devs;
+               first->anchor->family_num = sec->anchor->family_num;
+       }
+
+       return 0;
+}
+
+static void fd2devname(int fd, char *name)
+{
+       struct stat st;
+       char path[256];
+       char dname[100];
+       char *nm;
+       int rv;
+
+       name[0] = '\0';
+       if (fstat(fd, &st) != 0)
+               return;
+       sprintf(path, "/sys/dev/block/%d:%d",
+               major(st.st_rdev), minor(st.st_rdev));
+
+       rv = readlink(path, dname, sizeof(dname));
+       if (rv <= 0)
+               return;
+       
+       dname[rv] = '\0';
+       nm = strrchr(dname, '/');
+       nm++;
+       snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
+}
+
+
+extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
+
+static int imsm_read_serial(int fd, char *devname,
+                           __u8 serial[MAX_RAID_SERIAL_LEN])
+{
+       unsigned char scsi_serial[255];
+       int rv;
+       int rsp_len;
+       int len;
+       char *c, *rsp_buf;
+
+       memset(scsi_serial, 0, sizeof(scsi_serial));
+
+       rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial));
+
+       if (rv && imsm_env_devname_as_serial()) {
+               memset(serial, 0, MAX_RAID_SERIAL_LEN);
+               fd2devname(fd, (char *) serial);
+               return 0;
+       }
+
+       if (rv != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to retrieve serial for %s\n",
+                               devname);
+               return rv;
+       }
+
+       /* trim leading whitespace */
+       rsp_len = scsi_serial[3];
+       rsp_buf = (char *) &scsi_serial[4];
+       c = rsp_buf;
+       while (isspace(*c))
+               c++;
+
+       /* truncate len to the end of rsp_buf if necessary */
+       if (c + MAX_RAID_SERIAL_LEN > rsp_buf + rsp_len)
+               len = rsp_len - (c - rsp_buf);
+       else
+               len = MAX_RAID_SERIAL_LEN;
+
+       /* initialize the buffer and copy rsp_buf characters */
+       memset(serial, 0, MAX_RAID_SERIAL_LEN);
+       memcpy(serial, c, len);
+
+       /* trim trailing whitespace starting with the last character copied */
+       c = (char *) &serial[len - 1];
+       while (isspace(*c) || *c == '\0')
+               *c-- = '\0';
+
+       return 0;
+}
+
+static int serialcmp(__u8 *s1, __u8 *s2)
+{
+       return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN);
+}
+
+static void serialcpy(__u8 *dest, __u8 *src)
+{
+       strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN);
+}
+
+static int
+load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+       struct dl *dl;
+       struct stat stb;
+       int rv;
+       int i;
+       int alloc = 1;
+       __u8 serial[MAX_RAID_SERIAL_LEN];
+
+       rv = imsm_read_serial(fd, devname, serial);
+
+       if (rv != 0)
+               return 2;
+
+       /* check if this is a disk we have seen before.  it may be a spare in
+        * super->disks while the current anchor believes it is a raid member,
+        * check if we need to update dl->index
+        */
+       for (dl = super->disks; dl; dl = dl->next)
+               if (serialcmp(dl->serial, serial) == 0)
+                       break;
+
+       if (!dl)
+               dl = malloc(sizeof(*dl));
+       else
+               alloc = 0;
+
+       if (!dl) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": failed to allocate disk buffer for %s\n",
+                               devname);
+               return 2;
+       }
+
+       if (alloc) {
+               fstat(fd, &stb);
+               dl->major = major(stb.st_rdev);
+               dl->minor = minor(stb.st_rdev);
+               dl->next = super->disks;
+               dl->fd = keep_fd ? fd : -1;
+               dl->devname = devname ? strdup(devname) : NULL;
+               serialcpy(dl->serial, serial);
+               dl->index = -2;
+       } else if (keep_fd) {
+               close(dl->fd);
+               dl->fd = fd;
+       }
+
+       /* look up this disk's index in the current anchor */
+       for (i = 0; i < super->anchor->num_disks; i++) {
+               struct imsm_disk *disk_iter;
+
+               disk_iter = __get_imsm_disk(super->anchor, i);
+
+               if (serialcmp(disk_iter->serial, dl->serial) == 0) {
+                       __u32 status;
+
+                       dl->disk = *disk_iter;
+                       status = __le32_to_cpu(dl->disk.status);
+                       /* only set index on disks that are a member of a
+                        * populated contianer, i.e. one with raid_devs
+                        */
+                       if (status & FAILED_DISK)
+                               dl->index = -2;
+                       else if (status & SPARE_DISK)
+                               dl->index = -1;
+                       else
+                               dl->index = i;
+
+                       break;
+               }
+       }
+
+       /* no match, maybe a stale failed drive */
+       if (i == super->anchor->num_disks && dl->index >= 0) {
+               dl->disk = *__get_imsm_disk(super->anchor, dl->index);
+               if (__le32_to_cpu(dl->disk.status) & FAILED_DISK)
+                       dl->index = -2;
+       }
+
+       if (alloc)
+               super->disks = dl;
+
+       return 0;
+}
+
+static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
+{
+       memcpy(dest, src, sizeof_imsm_dev(src, 0));
+}
+
+#ifndef MDASSEMBLE
+/* When migrating map0 contains the 'destination' state while map1
+ * contains the current state.  When not migrating map0 contains the
+ * current state.  This routine assumes that map[0].map_state is set to
+ * the current array state before being called.
+ *
+ * Migration is indicated by one of the following states
+ * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed)
+ * 2/ Initialize (migr_state=1 migr_type=0 map0state=normal
+ *    map1state=unitialized)
+ * 3/ Verify (Resync) (migr_state=1 migr_type=1 map0state=normal
+ *    map1state=normal)
+ * 4/ Rebuild (migr_state=1 migr_type=1 map0state=normal
+ *    map1state=degraded)
+ */
+static void migrate(struct imsm_dev *dev, __u8 to_state, int rebuild_resync)
+{
+       struct imsm_map *dest;
+       struct imsm_map *src = get_imsm_map(dev, 0);
+
+       dev->vol.migr_state = 1;
+       dev->vol.migr_type = rebuild_resync;
+       dev->vol.curr_migr_unit = 0;
+       dest = get_imsm_map(dev, 1);
+
+       memcpy(dest, src, sizeof_imsm_map(src));
+       src->map_state = to_state;
+}
+
+static void end_migration(struct imsm_dev *dev, __u8 map_state)
+{
+       struct imsm_map *map = get_imsm_map(dev, 0);
+
+       dev->vol.migr_state = 0;
+       dev->vol.curr_migr_unit = 0;
+       map->map_state = map_state;
+}
+#endif
+
+static int parse_raid_devices(struct intel_super *super)
+{
+       int i;
+       struct imsm_dev *dev_new;
+       size_t len, len_migr;
+       size_t space_needed = 0;
+       struct imsm_super *mpb = super->anchor;
+
+       for (i = 0; i < super->anchor->num_raid_devs; i++) {
+               struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+
+               len = sizeof_imsm_dev(dev_iter, 0);
+               len_migr = sizeof_imsm_dev(dev_iter, 1);
+               if (len_migr > len)
+                       space_needed += len_migr - len;
+               
+               dev_new = malloc(len_migr);
+               if (!dev_new)
+                       return 1;
+               imsm_copy_dev(dev_new, dev_iter);
+               super->dev_tbl[i] = dev_new;
+       }
+
+       /* ensure that super->buf is large enough when all raid devices
+        * are migrating
+        */
+       if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
+               void *buf;
+
+               len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512);
+               if (posix_memalign(&buf, 512, len) != 0)
+                       return 1;
+
+               memcpy(buf, super->buf, len);
+               free(super->buf);
+               super->buf = buf;
+               super->len = len;
+       }
+               
+       return 0;
+}
+
+/* retrieve a pointer to the bbm log which starts after all raid devices */
+struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
+{
+       void *ptr = NULL;
+
+       if (__le32_to_cpu(mpb->bbm_log_size)) {
+               ptr = mpb;
+               ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size);
+       } 
+
+       return ptr;
+}
+
+static void __free_imsm(struct intel_super *super, int free_disks);
+
+/* load_imsm_mpb - read matrix metadata
+ * allocates super->mpb to be freed by free_super
+ */
+static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
+{
+       unsigned long long dsize;
+       unsigned long long sectors;
+       struct stat;
+       struct imsm_super *anchor;
+       __u32 check_sum;
+       int rc;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (posix_memalign((void**)&anchor, 512, 512) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to allocate imsm anchor buffer"
+                               " on %s\n", devname);
+               return 1;
+       }
+       if (read(fd, anchor, 512) != 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read anchor block on %s: %s\n",
+                               devname, strerror(errno));
+               free(anchor);
+               return 1;
+       }
+
+       if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": no IMSM anchor on %s\n", devname);
+               free(anchor);
+               return 2;
+       }
+
+       __free_imsm(super, 0);
+       super->len = ROUND_UP(anchor->mpb_size, 512);
+       if (posix_memalign(&super->buf, 512, super->len) != 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": unable to allocate %zu byte mpb buffer\n",
+                               super->len);
+               free(anchor);
+               return 2;
+       }
+       memcpy(super->buf, anchor, 512);
+
+       sectors = mpb_sectors(anchor) - 1;
+       free(anchor);
+       if (!sectors) {
+               rc = load_imsm_disk(fd, super, devname, 0);
+               if (rc == 0)
+                       rc = parse_raid_devices(super);
+               return rc;
+       }
+
+       /* read the extended mpb */
+       if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot seek to extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 1;
+       }
+
+       if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Cannot read extended mpb on %s: %s\n",
+                               devname, strerror(errno));
+               return 2;
+       }
+
+       check_sum = __gen_imsm_checksum(super->anchor);
+       if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": IMSM checksum %x != %x on %s\n",
+                               check_sum, __le32_to_cpu(super->anchor->check_sum),
+                               devname);
+               return 2;
+       }
+
+       /* FIXME the BBM log is disk specific so we cannot use this global
+        * buffer for all disks.  Ok for now since we only look at the global
+        * bbm_log_size parameter to gate assembly
+        */
+       super->bbm_log = __get_imsm_bbm_log(super->anchor);
+
+       rc = load_imsm_disk(fd, super, devname, 0);
+       if (rc == 0)
+               rc = parse_raid_devices(super);
+
+       return rc;
+}
+
+static void __free_imsm_disk(struct dl *d)
+{
+       if (d->fd >= 0)
+               close(d->fd);
+       if (d->devname)
+               free(d->devname);
+       free(d);
+
+}
+static void free_imsm_disks(struct intel_super *super)
+{
+       struct dl *d;
+
+       while (super->disks) {
+               d = super->disks;
+               super->disks = d->next;
+               __free_imsm_disk(d);
+       }
+       while (super->missing) {
+               d = super->missing;
+               super->missing = d->next;
+               __free_imsm_disk(d);
+       }
+
+}
+
+/* free all the pieces hanging off of a super pointer */
+static void __free_imsm(struct intel_super *super, int free_disks)
+{
+       int i;
+
+       if (super->buf) {
+               free(super->buf);
+               super->buf = NULL;
+       }
+       if (free_disks)
+               free_imsm_disks(super);
+       for (i = 0; i < IMSM_MAX_RAID_DEVS; i++)
+               if (super->dev_tbl[i]) {
+                       free(super->dev_tbl[i]);
+                       super->dev_tbl[i] = NULL;
+               }
+}
+
+static void free_imsm(struct intel_super *super)
+{
+       __free_imsm(super, 1);
+       free(super);
+}
+
+static void free_super_imsm(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+
+       if (!super)
+               return;
+
+       free_imsm(super);
+       st->sb = NULL;
+}
+
+static struct intel_super *alloc_super(int creating_imsm)
+{
+       struct intel_super *super = malloc(sizeof(*super));
+
+       if (super) {
+               memset(super, 0, sizeof(*super));
+               super->creating_imsm = creating_imsm;
+               super->current_vol = -1;
+       }
+
+       return super;
+}
+
+#ifndef MDASSEMBLE
+/* find_missing - helper routine for load_super_imsm_all that identifies
+ * disks that have disappeared from the system.  This routine relies on
+ * the mpb being uptodate, which it is at load time.
+ */
+static int find_missing(struct intel_super *super)
+{
+       int i;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       struct imsm_disk *disk;
+       __u32 status;
+
+       for (i = 0; i < mpb->num_disks; i++) {
+               disk = __get_imsm_disk(mpb, i);
+               for (dl = super->disks; dl; dl = dl->next)
+                       if (serialcmp(dl->disk.serial, disk->serial) == 0)
+                               break;
+               if (dl)
+                       continue;
+               /* ok we have a 'disk' without a live entry in
+                * super->disks
+                */
+               status = __le32_to_cpu(disk->status);
+               if (status & FAILED_DISK || !(status & USABLE_DISK))
+                       continue; /* never mind, already marked */
+
+               dl = malloc(sizeof(*dl));
+               if (!dl)
+                       return 1;
+               dl->major = 0;
+               dl->minor = 0;
+               dl->fd = -1;
+               dl->devname = strdup("missing");
+               dl->index = i;
+               serialcpy(dl->serial, disk->serial);
+               dl->disk = *disk;
+               dl->next = super->missing;
+               super->missing = dl;
+       }
+
+       return 0;
+}
+
+static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
+                              char *devname, int keep_fd)
+{
+       struct mdinfo *sra;
+       struct intel_super *super;
+       struct mdinfo *sd, *best = NULL;
+       __u32 bestgen = 0;
+       __u32 gen;
+       char nm[20];
+       int dfd;
+       int rv;
+
+       /* check if this disk is a member of an active array */
+       sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+       if (!sra)
+               return 1;
+
+       if (sra->array.major_version != -1 ||
+           sra->array.minor_version != -2 ||
+           strcmp(sra->text_version, "imsm") != 0)
+               return 1;
+
+       super = alloc_super(0);
+       if (!super)
+               return 1;
+
+       /* find the most up to date disk in this array, skipping spares */
+       for (sd = sra->devs; sd; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY);
+               if (!dfd) {
+                       free_imsm(super);
+                       return 2;
+               }
+               rv = load_imsm_mpb(dfd, super, NULL);
+               if (!keep_fd)
+                       close(dfd);
+               if (rv == 0) {
+                       if (super->anchor->num_raid_devs == 0)
+                               gen = 0;
+                       else
+                               gen = __le32_to_cpu(super->anchor->generation_num);
+                       if (!best || gen > bestgen) {
+                               bestgen = gen;
+                               best = sd;
+                       }
+               } else {
+                       free_imsm(super);
+                       return 2;
+               }
+       }
+
+       if (!best) {
+               free_imsm(super);
+               return 1;
+       }
+
+       /* load the most up to date anchor */
+       sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+       dfd = dev_open(nm, O_RDONLY);
+       if (!dfd) {
+               free_imsm(super);
+               return 1;
+       }
+       rv = load_imsm_mpb(dfd, super, NULL);
+       close(dfd);
+       if (rv != 0) {
+               free_imsm(super);
+               return 2;
+       }
+
+       /* re-parse the disk list with the current anchor */
+       for (sd = sra->devs ; sd ; sd = sd->next) {
+               sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+               dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY);
+               if (!dfd) {
+                       free_imsm(super);
+                       return 2;
+               }
+               load_imsm_disk(dfd, super, NULL, keep_fd);
+               if (!keep_fd)
+                       close(dfd);
+       }
+
+
+       if (find_missing(super) != 0) {
+               free_imsm(super);
+               return 2;
+       }
+
+       if (st->subarray[0]) {
+               if (atoi(st->subarray) <= super->anchor->num_raid_devs)
+                       super->current_vol = atoi(st->subarray);
+               else
+                       return 1;
+       }
+
+       *sbp = super;
+       st->container_dev = fd2devnum(fd);
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+       }
+       st->loaded_container = 1;
+
+       return 0;
+}
+#endif
+
+static int load_super_imsm(struct supertype *st, int fd, char *devname)
+{
+       struct intel_super *super;
+       int rv;
+
+#ifndef MDASSEMBLE
+       if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0)
+               return 0;
+#endif
+       if (st->subarray[0])
+               return 1; /* FIXME */
+
+       super = alloc_super(0);
+       if (!super) {
+               fprintf(stderr,
+                       Name ": malloc of %zu failed.\n",
+                       sizeof(*super));
+               return 1;
+       }
+
+       rv = load_imsm_mpb(fd, super, devname);
+
+       if (rv) {
+               if (devname)
+                       fprintf(stderr,
+                               Name ": Failed to load all information "
+                               "sections on %s\n", devname);
+               free_imsm(super);
+               return rv;
+       }
+
+       st->sb = super;
+       if (st->ss == NULL) {
+               st->ss = &super_imsm;
+               st->minor_version = 0;
+               st->max_devs = IMSM_MAX_DEVICES;
+       }
+       st->loaded_container = 0;
+
+       return 0;
+}
+
+static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
+{
+       if (info->level == 1)
+               return 128;
+       return info->chunk_size >> 9;
+}
+
+static __u32 info_to_num_data_stripes(mdu_array_info_t *info)
+{
+       __u32 num_stripes;
+
+       num_stripes = (info->size * 2) / info_to_blocks_per_strip(info);
+       if (info->level == 1)
+               num_stripes /= 2;
+
+       return num_stripes;
+}
+
+static __u32 info_to_blocks_per_member(mdu_array_info_t *info)
+{
+       return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1);
+}
+
+static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
+                                 unsigned long long size, char *name,
+                                 char *homehost, int *uuid)
+{
+       /* We are creating a volume inside a pre-existing container.
+        * so st->sb is already set.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct imsm_dev *dev;
+       struct imsm_vol *vol;
+       struct imsm_map *map;
+       int idx = mpb->num_raid_devs;
+       int i;
+       unsigned long long array_blocks;
+       __u32 offset = 0;
+       size_t size_old, size_new;
+
+       if (mpb->num_raid_devs >= 2) {
+               fprintf(stderr, Name": This imsm-container already has the "
+                       "maximum of 2 volumes\n");
+               return 0;
+       }
+
+       /* ensure the mpb is large enough for the new data */
+       size_old = __le32_to_cpu(mpb->mpb_size);
+       size_new = disks_to_mpb_size(info->nr_disks);
+       if (size_new > size_old) {
+               void *mpb_new;
+               size_t size_round = ROUND_UP(size_new, 512);
+
+               if (posix_memalign(&mpb_new, 512, size_round) != 0) {
+                       fprintf(stderr, Name": could not allocate new mpb\n");
+                       return 0;
+               }
+               memcpy(mpb_new, mpb, size_old);
+               free(mpb);
+               mpb = mpb_new;
+               super->anchor = mpb_new;
+               mpb->mpb_size = __cpu_to_le32(size_new);
+               memset(mpb_new + size_old, 0, size_round - size_old);
+       }
+       super->current_vol = idx;
+       /* when creating the first raid device in this container set num_disks
+        * to zero, i.e. delete this spare and add raid member devices in
+        * add_to_super_imsm_volume()
+        */
+       if (super->current_vol == 0)
+               mpb->num_disks = 0;
+       sprintf(st->subarray, "%d", idx);
+       dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+       if (!dev) {
+               fprintf(stderr, Name": could not allocate raid device\n");
+               return 0;
+       }
+       strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
+       array_blocks = calc_array_size(info->level, info->raid_disks,
+                                      info->layout, info->chunk_size,
+                                      info->size*2);
+       dev->size_low = __cpu_to_le32((__u32) array_blocks);
+       dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32));
+       dev->status = __cpu_to_le32(0);
+       dev->reserved_blocks = __cpu_to_le32(0);
+       vol = &dev->vol;
+       vol->migr_state = 0;
+       vol->migr_type = 0;
+       vol->dirty = 0;
+       vol->curr_migr_unit = 0;
+       for (i = 0; i < idx; i++) {
+               struct imsm_dev *prev = get_imsm_dev(super, i);
+               struct imsm_map *pmap = get_imsm_map(prev, 0);
+
+               offset += __le32_to_cpu(pmap->blocks_per_member);
+               offset += IMSM_RESERVED_SECTORS;
+       }
+       map = get_imsm_map(dev, 0);
+       map->pba_of_lba0 = __cpu_to_le32(offset);
+       map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info));
+       map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
+       map->num_data_stripes = __cpu_to_le32(info_to_num_data_stripes(info));
+       map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED :
+                                      IMSM_T_STATE_NORMAL;
+
+       if (info->level == 1 && info->raid_disks > 2) {
+               fprintf(stderr, Name": imsm does not support more than 2 disks"
+                               "in a raid1 volume\n");
+               return 0;
+       }
+       if (info->level == 10)
+               map->raid_level = 1;
+       else
+               map->raid_level = info->level;
+
+       map->num_members = info->raid_disks;
+       for (i = 0; i < map->num_members; i++) {
+               /* initialized in add_to_super */
+               set_imsm_ord_tbl_ent(map, i, 0);
+       }
+       mpb->num_raid_devs++;
+       super->dev_tbl[super->current_vol] = dev;
+
+       return 1;
+}
+
+static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
+                          unsigned long long size, char *name,
+                          char *homehost, int *uuid)
+{
+       /* This is primarily called by Create when creating a new array.
+        * We will then get add_to_super called for each component, and then
+        * write_init_super called to write it out to each device.
+        * For IMSM, Create can create on fresh devices or on a pre-existing
+        * array.
+        * To create on a pre-existing array a different method will be called.
+        * This one is just for fresh drives.
+        */
+       struct intel_super *super;
+       struct imsm_super *mpb;
+       size_t mpb_size;
+
+       if (!info) {
+               st->sb = NULL;
+               return 0;
+       }
+       if (st->sb)
+               return init_super_imsm_volume(st, info, size, name, homehost,
+                                             uuid);
+
+       super = alloc_super(1);
+       if (!super)
+               return 0;
+       mpb_size = disks_to_mpb_size(info->nr_disks);
+       if (posix_memalign(&super->buf, 512, mpb_size) != 0) {
+               free(super);
+               return 0;
+       }
+       mpb = super->buf;
+       memset(mpb, 0, mpb_size); 
+
+       memcpy(mpb->sig, MPB_SIGNATURE, strlen(MPB_SIGNATURE));
+       memcpy(mpb->sig + strlen(MPB_SIGNATURE), MPB_VERSION_RAID5,
+              strlen(MPB_VERSION_RAID5)); 
+       mpb->mpb_size = mpb_size;
+
+       st->sb = super;
+       return 1;
+}
+
+#ifndef MDASSEMBLE
+static void add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
+                                    int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct dl *dl;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+       __u32 status;
+
+       dev = get_imsm_dev(super, super->current_vol);
+       map = get_imsm_map(dev, 0);
+
+       for (dl = super->disks; dl ; dl = dl->next)
+               if (dl->major == dk->major &&
+                   dl->minor == dk->minor)
+                       break;
+
+       if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+               return;
+
+       /* add a pristine spare to the metadata */
+       if (dl->index < 0) {
+               dl->index = super->anchor->num_disks;
+               super->anchor->num_disks++;
+       }
+       set_imsm_ord_tbl_ent(map, dk->number, dl->index);
+       status = CONFIGURED_DISK | USABLE_DISK;
+       dl->disk.status = __cpu_to_le32(status);
+
+       /* if we are creating the first raid device update the family number */
+       if (super->current_vol == 0) {
+               __u32 sum;
+               struct imsm_dev *_dev = __get_imsm_dev(mpb, 0);
+               struct imsm_disk *_disk = __get_imsm_disk(mpb, dl->index);
+
+               *_dev = *dev;
+               *_disk = dl->disk;
+               sum = __gen_imsm_checksum(mpb);
+               mpb->family_num = __cpu_to_le32(sum);
+       }
+}
+
+static void add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
+                             int fd, char *devname)
+{
+       struct intel_super *super = st->sb;
+       struct dl *dd;
+       unsigned long long size;
+       __u32 status, id;
+       int rv;
+       struct stat stb;
+
+       if (super->current_vol >= 0) {
+               add_to_super_imsm_volume(st, dk, fd, devname);
+               return;
+       }
+
+       fstat(fd, &stb);
+       dd = malloc(sizeof(*dd));
+       if (!dd) {
+               fprintf(stderr,
+                       Name ": malloc failed %s:%d.\n", __func__, __LINE__);
+               abort();
+       }
+       memset(dd, 0, sizeof(*dd));
+       dd->major = major(stb.st_rdev);
+       dd->minor = minor(stb.st_rdev);
+       dd->index = -1;
+       dd->devname = devname ? strdup(devname) : NULL;
+       dd->fd = fd;
+       rv = imsm_read_serial(fd, devname, dd->serial);
+       if (rv) {
+               fprintf(stderr,
+                       Name ": failed to retrieve scsi serial, aborting\n");
+               free(dd);
+               abort();
+       }
+
+       get_dev_size(fd, NULL, &size);
+       size /= 512;
+       status = USABLE_DISK | SPARE_DISK;
+       serialcpy(dd->disk.serial, dd->serial);
+       dd->disk.total_blocks = __cpu_to_le32(size);
+       dd->disk.status = __cpu_to_le32(status);
+       if (sysfs_disk_to_scsi_id(fd, &id) == 0)
+               dd->disk.scsi_id = __cpu_to_le32(id);
+       else
+               dd->disk.scsi_id = __cpu_to_le32(0);
+
+       if (st->update_tail) {
+               dd->next = super->add;
+               super->add = dd;
+       } else {
+               dd->next = super->disks;
+               super->disks = dd;
+       }
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super);
+
+/* spare records have their own family number and do not have any defined raid
+ * devices
+ */
+static int write_super_imsm_spares(struct intel_super *super, int doclose)
+{
+       struct imsm_super mpb_save;
+       struct imsm_super *mpb = super->anchor;
+       __u32 sum;
+       struct dl *d;
+
+       mpb_save = *mpb;
+       mpb->num_raid_devs = 0;
+       mpb->num_disks = 1;
+       mpb->mpb_size = sizeof(struct imsm_super);
+       mpb->generation_num = __cpu_to_le32(1UL);
+
+       for (d = super->disks; d; d = d->next) {
+               if (d->index != -1)
+                       continue;
+
+               mpb->disk[0] = d->disk;
+               sum = __gen_imsm_checksum(mpb);
+               mpb->family_num = __cpu_to_le32(sum);
+               sum = __gen_imsm_checksum(mpb);
+               mpb->check_sum = __cpu_to_le32(sum);
+
+               if (store_imsm_mpb(d->fd, super)) {
+                       fprintf(stderr, "%s: failed for device %d:%d %s\n",
+                               __func__, d->major, d->minor, strerror(errno));
+                       *mpb = mpb_save;
+                       return 1;
+               }
+               if (doclose) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+       }
+
+       *mpb = mpb_save;
+       return 0;
+}
+
+static int write_super_imsm(struct intel_super *super, int doclose)
+{
+       struct imsm_super *mpb = super->anchor;
+       struct dl *d;
+       __u32 generation;
+       __u32 sum;
+       int spares = 0;
+       int i;
+       __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+
+       /* 'generation' is incremented everytime the metadata is written */
+       generation = __le32_to_cpu(mpb->generation_num);
+       generation++;
+       mpb->generation_num = __cpu_to_le32(generation);
+
+       mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
+       for (d = super->disks; d; d = d->next) {
+               if (d->index == -1)
+                       spares++;
+               else
+                       mpb->disk[d->index] = d->disk;
+       }
+       for (d = super->missing; d; d = d->next)
+               mpb->disk[d->index] = d->disk;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+               imsm_copy_dev(dev, super->dev_tbl[i]);
+               mpb_size += sizeof_imsm_dev(dev, 0);
+       }
+       mpb_size += __le32_to_cpu(mpb->bbm_log_size);
+       mpb->mpb_size = __cpu_to_le32(mpb_size);
+
+       /* recalculate checksum */
+       sum = __gen_imsm_checksum(mpb);
+       mpb->check_sum = __cpu_to_le32(sum);
+
+       /* write the mpb for disks that compose raid devices */
+       for (d = super->disks; d ; d = d->next) {
+               if (d->index < 0)
+                       continue;
+               if (store_imsm_mpb(d->fd, super))
+                       fprintf(stderr, "%s: failed for device %d:%d %s\n",
+                               __func__, d->major, d->minor, strerror(errno));
+               if (doclose) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+       }
+
+       if (spares)
+               return write_super_imsm_spares(super, doclose);
+
+       return 0;
+}
+
+
+static int create_array(struct supertype *st)
+{
+       size_t len;
+       struct imsm_update_create_array *u;
+       struct intel_super *super = st->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+
+       len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0);
+       u = malloc(len);
+       if (!u) {
+               fprintf(stderr, "%s: failed to allocate update buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       u->type = update_create_array;
+       u->dev_idx = super->current_vol;
+       imsm_copy_dev(&u->dev, dev);
+       append_metadata_update(st, u, len);
+
+       return 0;
+}
+
+static int _add_disk(struct supertype *st)
+{
+       struct intel_super *super = st->sb;
+       size_t len;
+       struct imsm_update_add_disk *u;
+
+       if (!super->add)
+               return 0;
+
+       len = sizeof(*u);
+       u = malloc(len);
+       if (!u) {
+               fprintf(stderr, "%s: failed to allocate update buffer\n",
+                       __func__);
+               return 1;
+       }
+
+       u->type = update_add_disk;
+       append_metadata_update(st, u, len);
+
+       return 0;
+}
+
+static int write_init_super_imsm(struct supertype *st)
+{
+       if (st->update_tail) {
+               /* queue the recently created array / added disk
+                * as a metadata update */
+               struct intel_super *super = st->sb;
+               struct dl *d;
+               int rv;
+
+               /* determine if we are creating a volume or adding a disk */
+               if (super->current_vol < 0) {
+                       /* in the add disk case we are running in mdmon
+                        * context, so don't close fd's
+                        */
+                       return _add_disk(st);
+               } else
+                       rv = create_array(st);
+
+               for (d = super->disks; d ; d = d->next) {
+                       close(d->fd);
+                       d->fd = -1;
+               }
+
+               return rv;
+       } else
+               return write_super_imsm(st->sb, 1);
+}
+#endif
+
+static int store_zero_imsm(struct supertype *st, int fd)
+{
+       unsigned long long dsize;
+       void *buf;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (posix_memalign(&buf, 512, 512) != 0)
+               return 1;
+
+       memset(buf, 0, 512);
+       if (write(fd, buf, 512) != 512)
+               return 1;
+       return 0;
+}
+
+static int imsm_bbm_log_size(struct imsm_super *mpb)
+{
+       return __le32_to_cpu(mpb->bbm_log_size);
+}
+
+#ifndef MDASSEMBLE
+static int validate_geometry_imsm_container(struct supertype *st, int level,
+                                           int layout, int raiddisks, int chunk,
+                                           unsigned long long size, char *dev,
+                                           unsigned long long *freesize,
+                                           int verbose)
+{
+       int fd;
+       unsigned long long ldsize;
+
+       if (level != LEVEL_CONTAINER)
+               return 0;
+       if (!dev)
+               return 1;
+
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm: Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       if (!get_dev_size(fd, dev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size_imsm(st, ldsize >> 9);
+
+       return 1;
+}
+
+/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd 
+ * FIX ME add ahci details
+ */
+static int validate_geometry_imsm_volume(struct supertype *st, int level,
+                                        int layout, int raiddisks, int chunk,
+                                        unsigned long long size, char *dev,
+                                        unsigned long long *freesize,
+                                        int verbose)
+{
+       struct stat stb;
+       struct intel_super *super = st->sb;
+       struct dl *dl;
+       unsigned long long pos = 0;
+       unsigned long long maxsize;
+       struct extent *e;
+       int i;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+
+       if (level == 1 && raiddisks > 2) {
+               if (verbose)
+                       fprintf(stderr, Name ": imsm does not support more "
+                               "than 2 in a raid1 configuration\n");
+               return 0;
+       }
+
+       /* We must have the container info already read in. */
+       if (!super)
+               return 0;
+
+       if (!dev) {
+               /* General test:  make sure there is space for
+                * 'raiddisks' device extents of size 'size' at a given
+                * offset
+                */
+               unsigned long long minsize = size*2 /* convert to blocks */;
+               unsigned long long start_offset = ~0ULL;
+               int dcnt = 0;
+               if (minsize == 0)
+                       minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+               for (dl = super->disks; dl ; dl = dl->next) {
+                       int found = 0;
+
+                       pos = 0;
+                       i = 0;
+                       e = get_extents(super, dl);
+                       if (!e) continue;
+                       do {
+                               unsigned long long esize;
+                               esize = e[i].start - pos;
+                               if (esize >= minsize)
+                                       found = 1;
+                               if (found && start_offset == ~0ULL) {
+                                       start_offset = pos;
+                                       break;
+                               } else if (found && pos != start_offset) {
+                                       found = 0;
+                                       break;
+                               }
+                               pos = e[i].start + e[i].size;
+                               i++;
+                       } while (e[i-1].size);
+                       if (found)
+                               dcnt++;
+                       free(e);
+               }
+               if (dcnt < raiddisks) {
+                       if (verbose)
+                               fprintf(stderr, Name ": imsm: Not enough "
+                                       "devices with space for this array "
+                                       "(%d < %d)\n",
+                                       dcnt, raiddisks);
+                       return 0;
+               }
+               return 1;
+       }
+       /* This device must be a member of the set */
+       if (stat(dev, &stb) < 0)
+               return 0;
+       if ((S_IFMT & stb.st_mode) != S_IFBLK)
+               return 0;
+       for (dl = super->disks ; dl ; dl = dl->next) {
+               if (dl->major == major(stb.st_rdev) &&
+                   dl->minor == minor(stb.st_rdev))
+                       break;
+       }
+       if (!dl) {
+               if (verbose)
+                       fprintf(stderr, Name ": %s is not in the "
+                               "same imsm set\n", dev);
+               return 0;
+       }
+       e = get_extents(super, dl);
+       maxsize = 0;
+       i = 0;
+       if (e) do {
+               unsigned long long esize;
+               esize = e[i].start - pos;
+               if (esize >= maxsize)
+                       maxsize = esize;
+               pos = e[i].start + e[i].size;
+               i++;
+       } while (e[i-1].size);
+       *freesize = maxsize;
+
+       return 1;
+}
+
+static int validate_geometry_imsm(struct supertype *st, int level, int layout,
+                                 int raiddisks, int chunk, unsigned long long size,
+                                 char *dev, unsigned long long *freesize,
+                                 int verbose)
+{
+       int fd, cfd;
+       struct mdinfo *sra;
+
+       /* if given unused devices create a container 
+        * if given given devices in a container create a member volume
+        */
+       if (level == LEVEL_CONTAINER) {
+               /* Must be a fresh device to add to a container */
+               return validate_geometry_imsm_container(st, level, layout,
+                                                       raiddisks, chunk, size,
+                                                       dev, freesize,
+                                                       verbose);
+       }
+       
+       if (st->sb) {
+               /* creating in a given container */
+               return validate_geometry_imsm_volume(st, level, layout,
+                                                    raiddisks, chunk, size,
+                                                    dev, freesize, verbose);
+       }
+
+       /* limit creation to the following levels */
+       if (!dev)
+               switch (level) {
+               case 0:
+               case 1:
+               case 10:
+               case 5:
+                       break;
+               default:
+                       return 1;
+               }
+
+       /* This device needs to be a device in an 'imsm' container */
+       fd = open(dev, O_RDONLY|O_EXCL, 0);
+       if (fd >= 0) {
+               if (verbose)
+                       fprintf(stderr,
+                               Name ": Cannot create this array on device %s\n",
+                               dev);
+               close(fd);
+               return 0;
+       }
+       if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot open %s: %s\n",
+                               dev, strerror(errno));
+               return 0;
+       }
+       /* Well, it is in use by someone, maybe an 'imsm' container. */
+       cfd = open_container(fd);
+       if (cfd < 0) {
+               close(fd);
+               if (verbose)
+                       fprintf(stderr, Name ": Cannot use %s: It is busy\n",
+                               dev);
+               return 0;
+       }
+       sra = sysfs_read(cfd, 0, GET_VERSION);
+       close(fd);
+       if (sra && sra->array.major_version == -1 &&
+           strcmp(sra->text_version, "imsm") == 0) {
+               /* This is a member of a imsm container.  Load the container
+                * and try to create a volume
+                */
+               struct intel_super *super;
+
+               if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) {
+                       st->sb = super;
+                       st->container_dev = fd2devnum(cfd);
+                       close(cfd);
+                       return validate_geometry_imsm_volume(st, level, layout,
+                                                            raiddisks, chunk,
+                                                            size, dev,
+                                                            freesize, verbose);
+               }
+               close(cfd);
+       } else /* may belong to another container */
+               return 0;
+
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
+static struct mdinfo *container_content_imsm(struct supertype *st)
+{
+       /* Given a container loaded by load_super_imsm_all,
+        * extract information about all the arrays into
+        * an mdinfo tree.
+        *
+        * For each imsm_dev create an mdinfo, fill it in,
+        *  then look for matching devices in super->disks
+        *  and create appropriate device mdinfo.
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       struct mdinfo *rest = NULL;
+       int i;
+
+       /* do not assemble arrays that might have bad blocks */
+       if (imsm_bbm_log_size(super->anchor)) {
+               fprintf(stderr, Name ": BBM log found in metadata. "
+                               "Cannot activate array(s).\n");
+               return NULL;
+       }
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               struct imsm_dev *dev = get_imsm_dev(super, i);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct mdinfo *this;
+               int slot;
+
+               this = malloc(sizeof(*this));
+               memset(this, 0, sizeof(*this));
+               this->next = rest;
+
+               super->current_vol = i;
+               getinfo_super_imsm_volume(st, this);
+               for (slot = 0 ; slot <  map->num_members; slot++) {
+                       struct mdinfo *info_d;
+                       struct dl *d;
+                       int idx;
+                       int skip;
+                       __u32 s;
+                       __u32 ord;
+
+                       skip = 0;
+                       idx = get_imsm_disk_idx(dev, slot);
+                       ord = get_imsm_ord_tbl_ent(dev, slot); 
+                       for (d = super->disks; d ; d = d->next)
+                               if (d->index == idx)
+                                        break;
+
+                       if (d == NULL)
+                               skip = 1;
+
+                       s = d ? __le32_to_cpu(d->disk.status) : 0;
+                       if (s & FAILED_DISK)
+                               skip = 1;
+                       if (!(s & USABLE_DISK))
+                               skip = 1;
+                       if (ord & IMSM_ORD_REBUILD)
+                               skip = 1;
+
+                       /* 
+                        * if we skip some disks the array will be assmebled degraded;
+                        * reset resync start to avoid a dirty-degraded situation
+                        *
+                        * FIXME handle dirty degraded
+                        */
+                       if (skip && !dev->vol.dirty)
+                               this->resync_start = ~0ULL;
+                       if (skip)
+                               continue;
+
+                       info_d = malloc(sizeof(*info_d));
+                       if (!info_d) {
+                               fprintf(stderr, Name ": failed to allocate disk"
+                                       " for volume %s\n", (char *) dev->volume);
+                               free(this);
+                               this = rest;
+                               break;
+                       }
+                       memset(info_d, 0, sizeof(*info_d));
+                       info_d->next = this->devs;
+                       this->devs = info_d;
+
+                       info_d->disk.number = d->index;
+                       info_d->disk.major = d->major;
+                       info_d->disk.minor = d->minor;
+                       info_d->disk.raid_disk = slot;
+
+                       this->array.working_disks++;
+
+                       info_d->events = __le32_to_cpu(mpb->generation_num);
+                       info_d->data_offset = __le32_to_cpu(map->pba_of_lba0);
+                       info_d->component_size = __le32_to_cpu(map->blocks_per_member);
+                       if (d->devname)
+                               strcpy(info_d->name, d->devname);
+               }
+               rest = this;
+       }
+
+       return rest;
+}
+
+
+#ifndef MDASSEMBLE
+static int imsm_open_new(struct supertype *c, struct active_array *a,
+                        char *inst)
+{
+       struct intel_super *super = c->sb;
+       struct imsm_super *mpb = super->anchor;
+       
+       if (atoi(inst) >= mpb->num_raid_devs) {
+               fprintf(stderr, "%s: subarry index %d, out of range\n",
+                       __func__, atoi(inst));
+               return -ENODEV;
+       }
+
+       dprintf("imsm: open_new %s\n", inst);
+       a->info.container_member = atoi(inst);
+       return 0;
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, int failed)
+{
+       struct imsm_map *map = get_imsm_map(dev, 0);
+
+       if (!failed)
+               return map->map_state == IMSM_T_STATE_UNINITIALIZED ? 
+                       IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL;
+
+       switch (get_imsm_raid_level(map)) {
+       case 0:
+               return IMSM_T_STATE_FAILED;
+               break;
+       case 1:
+               if (failed < map->num_members)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       case 10:
+       {
+               /**
+                * check to see if any mirrors have failed, otherwise we
+                * are degraded.  Even numbered slots are mirrored on
+                * slot+1
+                */
+               int i;
+               int insync;
+
+               for (i = 0; i < map->num_members; i++) {
+                       __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+                       int idx = ord_to_idx(ord);
+                       struct imsm_disk *disk;
+
+                       /* reset the potential in-sync count on even-numbered
+                        * slots.  num_copies is always 2 for imsm raid10 
+                        */
+                       if ((i & 1) == 0)
+                               insync = 2;
+
+                       disk = get_imsm_disk(super, idx);
+                       if (!disk ||
+                           __le32_to_cpu(disk->status) & FAILED_DISK ||
+                           ord & IMSM_ORD_REBUILD)
+                               insync--;
+
+                       /* no in-sync disks left in this mirror the
+                        * array has failed
+                        */
+                       if (insync == 0)
+                               return IMSM_T_STATE_FAILED;
+               }
+
+               return IMSM_T_STATE_DEGRADED;
+       }
+       case 5:
+               if (failed < 2)
+                       return IMSM_T_STATE_DEGRADED;
+               else
+                       return IMSM_T_STATE_FAILED;
+               break;
+       default:
+               break;
+       }
+
+       return map->map_state;
+}
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev)
+{
+       int i;
+       int failed = 0;
+       struct imsm_disk *disk;
+       struct imsm_map *map = get_imsm_map(dev, 0);
+
+       for (i = 0; i < map->num_members; i++) {
+               __u32 ord = get_imsm_ord_tbl_ent(dev, i);
+               int idx = ord_to_idx(ord);
+
+               disk = get_imsm_disk(super, idx);
+               if (!disk ||
+                   __le32_to_cpu(disk->status) & FAILED_DISK ||
+                   ord & IMSM_ORD_REBUILD)
+                       failed++;
+       }
+
+       return failed;
+}
+
+static int is_resyncing(struct imsm_dev *dev)
+{
+       struct imsm_map *migr_map;
+
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (dev->vol.migr_type == 0)
+               return 1;
+
+       migr_map = get_imsm_map(dev, 1);
+
+       if (migr_map->map_state == IMSM_T_STATE_NORMAL)
+               return 1;
+       else
+               return 0;
+}
+
+static int is_rebuilding(struct imsm_dev *dev)
+{
+       struct imsm_map *migr_map;
+
+       if (!dev->vol.migr_state)
+               return 0;
+
+       if (dev->vol.migr_type == 0)
+               return 0;
+
+       migr_map = get_imsm_map(dev, 1);
+
+       if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
+               return 1;
+       else
+               return 0;
+}
+
+static void mark_failure(struct imsm_disk *disk)
+{
+       __u32 status = __le32_to_cpu(disk->status);
+
+       if (status & FAILED_DISK)
+               return;
+       status |= FAILED_DISK;
+       disk->status = __cpu_to_le32(status);
+       disk->scsi_id = __cpu_to_le32(~(__u32)0);
+       memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1);
+}
+
+/* Handle dirty -> clean transititions and resync.  Degraded and rebuild
+ * states are handled in imsm_set_disk() with one exception, when a
+ * resync is stopped due to a new failure this routine will set the
+ * 'degraded' state for the array.
+ */
+static int imsm_set_array_state(struct active_array *a, int consistent)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       int failed = imsm_count_failed(super, dev);
+       __u8 map_state = imsm_check_degraded(super, dev, failed);
+
+       /* before we activate this array handle any missing disks */
+       if (consistent == 2 && super->missing) {
+               struct dl *dl;
+
+               dprintf("imsm: mark missing\n");
+               end_migration(dev, map_state);
+               for (dl = super->missing; dl; dl = dl->next)
+                       mark_failure(&dl->disk);
+               super->updates_pending++;
+       }
+               
+       if (consistent == 2 &&
+           (!is_resync_complete(a) ||
+            map_state != IMSM_T_STATE_NORMAL ||
+            dev->vol.migr_state))
+               consistent = 0;
+
+       if (is_resync_complete(a)) {
+               /* complete intialization / resync,
+                * recovery is completed in ->set_disk
+                */
+               if (is_resyncing(dev)) {
+                       dprintf("imsm: mark resync done\n");
+                       end_migration(dev, map_state);
+                       super->updates_pending++;
+               }
+       } else if (!is_resyncing(dev) && !failed) {
+               /* mark the start of the init process if nothing is failed */
+               dprintf("imsm: mark resync start (%llu)\n", a->resync_start);
+               map->map_state = map_state;
+               migrate(dev, IMSM_T_STATE_NORMAL,
+                       map->map_state == IMSM_T_STATE_NORMAL);
+               super->updates_pending++;
+       }
+
+       /* check if we can update the migration checkpoint */
+       if (dev->vol.migr_state &&
+           __le32_to_cpu(dev->vol.curr_migr_unit) != a->resync_start) {
+               dprintf("imsm: checkpoint migration (%llu)\n", a->resync_start);
+               dev->vol.curr_migr_unit = __cpu_to_le32(a->resync_start);
+               super->updates_pending++;
+       }
+
+       /* mark dirty / clean */
+       if (dev->vol.dirty != !consistent) {
+               dprintf("imsm: mark '%s' (%llu)\n",
+                       consistent ? "clean" : "dirty", a->resync_start);
+               if (consistent)
+                       dev->vol.dirty = 0;
+               else
+                       dev->vol.dirty = 1;
+               super->updates_pending++;
+       }
+       return consistent;
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state)
+{
+       int inst = a->info.container_member;
+       struct intel_super *super = a->container->sb;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       struct imsm_disk *disk;
+       int failed;
+       __u32 status;
+       __u32 ord;
+       __u8 map_state;
+
+       if (n > map->num_members)
+               fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n",
+                       n, map->num_members - 1);
+
+       if (n < 0)
+               return;
+
+       dprintf("imsm: set_disk %d:%x\n", n, state);
+
+       ord = get_imsm_ord_tbl_ent(dev, n);
+       disk = get_imsm_disk(super, ord_to_idx(ord));
+
+       /* check for new failures */
+       status = __le32_to_cpu(disk->status);
+       if ((state & DS_FAULTY) && !(status & FAILED_DISK)) {
+               mark_failure(disk);
+               super->updates_pending++;
+       }
+
+       /* check if in_sync */
+       if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD) {
+               struct imsm_map *migr_map = get_imsm_map(dev, 1);
+
+               set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord));
+               super->updates_pending++;
+       }
+
+       failed = imsm_count_failed(super, dev);
+       map_state = imsm_check_degraded(super, dev, failed);
+
+       /* check if recovery complete, newly degraded, or failed */
+       if (map_state == IMSM_T_STATE_NORMAL && is_rebuilding(dev)) {
+               end_migration(dev, map_state);
+               super->updates_pending++;
+       } else if (map_state == IMSM_T_STATE_DEGRADED &&
+                  map->map_state != map_state &&
+                  !dev->vol.migr_state) {
+               dprintf("imsm: mark degraded\n");
+               map->map_state = map_state;
+               super->updates_pending++;
+       } else if (map_state == IMSM_T_STATE_FAILED &&
+                  map->map_state != map_state) {
+               dprintf("imsm: mark failed\n");
+               end_migration(dev, map_state);
+               super->updates_pending++;
+       }
+}
+
+static int store_imsm_mpb(int fd, struct intel_super *super)
+{
+       struct imsm_super *mpb = super->anchor;
+       __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
+       unsigned long long dsize;
+       unsigned long long sectors;
+
+       get_dev_size(fd, NULL, &dsize);
+
+       if (mpb_size > 512) {
+               /* -1 to account for anchor */
+               sectors = mpb_sectors(mpb) - 1;
+
+               /* write the extended mpb to the sectors preceeding the anchor */
+               if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0)
+                       return 1;
+
+               if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors)
+                       return 1;
+       }
+
+       /* first block is stored on second to last sector of the disk */
+       if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+               return 1;
+
+       if (write(fd, super->buf, 512) != 512)
+               return 1;
+
+       return 0;
+}
+
+static void imsm_sync_metadata(struct supertype *container)
+{
+       struct intel_super *super = container->sb;
+
+       if (!super->updates_pending)
+               return;
+
+       write_super_imsm(super, 0);
+
+       super->updates_pending = 0;
+}
+
+static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+       int i = get_imsm_disk_idx(dev, idx);
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next)
+               if (dl->index == i)
+                       break;
+
+       if (dl && __le32_to_cpu(dl->disk.status) & FAILED_DISK)
+               dl = NULL;
+
+       if (dl)
+               dprintf("%s: found %x:%x\n", __func__, dl->major, dl->minor);
+
+       return dl;
+}
+
+static struct dl *imsm_add_spare(struct intel_super *super, int slot, struct active_array *a)
+{
+       struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+       int idx = get_imsm_disk_idx(dev, slot);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       unsigned long long esize;
+       unsigned long long pos;
+       struct mdinfo *d;
+       struct extent *ex;
+       int j;
+       int found;
+       __u32 array_start;
+       __u32 status;
+       struct dl *dl;
+
+       for (dl = super->disks; dl; dl = dl->next) {
+               /* If in this array, skip */
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->state_fd >= 0 &&
+                           d->disk.major == dl->major &&
+                           d->disk.minor == dl->minor) {
+                               dprintf("%x:%x already in array\n", dl->major, dl->minor);
+                               break;
+                       }
+               if (d)
+                       continue;
+
+               /* skip in use or failed drives */
+               status = __le32_to_cpu(dl->disk.status);
+               if (status & FAILED_DISK || idx == dl->index) {
+                       dprintf("%x:%x status ( %s%s)\n",
+                       dl->major, dl->minor,
+                       status & FAILED_DISK ? "failed " : "",
+                       idx == dl->index ? "in use " : "");
+                       continue;
+               }
+
+               /* Does this unused device have the requisite free space?
+                * We need a->info.component_size sectors
+                */
+               ex = get_extents(super, dl);
+               if (!ex) {
+                       dprintf("cannot get extents\n");
+                       continue;
+               }
+               found = 0;
+               j = 0;
+               pos = 0;
+               array_start = __le32_to_cpu(map->pba_of_lba0);
+
+               do {
+                       /* check that we can start at pba_of_lba0 with
+                        * a->info.component_size of space
+                        */
+                       esize = ex[j].start - pos;
+                       if (array_start >= pos &&
+                           array_start + a->info.component_size < ex[j].start) {
+                               found = 1;
+                               break;
+                       }
+                       pos = ex[j].start + ex[j].size;
+                       j++;
+                           
+               } while (ex[j-1].size);
+
+               free(ex);
+               if (!found) {
+                       dprintf("%x:%x does not have %llu at %d\n",
+                               dl->major, dl->minor,
+                               a->info.component_size,
+                               __le32_to_cpu(map->pba_of_lba0));
+                       /* No room */
+                       continue;
+               } else
+                       break;
+       }
+
+       return dl;
+}
+
+static struct mdinfo *imsm_activate_spare(struct active_array *a,
+                                         struct metadata_update **updates)
+{
+       /**
+        * Find a device with unused free space and use it to replace a
+        * failed/vacant region in an array.  We replace failed regions one a
+        * array at a time.  The result is that a new spare disk will be added
+        * to the first failed array and after the monitor has finished
+        * propagating failures the remainder will be consumed.
+        *
+        * FIXME add a capability for mdmon to request spares from another
+        * container.
+        */
+
+       struct intel_super *super = a->container->sb;
+       int inst = a->info.container_member;
+       struct imsm_dev *dev = get_imsm_dev(super, inst);
+       struct imsm_map *map = get_imsm_map(dev, 0);
+       int failed = a->info.array.raid_disks;
+       struct mdinfo *rv = NULL;
+       struct mdinfo *d;
+       struct mdinfo *di;
+       struct metadata_update *mu;
+       struct dl *dl;
+       struct imsm_update_activate_spare *u;
+       int num_spares = 0;
+       int i;
+
+       for (d = a->info.devs ; d ; d = d->next) {
+               if ((d->curr_state & DS_FAULTY) &&
+                       d->state_fd >= 0)
+                       /* wait for Removal to happen */
+                       return NULL;
+               if (d->state_fd >= 0)
+                       failed--;
+       }
+
+       dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
+               inst, failed, a->info.array.raid_disks, a->info.array.level);
+       if (imsm_check_degraded(super, dev, failed) != IMSM_T_STATE_DEGRADED)
+               return NULL;
+
+       /* For each slot, if it is not working, find a spare */
+       for (i = 0; i < a->info.array.raid_disks; i++) {
+               for (d = a->info.devs ; d ; d = d->next)
+                       if (d->disk.raid_disk == i)
+                               break;
+               dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+               if (d && (d->state_fd >= 0))
+                       continue;
+
+               /*
+                * OK, this device needs recovery.  Try to re-add the previous
+                * occupant of this slot, if this fails add a new spare
+                */
+               dl = imsm_readd(super, i, a);
+               if (!dl)
+                       dl = imsm_add_spare(super, i, a);
+               if (!dl)
+                       continue;
+               /* found a usable disk with enough space */
+               di = malloc(sizeof(*di));
+               if (!di)
+                       continue;
+               memset(di, 0, sizeof(*di));
+
+               /* dl->index will be -1 in the case we are activating a
+                * pristine spare.  imsm_process_update() will create a
+                * new index in this case.  Once a disk is found to be
+                * failed in all member arrays it is kicked from the
+                * metadata
+                */
+               di->disk.number = dl->index;
+
+               /* (ab)use di->devs to store a pointer to the device
+                * we chose
+                */
+               di->devs = (struct mdinfo *) dl;
+
+               di->disk.raid_disk = i;
+               di->disk.major = dl->major;
+               di->disk.minor = dl->minor;
+               di->disk.state = 0;
+               di->data_offset = __le32_to_cpu(map->pba_of_lba0);
+               di->component_size = a->info.component_size;
+               di->container_member = inst;
+               di->next = rv;
+               rv = di;
+               num_spares++;
+               dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+                       i, di->data_offset);
+
+               break;
+       }
+
+       if (!rv)
+               /* No spares found */
+               return rv;
+       /* Now 'rv' has a list of devices to return.
+        * Create a metadata_update record to update the
+        * disk_ord_tbl for the array
+        */
+       mu = malloc(sizeof(*mu));
+       if (mu) {
+               mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares);
+               if (mu->buf == NULL) {
+                       free(mu);
+                       mu = NULL;
+               }
+       }
+       if (!mu) {
+               while (rv) {
+                       struct mdinfo *n = rv->next;
+
+                       free(rv);
+                       rv = n;
+               }
+               return NULL;
+       }
+                       
+       mu->space = NULL;
+       mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
+       mu->next = *updates;
+       u = (struct imsm_update_activate_spare *) mu->buf;
+
+       for (di = rv ; di ; di = di->next) {
+               u->type = update_activate_spare;
+               u->dl = (struct dl *) di->devs;
+               di->devs = NULL;
+               u->slot = di->disk.raid_disk;
+               u->array = inst;
+               u->next = u + 1;
+               u++;
+       }
+       (u-1)->next = NULL;
+       *updates = mu;
+
+       return rv;
+}
+
+static int disks_overlap(struct imsm_dev *d1, struct imsm_dev *d2)
+{
+       struct imsm_map *m1 = get_imsm_map(d1, 0);
+       struct imsm_map *m2 = get_imsm_map(d2, 0);
+       int i;
+       int j;
+       int idx;
+
+       for (i = 0; i < m1->num_members; i++) {
+               idx = get_imsm_disk_idx(d1, i);
+               for (j = 0; j < m2->num_members; j++)
+                       if (idx == get_imsm_disk_idx(d2, j))
+                               return 1;
+       }
+
+       return 0;
+}
+
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index);
+
+static void imsm_process_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * crack open the metadata_update envelope to find the update record
+        * update can be one of:
+        *      update_activate_spare - a spare device has replaced a failed
+        *      device in an array, update the disk_ord_tbl.  If this disk is
+        *      present in all member arrays then also clear the SPARE_DISK
+        *      flag
+        */
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb;
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+       /* update requires a larger buf but the allocation failed */
+       if (super->next_len && !super->next_buf) {
+               super->next_len = 0;
+               return;
+       }
+
+       if (super->next_buf) {
+               memcpy(super->next_buf, super->buf, super->len);
+               free(super->buf);
+               super->len = super->next_len;
+               super->buf = super->next_buf;
+
+               super->next_len = 0;
+               super->next_buf = NULL;
+       }
+
+       mpb = super->anchor;
+
+       switch (type) {
+       case update_activate_spare: {
+               struct imsm_update_activate_spare *u = (void *) update->buf; 
+               struct imsm_dev *dev = get_imsm_dev(super, u->array);
+               struct imsm_map *map = get_imsm_map(dev, 0);
+               struct imsm_map *migr_map;
+               struct active_array *a;
+               struct imsm_disk *disk;
+               __u32 status;
+               __u8 to_state;
+               struct dl *dl;
+               unsigned int found;
+               int failed;
+               int victim = get_imsm_disk_idx(dev, u->slot);
+               int i;
+
+               for (dl = super->disks; dl; dl = dl->next)
+                       if (dl == u->dl)
+                               break;
+
+               if (!dl) {
+                       fprintf(stderr, "error: imsm_activate_spare passed "
+                               "an unknown disk (index: %d)\n",
+                               u->dl->index);
+                       return;
+               }
+
+               super->updates_pending++;
+
+               /* count failures (excluding rebuilds and the victim)
+                * to determine map[0] state
+                */
+               failed = 0;
+               for (i = 0; i < map->num_members; i++) {
+                       if (i == u->slot)
+                               continue;
+                       disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+                       if (!disk ||
+                           __le32_to_cpu(disk->status) & FAILED_DISK)
+                               failed++;
+               }
+
+               /* adding a pristine spare, assign a new index */
+               if (dl->index < 0) {
+                       dl->index = super->anchor->num_disks;
+                       super->anchor->num_disks++;
+               }
+               disk = &dl->disk;
+               status = __le32_to_cpu(disk->status);
+               status |= CONFIGURED_DISK;
+               status &= ~SPARE_DISK;
+               disk->status = __cpu_to_le32(status);
+
+               /* mark rebuild */
+               to_state = imsm_check_degraded(super, dev, failed);
+               map->map_state = IMSM_T_STATE_DEGRADED;
+               migrate(dev, to_state, 1);
+               migr_map = get_imsm_map(dev, 1);
+               set_imsm_ord_tbl_ent(map, u->slot, dl->index);
+               set_imsm_ord_tbl_ent(migr_map, u->slot, dl->index | IMSM_ORD_REBUILD);
+
+               /* count arrays using the victim in the metadata */
+               found = 0;
+               for (a = st->arrays; a ; a = a->next) {
+                       dev = get_imsm_dev(super, a->info.container_member);
+                       for (i = 0; i < map->num_members; i++)
+                               if (victim == get_imsm_disk_idx(dev, i))
+                                       found++;
+               }
+
+               /* delete the victim if it is no longer being
+                * utilized anywhere
+                */
+               if (!found) {
+                       struct dl **dlp;
+
+                       /* We know that 'manager' isn't touching anything,
+                        * so it is safe to delete
+                        */
+                       for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next)
+                               if ((*dlp)->index == victim)
+                                       break;
+
+                       /* victim may be on the missing list */
+                       if (!*dlp)
+                               for (dlp = &super->missing; *dlp; dlp = &(*dlp)->next)
+                                       if ((*dlp)->index == victim)
+                                               break;
+                       imsm_delete(super, dlp, victim);
+               }
+               break;
+       }
+       case update_create_array: {
+               /* someone wants to create a new array, we need to be aware of
+                * a few races/collisions:
+                * 1/ 'Create' called by two separate instances of mdadm
+                * 2/ 'Create' versus 'activate_spare': mdadm has chosen
+                *     devices that have since been assimilated via
+                *     activate_spare.
+                * In the event this update can not be carried out mdadm will
+                * (FIX ME) notice that its update did not take hold.
+                */
+               struct imsm_update_create_array *u = (void *) update->buf;
+               struct imsm_dev *dev;
+               struct imsm_map *map, *new_map;
+               unsigned long long start, end;
+               unsigned long long new_start, new_end;
+               int i;
+               int overlap = 0;
+
+               /* handle racing creates: first come first serve */
+               if (u->dev_idx < mpb->num_raid_devs) {
+                       dprintf("%s: subarray %d already defined\n",
+                               __func__, u->dev_idx);
+                       return;
+               }
+
+               /* check update is next in sequence */
+               if (u->dev_idx != mpb->num_raid_devs) {
+                       dprintf("%s: can not create array %d expected index %d\n",
+                               __func__, u->dev_idx, mpb->num_raid_devs);
+                       return;
+               }
+
+               new_map = get_imsm_map(&u->dev, 0);
+               new_start = __le32_to_cpu(new_map->pba_of_lba0);
+               new_end = new_start + __le32_to_cpu(new_map->blocks_per_member);
+
+               /* handle activate_spare versus create race:
+                * check to make sure that overlapping arrays do not include
+                * overalpping disks
+                */
+               for (i = 0; i < mpb->num_raid_devs; i++) {
+                       dev = get_imsm_dev(super, i);
+                       map = get_imsm_map(dev, 0);
+                       start = __le32_to_cpu(map->pba_of_lba0);
+                       end = start + __le32_to_cpu(map->blocks_per_member);
+                       if ((new_start >= start && new_start <= end) ||
+                           (start >= new_start && start <= new_end))
+                               overlap = 1;
+                       if (overlap && disks_overlap(dev, &u->dev)) {
+                               dprintf("%s: arrays overlap\n", __func__);
+                               return;
+                       }
+               }
+               /* check num_members sanity */
+               if (new_map->num_members > mpb->num_disks) {
+                       dprintf("%s: num_disks out of range\n", __func__);
+                       return;
+               }
+
+               /* check that prepare update was successful */
+               if (!update->space) {
+                       dprintf("%s: prepare update failed\n", __func__);
+                       return;
+               }
+
+               super->updates_pending++;
+               dev = update->space;
+               map = get_imsm_map(dev, 0);
+               update->space = NULL;
+               imsm_copy_dev(dev, &u->dev);
+               map = get_imsm_map(dev, 0);
+               super->dev_tbl[u->dev_idx] = dev;
+               mpb->num_raid_devs++;
+
+               /* fix up flags */
+               for (i = 0; i < map->num_members; i++) {
+                       struct imsm_disk *disk;
+                       __u32 status;
+
+                       disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i));
+                       status = __le32_to_cpu(disk->status);
+                       status |= CONFIGURED_DISK;
+                       status &= ~SPARE_DISK;
+                       disk->status = __cpu_to_le32(status);
+               }
+               break;
+       }
+       case update_add_disk:
+
+               /* we may be able to repair some arrays if disks are
+                * being added */
+               if (super->add) {
+                       struct active_array *a;
+
+                       super->updates_pending++;
+                       for (a = st->arrays; a; a = a->next)
+                               a->check_degraded = 1;
+               }
+               /* add some spares to the metadata */
+               while (super->add) {
+                       struct dl *al;
+
+                       al = super->add;
+                       super->add = al->next;
+                       al->next = super->disks;
+                       super->disks = al;
+                       dprintf("%s: added %x:%x\n",
+                               __func__, al->major, al->minor);
+               }
+
+               break;
+       }
+}
+
+static void imsm_prepare_update(struct supertype *st,
+                               struct metadata_update *update)
+{
+       /**
+        * Allocate space to hold new disk entries, raid-device entries or a new
+        * mpb if necessary.  The manager synchronously waits for updates to
+        * complete in the monitor, so new mpb buffers allocated here can be
+        * integrated by the monitor thread without worrying about live pointers
+        * in the manager thread.
+        */
+       enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+       struct intel_super *super = st->sb;
+       struct imsm_super *mpb = super->anchor;
+       size_t buf_len;
+       size_t len = 0;
+
+       switch (type) {
+       case update_create_array: {
+               struct imsm_update_create_array *u = (void *) update->buf;
+
+               len = sizeof_imsm_dev(&u->dev, 1);
+               update->space = malloc(len);
+               break;
+       default:
+               break;
+       }
+       }
+
+       /* check if we need a larger metadata buffer */
+       if (super->next_buf)
+               buf_len = super->next_len;
+       else
+               buf_len = super->len;
+
+       if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) {
+               /* ok we need a larger buf than what is currently allocated
+                * if this allocation fails process_update will notice that
+                * ->next_len is set and ->next_buf is NULL
+                */
+               buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512);
+               if (super->next_buf)
+                       free(super->next_buf);
+
+               super->next_len = buf_len;
+               if (posix_memalign(&super->next_buf, buf_len, 512) != 0)
+                       super->next_buf = NULL;
+       }
+}
+
+/* must be called while manager is quiesced */
+static void imsm_delete(struct intel_super *super, struct dl **dlp, int index)
+{
+       struct imsm_super *mpb = super->anchor;
+       struct dl *iter;
+       struct imsm_dev *dev;
+       struct imsm_map *map;
+       int i, j, num_members;
+       __u32 ord;
+
+       dprintf("%s: deleting device[%d] from imsm_super\n",
+               __func__, index);
+
+       /* shift all indexes down one */
+       for (iter = super->disks; iter; iter = iter->next)
+               if (iter->index > index)
+                       iter->index--;
+       for (iter = super->missing; iter; iter = iter->next)
+               if (iter->index > index)
+                       iter->index--;
+
+       for (i = 0; i < mpb->num_raid_devs; i++) {
+               dev = get_imsm_dev(super, i);
+               map = get_imsm_map(dev, 0);
+               num_members = map->num_members;
+               for (j = 0; j < num_members; j++) {
+                       /* update ord entries being careful not to propagate
+                        * ord-flags to the first map
+                        */
+                       ord = get_imsm_ord_tbl_ent(dev, j);
+
+                       if (ord_to_idx(ord) <= index)
+                               continue;
+
+                       map = get_imsm_map(dev, 0);
+                       set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1));
+                       map = get_imsm_map(dev, 1);
+                       if (map)
+                               set_imsm_ord_tbl_ent(map, j, ord - 1);
+               }
+       }
+
+       mpb->num_disks--;
+       super->updates_pending++;
+       if (*dlp) {
+               struct dl *dl = *dlp;
+
+               *dlp = (*dlp)->next;
+               __free_imsm_disk(dl);
+       }
+}
+#endif /* MDASSEMBLE */
+
+struct superswitch super_imsm = {
+#ifndef        MDASSEMBLE
+       .examine_super  = examine_super_imsm,
+       .brief_examine_super = brief_examine_super_imsm,
+       .detail_super   = detail_super_imsm,
+       .brief_detail_super = brief_detail_super_imsm,
+       .write_init_super = write_init_super_imsm,
+       .validate_geometry = validate_geometry_imsm,
+       .add_to_super   = add_to_super_imsm,
+#endif
+       .match_home     = match_home_imsm,
+       .uuid_from_super= uuid_from_super_imsm,
+       .getinfo_super  = getinfo_super_imsm,
+       .update_super   = update_super_imsm,
+
+       .avail_size     = avail_size_imsm,
+
+       .compare_super  = compare_super_imsm,
+
+       .load_super     = load_super_imsm,
+       .init_super     = init_super_imsm,
+       .store_super    = store_zero_imsm,
+       .free_super     = free_super_imsm,
+       .match_metadata_desc = match_metadata_desc_imsm,
+       .container_content = container_content_imsm,
+
+       .external       = 1,
+
+#ifndef MDASSEMBLE
+/* for mdmon */
+       .open_new       = imsm_open_new,
+       .load_super     = load_super_imsm,
+       .set_array_state= imsm_set_array_state,
+       .set_disk       = imsm_set_disk,
+       .sync_metadata  = imsm_sync_metadata,
+       .activate_spare = imsm_activate_spare,
+       .process_update = imsm_process_update,
+       .prepare_update = imsm_prepare_update,
+#endif /* MDASSEMBLE */
+};
index 90fdf23d075f46b1eceb90dfc0d6b6f6861efaba..92255c23cbec0f9f7e1776377bce2db03021a0d2 100644 (file)
--- a/super0.c
+++ b/super0.c
@@ -53,7 +53,7 @@ static unsigned long calc_sb0_csum(mdp_super_t *super)
 }
 
 
-void super0_swap_endian(struct mdp_superblock_s *sb)
+static void super0_swap_endian(struct mdp_superblock_s *sb)
 {
        /* as super0 superblocks are host-endian, it is sometimes
         * useful to be able to swap the endianness
@@ -300,18 +300,6 @@ static void brief_detail_super0(struct supertype *st)
        else
                printf("%08x", sb->set_uuid0);
 }
-
-static void export_detail_super0(struct supertype *st)
-{
-       mdp_super_t *sb = st->sb;
-       printf("MD_UUID=");
-       if (sb->minor_version >= 90)
-               printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
-                      sb->set_uuid2, sb->set_uuid3);
-       else
-               printf("%08x", sb->set_uuid0);
-       printf("\n");
-}
 #endif
 
 static int match_home0(struct supertype *st, char *homehost)
@@ -368,6 +356,9 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info)
        info->events = md_event(sb);
        info->data_offset = 0;
 
+       sprintf(info->text_version, "0.%d", sb->minor_version);
+       info->safe_mode_delay = 200;
+
        uuid_from_super0(st, info->uuid);
 
        if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) {
@@ -551,12 +542,17 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *ignored_name, char *homehost,
                       int *uuid)
 {
-       mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       mdp_super_t *sb;
        int spares;
+
+       if (posix_memalign((void**)&sb, 512, MD_SB_BYTES + sizeof(bitmap_super_t)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
        memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t));
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing the superblock */
                return 0;
        }
@@ -622,18 +618,42 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
+
+#ifndef MDASSEMBLE
 /* Add a device to the superblock being created */
-static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo)
+static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
+                         int fd, char *devname)
 {
        mdp_super_t *sb = st->sb;
        mdp_disk_t *dk = &sb->disks[dinfo->number];
+       struct devinfo *di, **dip;
 
        dk->number = dinfo->number;
        dk->major = dinfo->major;
        dk->minor = dinfo->minor;
        dk->raid_disk = dinfo->raid_disk;
        dk->state = dinfo->state;
+
+       sb->this_disk = sb->disks[dinfo->number];
+       sb->sb_csum = calc_sb0_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dinfo;
+       di->next = NULL;
+       *dip = di;
 }
+#endif
 
 static int store_super0(struct supertype *st, int fd)
 {
@@ -660,7 +680,8 @@ static int store_super0(struct supertype *st, int fd)
        if (super->state & (1<<MD_SB_BITMAP_PRESENT)) {
                struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC)
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) != 
+                           ROUND_UP(sizeof(*bm),512))
                            return 5;
        }
 
@@ -668,32 +689,41 @@ static int store_super0(struct supertype *st, int fd)
        return 0;
 }
 
-static int write_init_super0(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super0(struct supertype *st)
 {
        mdp_super_t *sb = st->sb;
-       int fd = open(devname, O_RDWR|O_EXCL);
-       int rv;
+       int rv = 0;
+       struct devinfo *di;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname);
-               return -1;
-       }
+       for (di = st->info ; di && ! rv ; di = di->next) {
 
-       sb->disks[dinfo->number].state &= ~(1<<MD_DISK_FAULTY);
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd == -1)
+                       continue;
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->this_disk = sb->disks[dinfo->number];
-       sb->sb_csum = calc_sb0_csum(sb);
-       rv = store_super0(st, fd);
+               sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY);
 
-       if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
-               rv = st->ss->write_bitmap(st, fd);
+               sb->this_disk = sb->disks[di->disk.number];
+               sb->sb_csum = calc_sb0_csum(sb);
+               rv = store_super0(st, di->fd);
 
-       close(fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
+                       rv = st->ss->write_bitmap(st, di->fd);
+
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super0(struct supertype *st, struct supertype *tst)
 {
@@ -711,7 +741,12 @@ static int compare_super0(struct supertype *st, struct supertype *tst)
        if (second->md_magic != MD_SB_MAGIC)
                return 1;
        if (!first) {
-               first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s));
+               if (posix_memalign((void**)&first, 512, 
+                              MD_SB_BYTES + sizeof(struct bitmap_super_s)) != 0) {
+                       fprintf(stderr, Name
+                               ": %s could not allocate superblock\n", __func__);
+                       return 1;
+               }
                memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s));
                st->sb = first;
                return 0;
@@ -753,6 +788,9 @@ static int load_super0(struct supertype *st, int fd, char *devname)
 
        free_super0(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (!get_dev_size(fd, devname, &dsize))
                return 1;
 
@@ -777,7 +815,12 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t));
+       if (posix_memalign((void**)&super, 512,
+                          MD_SB_BYTES + sizeof(bitmap_super_t)+512) != 0) {
+               fprintf(stderr, Name
+                       ": %s could not allocate superblock\n", __func__);
+               return 1;
+       }
 
        if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) {
                if (devname)
@@ -811,6 +854,7 @@ static int load_super0(struct supertype *st, int fd, char *devname)
                st->ss = &super0;
                st->minor_version = super->minor_version;
                st->max_devs = MD_SB_DISKS;
+               st->info = NULL;
        }
 
        /* Now check on the bitmap superblock */
@@ -820,8 +864,8 @@ static int load_super0(struct supertype *st, int fd, char *devname)
         * valid.  If it doesn't clear the bit.  An --assemble --force
         * should get that written out.
         */
-       if (read(fd, super+1, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),512))
+           != ROUND_UP(sizeof(struct bitmap_super_s),512))
                goto no_bitmap;
 
        uuid_from_super0(st, uuid);
@@ -842,12 +886,14 @@ static struct supertype *match_metadata_desc0(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super0;
+       st->info = NULL;
        st->minor_version = 90;
        st->max_devs = MD_SB_DISKS;
        st->sb = NULL;
-       /* Eliminate pointless leading 0 from some versions of mdadm -D */
-       if (strncmp(arg, "00.", 3) == 0)
+       /* we sometimes get 00.90 */
+       while (arg[0] == '0' && arg[1] == '0')
                arg++;
        if (strcmp(arg, "0") == 0 ||
            strcmp(arg, "0.90") == 0 ||
@@ -921,7 +967,7 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp,
 }
 
 
-void locate_bitmap0(struct supertype *st, int fd)
+static void locate_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -941,7 +987,7 @@ void locate_bitmap0(struct supertype *st, int fd)
        lseek64(fd, offset, 0);
 }
 
-int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd)
 {
        unsigned long long dsize;
        unsigned long long offset;
@@ -950,7 +996,8 @@ int write_bitmap0(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char abuf[4096+512];
+       char *buf = (char*)(((long)(abuf+512))&~511UL);
 
        if (!get_dev_size(fd, NULL, &dsize))
                return 1;
@@ -966,21 +1013,19 @@ int write_bitmap0(struct supertype *st, int fd)
        if (lseek64(fd, offset + 4096, 0)< 0LL)
                return 3;
 
-
-       if (write(fd, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
-       towrite = 64*1024 - MD_SB_BYTES - sizeof(bitmap_super_t);
-       memset(buf, 0xff, sizeof(buf));
+       memset(buf, 0xff, 4096);
+       memcpy(buf,  ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t));
+       towrite = 64*1024;
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -996,6 +1041,48 @@ static void free_super0(struct supertype *st)
        st->sb = NULL;
 }
 
+#ifndef MDASSEMBLE
+static int validate_geometry0(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (raiddisks > MD_SB_DISKS)
+               return 0;
+       if (size > (0x7fffffffULL<<10))
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super0.90 cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       if (ldsize < MD_RESERVED_SECTORS * 512)
+               return 0;
+       if (size > (0x7fffffffULL<<10))
+               return 0;
+       *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9);
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
 struct superswitch super0 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super0,
@@ -1003,16 +1090,16 @@ struct superswitch super0 = {
        .export_examine_super = export_examine_super0,
        .detail_super = detail_super0,
        .brief_detail_super = brief_detail_super0,
-       .export_detail_super = export_detail_super0,
+       .write_init_super = write_init_super0,
+       .validate_geometry = validate_geometry0,
+       .add_to_super = add_to_super0,
 #endif
        .match_home = match_home0,
        .uuid_from_super = uuid_from_super0,
        .getinfo_super = getinfo_super0,
        .update_super = update_super0,
        .init_super = init_super0,
-       .add_to_super = add_to_super0,
        .store_super = store_super0,
-       .write_init_super = write_init_super0,
        .compare_super = compare_super0,
        .load_super = load_super0,
        .match_metadata_desc = match_metadata_desc0,
@@ -1021,6 +1108,4 @@ struct superswitch super0 = {
        .locate_bitmap = locate_bitmap0,
        .write_bitmap = write_bitmap0,
        .free_super = free_super0,
-       .major = 0,
-       .swapuuid = 0,
 };
index 1eb88aac1d4f39371ba5b34fed8e001b19b62b14..4cfd786009a6c9a8b24c944d0fea9f44f04b3321 100644 (file)
--- a/super1.c
+++ b/super1.c
@@ -454,12 +454,6 @@ static void export_detail_super1(struct supertype *st)
                }
        if (len)
                printf("MD_NAME=%.*s\n", len, sb->set_name);
-       printf("MD_UUID=");
-       for (i=0; i<16; i++) {
-               if ((i&3)==0 && i != 0) printf(":");
-               printf("%02x", sb->set_uuid[i]);
-       }
-       printf("\n");
 }
 
 #endif
@@ -491,7 +485,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
        int role;
 
        info->array.major_version = 1;
-       info->array.minor_version = __le32_to_cpu(sb->feature_map);
+       info->array.minor_version = st->minor_version;
        info->array.patch_version = 0;
        info->array.raid_disks = __le32_to_cpu(sb->raid_disks);
        info->array.level = __le32_to_cpu(sb->level);
@@ -529,6 +523,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info)
                info->disk.raid_disk = role;
        }
        info->events = __le64_to_cpu(sb->events);
+       sprintf(info->text_version, "1.%d", st->minor_version);
+       info->safe_mode_delay = 200;
 
        memcpy(info->uuid, sb->set_uuid, 16);
 
@@ -687,7 +683,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
            __le64_to_cpu(sb->data_offset)) {
                /* set data_size to device size less data_offset */
                struct misc_dev_info *misc = (struct misc_dev_info*)
-                       (st->sb + 1024 + sizeof(struct bitmap_super_s));
+                       (st->sb + 1024 + 512);
                printf("Size was %llu\n", (unsigned long long)
                       __le64_to_cpu(sb->data_size));
                sb->data_size = __cpu_to_le64(
@@ -705,15 +701,21 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 static int init_super1(struct supertype *st, mdu_array_info_t *info,
                       unsigned long long size, char *name, char *homehost, int *uuid)
 {
-       struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) +
-                                            sizeof(struct misc_dev_info));
+       struct mdp_superblock_1 *sb;
        int spares;
        int rfd;
        char defname[10];
+
+       if (posix_memalign((void**)&sb, 512, (1024 + 512 + 
+                          sizeof(struct misc_dev_info))) != 0) {
+               fprintf(stderr, Name
+                       ": %s could not allocate superblock\n", __func__);
+               return 0;
+       }
        memset(sb, 0, 1024);
 
        st->sb = sb;
-       if (info->major_version == -1) {
+       if (info == NULL) {
                /* zeroing superblock */
                return 0;
        }
@@ -784,18 +786,42 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
        return 1;
 }
 
+struct devinfo {
+       int fd;
+       char *devname;
+       mdu_disk_info_t disk;
+       struct devinfo *next;
+};
+#ifndef MDASSEMBLE
 /* Add a device to the superblock being created */
-static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk)
+static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
+                         int fd, char *devname)
 {
        struct mdp_superblock_1 *sb = st->sb;
        __u16 *rp = sb->dev_roles + dk->number;
+       struct devinfo *di, **dip;
+
        if ((dk->state & 6) == 6) /* active, sync */
                *rp = __cpu_to_le16(dk->raid_disk);
        else if ((dk->state & ~2) == 0) /* active or idle -> spare */
                *rp = 0xffff;
        else
                *rp = 0xfffe;
+
+       sb->dev_number = __cpu_to_le32(dk->number);
+       sb->sb_csum = calc_sb_1_csum(sb);
+
+       dip = (struct devinfo **)&st->info;
+       while (*dip)
+               dip = &(*dip)->next;
+       di = malloc(sizeof(struct devinfo));
+       di->fd = fd;
+       di->devname = devname;
+       di->disk = *dk;
+       di->next = NULL;
+       *dip = di;
 }
+#endif
 
 static void locate_bitmap1(struct supertype *st, int fd);
 
@@ -851,6 +877,7 @@ static int store_super1(struct supertype *st, int fd)
                return 3;
 
        sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev);
+       sbsize = (sbsize+511)&(~511UL);
 
        if (write(fd, sb, sbsize) != sbsize)
                return 4;
@@ -860,7 +887,8 @@ static int store_super1(struct supertype *st, int fd)
                        (((char*)sb)+1024);
                if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) {
                        locate_bitmap1(st, fd);
-                       if (write(fd, bm, sizeof(*bm)) != sizeof(*bm))
+                       if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) !=
+                           ROUND_UP(sizeof(*bm),512))
                            return 5;
                }
        }
@@ -883,123 +911,133 @@ static unsigned long choose_bm_space(unsigned long devsize)
        return 4*2;
 }
 
-static int write_init_super1(struct supertype *st,
-                            mdu_disk_info_t *dinfo, char *devname)
+#ifndef MDASSEMBLE
+static int write_init_super1(struct supertype *st)
 {
        struct mdp_superblock_1 *sb = st->sb;
        struct supertype refst;
-       int fd = open(devname, O_RDWR | O_EXCL);
        int rfd;
-       int rv;
+       int rv = 0;
        int bm_space;
-
+       struct devinfo *di;
        unsigned long long dsize, array_size;
        long long sb_offset;
 
+       for (di = st->info; di && ! rv ; di = di->next) {
+               if (di->disk.state == 1)
+                       continue;
+               if (di->fd < 0)
+                       continue;
 
-       if (fd < 0) {
-               fprintf(stderr, Name ": Failed to open %s to write superblock\n",
-                       devname);
-               return -1;
-       }
+               Kill(di->devname, 0, 1, 1);
+               Kill(di->devname, 0, 1, 1);
 
-       sb->dev_number = __cpu_to_le32(dinfo->number);
-       if (dinfo->state & (1<<MD_DISK_WRITEMOSTLY))
-               sb->devflags |= __cpu_to_le32(WriteMostly1);
+               sb->dev_number = __cpu_to_le32(di->disk.number);
+               if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY))
+                       sb->devflags |= __cpu_to_le32(WriteMostly1);
 
-       if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
-           read(rfd, sb->device_uuid, 16) != 16) {
-               *(__u32*)(sb->device_uuid) = random();
-               *(__u32*)(sb->device_uuid+4) = random();
-               *(__u32*)(sb->device_uuid+8) = random();
-               *(__u32*)(sb->device_uuid+12) = random();
-       }
-       if (rfd >= 0) close(rfd);
-       sb->events = 0;
-
-       refst =*st;
-       refst.sb = NULL;
-       if (load_super1(&refst, fd, NULL)==0) {
-               struct mdp_superblock_1 *refsb = refst.sb;
-
-               memcpy(sb->device_uuid, refsb->device_uuid, 16);
-               if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
-                       /* same array, so preserve events and dev_number */
-                       sb->events = refsb->events;
-                       /* bugs in 2.6.17 and earlier mean the dev_number
-                        * chosen in Manage must be preserved
-                        */
-                       if (get_linux_version() >= 2006018)
-                               sb->dev_number = refsb->dev_number;
+               if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
+                   read(rfd, sb->device_uuid, 16) != 16) {
+                       *(__u32*)(sb->device_uuid) = random();
+                       *(__u32*)(sb->device_uuid+4) = random();
+                       *(__u32*)(sb->device_uuid+8) = random();
+                       *(__u32*)(sb->device_uuid+12) = random();
+               }
+               if (rfd >= 0) close(rfd);
+               sb->events = 0;
+
+               refst =*st;
+               refst.sb = NULL;
+               if (load_super1(&refst, di->fd, NULL)==0) {
+                       struct mdp_superblock_1 *refsb = refst.sb;
+
+                       memcpy(sb->device_uuid, refsb->device_uuid, 16);
+                       if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
+                               /* same array, so preserve events and
+                                * dev_number */
+                               sb->events = refsb->events;
+                               /* bugs in 2.6.17 and earlier mean the
+                                * dev_number chosen in Manage must be preserved
+                                */
+                               if (get_linux_version() >= 2006018)
+                                       sb->dev_number = refsb->dev_number;
+                       }
+                       free(refsb);
                }
-               free(refsb);
-       }
-
-       if (!get_dev_size(fd, NULL, &dsize))
-               return 1;
-       dsize >>= 9;
 
-       if (dsize < 24) {
-               close(fd);
-               return 2;
-       }
+               if (!get_dev_size(di->fd, NULL, &dsize))
+                       return 1;
+               dsize >>= 9;
 
+               if (dsize < 24) {
+                       close(di->fd);
+                       return 2;
+               }
 
-       /*
-        * Calculate the position of the superblock.
-        * It is always aligned to a 4K boundary and
-        * depending on minor_version, it can be:
-        * 0: At least 8K, but less than 12K, from end of device
-        * 1: At start of device
-        * 2: 4K from start of device.
-        * Depending on the array size, we might leave extra space
-        * for a bitmap.
-        */
-       array_size = __le64_to_cpu(sb->size);
-       /* work out how much space we left for a bitmap */
-       bm_space = choose_bm_space(array_size);
 
-       switch(st->minor_version) {
-       case 0:
-               sb_offset = dsize;
-               sb_offset -= 8*2;
-               sb_offset &= ~(4*2-1);
-               sb->super_offset = __cpu_to_le64(sb_offset);
-               sb->data_offset = __cpu_to_le64(0);
+               /*
+                * Calculate the position of the superblock.
+                * It is always aligned to a 4K boundary and
+                * depending on minor_version, it can be:
+                * 0: At least 8K, but less than 12K, from end of device
+                * 1: At start of device
+                * 2: 4K from start of device.
+                * Depending on the array size, we might leave extra space
+                * for a bitmap.
+                */
+               array_size = __le64_to_cpu(sb->size);
+               /* work out how much space we left for a bitmap */
+               bm_space = choose_bm_space(array_size);
+
+               switch(st->minor_version) {
+               case 0:
+                       sb_offset = dsize;
+                       sb_offset -= 8*2;
+                       sb_offset &= ~(4*2-1);
+                       sb->super_offset = __cpu_to_le64(sb_offset);
+                       sb->data_offset = __cpu_to_le64(0);
                if (sb_offset - bm_space < array_size)
                        bm_space = sb_offset - array_size;
-               sb->data_size = __cpu_to_le64(sb_offset - bm_space);
-               break;
-       case 1:
-               sb->super_offset = __cpu_to_le64(0);
-               if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2;
-               sb->data_offset = __cpu_to_le64(bm_space + 4*2);
-               sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
-               break;
-       case 2:
-               sb_offset = 4*2;
-               sb->super_offset = __cpu_to_le64(4*2);
-               if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
-                       bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2;
-               sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
-               sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space );
-               break;
-       default:
-               return -EINVAL;
-       }
+                       sb->data_size = __cpu_to_le64(sb_offset - bm_space);
+                       break;
+               case 1:
+                       sb->super_offset = __cpu_to_le64(0);
+                       if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size) -4*2;
+                       sb->data_offset = __cpu_to_le64(bm_space + 4*2);
+                       sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2);
+                       break;
+               case 2:
+                       sb_offset = 4*2;
+                       sb->super_offset = __cpu_to_le64(4*2);
+                       if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size)
+                           > dsize)
+                               bm_space = dsize - __le64_to_cpu(sb->size)
+                                       - 4*2 - 4*2;
+                       sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space);
+                       sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2
+                                                     - bm_space );
+                       break;
+               default:
+                       return -EINVAL;
+               }
 
 
-       sb->sb_csum = calc_sb_1_csum(sb);
-       rv = store_super1(st, fd);
-       if (rv)
-               fprintf(stderr, Name ": failed to write superblock to %s\n", devname);
+               sb->sb_csum = calc_sb_1_csum(sb);
+               rv = store_super1(st, di->fd);
+               if (rv)
+                       fprintf(stderr,
+                               Name ": failed to write superblock to %s\n",
+                               di->devname);
 
-       if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
-               rv = st->ss->write_bitmap(st, fd);
-       close(fd);
+               if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+                       rv = st->ss->write_bitmap(st, di->fd);
+               close(di->fd);
+               di->fd = -1;
+       }
        return rv;
 }
+#endif
 
 static int compare_super1(struct supertype *st, struct supertype *tst)
 {
@@ -1019,9 +1057,14 @@ static int compare_super1(struct supertype *st, struct supertype *tst)
                return 1;
 
        if (!first) {
-               first = malloc(1024+sizeof(bitmap_super_t) +
-                              sizeof(struct misc_dev_info));
-               memcpy(first, second, 1024+sizeof(bitmap_super_t) +
+               if (posix_memalign((void**)&first, 512,
+                              1024 + 512 +
+                              sizeof(struct misc_dev_info)) != 0) {
+                       fprintf(stderr, Name
+                               ": %s could not allocate superblock\n", __func__);
+                       return 1;
+               }
+               memcpy(first, second, 1024 + 512 + 
                       sizeof(struct misc_dev_info));
                st->sb = first;
                return 0;
@@ -1052,13 +1095,16 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        free_super1(st);
 
+       if (st->subarray[0])
+               return 1;
+
        if (st->ss == NULL || st->minor_version == -1) {
                int bestvers = -1;
                struct supertype tst;
                __u64 bestctime = 0;
                /* guess... choose latest ctime */
+               memset(&tst, 0, sizeof(tst));
                tst.ss = &super1;
-               tst.sb = NULL;
                for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) {
                        switch(load_super1(&tst, fd, devname)) {
                        case 0: super = tst.sb;
@@ -1131,8 +1177,13 @@ static int load_super1(struct supertype *st, int fd, char *devname)
                return 1;
        }
 
-       super = malloc(1024 + sizeof(bitmap_super_t) +
-                      sizeof(struct misc_dev_info));
+       if (posix_memalign((void**)&super, 512,
+                      1024 + 512 +
+                      sizeof(struct misc_dev_info)) != 0) {
+               fprintf(stderr, Name ": %s could not allocate superblock\n",
+                       __func__);
+               return 1;
+       }
 
        if (read(fd, super, 1024) != 1024) {
                if (devname)
@@ -1168,7 +1219,7 @@ static int load_super1(struct supertype *st, int fd, char *devname)
 
        bsb = (struct bitmap_super_s *)(((char*)super)+1024);
 
-       misc = (struct misc_dev_info*) (bsb+1);
+       misc = (struct misc_dev_info*) (((char*)super)+1024+512);
        misc->device_size = dsize;
 
        /* Now check on the bitmap superblock */
@@ -1179,8 +1230,8 @@ static int load_super1(struct supertype *st, int fd, char *devname)
         * should get that written out.
         */
        locate_bitmap1(st, fd);
-       if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s))
-           != sizeof(struct bitmap_super_s))
+       if (read(fd, ((char*)super)+1024, 512)
+           != 512)
                goto no_bitmap;
 
        uuid_from_super1(st, uuid);
@@ -1200,26 +1251,30 @@ static struct supertype *match_metadata_desc1(char *arg)
        struct supertype *st = malloc(sizeof(*st));
        if (!st) return st;
 
+       memset(st, 0, sizeof(*st));
        st->ss = &super1;
        st->max_devs = 384;
        st->sb = NULL;
-       /* Eliminate pointless leading 0 from some versions of mdadm -D */
-       if (strncmp(arg, "01.", 3) == 0)
+       /* leading zeros can be safely ignored.  --detail generates them. */
+       while (*arg == '0')
                arg++;
-       if (strcmp(arg, "1.0") == 0) {
+       if (strcmp(arg, "1.0") == 0 ||
+           strcmp(arg, "1.00") == 0) {
                st->minor_version = 0;
                return st;
        }
-       if (strcmp(arg, "1.1") == 0) {
+       if (strcmp(arg, "1.1") == 0 ||
+           strcmp(arg, "1.01") == 0) {
                st->minor_version = 1;
                return st;
        }
-       if (strcmp(arg, "1.2") == 0) {
+       if (strcmp(arg, "1.2") == 0 ||
+           strcmp(arg, "1.02") == 0) {
                st->minor_version = 2;
                return st;
        }
        if (strcmp(arg, "1") == 0 ||
-           strcmp(arg, "default/large") == 0) {
+           strcmp(arg, "default") == 0) {
                st->minor_version = -1;
                return st;
        }
@@ -1413,25 +1468,28 @@ static int write_bitmap1(struct supertype *st, int fd)
        int rv = 0;
 
        int towrite, n;
-       char buf[4096];
+       char abuf[4096+512];
+       char *buf = (char*)(((long)(abuf+512))&~511UL);
 
        locate_bitmap1(st, fd);
 
-       if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) !=
-           sizeof(bitmap_super_t))
-               return -2;
+       memset(buf, 0xff, 4096);
+       memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t));
+
        towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
        towrite = (towrite+7) >> 3; /* bits to bytes */
-       memset(buf, 0xff, sizeof(buf));
+       towrite += sizeof(bitmap_super_t);
+       towrite = ROUND_UP(towrite, 512);
        while (towrite > 0) {
                n = towrite;
-               if (n > sizeof(buf))
-                       n = sizeof(buf);
+               if (n > 4096)
+                       n = 4096;
                n = write(fd, buf, n);
                if (n > 0)
                        towrite -= n;
                else
                        break;
+               memset(buf, 0xff, 4096);
        }
        fsync(fd);
        if (towrite)
@@ -1447,6 +1505,40 @@ static void free_super1(struct supertype *st)
        st->sb = NULL;
 }
 
+#ifndef MDASSEMBLE
+static int validate_geometry1(struct supertype *st, int level,
+                             int layout, int raiddisks,
+                             int chunk, unsigned long long size,
+                             char *subdev, unsigned long long *freesize,
+                             int verbose)
+{
+       unsigned long long ldsize;
+       int fd;
+
+       if (level == LEVEL_CONTAINER)
+               return 0;
+       if (!subdev)
+               return 1;
+
+       fd = open(subdev, O_RDONLY|O_EXCL, 0);
+       if (fd < 0) {
+               if (verbose)
+                       fprintf(stderr, Name ": super1.x cannot open %s: %s\n",
+                               subdev, strerror(errno));
+               return 0;
+       }
+
+       if (!get_dev_size(fd, subdev, &ldsize)) {
+               close(fd);
+               return 0;
+       }
+       close(fd);
+
+       *freesize = avail_size1(st, ldsize >> 9);
+       return 1;
+}
+#endif /* MDASSEMBLE */
+
 struct superswitch super1 = {
 #ifndef MDASSEMBLE
        .examine_super = examine_super1,
@@ -1455,15 +1547,16 @@ struct superswitch super1 = {
        .detail_super = detail_super1,
        .brief_detail_super = brief_detail_super1,
        .export_detail_super = export_detail_super1,
+       .write_init_super = write_init_super1,
+       .validate_geometry = validate_geometry1,
+       .add_to_super = add_to_super1,
 #endif
        .match_home = match_home1,
        .uuid_from_super = uuid_from_super1,
        .getinfo_super = getinfo_super1,
        .update_super = update_super1,
        .init_super = init_super1,
-       .add_to_super = add_to_super1,
        .store_super = store_super1,
-       .write_init_super = write_init_super1,
        .compare_super = compare_super1,
        .load_super = load_super1,
        .match_metadata_desc = match_metadata_desc1,
@@ -1472,7 +1565,6 @@ struct superswitch super1 = {
        .locate_bitmap = locate_bitmap1,
        .write_bitmap = write_bitmap1,
        .free_super = free_super1,
-       .major = 1,
 #if __BYTE_ORDER == BIG_ENDIAN
        .swapuuid = 0,
 #else
diff --git a/sysfs.c b/sysfs.c
index 6350242b6db175d0896cd01d9717ee6f63e03fd5..8bcdaa59cbac61f093025f4aea876bf2c7815844 100644 (file)
--- a/sysfs.c
+++ b/sysfs.c
@@ -25,6 +25,7 @@
 
 #include       "mdadm.h"
 #include       <dirent.h>
+#include       <ctype.h>
 
 int load_sys(char *path, char *buf)
 {
@@ -34,10 +35,10 @@ int load_sys(char *path, char *buf)
                return -1;
        n = read(fd, buf, 1024);
        close(fd);
-       if (n <=0 || n >= 1024)
+       if (n <0 || n >= 1024)
                return -1;
        buf[n] = 0;
-       if (buf[n-1] == '\n')
+       if (n && buf[n-1] == '\n')
                buf[n-1] = 0;
        return 0;
 }
@@ -56,37 +57,41 @@ void sysfs_free(struct mdinfo *sra)
        }
 }
 
-struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
+int sysfs_open(int devnum, char *devname, char *attr)
 {
-       /* Longest possible name in sysfs, mounted at /sys, is
-        *  /sys/block/md_dXXX/md/dev-XXXXX/block/dev
-        *  /sys/block/md_dXXX/md/metadata_version
-        * which is about 41 characters.  50 should do for now
-        */
        char fname[50];
-       char buf[1024];
-       char *base;
-       char *dbase;
-       struct mdinfo *sra;
-       struct mdinfo *dev;
-       DIR *dir;
-       struct dirent *de;
+       int fd;
+       char *mdname = devnum2devname(devnum);
 
-       sra = malloc(sizeof(*sra));
-       if (sra == NULL)
-               return sra;
-       sra->next = NULL;
+       if (!mdname)
+               return -1;
+
+       sprintf(fname, "/sys/block/%s/md/", mdname);
+       if (devname) {
+               strcat(fname, devname);
+               strcat(fname, "/");
+       }
+       strcat(fname, attr);
+       fd = open(fname, O_RDWR);
+       if (fd < 0 && errno == EACCES)
+               fd = open(fname, O_RDONLY);
+       free(mdname);
+       return fd;
+}
 
+void sysfs_init(struct mdinfo *mdi, int fd, int devnum)
+{
        if (fd >= 0) {
                struct stat stb;
                mdu_version_t vers;
-               if (fstat(fd, &stb)) return NULL;
+               if (fstat(fd, &stb))
+                       return;
                if (ioctl(fd, RAID_VERSION, &vers) != 0)
-                       return NULL;
+                       return;
                if (major(stb.st_rdev) == MD_MAJOR)
-                       sprintf(sra->sys_name, "md%d", (int)minor(stb.st_rdev));
+                       sprintf(mdi->sys_name, "md%d", (int)minor(stb.st_rdev));
                else if (major(stb.st_rdev) == get_mdp_major())
-                       sprintf(sra->sys_name, "md_d%d",
+                       sprintf(mdi->sys_name, "md_d%d",
                                (int)minor(stb.st_rdev)>>MdpMinorShift);
                else {
                        /* must be an extended-minor partition. Look at the
@@ -101,23 +106,47 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                                minor(stb.st_rdev));
                        n = readlink(path, link, sizeof(link)-1);
                        if (n <= 0)
-                               return NULL;
+                               return;
                        link[n] = 0;
                        cp = strrchr(link, '/');
                        if (cp) *cp = 0;
                        cp = strchr(link, '/');
                        if (cp && strncmp(cp, "/md", 3) == 0)
-                               strcpy(sra->sys_name, cp+1);
+                               strcpy(mdi->sys_name, cp+1);
                        else
-                               return NULL;
+                               return;
                }
        } else {
                if (devnum >= 0)
-                       sprintf(sra->sys_name, "md%d", devnum);
+                       sprintf(mdi->sys_name, "md%d", devnum);
                else
-                       sprintf(sra->sys_name, "md_d%d",
+                       sprintf(mdi->sys_name, "md_d%d",
                                -1-devnum);
        }
+}
+
+struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
+{
+       /* Longest possible name in sysfs, mounted at /sys, is
+        *  /sys/block/md_dXXX/md/dev-XXXXX/block/dev
+        *  /sys/block/md_dXXX/md/metadata_version
+        * which is about 41 characters.  50 should do for now
+        */
+       char fname[50];
+       char buf[1024];
+       char *base;
+       char *dbase;
+       struct mdinfo *sra;
+       struct mdinfo *dev;
+       DIR *dir = NULL;
+       struct dirent *de;
+
+       sra = malloc(sizeof(*sra));
+       if (sra == NULL)
+               return sra;
+       memset(sra, 0, sizeof(*sra));
+       sysfs_init(sra, fd, devnum);
+
        sprintf(fname, "/sys/block/%s/md/", sra->sys_name);
        base = fname + strlen(fname);
 
@@ -134,10 +163,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        sra->array.major_version = -1;
                        sra->array.minor_version = -2;
                        strcpy(sra->text_version, buf+9);
-               } else
+               } else {
                        sscanf(buf, "%d.%d",
                               &sra->array.major_version,
                               &sra->array.minor_version);
+                       strcpy(sra->text_version, buf);
+               }
        }
        if (options & GET_LEVEL) {
                strcpy(base, "level");
@@ -151,6 +182,18 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        goto abort;
                sra->array.layout = strtoul(buf, NULL, 0);
        }
+       if (options & GET_DISKS) {
+               strcpy(base, "raid_disks");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->array.raid_disks = strtoul(buf, NULL, 0);
+       }
+       if (options & GET_DEGRADED) {
+               strcpy(base, "degraded");
+               if (load_sys(fname, buf))
+                       goto abort;
+               sra->array.failed_disks = strtoul(buf, NULL, 0);
+       }
        if (options & GET_COMPONENT) {
                strcpy(base, "component_size");
                if (load_sys(fname, buf))
@@ -177,6 +220,35 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        goto abort;
                sra->mismatch_cnt = strtoul(buf, NULL, 0);
        }
+       if (options & GET_SAFEMODE) {
+               int scale = 1;
+               int dot = 0;
+               int i;
+               unsigned long msec;
+               size_t len;
+
+               strcpy(base, "safe_mode_delay");
+               if (load_sys(fname, buf))
+                       goto abort;
+
+               /* remove a period, and count digits after it */
+               len = strlen(buf);
+               for (i = 0; i < len; i++) {
+                       if (dot) {
+                               if (isdigit(buf[i])) {
+                                       buf[i-1] = buf[i];
+                                       scale *= 10;
+                               }
+                               buf[i] = 0;
+                       } else if (buf[i] == '.') {
+                               dot=1;
+                               buf[i] = 0;
+                       }
+               }
+               msec = strtoul(buf, NULL, 10);
+               msec = (msec * 1000) / scale;
+               sra->safe_mode_delay = msec;
+       }
 
        if (! (options & GET_DEVS))
                return sra;
@@ -200,14 +272,33 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                dev = malloc(sizeof(*dev));
                if (!dev)
                        goto abort;
-               dev->next = sra->devs;
-               sra->devs = dev;
-               strcpy(dev->sys_name, de->d_name);
 
                /* Always get slot, major, minor */
                strcpy(dbase, "slot");
-               if (load_sys(fname, buf))
-                       goto abort;
+               if (load_sys(fname, buf)) {
+                       /* hmm... unable to read 'slot' maybe the device
+                        * is going away?
+                        */
+                       strcpy(dbase, "block");
+                       if (readlink(fname, buf, sizeof(buf)) < 0 &&
+                           errno != ENAMETOOLONG) {
+                               /* ...yup device is gone */
+                               free(dev);
+                               continue;
+                       } else {
+                               /* slot is unreadable but 'block' link
+                                * still intact... something bad is happening
+                                * so abort
+                                */
+                               free(dev);
+                               goto abort;
+                       }
+                       
+               }
+               dev->next = sra->devs;
+               sra->devs = dev;
+
+               strcpy(dev->sys_name, de->d_name);
                dev->disk.raid_disk = strtoul(buf, &ep, 10);
                if (*ep) dev->disk.raid_disk = -1;
 
@@ -226,7 +317,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        strcpy(dbase, "size");
                        if (load_sys(fname, buf))
                                goto abort;
-                       dev->component_size = strtoull(buf, NULL, 0);
+                       dev->component_size = strtoull(buf, NULL, 0) * 2;
                }
                if (options & GET_STATE) {
                        dev->disk.state = 0;
@@ -247,13 +338,41 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options)
                        dev->errors = strtoul(buf, NULL, 0);
                }
        }
+       closedir(dir);
        return sra;
 
  abort:
+       if (dir)
+               closedir(dir);
        sysfs_free(sra);
        return NULL;
 }
 
+int sysfs_attr_match(const char *attr, const char *str)
+{
+       /* See if attr, read from a sysfs file, matches
+        * str.  They must either be the same, or attr can
+        * have a trailing newline or comma
+        */
+       while (*attr && *str && *attr == *str) {
+               attr++;
+               str++;
+       }
+
+       if (*str || (*attr && *attr != ',' && *attr != '\n'))
+               return 0;
+       return 1;
+}
+
+int sysfs_match_word(const char *word, char **list)
+{
+       int n;
+       for (n=0; list[n]; n++)
+               if (sysfs_attr_match(word, list[n]))
+                       break;
+       return n;
+}
+
 unsigned long long get_component_size(int fd)
 {
        /* Find out the component size of the array.
@@ -290,6 +409,7 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
        char fname[50];
        int n;
        int fd;
+
        sprintf(fname, "/sys/block/%s/md/%s/%s",
                sra->sys_name, dev?dev->sys_name:"", name);
        fd = open(fname, O_WRONLY);
@@ -297,8 +417,11 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
                return -1;
        n = write(fd, val, strlen(val));
        close(fd);
-       if (n != strlen(val))
+       if (n != strlen(val)) {
+               dprintf(Name ": failed to write '%s' to '%s' (%s)\n",
+                       val, fname, strerror(errno));
                return -1;
+       }
        return 0;
 }
 
@@ -333,3 +456,259 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
                return -1;
        return 0;
 }
+
+int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
+{
+       unsigned long sec;
+       unsigned long msec;
+       char delay[30];
+
+       sec = ms / 1000;
+       msec = ms % 1000;
+
+       sprintf(delay, "%ld.%03ld\n", sec, msec);
+       /*             this '\n' ^ needed for kernels older than 2.6.28 */
+       return sysfs_set_str(sra, NULL, "safe_mode_delay", delay);
+}
+
+int sysfs_set_array(struct mdinfo *info, int vers)
+{
+       int rv = 0;
+       char ver[100];
+
+       ver[0] = 0;
+       if (info->array.major_version == -1 &&
+           info->array.minor_version == -2) {
+               strcat(strcpy(ver, "external:"), info->text_version);
+
+               if ((vers % 100) < 2 ||
+                   sysfs_set_str(info, NULL, "metadata_version",
+                                 ver) < 0) {
+                       fprintf(stderr, Name ": This kernel does not "
+                               "support external metadata.\n");
+                       return 1;
+               }
+       }
+       if (info->array.level < 0)
+               return 0; /* FIXME */
+       rv |= sysfs_set_str(info, NULL, "level",
+                           map_num(pers, info->array.level));
+       rv |= sysfs_set_num(info, NULL, "raid_disks", info->array.raid_disks);
+       rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size);
+       rv |= sysfs_set_num(info, NULL, "layout", info->array.layout);
+       rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2);
+       if (info->array.level > 0)
+               rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
+       return rv;
+}
+
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd)
+{
+       char dv[100];
+       char nm[100];
+       char *dname;
+       int rv;
+
+       sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor);
+       rv = sysfs_set_str(sra, NULL, "new_dev", dv);
+       if (rv)
+               return rv;
+
+       memset(nm, 0, sizeof(nm));
+       sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor);
+       rv = readlink(dv, nm, sizeof(nm));
+       if (rv <= 0)
+               return -1;
+       nm[rv] = '\0';
+       dname = strrchr(nm, '/');
+       if (dname) dname++;
+       strcpy(sd->sys_name, "dev-");
+       strcpy(sd->sys_name+4, dname);
+
+       rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
+       rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
+       if (sra->array.level != LEVEL_CONTAINER) {
+               rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+//             rv |= sysfs_set_str(sra, sd, "state", "in_sync");
+       }
+       return rv;
+}
+
+#if 0
+int sysfs_disk_to_sg(int fd)
+{
+       /* from an open block device, try find and open its corresponding
+        * scsi_generic interface
+        */
+       struct stat st;
+       char path[256];
+       char sg_path[256];
+       char sg_major_minor[8];
+       char *c;
+       DIR *dir;
+       struct dirent *de;
+       int major, minor, rv;
+
+       if (fstat(fd, &st))
+               return -1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_generic:", de->d_name,
+                           strlen("scsi_generic:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return -1;
+
+       snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name);
+       fd = open(sg_path, O_RDONLY);
+       if (fd < 0)
+               return fd;
+
+       rv = read(fd, sg_major_minor, sizeof(sg_major_minor));
+       close(fd);
+       if (rv < 0)
+               return -1;
+       else
+               sg_major_minor[rv - 1] = '\0';
+
+       c = strchr(sg_major_minor, ':');
+       *c = '\0';
+       c++;
+       major = strtol(sg_major_minor, NULL, 10);
+       minor = strtol(c, NULL, 10);
+       snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d",
+                (int) getpid(), major, minor);
+       if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) {
+                       fd = open(path, O_RDONLY);
+                       unlink(path);
+                       return fd;
+       }
+
+       return -1;
+}
+#endif
+
+int sysfs_disk_to_scsi_id(int fd, __u32 *id)
+{
+       /* from an open block device, try to retrieve it scsi_id */
+       struct stat st;
+       char path[256];
+       char *c1, *c2;
+       DIR *dir;
+       struct dirent *de;
+
+       if (fstat(fd, &st))
+               return 1;
+
+       snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+                major(st.st_rdev), minor(st.st_rdev));
+
+       dir = opendir(path);
+       if (!dir)
+               return 1;
+
+       de = readdir(dir);
+       while (de) {
+               if (strncmp("scsi_disk:", de->d_name,
+                           strlen("scsi_disk:")) == 0)
+                       break;
+               de = readdir(dir);
+       }
+       closedir(dir);
+
+       if (!de)
+               return 1;
+
+       c1 = strchr(de->d_name, ':');
+       c1++;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id = strtol(c1, NULL, 10) << 24; /* host */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 16; /* channel */
+       c1 = c2 + 1;
+       c2 = strchr(c1, ':');
+       *c2 = '\0';
+       *id |= strtol(c1, NULL, 10) << 8; /* lun */
+       c1 = c2 + 1;
+       *id |= strtol(c1, NULL, 10); /* id */
+
+       return 0;
+}
+
+
+int sysfs_unique_holder(int devnum, long rdev)
+{
+       /* Check that devnum is a holder of rdev,
+        * and is the only holder.
+        * we should be locked against races by
+        * an O_EXCL on devnum
+        */
+       DIR *dir;
+       struct dirent *de;
+       char dirname[100];
+       char l;
+       int found = 0;
+       sprintf(dirname, "/sys/dev/block/%d:%d/holders",
+               major(rdev), minor(rdev));
+       dir = opendir(dirname);
+       errno = ENOENT;
+       if (!dir)
+               return 0;
+       l = strlen(dirname);
+       while ((de = readdir(dir)) != NULL) {
+               char buf[10];
+               int n;
+               int mj, mn;
+               char c;
+               int fd;
+
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               strcpy(dirname+l, "/");
+               strcat(dirname+l, de->d_name);
+               strcat(dirname+l, "/dev");
+               fd = open(dirname, O_RDONLY);
+               if (fd < 0) {
+                       errno = ENOENT;
+                       break;
+               }
+               n = read(fd, buf, sizeof(buf)-1);
+               close(fd);
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 ||
+                   c != '\n') {
+                       errno = ENOENT;
+                       break;
+               }
+               if (mj != MD_MAJOR)
+                       mn = -1-(mn>>6);
+
+               if (devnum != mn) {
+                       errno = EEXIST;
+                       break;
+               }
+               found = 1;
+       }
+       closedir(dir);
+       if (de)
+               return 0;
+       else
+               return found;
+}
diff --git a/test b/test
index 1a79bab42295102a8731d854d75e3997b32f1bf1..9ceb531e7a3e3fe882f349931d75c7523655fa01 100644 (file)
--- a/test
+++ b/test
@@ -155,6 +155,7 @@ testdev() {
    dsize=$[dvsize/chunk]
    dsize=$[dsize*chunk]
    rasize=$[dsize*2*cnt]
+   if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi
    if [ $rasize -ne `/sbin/blockdev --getsize $dev` ]
    then
      echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`"
@@ -174,6 +175,8 @@ do
   if [ -f "$script" ]
   then
    rm -f $targetdir/stderr
+   # stop all arrays, just incase some script left an array active.
+   mdadm -Ssq
    # source script in a subshell, so it has access to our
    # namespace, but cannot change it.
    if ( set -ex ; . $script )  2> $targetdir/log
index 4f03d7bdc2ee0d2516c4684567d8bff5d80e88f6..55205a36f91702e62387b5e5ffea655ba655d5c7 100644 (file)
@@ -129,3 +129,10 @@ echo "  metadata=1 devices=$dev0,$dev1,$dev2" >> $conf
 mdadm --assemble --scan --config=$conf $md2 
 $tst
 mdadm -S $md2
+
+# Now use incremental assembly.
+mdadm -I --config=$conf $dev0
+mdadm -I --config=$conf $dev1
+mdadm -I --config=$conf $dev2
+$tst
+mdadm -S $md2
index 7553a4f0b9b56fab6854f57c8fbbe1ffcf6eab22..0f2c83b6ed3a159964844747f00a005ff5a1baae 100644 (file)
@@ -113,3 +113,10 @@ echo "  metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf
 mdadm --assemble --scan --config=$conf $md1
 check state U_U
 eval $tst
+
+# And now assemble with -I
+mdadm -Ss
+mdadm -I -c $conf $dev0
+mdadm -I -c $conf $dev1
+mdadm -I -c $conf $dev2
+eval $tst
diff --git a/util.c b/util.c
index a50036c116a76c86c2205ac3c148de345d4e9b93..ab2d7e9f06e3cb1f91a4a9c30529df872ee95670 100644 (file)
--- a/util.c
+++ b/util.c
 
 #include       "mdadm.h"
 #include       "md_p.h"
+#include       <sys/socket.h>
 #include       <sys/utsname.h>
+#include       <sys/wait.h>
+#include       <sys/un.h>
 #include       <ctype.h>
+#include       <dirent.h>
+#include       <signal.h>
 
 /*
  * following taken from linux/blkpg.h because they aren't
@@ -217,8 +222,13 @@ int enough(int level, int raid_disks, int layout, int clean,
        }
 }
 
+const int uuid_match_any[4] = { ~0, ~0, ~0, ~0 };
 int same_uuid(int a[4], int b[4], int swapuuid)
 {
+       if (memcmp(a, uuid_match_any, sizeof(int[4])) == 0 ||
+           memcmp(b, uuid_match_any, sizeof(int[4])) == 0)
+               return 1;
+
        if (swapuuid) {
                /* parse uuids are hostendian.
                 * uuid's from some superblocks are big-ending
@@ -264,6 +274,27 @@ void copy_uuid(void *a, int b[4], int swapuuid)
                memcpy(a, b, 16);
 }
 
+char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep)
+{
+       int i, j;
+       int id;
+       char uuid[16];
+       char *c = buf;
+       strcpy(c, "UUID-");
+       c += strlen(c);
+       copy_uuid(uuid, info->uuid, st->ss->swapuuid);
+       for (i = 0; i < 4; i++) {
+               id = uuid[i];
+               if (i)
+                       *c++ = sep;
+               for (j = 3; j >= 0; j--) {
+                       sprintf(c,"%02x", (unsigned char) uuid[j+4*i]);
+                       c+= 2;
+               }
+       }
+       return buf;
+}
+
 #ifndef MDASSEMBLE
 int check_ext2(int fd, char *name)
 {
@@ -389,6 +420,9 @@ int is_standard(char *dev, int *nump)
        /* tests if dev is a "standard" md dev name.
         * i.e if the last component is "/dNN" or "/mdNN",
         * where NN is a string of digits
+        * Returns 1 if a partitionable standard,
+        *   -1 if non-partitonable,
+        *   0 if not a standard name.
         */
        char *d = strrchr(dev, '/');
        int type=0;
@@ -625,7 +659,23 @@ void print_r10_layout(int layout)
 }
 #endif
 
-#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+unsigned long long calc_array_size(int level, int raid_disks, int layout,
+                                  int chunksize, unsigned long long devsize)
+{
+       int data_disks = 0;
+       switch (level) {
+       case 0: data_disks = raid_disks; break;
+       case 1: data_disks = 1; break;
+       case 4:
+       case 5: data_disks = raid_disks - 1; break;
+       case 6: data_disks = raid_disks - 2; break;
+       case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
+               break;
+       }
+       devsize &= ~(unsigned long long)((chunksize>>9)-1);
+       return data_disks * devsize;
+}
+
 int get_mdp_major(void)
 {
 static int mdp_major = -1;
@@ -654,8 +704,7 @@ static int mdp_major = -1;
        return mdp_major;
 }
 
-
-
+#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
 char *get_md_name(int dev)
 {
        /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */
@@ -710,21 +759,6 @@ void put_md_name(char *name)
                unlink(name);
 }
 
-static int dev2major(int d)
-{
-       if (d >= 0)
-               return MD_MAJOR;
-       else
-               return get_mdp_major();
-}
-
-static int dev2minor(int d)
-{
-       if (d >= 0)
-               return d;
-       return (-1-d) << MdpMinorShift;
-}
-
 int find_free_devnum(int use_partitions)
 {
        int devnum;
@@ -766,19 +800,38 @@ int dev_open(char *dev, int flags)
        if (e > dev && *e == ':' && e[1] &&
            (minor = strtoul(e+1, &e, 0)) >= 0 &&
            *e == 0) {
-               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor);
+               snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
+                        (int)getpid(), major, minor);
                if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) {
-                       fd = open(devname, flags);
+                       fd = open(devname, flags|O_DIRECT);
                        unlink(devname);
                }
        } else
-               fd = open(dev, flags);
+               fd = open(dev, flags|O_DIRECT);
        return fd;
 }
 
-struct superswitch *superlist[] = { &super0, &super1, NULL };
+int open_dev_excl(int devnum)
+{
+       char buf[20];
+       int i;
+
+       sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum));
+       for (i=0 ; i<25 ; i++) {
+               int fd = dev_open(buf, O_RDWR|O_EXCL);
+               if (fd >= 0)
+                       return fd;
+               if (errno != EBUSY)
+                       return fd;
+               usleep(200000);
+       }
+       return -1;
+}
+
+struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL };
 
 #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+
 struct supertype *super_by_fd(int fd)
 {
        mdu_array_info_t array;
@@ -789,6 +842,7 @@ struct supertype *super_by_fd(int fd)
        char *verstr;
        char version[20];
        int i;
+       char *subarray = NULL;
 
        sra = sysfs_read(fd, 0, GET_VERSION);
 
@@ -808,40 +862,56 @@ struct supertype *super_by_fd(int fd)
                sprintf(version, "%d.%d", vers, minor);
                verstr = version;
        }
+       if (minor == -2 && is_subarray(verstr)) {
+               char *dev = verstr+1;
+               subarray = strchr(dev, '/');
+               int devnum;
+               if (subarray)
+                       *subarray++ = '\0';
+               devnum = devname2devnum(dev);
+               subarray = strdup(subarray);
+               if (sra)
+                       sysfs_free(sra);
+               sra = sysfs_read(-1, devnum, GET_VERSION);
+               verstr = sra->text_version ? : "-no-metadata-";
+       }
+
        for (i = 0; st == NULL && superlist[i] ; i++)
                st = superlist[i]->match_metadata_desc(verstr);
 
        if (sra)
                sysfs_free(sra);
-       if (st)
+       if (st) {
                st->sb = NULL;
+               if (subarray) {
+                       strncpy(st->subarray, subarray, 32);
+                       st->subarray[31] = 0;
+                       free(subarray);
+               } else
+                       st->subarray[0] = 0;
+       }
        return st;
 }
 #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
 
 
-struct supertype *dup_super(struct supertype *st)
+struct supertype *dup_super(struct supertype *orig)
 {
-       struct supertype *stnew = NULL;
-       char *verstr = NULL;
-       char version[20];
-       int i;
+       struct supertype *st;
 
+       if (!orig)
+               return orig;
+       st = malloc(sizeof(*st));
        if (!st)
                return st;
-
-       if (st->minor_version == -1)
-               sprintf(version, "%d", st->ss->major);
-       else
-               sprintf(version, "%d.%d", st->ss->major, st->minor_version);
-       verstr = version;
-
-       for (i = 0; stnew == NULL && superlist[i] ; i++)
-               stnew = superlist[i]->match_metadata_desc(verstr);
-
-       if (stnew)
-               stnew->sb = NULL;
-       return stnew;
+       memset(st, 0, sizeof(*st));
+       st->ss = orig->ss;
+       st->max_devs = orig->max_devs;
+       st->minor_version = orig->minor_version;
+       strcpy(st->subarray, orig->subarray);
+       st->sb = NULL;
+       st->info = NULL;
+       return st;
 }
 
 struct supertype *guess_super(int fd)
@@ -856,11 +926,10 @@ struct supertype *guess_super(int fd)
        int i;
 
        st = malloc(sizeof(*st));
-       memset(st, 0, sizeof(*st));
        for (i=0 ; superlist[i]; i++) {
                int rv;
                ss = superlist[i];
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = ss->load_super(st, fd, NULL);
                if (rv == 0) {
                        struct mdinfo info;
@@ -875,7 +944,7 @@ struct supertype *guess_super(int fd)
        }
        if (bestsuper != -1) {
                int rv;
-               st->ss = NULL;
+               memset(st, 0, sizeof(*st));
                rv = superlist[bestsuper]->load_super(st, fd, NULL);
                if (rv == 0) {
                        superlist[bestsuper]->free_super(st);
@@ -923,6 +992,303 @@ void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk)
                        return;
 }
 
+int open_container(int fd)
+{
+       /* 'fd' is a block device.  Find out if it is in use
+        * by a container, and return an open fd on that container.
+        */
+       char path[256];
+       char *e;
+       DIR *dir;
+       struct dirent *de;
+       int dfd, n;
+       char buf[200];
+       int major, minor;
+       struct stat st;
+
+       if (fstat(fd, &st) != 0)
+               return -1;
+       sprintf(path, "/sys/dev/block/%d:%d/holders",
+               (int)major(st.st_rdev), (int)minor(st.st_rdev));
+       e = path + strlen(path);
+
+       dir = opendir(path);
+       if (!dir)
+               return -1;
+       while ((de = readdir(dir))) {
+               if (de->d_ino == 0)
+                       continue;
+               if (de->d_name[0] == '.')
+                       continue;
+               sprintf(e, "/%s/dev", de->d_name);
+               dfd = open(path, O_RDONLY);
+               if (dfd < 0)
+                       continue;
+               n = read(dfd, buf, sizeof(buf));
+               close(dfd);
+               if (n <= 0 || n >= sizeof(buf))
+                       continue;
+               buf[n] = 0;
+               if (sscanf(buf, "%d:%d", &major, &minor) != 2)
+                       continue;
+               sprintf(buf, "%d:%d", major, minor);
+               dfd = dev_open(buf, O_RDONLY);
+               if (dfd >= 0) {
+                       closedir(dir);
+                       return dfd;
+               }
+       }
+       closedir(dir);
+       return -1;
+}
+
+int add_disk(int mdfd, struct supertype *st,
+            struct mdinfo *sra, struct mdinfo *info)
+{
+       /* Add a device to an array, in one of 2 ways. */
+       int rv;
+#ifndef MDASSEMBLE
+       if (st->ss->external) {
+               rv = sysfs_add_disk(sra, info);
+               if (! rv) {
+                       struct mdinfo *sd2;
+                       for (sd2 = sra->devs; sd2; sd2=sd2->next)
+                               if (sd2 == info)
+                                       break;
+                       if (sd2 == NULL) {
+                               sd2 = malloc(sizeof(*sd2));
+                               *sd2 = *info;
+                               sd2->next = sra->devs;
+                               sra->devs = sd2;
+                       }
+               }
+       } else
+#endif
+               rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk);
+       return rv;
+}
+
+int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
+{
+       /* Initialise kernel's knowledge of array.
+        * This varies between externally managed arrays
+        * and older kernels
+        */
+       int vers = md_get_version(mdfd);
+       int rv;
+
+#ifndef MDASSEMBLE
+       if (st->ss->external)
+               rv = sysfs_set_array(info, vers);
+       else
+#endif
+               if ((vers % 100) >= 1) { /* can use different versions */
+               mdu_array_info_t inf;
+               memset(&inf, 0, sizeof(inf));
+               inf.major_version = info->array.major_version;
+               inf.minor_version = info->array.minor_version;
+               rv = ioctl(mdfd, SET_ARRAY_INFO, &inf);
+       } else
+               rv = ioctl(mdfd, SET_ARRAY_INFO, NULL);
+       return rv;
+}
+
+char *devnum2devname(int num)
+{
+       char name[100];
+       if (num > 0)
+               sprintf(name, "md%d", num);
+       else
+               sprintf(name, "md_d%d", -1-num);
+       return strdup(name);
+}
+
+int devname2devnum(char *name)
+{
+       char *ep;
+       int num;
+       if (strncmp(name, "md_d", 4)==0)
+               num = -1-strtoul(name+4, &ep, 10);
+       else
+               num = strtoul(name+2, &ep, 10);
+       return num;
+}
+
+int stat2devnum(struct stat *st)
+{
+       if ((S_IFMT & st->st_mode) == S_IFBLK) {
+               if (major(st->st_rdev) == MD_MAJOR)
+                       return minor(st->st_rdev);
+               else
+                       return -1- (minor(st->st_rdev)>>6);
+       }
+       return -1;
+
+}
+
+int fd2devnum(int fd)
+{
+       struct stat stb;
+       if (fstat(fd, &stb) == 0)
+               return stat2devnum(&stb);
+       return -1;
+}
+
+int mdmon_running(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), 0) == 0)
+               return 1;
+       return 0;
+}
+
+int signal_mdmon(int devnum)
+{
+       char path[100];
+       char pid[10];
+       int fd;
+       int n;
+       sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum));
+       fd = open(path, O_RDONLY, 0);
+
+       if (fd < 0)
+               return 0;
+       n = read(fd, pid, 9);
+       close(fd);
+       if (n <= 0)
+               return 0;
+       if (kill(atoi(pid), SIGUSR1) == 0)
+               return 1;
+       return 0;
+}
+
+int start_mdmon(int devnum)
+{
+       int i;
+       int len;
+       pid_t pid;      
+       int status;
+       char pathbuf[1024];
+       char *paths[4] = {
+               pathbuf,
+               "/sbin/mdmon",
+               "mdmon",
+               NULL
+       };
+
+       if (env_no_mdmon())
+               return 0;
+
+       len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf));
+       if (len > 0) {
+               char *sl;
+               pathbuf[len] = 0;
+               sl = strrchr(pathbuf, '/');
+               if (sl)
+                       sl++;
+               else
+                       sl = pathbuf;
+               strcpy(sl, "mdmon");
+       } else
+               pathbuf[0] = '\0';
+
+       switch(fork()) {
+       case 0:
+               /* FIXME yuk. CLOSE_EXEC?? */
+               for (i=3; i < 100; i++)
+                       close(i);
+               for (i=0; paths[i]; i++)
+                       if (paths[i][0])
+                               execl(paths[i], "mdmon",
+                                     map_dev(dev2major(devnum),
+                                             dev2minor(devnum),
+                                             1), NULL);
+               exit(1);
+       case -1: fprintf(stderr, Name ": cannot run mdmon. "
+                        "Array remains readonly\n");
+               return -1;
+       default: /* parent - good */
+               pid = wait(&status);
+               if (pid < 0 || status != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+int env_no_mdmon(void)
+{
+       char *val = getenv("MDADM_NO_MDMON");
+
+       if (val && atoi(val) == 1)
+               return 1;
+
+       return 0;
+}
+
+#ifndef MDASSEMBLE
+int flush_metadata_updates(struct supertype *st)
+{
+       int sfd;
+       if (!st->updates) {
+               st->update_tail = NULL;
+               return -1;
+       }
+
+       sfd = connect_monitor(devnum2devname(st->container_dev));
+       if (sfd < 0)
+               return -1;
+
+       while (st->updates) {
+               struct metadata_update *mu = st->updates;
+               st->updates = mu->next;
+
+               send_message(sfd, mu, 0);
+               wait_reply(sfd, 0);
+               free(mu->buf);
+               free(mu);
+       }
+       ack(sfd, 0);
+       wait_reply(sfd, 0);
+       close(sfd);
+       st->update_tail = NULL;
+       return 0;
+}
+
+void append_metadata_update(struct supertype *st, void *buf, int len)
+{
+
+       struct metadata_update *mu = malloc(sizeof(*mu));
+
+       mu->buf = buf;
+       mu->len = len;
+       mu->space = NULL;
+       mu->next = NULL;
+       *st->update_tail = mu;
+       st->update_tail = &mu->next;
+}
+
+struct superswitch *find_metadata_methods(char *vers)
+{
+       if (strcmp(vers, "ddf") == 0)
+               return &super_ddf;
+       if (strcmp(vers, "imsm") == 0)
+               return &super_imsm;
+       return NULL;
+}
+#endif /* MDASSEMBLE */
+
 #ifdef __TINYC__
 /* tinyc doesn't optimize this check in ioctl.h out ... */
 unsigned int __invalid_size_argument_for_IOC = 0;